In [None]:
import pandas as pd
import json
import hypernetx as hnx
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import os
import glob
from tqdm.notebook import tqdm

## Load Data

In [None]:
paps = []
for f_name in glob.glob("./data/*.json"):
    with open(f_name, "r") as f:
        dat = json.load(f)
        paps = paps + dat["hits"]["hits"]
print(f"{len(paps)} papers loaded")

### How to Get Different Elemens

In [None]:
def get_authors(pap):
    if "authors" in pap["metadata"].keys():
        return [a["full_name"] for a in pap["metadata"]["authors"]]
    else:
        return []
def get_title(pap):
    return pap["metadata"]["titles"][0]["title"]

def get_type(pap):
    return pap["metadata"]["primary_arxiv_category"]
    
def get_id(pap):
    return pap["id"]

## HyperGraph

In [None]:
paps_dict = { get_id(p):[a for a in get_authors(p)] for p in paps}

H = hnx.Hypergraph(paps_dict)
H.shape

### Simple distributions

In [None]:
authors = list(H.nodes.items)

In [None]:
def plot_loghist(x, bins, ax=None, title = None):
    if ax is None:
        ax = plt.gca()
    x = [x_ for x_ in x if x_>0]
    hist, bins = np.histogram(x, bins=bins)
    logbins = np.logspace(np.log10(bins[0]),np.log10(bins[-1]),len(bins))
    ax.hist(x, bins=logbins)
    ax.set_xscale('log')
    if(title):
        ax.set_xlabel(title)

In [None]:
fig, axes = plt.subplots(1,2, figsize=(8,4))
edge_sizes = hnx.edge_size_dist(H)
node_sizes = [H.degree(n, s=1) for n in H.nodes]
plot_loghist(edge_sizes, 20, axes[0], title = "# authors for paper")
plot_loghist(node_sizes, 20, axes[1], title = "# papers for authors")
plt.show()

### Author List

### Connected Componets

In [None]:
comp_sizes = [[comp.shape, comp.nodes.items, comp.edges.items] for comp in tqdm(H.s_component_subgraphs())]
print("There are ", len(comp_sizes), " connected components")
sorted_comps = sorted(comp_sizes, key=lambda x: x[0])[::-1]

In [None]:
plt.bar(range(len(sorted_comps)), [c[0][0] for c in sorted_comps])
plt.ylim(0, 30)
plt.title("Connected Components")
plt.show()

Here are some plots

In [None]:
np.random.seed(1)
min_ = 20
fig = plt.figure(figsize=(16, 8))
# fig, axes = plt.subplots(nrows=1, ncols=3, figsize = (12, 4))
for i in range(6):
    h = H.restrict_to_nodes( sorted_comps[min_+i][1])
    # h = h.toplexes(return_hyp=True)
    plt.subplot(2, 3, i+1)
    hnx.draw(h)
    plt.title("[#" + str(min_+i) + ']='+str(sorted_comps[min_+i][0][0]))


### Diameters and Distances

#### The largest component

In [None]:
# Diameter of the largest componet
h_max = H.restrict_to_nodes(sorted_comps[0][1])
print("[]=", h_max.shape)
print("edge_diameter = ", h_max.edge_diameter())

It requies lots of time to work with it, collapsing edges and nodes

In [None]:
h_max_c = h_max.collapse_nodes_and_edges()
print("[] = ", h_max_c.shape)
[h_max_c.diameter(), h_max_c.edge_diameter()]

Here is a distance between some authors

In [None]:
au = list(h_max_c.nodes)
a1, a2 = au[1], au[10]
print("|(", a1, ")-(", a2, ")|=", h_max_c.distance(a1, a2))

#### Smaller Cluster

In [None]:
np.random.seed(122)
h = H.restrict_to_nodes(sorted_comps[5][1])
# h = h.toplexes(return_hyp=True)
hnx.draw(h)
[h.diameter(s=1), h.edge_diameter()]

In [None]:
au = list(h.nodes.items)
a1, a2 = au[16], au[9]
print("|'", a1, "'-'", a2, "'|=", h.distance(a1, a2))

With s=2 this cluster is not connected, here are diameters and content of the sub-clusters

#### My Cluster

In [None]:
"Luchinsky, A.V." in sorted_comps[0][1]

In [None]:
np.where(["Luchinsky, A.V." in cl[1] for cl in sorted_comps])

In [None]:
cl_luch = [cl for cl in sorted_comps if "Luchinsky, A.V." in cl[1]]
h_Luch = H.restrict_to_nodes(cl_luch[0][1])
h_Luch.shape

## LineGraphs

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (12, 4))
bh = h.bipartite()
top = nx.bipartite.sets(bh)[0]
pos = nx.bipartite_layout(bh, top)
nx.draw(bh, with_labels = True, ax=axes[0], pos = pos)
hnx.draw_bipartite_using_euler(h, pos=pos, ax=axes[1])

## Walks

In [None]:
h.diameter()

In [None]:
mat, rindex = h.adjacency_matrix(s=1, index=True)
G = nx.from_numpy_array(mat.todense())

In [None]:
h.incidence_dataframe().shape

In [None]:
rindex

In [None]:
i1, i2  = 1, 12
name1, name2 = rindex[i1], rindex[i2]
h.distance(name1, name2)

In [None]:
hb = h.bipartite()
path = nx.shortest_path(hb, source=name1, target=name2)
path

In [None]:
# # Check
# np.all([
#     h.incidence_dataframe().loc[name1,]['1637222'] == 1,
#     h.incidence_dataframe().loc['Zhao, Qiang',]['1637222'] == 1,
#     h.incidence_dataframe().loc['Zhao, Qiang',]['1254432'] == 1,
#     h.incidence_dataframe().loc['Guo, Feng-Kun',]['1254432'] == 1,
#     h.incidence_dataframe().loc['Guo, Feng-Kun',]['2778309'] == 1,
#     h.incidence_dataframe().loc[name2,]['2778309'] == 1
# ])    

In [None]:
path_nodes = [a for a in path if a in h.nodes]
path_edges = [a for a in path if a in h.edges]
H_path = H.restrict_to_nodes(path_nodes).restrict_to_edges(path_edges)

In [None]:
# 1. Set up the hypergraph and find the path
path = nx.shortest_path(hb, name1, name2)
print(f"The shortest path is: {path}")

In [None]:
# 3. Create a subgraph and set colors for plotting
# Separate the path into nodes and hyperedges
path_nodes = [n for n in path if n in h.nodes()]
path_edges = [e for e in path if e in h.edges()]

In [None]:
# Set custom colors for the nodes and edges
node_colors = ["blue" if node in path_nodes else "gray" for node in h.nodes()]
edge_colors = ["red" if edge in path_edges else "lightgray" for edge in h.edges()]
np.random.seed(122)
# 4. Plot the hypergraph with highlighted path
ax = plt.gca()
hnx.draw(h,  ax=ax,with_edge_labels = False, 
             nodes_kwargs={"facecolors": node_colors},
    edges_kwargs={"facecolors": edge_colors, "edgecolors": "black"},

        )
ax.set_title(f"Shortest Path from {path_nodes[0]} to {path_nodes[-1]}")
plt.show()
print(f"The shortest path is: {path}")

### Centrality

In [None]:
h.diameter()

In [None]:
hnx.draw(h)
print(" shape", h.shape)
print(f" diameter: {h.diameter()},\t edges diameter: {h.edge_diameter()}")

In [None]:
cent_func_dict = {
    "closeness_centrality":hnx.s_closeness_centrality,
    "betweenness_centrality": hnx.s_betweenness_centrality,
    "harmonic_centrality":hnx.s_harmonic_centrality
    # "harmonic_closeness_centrality":hnx.s_harmonic_closeness_centrality
}
def get_cent_df(h, s=1, cent_func_name = "closeness_centrality"):
    cent = cent_func_dict[cent_func_name](h, edges=0, s=s)
    df = pd.DataFrame.from_dict(cent, orient="index", columns=["cent"]).reset_index()
    df = df.rename(columns={"index":"author"})
    df["s"] = s
    df["func_name"] = cent_func_name
    return df


In [None]:
df_centrality = pd.DataFrame()
for cent_func_name in cent_func_dict.keys():
    print(cent_func_name)
    for s in range(1, 5):
        df_centrality = pd.concat([df_centrality, get_cent_df(h, s=s, cent_func_name=cent_func_name)])

In [None]:
df_centrality

In [None]:
import plotnine as pq
from plotnine import ggplot, aes, geom_point, geom_line, facet_wrap, theme

In [None]:
import seaborn as sns

In [None]:
df_centrality.query("func_name == 'closeness_centrality'")

In [None]:
pd.pivot(df_centrality.query("author == 'Monteiro, A.P.'"), index = "s", columns = ["func_name"], values = "cent")

In [None]:
pd.pivot(df_centrality.query("func_name == 'harmonic_centrality'"), index = "author", columns = ["s"], values = "cent").sort_values(1, ascending=False)

In [None]:
ggplot(
    df_centrality
) + aes(x="s", y="cent", color = "author") + geom_line() + geom_point() + theme(legend_position="none") + facet_wrap("func_name", scales="free")

### My attempt

In [None]:
nodes = list(h.nodes.items)
v = nodes[2]

In [None]:
?h.distance

In [None]:
import warnings

In [None]:
hnx.s_closeness_centrality(h, edges=0, s=4)

In [None]:
?hnx.s_closeness_centrality

In [None]:
with warnings.catch_warnings():
    

In [None]:
def centrality(h, v, s=1):
    Es = [n for n in h.nodes if (h.distance(v, n) >= s) and  (n != v)]
    dists = [h.distance(v, f, s=s) for f in Es]
    if len(dists)>0:
        return (len(Es)-1)/sum(dists)
    else:
        return 0

In [None]:
def harmonic_centrality(h, v, s=1):
    Es = [n for n in h.nodes if (h.distance(v, n) >= s) and  (n != v)]
    dists = [h.distance(v, f, s=s) for f in Es]
    if len(dists)>1:
        return 1/(len(Es)-1)*sum(1/d for d in dists)
    else:
        return 0

In [None]:
cent_list = [(n, centrality(h, n, s=2)) for n in h.nodes]
cent_list[np.argmax([c[1] for c in cent_list])]

In [None]:
?h.distance

In [None]:
h.distance(nodes[0], nodes[1], s=2)

In [None]:
cent_list = [(n, harmonic_centrality(h, n, s=3)) for n in h.nodes]
cent_list[np.argmax([c[1] for c in cent_list])]