In [None]:
import pandas as pd
import json
import hypernetx as hnx
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import os
import glob
from tqdm.notebook import tqdm

## Load Data

In [None]:
?os.listdir

In [None]:
paps = []
for f_name in glob.glob("./data/*.json"):
    with open(f_name, "r") as f:
        dat = json.load(f)
        paps = paps + dat["hits"]["hits"]
print(f"{len(paps)} papers loaded")

In [None]:
# file downloaded using
#     wget "https://inspirehep.net/api/literature?sort=mostrecent&size=1000&page=1&q=find%20a%20t%20psi%20or%20t%20chi%20or%20t%20charmonium%20or%20t%20charmonia" -O papers_1000_1.json
# with open("./data/papers_1000_1.json", "r") as f:
#     dat = json.load(f)
#     paps = dat["hits"]["hits"]

### How to Get Different Elemens

In [None]:
def get_authors(pap):
    if "authors" in pap["metadata"].keys():
        return [a["full_name"] for a in pap["metadata"]["authors"]]
    else:
        return []
def get_title(pap):
    return pap["metadata"]["titles"][0]["title"]

def get_type(pap):
    return pap["metadata"]["primary_arxiv_category"]
    
def get_id(pap):
    return pap["id"]

## HyperGraph

In [None]:
paps_dict = { get_id(p):[a for a in get_authors(p)] for p in paps}

In [None]:
H = hnx.Hypergraph(paps_dict)
H.shape

### Simple distributions

In [None]:
authors = list(H.nodes.items)

In [None]:
def plot_loghist(x, bins, ax=None, title = None):
    if ax is None:
        ax = plt.gca()
    x = [x_ for x_ in x if x_>0]
    hist, bins = np.histogram(x, bins=bins)
    logbins = np.logspace(np.log10(bins[0]),np.log10(bins[-1]),len(bins))
    ax.hist(x, bins=logbins)
    ax.set_xscale('log')
    if(title):
        ax.set_xlabel(title)

In [None]:
fig, axes = plt.subplots(1,2, figsize=(8,4))
edge_sizes = hnx.edge_size_dist(H)
node_sizes = [H.degree(n, s=1) for n in H.nodes]
plot_loghist(edge_sizes, 20, axes[0], title = "# authors for paper")
plot_loghist(node_sizes, 20, axes[1], title = "# papers for authors")
plt.show()

### Author List

In [None]:
"Luchinsky, A.V." in H.nodes.items

### Connected Componets

In [None]:
print("1-component subgraphs")
n=0
for comp in tqdm(H.s_component_subgraphs(s=1)):
    n += 1

In [None]:
comp_sizes = [[comp.shape, comp.nodes.items, comp.edges.items] for comp in tqdm(H.s_component_subgraphs(s=3))]
print("There are ", len(comp_sizes), " connected components")

In [None]:
sorted_comps = sorted(comp_sizes, key=lambda x: x[0])[::-1]

In [None]:
plt.bar(range(len(sorted_comps)), [c[0][0] for c in sorted_comps])
plt.ylim(0, 25)
plt.title("Connected Components")
plt.show()

Here are some plots

In [None]:
np.random.seed(1)
min_ = 13
fig = plt.figure(figsize=(16, 8))
# fig, axes = plt.subplots(nrows=1, ncols=3, figsize = (12, 4))
for i in range(6):
    h = H.restrict_to_nodes( sorted_comps[min_+i][1])
    # h = h.toplexes(return_hyp=True)
    plt.subplot(2, 3, i+1)
    hnx.draw(h)
    plt.title("[#" + str(min_+i) + ']='+str(sorted_comps[min_+i][0][0]))


### Diameters and Distances

#### The largest component

In [None]:
# Diameter of the largest componet
h_max = H.restrict_to_nodes(sorted_comps[0][1])
print("[]=", h_max.shape)
print("edge_diameter = ", h_max.edge_diameter())

It requies lots of time to work with it, collapsing edges and nodes

In [None]:
h_max_c = h_max.collapse_nodes_and_edges()
print("[] = ", h_max_c.shape)
[h_max_c.diameter(), h_max_c.edge_diameter()]

Here is a distance between some authors

In [None]:
au = list(h_max_c.nodes)
a1, a2 = au[1], au[10]
print("|(", a1, ")-(", a2, ")|=", h_max_c.distance(a1, a2))

#### Smaller Cluster

In [None]:
sorted_comps[2][0]

In [None]:
np.random.seed(122)
h = H.restrict_to_nodes(sorted_comps[5][1])
# h = h.toplexes(return_hyp=True)
hnx.draw(h)
[h.diameter(s=1), h.edge_diameter()]

In [None]:
au = list(h.nodes.items)
a1, a2 = au[1], au[4]
print("|'", a1, "'-'", a2, "'|=", h.distance(a1, a2))

With s=2 this cluster is not connected, here are diameters and content of the sub-clusters

In [None]:
h.node_diameters(s=2)

## LineGraphs

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (12, 4))
bh = h.bipartite()
top = nx.bipartite.sets(bh)[0]
pos = nx.bipartite_layout(bh, top)
nx.draw(bh, with_labels = True, ax=axes[0], pos = pos)
hnx.draw_bipartite_using_euler(h, pos=pos, ax=axes[1])

## Walks

In [None]:
h.diameter()

In [None]:
mat, rindex = h.adjacency_matrix(s=1, index=True)
G = nx.from_numpy_array(mat.todense())

In [None]:
h.incidence_dataframe().shape

In [None]:
rindex

In [None]:
i1, i2  = 1, 12
name1, name2 = rindex[i1], rindex[i2]
h.distance(name1, name2)

In [None]:
hb = h.bipartite()
path = nx.shortest_path(hb, source=name1, target=name2)
path

In [None]:
# # Check
# np.all([
#     h.incidence_dataframe().loc[name1,]['1637222'] == 1,
#     h.incidence_dataframe().loc['Zhao, Qiang',]['1637222'] == 1,
#     h.incidence_dataframe().loc['Zhao, Qiang',]['1254432'] == 1,
#     h.incidence_dataframe().loc['Guo, Feng-Kun',]['1254432'] == 1,
#     h.incidence_dataframe().loc['Guo, Feng-Kun',]['2778309'] == 1,
#     h.incidence_dataframe().loc[name2,]['2778309'] == 1
# ])    

In [None]:
path_nodes = [a for a in path if a in h.nodes]
path_edges = [a for a in path if a in h.edges]
H_path = H.restrict_to_nodes(path_nodes).restrict_to_edges(path_edges)

In [None]:
# 1. Set up the hypergraph and find the path
path = nx.shortest_path(hb, name1, name2)
print(f"The shortest path is: {path}")

In [None]:
# 3. Create a subgraph and set colors for plotting
# Separate the path into nodes and hyperedges
path_nodes = [n for n in path if n in h.nodes()]
path_edges = [e for e in path if e in h.edges()]

In [None]:
# Set custom colors for the nodes and edges
node_colors = ["blue" if node in path_nodes else "gray" for node in h.nodes()]
edge_colors = ["red" if edge in path_edges else "lightgray" for edge in h.edges()]
np.random.seed(122)
# 4. Plot the hypergraph with highlighted path
ax = plt.gca()
hnx.draw(h,  ax=ax,with_edge_labels = False, 
             nodes_kwargs={"facecolors": node_colors},
    edges_kwargs={"facecolors": edge_colors, "edgecolors": "black"},

        )
ax.set_title(f"Shortest Path from {path_nodes[0]} to {path_nodes[-1]}")
plt.show()
print(f"The shortest path is: {path}")