In [None]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import kagglehub
import networkx as nx
from networkx.readwrite import json_graph
from kagglehub import KaggleDatasetAdapter
from sentence_transformers import SentenceTransformer

file_path = kagglehub.dataset_download("Cornell-University/arxiv")
print("Downloaded to:", file_path)
json_path = os.path.join(file_path, "arxiv-metadata-oai-snapshot.json")
print("JSON snapshot path:", json_path)

subset_df = pd.read_json(json_path, lines=True, nrows=10_000)
print("Subset shape:", subset_df.shape)
print(subset_df[["id", "title", "categories"]].head(3))
os.makedirs("data", exist_ok=True)
subset_path = "data/arxiv_subset_10k.jsonl"
subset_df.to_json(subset_path, orient="records", lines=True)
print(f"Saved trimmed subset → {subset_path}")

os.remove(json_path)
print("Deleted large original file.")
df = pd.read_json("data/arxiv_subset_10k.jsonl", lines=True)

Downloading from https://www.kaggle.com/api/v1/datasets/download/Cornell-University/arxiv?dataset_version_number=256...


100%|██████████| 1.51G/1.51G [02:54<00:00, 9.33MB/s]

Extracting files...





Downloaded to: C:\Users\amkah\.cache\kagglehub\datasets\Cornell-University\arxiv\versions\256
JSON snapshot path: C:\Users\amkah\.cache\kagglehub\datasets\Cornell-University\arxiv\versions\256\arxiv-metadata-oai-snapshot.json
Subset shape: (10000, 14)
         id                                              title      categories
0  704.0001  Calculation of prompt diphoton production cros...          hep-ph
1  704.0002           Sparsity-certifying Graph Decompositions   math.CO cs.CG
2  704.0003  The evolution of the Earth-Moon system based o...  physics.gen-ph
Saved trimmed subset → data/arxiv_subset_10k.json
Deleted large original file.


In [None]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
print("SBERT model loaded for embedding abstracts.\n\n")
print(df["categories"].value_counts().head(10), '\n')

SBERT model loaded for embedding abstracts.


categories
hep-th                            7
hep-ph                            6
astro-ph                          6
physics.optics physics.comp-ph    4
cond-mat.mes-hall                 3
gr-qc                             3
math.CO                           3
math.NT                           2
cond-mat.mtrl-sci                 2
math.CA math.FA                   2
Name: count, dtype: int64 



In [64]:
def save(G, fname):
    data = {
        "nodes": list(G.nodes(data=True)), 
        "edges": list(G.edges(data=True))
    }
    with open(fname, 'w') as f: json.dump(data, f, indent=2)

def load(fname):
    G = nx.DiGraph()
    d = json.load(open(fname))
    G.add_nodes_from(d['nodes'])
    G.add_edges_from(d['edges'])
    return G

def handle_author(author_parsed_instance):
    # convert [["Ortega-Cerda","Joaquim",""]] to Joa.Ortega-Cerda
    first = author_parsed_instance[1]
    last = author_parsed_instance[0]
    author_clean = f"{first}|{last}"
    return author_clean

def generate_tag(df, node_type="article", edge_type="cites", out_dir="data", limit=None):
    if limit: df = df.head(limit)
    os.makedirs(out_dir, exist_ok=True)
    G = nx.Graph(name=f"TAG_{node_type}_{edge_type}")

    # NODE CONSTRUCTION
    if node_type == "article":
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Adding article nodes"):
            node_id = row["id"]
            text = row["title"] + ":\n" + row["abstract"]
            G.add_node(
                node_id,
                type="article",
                text=text,
                embedding=embedder.encode(text).tolist(),
                category=row["categories"],
            )
    elif node_type == "author":
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Adding author nodes"):
            authors = row["authors_parsed"]
            for author in authors:
                author_clean = handle_author(author)
                if not G.has_node(author_clean):
                    G.add_node(author_clean, type="author", embedding=None)

    # EDGE CONSTRUCTION

    if edge_type == "coauthor":
        if node_type != "author": 
            print("You made a mistake. Coauthor edges require author nodes. Change node_type to 'author'.")
            return None
        
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Adding coauthor edges"):
            authors = row["authors_parsed"]
            for i, a1 in enumerate(authors):
                author1_clean = handle_author(a1)
                for a2 in authors[i + 1:]:
                    author2_clean = handle_author(a2)
                    G.add_edge(author1_clean, author2_clean, type="coauthor", paper=row["id"])
                    
    elif edge_type == "cites":
        ids = df["id"].tolist()
        for i in range(len(ids) - 1):
            G.add_edge(ids[i], ids[i + 1], type="cites")

    elif edge_type == "co_citation":
        for cat, group in df.groupby("categories"):
            ids = group["id"].tolist()
            for i in range(len(ids) - 1):
                G.add_edge(ids[i], ids[i + 1], type="co_citation")

    print(G)

    # SAVE STUFF
    out_path = os.path.join(out_dir, f"TAG_{node_type}_{edge_type}.json")
    save(G, out_path)
    print(f"✅ Saved {len(G)} nodes, {G.number_of_edges()} edges → {out_path}")
    return G


In [None]:
# METHOD [1]: COAUTHORSHIP GRAPH

G_coauthorship = nx.Graph()
article_data = {}

for _, row in tqdm(df.iterrows(), total=len(df), desc="Building graph topology and external repo"):
    article_id = row["id"]
    title = row["title"]
    abstract = row["abstract"]
    article_data[article_id] = {
        "abstract": title + "\n" + abstract,
        "vector": None
    }
    authors = row["authors_parsed"]
    for i, a1 in enumerate(authors):
        author1_clean = handle_author(a1)
        for a2 in authors[i + 1:]:
            author2_clean = handle_author(a2)
            # ADD NODES
            if not G_coauthorship.has_node(author1_clean): G_coauthorship.add_node(author1_clean, type="author")
            if not G_coauthorship.has_node(author2_clean): G_coauthorship.add_node(author2_clean, type="author")

            # ADD/UPDATE EDGES
            if G_coauthorship.has_edge(author1_clean, author2_clean):
                G_coauthorship[author1_clean][author2_clean]["paper_ids"].append(article_id)
                G_coauthorship[author1_clean][author2_clean]["weight"] += 1
            else:
                G_coauthorship.add_edge(author1_clean, author2_clean, paper_ids=[article_id], weight=1)

for article_id, data in tqdm(article_data.items(), desc="Generating SBERT embeddings"):
    abstract = data["abstract"]
    vector = embedder.encode(abstract)
    article_data[article_id]["vector"] = vector

for u, v, data in tqdm(G_coauthorship.edges(data=True), desc="Enriching graph edges with topic embeddings"):
    paper_ids = data["paper_ids"]
    vectors_to_average = []
    for pid in paper_ids:
        vector = article_data[pid]["vector"]
        vectors_to_average.append(vector)
    mean_vector = sum(vectors_to_average) / len(vectors_to_average)
    data["topic_embedding"] = mean_vector.tolist()

embeddings = []
for article_id, data in article_data.items():
    vector = data["vector"]
    if vector is not None:
        embeddings.append(vector)

save(G_coauthorship, "data/TAG_author_coauthor_enriched.json")
embeddings_array = np.array(embeddings)
np.save("data/article_embeddings.npy", embeddings_array)

In [None]:
# METHOD [2]: COCITATION GRAPH

G_cocitation = nx.Graph()
article_data = {}

for _, row in tqdm(df.iterrows(), total=len(df), desc="Building graph topology and external repo"):
    article_id = row["id"]
    title = row["title"]
    abstract = row["abstract"]
    article_data[article_id] = {
        "abstract": title + "\n" + abstract,
        "vector": None
    }
    categories = row["categories"].split()
    for i, cat1 in enumerate(categories):
        for cat2 in categories[i + 1:]:
            # ADD NODES
            if not G_cocitation.has_node(cat1): G_cocitation.add_node(cat1, type="category")
            if not G_cocitation.has_node(cat2): G_cocitation.add_node(cat2, type="category")

            # ADD/UPDATE EDGES
            if G_cocitation.has_edge(cat1, cat2):
                G_cocitation[cat1][cat2]["paper_ids"].append(article_id)
                G_cocitation[cat1][cat2]["weight"] += 1
            else:
                G_cocitation.add_edge(cat1, cat2, paper_ids=[article_id], weight=1)

In [85]:
print(df.columns)   

Index(['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
       'report-no', 'categories', 'license', 'abstract', 'versions',
       'update_date', 'authors_parsed'],
      dtype='object')


In [69]:
# save article embeddings from article_data as numpy file
import numpy as np

# Extract article embeddings and save as numpy array
embeddings = []
for article_id, data in article_data.items():
    vector = data["vector"]
    if vector is not None:
        embeddings.append(vector)

# Convert to numpy array and save
embeddings_array = np.array(embeddings)
np.save("data/article_embeddings.npy", embeddings_array)
