In [None]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import kagglehub
import networkx as nx
from networkx.readwrite import json_graph
from kagglehub import KaggleDatasetAdapter
from sentence_transformers import SentenceTransformer

# IF YOU NEED TO DOWNLOAD THE ARXIV DATASET, UNCOMMENT BELOW:
# file_path = kagglehub.dataset_download("Cornell-University/arxiv")
# print("Downloaded to:", file_path)
# json_path = os.path.join(file_path, "arxiv-metadata-oai-snapshot.json")
# print("JSON snapshot path:", json_path)

# subset_df = pd.read_json(json_path, lines=True, nrows=10_000)
# print("Subset shape:", subset_df.shape)
# print(subset_df[["id", "title", "categories"]].head(3))
# os.makedirs("data", exist_ok=True)
# subset_path = "data/arxiv_subset_10k.jsonl"
# subset_df.to_json(subset_path, orient="records", lines=True)
# print(f"Saved trimmed subset → {subset_path}")

# os.remove(json_path)
# print("Deleted large original file.")

df = pd.read_json("data/arxiv_subset_10k.jsonl", lines=True)

In [None]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
print("SBERT model loaded for embedding abstracts.\n\n")
print(df["categories"].value_counts().head(10), '\n')

In [None]:
import ast
import json
from tqdm import tqdm

# Column containing the author data
AUTHOR_COL = 'authors_parsed'

# --- 1. Parse the authors_parsed column safely ---
def parse_authors_field(val):
    """Convert a possibly stringified authors_parsed field to a Python list."""
    if isinstance(val, list):
        return val
    if isinstance(val, str):
        for parser in (ast.literal_eval, json.loads):
            try:
                parsed = parser(val)
                if isinstance(parsed, list):
                    return parsed
            except Exception:
                continue
        return []
    return []

df[AUTHOR_COL] = df[AUTHOR_COL].apply(parse_authors_field)

# --- 2. Compute number of authors per row for sanity checking ---
df['n_authors'] = df[AUTHOR_COL].apply(lambda x: len(x) if isinstance(x, list) else 0)
print("Summary of author counts per article:\n", df['n_authors'].describe())

# --- 3. Define cleaning helper ---
def handle_author(author_parsed_instance):
    """Convert ['Ortega-Cerda', 'Joaquim', ''] -> 'Joaquim|Ortega-Cerda'"""
    try:
        first = author_parsed_instance[1].strip() if len(author_parsed_instance) > 1 else ''
        last = author_parsed_instance[0].strip() if len(author_parsed_instance) > 0 else ''
        return f"{first}|{last}"
    except Exception:
        return ""

# --- 4. Iterate efficiently and collect unique authors ---
unique_authors = set()

for row in tqdm(df.itertuples(index=False), total=len(df)):
    authors = getattr(row, AUTHOR_COL, [])
    if not isinstance(authors, list):
        continue
    for a in authors:
        author_clean = handle_author(a)
        if author_clean and "|" in author_clean:
            unique_authors.add(author_clean)

# --- 5. Print diagnostics ---
df = df[df['n_authors'] < 10000].copy()
# print(f"Dropped {len(df) - len(df)} corrupted rows.")

print(f"\n✅ Total rows processed: {len(df)}")
print(f"✅ Total unique authors: {len(unique_authors)}")
print("Example authors:", list(unique_authors)[:10])

# Optionally: sanity check extreme rows
print("\nTop 5 articles with the most authors:")
print(df.nlargest(5, 'n_authors')[[AUTHOR_COL, 'n_authors']])

In [None]:
embeddings = np.zeros((len(unique_authors), 384))
from collections import defaultdict

# Prepare data structures
author_sums = defaultdict(lambda: np.zeros(384))
author_counts = defaultdict(int)

# Iterate once over all rows
for row in tqdm(df.itertuples(index=False), total=len(df)):
    authors = getattr(row, 'authors_parsed', [])
    if not isinstance(authors, list) or len(authors) == 0:
        continue

    # Combine title + abstract text
    text_parts = []
    if hasattr(row, 'title') and isinstance(row.title, str):
        text_parts.append(row.title)
    if hasattr(row, 'abstract') and isinstance(row.abstract, str):
        text_parts.append(row.abstract)
    if not text_parts:
        continue
    combined_text = " ".join(text_parts)

    # Get embedding for the paper
    emb = embedder.encode(combined_text, show_progress_bar=False)

    # Add to each author's sum
    for a in authors:
        author_id = handle_author(a)
        if not author_id:
            continue
        author_sums[author_id] += emb
        author_counts[author_id] += 1

# Compute averages
author_embeddings = {}
for author_id, sum_vec in author_sums.items():
    count = author_counts[author_id]
    if count > 0:
        author_embeddings[author_id] = sum_vec / count

print(f"✅ Computed embeddings for {len(author_embeddings)} authors")

# Convert to numpy array and save
author_list = list(author_embeddings.keys())
embeddings = np.vstack([author_embeddings[a] for a in author_list])
np.save("author_embeddings.npy", embeddings)

# Optionally save mapping (index → author name)
import json
with open("author_index.json", "w") as f:
    json.dump(author_list, f)


In [None]:
import networkx as nx
from itertools import combinations
from tqdm import tqdm
G_coauthorship = nx.Graph()

# --- 1. Add author nodes ---
for author in tqdm(unique_authors):
    G_coauthorship.add_node(author, type="author", embedding=None)

# --- 2. Add coauthorship edges ---
for row in tqdm(df.itertuples(index=False), total=len(df)):
    authors = getattr(row, 'authors_parsed', [])
    if not isinstance(authors, list) or len(authors) < 2:
        continue

    clean_authors = [handle_author(a) for a in authors if a and isinstance(a, list)]

    # Generate all coauthor pairs
    for a1, a2 in combinations(clean_authors, 2):
        if a1 == a2: continue

        if G_coauthorship.has_edge(a1, a2):
            # Optionally track number of coauthored papers
            G_coauthorship[a1][a2]["weight"] += 1
            G_coauthorship[a1][a2]["papers"].append(getattr(row, 'id', None))
        else:
            G_coauthorship.add_edge(a1, a2,
                       type="coauthor",
                       weight=1,
                       papers=[getattr(row, 'id', None)])

In [None]:
# sanity check: number of nodes and edges
print(f"\n✅ Coauthorship graph has {G_coauthorship.number_of_nodes()} nodes and {G_coauthorship.number_of_edges()} edges.")

In [None]:
import numpy as np
degrees = [deg for _, deg in G_coauthorship.degree()]
print(f"Min degree: {np.min(degrees)}")
print(f"Median degree: {np.median(degrees)}")
print(f"Mean degree: {np.mean(degrees):.2f}")
print(f"Max degree: {np.max(degrees)}")

In [None]:
components = sorted(nx.connected_components(G_coauthorship), key=len, reverse=True)
print(f"Number of components: {len(components)}")
print(f"Largest component size: {len(components[0])}")

In [None]:
weights = [d['weight'] for _,_,d in G_coauthorship.edges(data=True)]
print(f"Edges with >1 coauthored paper: {sum(w>1 for w in weights)}")

In [None]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
import networkx as nx
import json
from tqdm import tqdm

# --- Load embeddings and author list ---
embeddings = np.load("author_embeddings.npy")
with open("author_index.json") as f:
    author_list = json.load(f)

# Normalize embeddings for cosine similarity
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# --- Find nearest neighbors ---
k = 10  # how many similar authors to connect each to
nbrs = NearestNeighbors(n_neighbors=k+1, metric='cosine', n_jobs=-1)
nbrs.fit(embeddings)
distances, indices = nbrs.kneighbors(embeddings)

# --- Build graph ---
G_cosine = nx.Graph()

for author in tqdm(unique_authors):
    G_cosine.add_node(author, type="author", embedding=None)

for i, author in tqdm(enumerate(author_list), total=len(author_list)):
    for j, dist in zip(indices[i][1:], distances[i][1:]):  # skip self (index 0)
        sim = 1 - dist
        if sim < 0.75:
            continue
        neighbor = author_list[j]
        G_cosine.add_edge(author, neighbor, type="cosine", weight=sim)

print(f"✅ Cosine graph has {G_cosine.number_of_nodes()} nodes and {G_cosine.number_of_edges()} edges.")


In [None]:
degrees_coauthorship = [deg for _, deg in G_coauthorship.degree()]
degrees_cosine = [deg for _, deg in G_cosine.degree()] 

stats_coauthorship = f"Min: {np.min(degrees_coauthorship)}\nMedian: {np.median(degrees_coauthorship)}\nMean: {np.mean(degrees_coauthorship):.4f}\nMax: {np.max(degrees_coauthorship)}"
stats_cosine = f"Min: {np.min(degrees_cosine)}\nMedian: {np.median(degrees_cosine)}\nMean: {np.mean(degrees_cosine):.4f}\nMax: {np.max(degrees_cosine)}"

import matplotlib.pyplot as plt
plt.boxplot([degrees_coauthorship, degrees_cosine], labels=[f'Coauthorship_TAG\n\n{stats_coauthorship}', f'Cosine_Similarity_TAG\n\n{stats_cosine}'])
plt.ylabel('Node Degree')
plt.title('Degree Distribution Comparison')
# plt.yscale('log')
plt.show()

In [None]:
# get densities
density_coauthorship = nx.density(G_coauthorship)
density_cosine = nx.density(G_cosine)
print(f"Coauthorship Graph Density: {density_coauthorship:.6f}")
print(f"Cosine Similarity Graph Density: {density_cosine:.6f}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Compute degrees
degrees_coauthorship = [deg for _, deg in G_coauthorship.degree()]
degrees_cosine = [deg for _, deg in G_cosine.degree()]

# Compute summary stats
stats_coauthorship = (
    f"Nodes: {G_coauthorship.number_of_nodes()}\n"
    f"Edges: {G_coauthorship.number_of_edges()}\n"
    f"Median: {np.median(degrees_coauthorship)}\n"
    f"Mean: {np.mean(degrees_coauthorship):.1f}\n"
)
stats_cosine = (
    f"Nodes: {G_cosine.number_of_nodes()}\n"
    f"Edges: {G_cosine.number_of_edges()}\n"
    f"Median: {np.median(degrees_cosine)}\n"
    f"Mean: {np.mean(degrees_cosine):.1f}\n"
)

def degree_freq(degrees):
    freq = Counter(degrees)
    x, y = zip(*sorted(freq.items()))
    return x, y

x_co, y_co = degree_freq(degrees_coauthorship)
x_cos, y_cos = degree_freq(degrees_cosine)

# --- Visualization ---
plt.figure(figsize=(14, 6))

# Boxplot comparison
plt.subplot(1, 2, 1)
plt.boxplot(
    [degrees_coauthorship, degrees_cosine],
    labels=[f"Coauthorship TAG\n\n{stats_coauthorship}", f"Cosine Similarity (<0.75) TAG\n\n{stats_cosine}"],
    showfliers=False,
)
plt.ylabel("Node Degree")
plt.title("Degree Distribution Comparison | Node = Author")
plt.grid(True, alpha=0.3)

# Histograms on log-scale to see the tail
plt.subplot(1, 2, 2)
plt.plot(x_co, y_co, 'o-', label='Coauthorship TAG', color='tab:orange', alpha=0.8)
plt.plot(x_cos, y_cos, 'o-', label='Cosine TAG', color='tab:blue', alpha=0.8)
plt.xlabel(f'Degree Count\n\nMaximum ~ Coauthor: {np.max(degrees_coauthorship)} | Cosine: {np.max(degrees_cosine)}\nDensity ~ Coauthor: {nx.density(G_coauthorship):.6f} | Cosine: {nx.density(G_cosine):.6f}')
plt.ylabel('Number of nodes')
plt.title('Degree Frequency Distribution')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
import networkx as nx
from collections import defaultdict

# --- Step 1: Build mapping author -> set of paper IDs ---
author_papers = defaultdict(set)
for row in df.itertuples(index=False):
    paper_id = getattr(row, 'id', None)
    authors = getattr(row, 'authors_parsed', [])
    if paper_id is None or not authors:
        continue
    for a in authors:
        author_clean = handle_author(a)
        if author_clean:
            author_papers[author_clean].add(paper_id)

# --- Step 2: Create a copy of the coauthorship graph ---
G_jaccard = nx.Graph()
G_jaccard.add_nodes_from(G_coauthorship.nodes(data=True))

# --- Step 3: Compute Jaccard weight for each coauthor edge ---
for u, v in G_coauthorship.edges():
    papers_u = author_papers[u]
    papers_v = author_papers[v]
    if not papers_u or not papers_v:
        continue
    inter = len(papers_u & papers_v)
    union = len(papers_u | papers_v)
    if union == 0:
        continue
    w = inter / union
    G_jaccard.add_edge(u, v, type='coauthor', weight=w)

print(f"✅ Jaccard-weighted coauthorship graph has {G_jaccard.number_of_edges()} edges.")


In [None]:
# print df columns names
print(df.columns)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Compute degree frequency distributions
deg_freq_coauthorship = Counter(degrees_coauthorship)
deg_freq_cosine = Counter(degrees_cosine)

# Convert to sorted arrays
x_co, y_co = zip(*sorted(deg_freq_coauthorship.items()))
x_cos, y_cos = zip(*sorted(deg_freq_cosine.items()))

plt.figure(figsize=(7, 6))
plt.loglog(x_co, y_co, 'o-', label='Coauthorship TAG', color='tab:blue', alpha=0.8)
plt.loglog(x_cos, y_cos, 'o-', label='Cosine TAG', color='tab:orange', alpha=0.8)
plt.xlabel('Degree (log scale)')
plt.ylabel('Number of nodes (log scale)')
plt.title('Degree Frequency Distribution (log–log)')
plt.legend()
plt.grid(True, which="both", ls="--", alpha=0.4)
plt.show()



In [None]:
pd.Series(degrees_coauthorship).describe()

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

# Compute degree frequency distributions
deg_freq_coauthorship = Counter(degrees_coauthorship)
deg_freq_cosine = Counter(degrees_cosine)

# Convert to sorted arrays
x_co, y_co = zip(*sorted(deg_freq_coauthorship.items()))
x_cos, y_cos = zip(*sorted(deg_freq_cosine.items()))

plt.figure(figsize=(8,6))
plt.plot(x_co, y_co, 'o-', label='Coauthorship TAG', color='tab:blue', alpha=0.8)
plt.plot(x_cos, y_cos, 'o-', label='Cosine TAG', color='tab:orange', alpha=0.8)
plt.xlabel('Degree')
plt.ylabel('Number of nodes')
plt.title('Degree Frequency Distribution (Linear Scale)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
def save(G, fname):
    data = {
        "nodes": list(G.nodes(data=True)), 
        "edges": list(G.edges(data=True))
    }
    with open(fname, 'w') as f: json.dump(data, f, indent=2)

def load(fname):
    G = nx.DiGraph()
    d = json.load(open(fname))
    G.add_nodes_from(d['nodes'])
    G.add_edges_from(d['edges'])
    return G

def handle_author(author_parsed_instance):
    # convert [["Ortega-Cerda","Joaquim",""]] to Joa.Ortega-Cerda
    first = author_parsed_instance[1]
    last = author_parsed_instance[0]
    author_clean = f"{first}|{last}"
    return author_clean

def generate_tag(df, node_type="article", edge_type="cites", out_dir="data", limit=None):
    if limit: df = df.head(limit)
    os.makedirs(out_dir, exist_ok=True)
    G = nx.Graph(name=f"TAG_{node_type}_{edge_type}")

    # NODE CONSTRUCTION
    if node_type == "article":
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Adding article nodes"):
            node_id = row["id"]
            text = row["title"] + ":\n" + row["abstract"]
            G.add_node(
                node_id,
                type="article",
                text=text,
                embedding=embedder.encode(text).tolist(),
                category=row["categories"],
            )
    elif node_type == "author":
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Adding author nodes"):
            authors = row["authors_parsed"]
            for author in authors:
                author_clean = handle_author(author)
                if not G.has_node(author_clean):
                    G.add_node(author_clean, type="author", embedding=None)

    # EDGE CONSTRUCTION

    if edge_type == "coauthor":
        if node_type != "author": 
            print("You made a mistake. Coauthor edges require author nodes. Change node_type to 'author'.")
            return None
        
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Adding coauthor edges"):
            authors = row["authors_parsed"]
            for i, a1 in enumerate(authors):
                author1_clean = handle_author(a1)
                for a2 in authors[i + 1:]:
                    author2_clean = handle_author(a2)
                    G.add_edge(author1_clean, author2_clean, type="coauthor", paper=row["id"])
                    
    elif edge_type == "cites":
        ids = df["id"].tolist()
        for i in range(len(ids) - 1):
            G.add_edge(ids[i], ids[i + 1], type="cites")

    elif edge_type == "co_citation":
        for cat, group in df.groupby("categories"):
            ids = group["id"].tolist()
            for i in range(len(ids) - 1):
                G.add_edge(ids[i], ids[i + 1], type="co_citation")

    print(G)

    # SAVE STUFF
    out_path = os.path.join(out_dir, f"TAG_{node_type}_{edge_type}.json")
    save(G, out_path)
    print(f"✅ Saved {len(G)} nodes, {G.number_of_edges()} edges → {out_path}")
    return G

In [None]:
# METHOD [1]: COAUTHORSHIP GRAPH

G_coauthorship = nx.Graph()
article_data = {}

for _, row in tqdm(df.iterrows(), total=len(df), desc="Building graph topology and external repo"):
    article_id = row["id"]
    title = row["title"]
    abstract = row["abstract"]
    article_data[article_id] = {
        "abstract": title + "\n" + abstract,
        "vector": None
    }
    authors = row["authors_parsed"]
    for i, a1 in enumerate(authors):
        author1_clean = handle_author(a1)
        for a2 in authors[i + 1:]:
            author2_clean = handle_author(a2)
            # ADD NODES
            if not G_coauthorship.has_node(author1_clean): G_coauthorship.add_node(author1_clean, type="author")
            if not G_coauthorship.has_node(author2_clean): G_coauthorship.add_node(author2_clean, type="author")

            # ADD/UPDATE EDGES
            if G_coauthorship.has_edge(author1_clean, author2_clean):
                G_coauthorship[author1_clean][author2_clean]["paper_ids"].append(article_id)
                G_coauthorship[author1_clean][author2_clean]["weight"] += 1
            else:
                G_coauthorship.add_edge(author1_clean, author2_clean, paper_ids=[article_id], weight=1)

for article_id, data in tqdm(article_data.items(), desc="Generating SBERT embeddings"):
    abstract = data["abstract"]
    vector = embedder.encode(abstract)
    article_data[article_id]["vector"] = vector

for u, v, data in tqdm(G_coauthorship.edges(data=True), desc="Enriching graph edges with topic embeddings"):
    paper_ids = data["paper_ids"]
    vectors_to_average = []
    for pid in paper_ids:
        vector = article_data[pid]["vector"]
        vectors_to_average.append(vector)
    mean_vector = sum(vectors_to_average) / len(vectors_to_average)
    data["topic_embedding"] = mean_vector.tolist()

embeddings = []
for article_id, data in article_data.items():
    vector = data["vector"]
    if vector is not None:
        embeddings.append(vector)

save(G_coauthorship, "data/TAG_author_coauthor_enriched.json")
embeddings_array = np.array(embeddings)
np.save("data/article_embeddings.npy", embeddings_array)

In [None]:
embeddings.shape

In [None]:
G_coauthorship = load("data/TAG_author_coauthor_enriched.json")
print("num authors:", G_coauthorship.number_of_nodes())

In [None]:
author_list = []
for i, row in tqdm(df.iterrows(), total=len(df), desc="Adding nodes to topical similarity graph"):
    authors = row["authors_parsed"]
    for a in authors:
        author_clean = handle_author(a)
        if author_clean not in author_list:
            author_list.append(author_clean)
            G_topic_similarity.add_node(author_clean, type="author", embedding=None)
print(len(author_list))

In [None]:
# METHOD [2]: TOPICAL SIMILARITY GRAPH

G_topic_similarity = nx.Graph()
article_data = {}

embeddings = np.load("data/article_embeddings.npy")

# nodes: cleaned_author, type, embeddings
# edges: cleaned_author1, cleaned_author2, type, weight, topic_embedding
# cleaned_author examples ~ "P.|Bezzon", "R.|Menegazzo",

author_list = []
for i, row in tqdm(df.iterrows(), total=len(df), desc="Adding nodes to topical similarity graph"):
    authors = row["authors_parsed"]
    for a in authors:
        author_clean = handle_author(a)
        if author_clean not in author_list:
            author_list.append(author_clean)
            G_topic_similarity.add_node(author_clean, type="author", embedding=None)

similarities = []
# loop through embeddings to calculate cosine similarity matrix
for embedding in embeddings:
    





for i, u in tqdm(enumerate(G_topic_similarity.nodes()), total=G_topic_similarity.number_of_nodes(), desc="Calculating edge similarity matrix"):
    # vec_u = article_data[u]["vector"]
    # for j, v in enumerate(G_topic_similarity.nodes()):
    #     if i >= j: continue
    #     vec_v = article_data[v]["vector"]
    #     similarity = np.dot(vec_u, vec_v) / (np.linalg.norm(vec_u) * np.linalg.norm(vec_v))
    #     similarities.append(similarity)
    #     if similarity >= 0.25:
    #         G_topic_similarity.add_edge(u, v, type="topic_similarity", weight=similarity)
    
    # ADD EDGES BETWEEN AUTHORS BASED ON TOPICAL SIMILARITY
    for 

save(G_topic_similarity, "data/TAG_author_topic_similarity.json")
print(pd.Series(similarities).describe())

In [None]:
# save(G_topic_similarity, "data/TAG_article_topic_similarity.json")
np.save("data/t.npy", embeddings_array)

In [None]:
pd.Series(similarities).describe()

In [None]:
len([s for s in similarities if s >= 0.25]) / len(similarities)

In [None]:
# print np stats for similarities
print("Similarity stats:")


In [None]:
print(df.columns)   

In [None]:
print(len(set(df['categories'])))
print(set(df['categories']))

In [None]:
category_broad = set()
for category in set(df['categories']):
    category.split('.')
    print(category, '|', category.split('.')[0])
    category_broad.add(category.split('.')[0])

print(len(category_broad))
print(category_broad)

In [None]:
'cs' in category_broad
# where is cs in category_broad
print([category for category in category_broad if category == 'cs'])

In [None]:
# add category_broad to df as integer labels
category_to_int = {cat: idx for idx, cat in enumerate(sorted(category_broad))}
df['category_broad'] = df['categories'].apply(lambda cat: category_to_int[cat.split('.')[0]])

In [None]:
# save g_coauthorship and g_cosine
save(G_coauthorship, "data/TAG_author_coauthorship.json")
save(G_cosine, "data/TAG_author_cosine.json")

In [None]:
# === GNN training cell: PyTorch Geometric ===
import random
import json
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from torch_geometric.utils import from_networkx, to_undirected
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import networkx as nx
from collections import defaultdict
from tqdm import tqdm

# --------------------
# CONFIG
# --------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 256
NUM_LAYERS = 2
DROPOUT = 0.5
LR = 1e-3
WEIGHT_DECAY = 1e-5
EPOCHS = 200
RANDOM_SEED = 42
NUM_CLASSES = 174   # your number of category_broad classes
USE_DEG_FEATURE = False  # optionally append degree as a scalar feature

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# --------------------
# Utilities: build author -> paper set and target distribution
# --------------------
# Build author -> paper id map (if not already available)
author_papers = defaultdict(set)
for row in df.itertuples(index=False):
    pid = getattr(row, 'id', None)
    authors = getattr(row, 'authors_parsed', [])
    if pid is None: 
        continue
    for a in authors:
        auth = handle_author(a)
        if auth:
            author_papers[auth].add(pid)

# Build author -> category distribution (y)
# Assumption: df has 'category_broad' with integer label in [0, NUM_CLASSES-1]
# We'll compute for each author a normalized histogram over categories of their papers.
author_category_counts = defaultdict(lambda: np.zeros(NUM_CLASSES, dtype=np.float32))
for row in df.itertuples(index=False):
    pid = getattr(row, 'id', None)
    cat = getattr(row, 'category_broad', None)
    if pid is None or cat is None: 
        continue
    authors = getattr(row, 'authors_parsed', [])
    for a in authors:
        auth = handle_author(a)
        if auth:
            author_category_counts[auth][int(cat)] += 1.0

# Normalize to probability distributions (sum to 1). If an author had zero, we'll leave zeros (mask later).
author_list_all = author_list  # order used for embeddings
author_to_idx = {a: i for i, a in enumerate(author_list_all)}
N = len(author_list_all)

Y = np.zeros((N, NUM_CLASSES), dtype=np.float32)
for a, idx in author_to_idx.items():
    counts = author_category_counts.get(a, None)
    if counts is None:
        continue
    s = counts.sum()
    if s > 0:
        Y[idx] = counts / s
    else:
        Y[idx] = np.zeros(NUM_CLASSES, dtype=np.float32)

# --------------------
# Node features: embeddings (+ optional degree)
# --------------------
# embeddings: numpy array shape (N, 384)
# Normalize embedding vectors
embeddings = normalize(embeddings, axis=1)  # L2 normalize rows
X = embeddings.astype(np.float32)

if USE_DEG_FEATURE:
    # compute degree for nodes using G_coauthorship as baseline (or G_cosine)
    deg_map = {}
    for g in [G_coauthorship, G_cosine]:
        if len(g) > 0:
            # default deg from coauthorship for feature; you can choose per graph later
            break
    # use degrees from G_coauthorship
    degs = np.zeros((N, 1), dtype=np.float32)
    for a, i in author_to_idx.items():
        d = G_coauthorship.degree(a) if a in G_coauthorship else 0
        degs[i, 0] = float(d)
    # normalize degree (log scaling helps)
    degs = np.log1p(degs)
    degs = (degs - degs.mean()) / (degs.std() + 1e-9)
    X = np.concatenate([X, degs], axis=1)

FEATURE_DIM = X.shape[1]

# --------------------
# Helper: convert networkx graph -> torch_geometric Data
# --------------------
def build_data_from_nx(G_nx, x_array, y_array, train_frac=0.7, val_frac=0.15, test_frac=0.15):
    # Ensure graph contains all author nodes
    # If some author nodes are missing from G_nx, we'll add isolated nodes
    G_copy = G_nx.copy()
    for a in author_list_all:
        if a not in G_copy:
            G_copy.add_node(a)
    # convert to PyG Data


    pyg = from_networkx(G_copy)  # node attribute ordering depends on networkx; we'll set x, y manually
    # Build edge_index (undirected)

    edges = [(author_to_idx[u], author_to_idx[v]) for u, v in G_copy.edges()]
    if len(edges) == 0:
        edge_index = torch.empty((2, 0), dtype=torch.long)
    else:
        edge_index = torch.tensor(np.array(edges).T, dtype=torch.long)
    edge_index = to_undirected(edge_index)

    # edge_index = torch.tensor(np.array(list(G_copy.edges())).T, dtype=torch.long)
    # if edge_index.numel() == 0:
    #     edge_index = torch.empty((2, 0), dtype=torch.long)
    # else:
    #     edge_index = to_undirected(edge_index)


    # Build data object
    data = Data()
    data.num_nodes = N
    data.edge_index = edge_index.to(DEVICE)
    data.x = torch.tensor(x_array, dtype=torch.float32).to(DEVICE)
    data.y = torch.tensor(y_array, dtype=torch.float32).to(DEVICE)
    # Build masks (random split on nodes that have non-zero target)
    has_target = (data.y.sum(dim=1) > 0).cpu().numpy()
    idx_with_target = np.where(has_target)[0]
    train_idx, test_idx = train_test_split(idx_with_target, train_size=train_frac, random_state=RANDOM_SEED)
    val_size = int(len(train_idx) * (val_frac / train_frac))
    # split train_idx into train/val
    train_idx, val_idx = train_test_split(train_idx, test_size=val_size, random_state=RANDOM_SEED)
    mask_train = np.zeros(N, dtype=bool)
    mask_val = np.zeros(N, dtype=bool)
    mask_test = np.zeros(N, dtype=bool)
    mask_train[train_idx] = True
    mask_val[val_idx] = True
    mask_test[test_idx] = True
    data.train_mask = torch.tensor(mask_train, dtype=torch.bool).to(DEVICE)
    data.val_mask = torch.tensor(mask_val, dtype=torch.bool).to(DEVICE)
    data.test_mask = torch.tensor(mask_test, dtype=torch.bool).to(DEVICE)
    return data

# Build Data objects for both graphs
data_co = build_data_from_nx(G_coauthorship, X, Y)
data_cos = build_data_from_nx(G_cosine, X, Y)

In [None]:
# --------------------
# Model: 2-layer GraphSAGE + MLP head
# --------------------
class SAGEPredictor(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout=0.5):
        super().__init__()
        self.conv1 = SAGEConv(in_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim//2, out_dim)
        )
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        out = self.mlp(x)  # logits for classes
        log_probs = F.log_softmax(out, dim=-1)  # log-probs for KLDivLoss
        return log_probs

# --------------------
# Training & eval helpers
# --------------------
from torch import optim
from torch.nn import KLDivLoss
from sklearn.metrics import top_k_accuracy_score
import math

def train_one(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    # compute loss only on training nodes with targets
    mask = data.train_mask
    target = data.y[mask]
    pred = out[mask]
    loss = criterion(pred, target)  # KLDiv between log_prob and target prob
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def evaluate(model, data, mask):
    model.eval()
    out = model(data.x, data.edge_index)  # log-probs
    probs = out.exp().cpu().numpy()
    targets = data.y.cpu().numpy()
    mask_np = mask.cpu().numpy()
    if mask_np.sum() == 0:
        return {"kl": float('nan'), "top1": float('nan'), "top3": float('nan'), "cosine": float('nan')}
    pred = probs[mask_np]
    targ = targets[mask_np]
    # KL (batch mean)
    # reconvert pred to log for criterion
    kl = float(np.sum(targ * (np.log(np.maximum(targ, 1e-12)) - np.log(np.maximum(pred, 1e-12)))) / len(pred))
    # top-1 / top-3 accuracy (if targets are distributions, use argmax)
    y_true_idx = targ.argmax(axis=1)
    y_pred_idx = pred.argmax(axis=1)
    top1 = (y_pred_idx == y_true_idx).mean()
    # top-3
    # top3 = top_k_accuracy_score(y_true_idx, pred, k=3)
    top3 = top_k_accuracy_score(y_true_idx, pred, k=3, labels=np.arange(NUM_CLASSES))
    # cosine similarity between pred and true vectors (mean)
    dot = (pred * targ).sum(axis=1)
    norm_pred = np.linalg.norm(pred, axis=1)
    norm_targ = np.linalg.norm(targ, axis=1)
    cosines = dot / (norm_pred * norm_targ + 1e-12)
    cosine_mean = float(np.nanmean(cosines))
    return {"kl": kl, "top1": top1, "top3": top3, "cosine": cosine_mean}

# --------------------
# Training loop function: train on a given data object (coauthorship or cosine)
# --------------------
def run_training(data, name="graph"):
    model = SAGEPredictor(FEATURE_DIM, HIDDEN_DIM, NUM_CLASSES, dropout=DROPOUT).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    criterion = KLDivLoss(reduction='batchmean')  # expects log_prob inputs and target probs
    best_val_kl = float('inf')
    best_state = None

    for epoch in range(1, EPOCHS + 1):
        loss = train_one(model, data, optimizer, criterion)
        train_metrics = evaluate(model, data, data.train_mask)
        val_metrics = evaluate(model, data, data.val_mask)
        if val_metrics["kl"] < best_val_kl:
            best_val_kl = val_metrics["kl"]
            best_state = model.state_dict()
        if epoch % 10 == 0 or epoch == 1:
            print(f"[{name}] Epoch {epoch:03d} | Loss {loss:.4f} | Train KL {train_metrics['kl']:.4f} | Val KL {val_metrics['kl']:.4f} | Val top1 {val_metrics['top1']:.3f} | Val cos {val_metrics['cosine']:.3f}")

    # load best
    if best_state is not None:
        model.load_state_dict(best_state)
    test_metrics = evaluate(model, data, data.test_mask)
    print(f"*** [{name}] TEST KL {test_metrics['kl']:.4f} | top1 {test_metrics['top1']:.3f} | top3 {test_metrics['top3']:.3f} | cos {test_metrics['cosine']:.3f}")
    return model, test_metrics

# --------------------
# Run experiments on both TAGs
# --------------------
print("Training on Coauthorship TAG...")
model_co, metrics_co = run_training(data_co, name="Coauthorship")

print("\nTraining on Cosine TAG...")
model_cos, metrics_cos = run_training(data_cos, name="Cosine")

# Optionally save models
torch.save(model_co.state_dict(), "gnn_coauthorship.pt")
torch.save(model_cos.state_dict(), "gnn_cosine.pt")
print("Models saved.")

In [None]:
sub_columns = ['gnn_only', 'llm_only', 'sequential', 'dual']
# sub_accuracies for coauthorship, cosine, topic_similarity
sub_accuracies = [[0.891, 0.891, 0], [0,0,0], [0,0,0], [0,0,0]]

# plot grouped bar chart for sub_accuracies
x = np.arange(len(sub_columns))
width = 0.2
fig, ax = plt.subplots(figsize=(10,6))
bars1 = ax.bar(x - width, [sub_accuracies[i][0] for i in range(len(sub_columns))], width, label='Coauthorship TAG', color='tab:red')
bars2 = ax.bar(x, [sub_accuracies[i][1] for i in range(len(sub_columns))], width, label='Cosine TAG', color='tab:blue')
bars3 = ax.bar(x + width, [sub_accuracies[i][2] for i in range(len(sub_columns))], width, label='Topic_Similarity TAG', color='tab:green')
ax.set_ylabel('Accuracy')
ax.set_title('GNN Subtask Accuracies by TAG Type')
ax.set_xticks(x)
ax.set_xticklabels(sub_columns)
ax.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# ---------------------------
# Setup placeholder data
# ---------------------------
graph_types = ["Coauthorship", "Cosine", "Topic Similarity"]
models = ["GNN Only", "LLM Only", "Sequential", "Dual"]
metrics = ["KL ↓", "Top-1 ↑", "Top-3 ↑", "Cosine ↑"]

results = {
    "Coauthorship": {
        "GNN Only":  [0.4555, 0.891, 0.946, 0.908],
        "LLM Only":  [0, 0, 0, 0],
        "Sequential": [0, 0, 0, 0],
        "Dual":      [0, 0, 0, 0],
    },
    "Cosine": {
        "GNN Only":  [0.4844, 0.891, 0.942, 0.905],
        "LLM Only":  [0, 0, 0, 0],
        "Sequential": [0, 0, 0, 0],
        "Dual":      [0, 0, 0, 0],
    },
    "Topic Similarity": {
        "GNN Only":  [0, 0, 0, 0],
        "LLM Only":  [0, 0, 0, 0],
        "Sequential": [0, 0, 0, 0],
        "Dual":      [0, 0, 0, 0],
    },
}

# ---------------------------
# Build table data with separator rows
# ---------------------------
table_data = []
for i, graph in enumerate(graph_types):
    for model in models:
        row = [graph, model] + [f"{v:.3f}" if v != 0 else "–" for v in results[graph][model]]
        table_data.append(row)
    # Add a black separator row except after the last graph
    if i < len(graph_types) - 1:
        table_data.append(["", "", "", "", "", ""])

columns = ["Graph", "Model"] + metrics



# ---------------------------
# Plot table
# ---------------------------
table_data_with_spacing = []
for i, graph in enumerate(graph_types):
    for model in models:
        row = [graph, model] + [f"{v:.3f}" if v != 0 else "–" for v in results[graph][model]]
        table_data_with_spacing.append(row)
    # Add a blank row for spacing except after the last graph
    if i < len(graph_types) - 1:
        table_data_with_spacing.append([""] * (2 + len(metrics)))  # empty row

# Then when plotting the table, just increase row height slightly
fig, ax = plt.subplots(figsize=(9, 5))
ax.axis("off")
tbl = ax.table(
    cellText=table_data_with_spacing,
    colLabels=columns,
    loc="center",
    cellLoc="center",
)
tbl.auto_set_font_size(False)
tbl.set_fontsize(10)
tbl.scale(1.2, 1.6)  # slightly taller rows for visual spacing


# Style headers
for (row, col), cell in tbl.get_celld().items():
    if row == 0:
        cell.set_text_props(weight="bold", color="white")
        cell.set_facecolor("#4C72B0")
    elif row % 5 == 0 and row not in [0, len(table_data)]:  
        # gray background for alternating graph sections
        cell.set_facecolor("#f0f0f0")
    elif all(c == "" for c in table_data[row-1]):

        # Make the spacer rows black
        cell.set_facecolor("green")
        cell.get_text().set_color("white")  # optional, to make text visible if any


plt.title("Performance Summary Across Graph Variants and Models", pad=20)
plt.show()


In [None]:
from sklearn.metrics import top_k_accuracy_score

# Assume NUM_CLASSES = 174
