# Collaboration Networks â€” Data Loading + Graph Summary + LCC

This notebook:

- Loads `.mtx` adjacency matrices and converts them to NetworkX graphs
- Prints key statistics (Nodes, Edges, Avg Degree, Avg Clustering, Approx Avg Path Length)
- Extracts the Largest Connected Component (LCC)
- Compares metrics before vs after LCC


In [2]:
import os
import time
import random
import numpy as np
import pandas as pd
import networkx as nx

from scipy.io import mmread
from scipy.sparse import csr_matrix

# Reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 160)


In [3]:
DATASETS = {
    "ca-CondMat": "../data/ca-CondMat/ca-CondMat.mtx",
    "ca-GrQc":    "../data/ca-GrQc/ca-GrQc.mtx",
    "ca-HepPh":   "../data/ca-HepPh/ca-HepPh.mtx",
}
# Basic sanity check
missing = [name for name, path in DATASETS.items() if not os.path.exists(path)]
if missing:
    raise FileNotFoundError(f"Missing dataset(s): {missing}\nPlease check the file paths.")
print("Dataset paths OK")

Dataset paths OK


In [4]:
def load_graph_from_mtx(path: str) -> tuple[nx.Graph, float]:
    """
    Reads a Matrix Market (.mtx) adjacency matrix and builds an undirected simple graph.
    - Converts to CSR for efficiency
    - Builds a NetworkX Graph from the sparse matrix
    - Removes self-loops
    - Ensures a simple undirected Graph
    Returns: (Graph, load_time_seconds)
    """
    t0 = time.time()
    A = mmread(path)

    if not isinstance(A, csr_matrix):
        A = A.tocsr()

    G = nx.from_scipy_sparse_array(A)  # Graph for symmetric adjacency

    # Remove self-loops if any
    G.remove_edges_from(nx.selfloop_edges(G))

    # Ensure simple Graph
    if isinstance(G, (nx.MultiGraph, nx.MultiDiGraph)):
        G = nx.Graph(G)

    return G, (time.time() - t0)


def get_lcc(G: nx.Graph) -> nx.Graph:
    """Return the Largest Connected Component (induced subgraph) as a copy."""
    if G.number_of_nodes() == 0:
        return G.copy()
    largest_cc_nodes = max(nx.connected_components(G), key=len)
    return G.subgraph(largest_cc_nodes).copy()


def approx_average_path_length(G: nx.Graph, sample_size: int = 60, cutoff: int = None) -> float:
    """
    Approximate average shortest path length by sampling BFS from random source nodes.
    For disconnected graphs, we average over reachable pairs from sampled sources.
    Returns NaN if graph is too small or no reachable pairs are found.
    """
    n = G.number_of_nodes()
    if n < 2:
        return float("nan")

    nodes = list(G.nodes())
    k = min(sample_size, n)
    sources = random.sample(nodes, k)

    dists = []
    for s in sources:
        sp = nx.single_source_shortest_path_length(G, s, cutoff=cutoff)
        for _, dist in sp.items():
            if dist > 0:
                dists.append(dist)

    return float(np.mean(dists)) if dists else float("nan")


def graph_metrics(G: nx.Graph, path_sample_size: int = 60) -> dict:
    """
    Compute requested graph-level stats:
    - Nodes, Edges
    - Average Degree
    - Average Clustering
    - Approx. Average Shortest Path Length
    """
    n = G.number_of_nodes()
    m = G.number_of_edges()

    avg_degree = (2.0 * m / n) if n > 0 else float("nan")
    avg_clustering = nx.average_clustering(G) if n > 0 else float("nan")
    avg_path_approx = approx_average_path_length(G, sample_size=path_sample_size)

    return {
        "Nodes": n,
        "Edges": m,
        "Average Degree": avg_degree,
        "Average Clustering": avg_clustering,
        "Average Path (approx)": avg_path_approx,
    }


In [5]:
graphs = {}
load_times = {}

for name, path in DATASETS.items():
    G, load_s = load_graph_from_mtx(path)
    graphs[name] = G
    load_times[name] = load_s
    print(f"{name}: loaded in {load_s:.2f}s | nodes={G.number_of_nodes():,} | edges={G.number_of_edges():,}")

ca-CondMat: loaded in 0.26s | nodes=21,363 | edges=91,286
ca-GrQc: loaded in 0.09s | nodes=4,158 | edges=13,422
ca-HepPh: loaded in 0.23s | nodes=11,204 | edges=117,619


In [6]:
rows_before = []
for name, G in graphs.items():
    stats = graph_metrics(G, path_sample_size=60)
    stats["Dataset"] = name
    stats["Load Time (s)"] = load_times[name]
    rows_before.append(stats)

df_before = pd.DataFrame(rows_before).set_index("Dataset")
df_before


Unnamed: 0_level_0,Nodes,Edges,Average Degree,Average Clustering,Average Path (approx),Load Time (s)
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ca-CondMat,21363,91286,8.546178,0.641732,5.17629,0.25766
ca-GrQc,4158,13422,6.455988,0.556878,6.065243,0.089085
ca-HepPh,11204,117619,20.995894,0.621582,4.643917,0.233351


In [7]:
graphs_lcc = {}
lcc_coverage = {}

for name, G in graphs.items():
    Glcc = get_lcc(G)
    graphs_lcc[name] = Glcc

    lcc_coverage[name] = {
        "LCC Node %": 100.0 * Glcc.number_of_nodes() / max(1, G.number_of_nodes()),
        "LCC Edge %": 100.0 * Glcc.number_of_edges() / max(1, G.number_of_edges()),
    }

    print(
        f"{name}: LCC nodes={Glcc.number_of_nodes():,} ({lcc_coverage[name]['LCC Node %']:.2f}%) | "
        f"edges={Glcc.number_of_edges():,} ({lcc_coverage[name]['LCC Edge %']:.2f}%)"
    )


ca-CondMat: LCC nodes=21,363 (100.00%) | edges=91,286 (100.00%)
ca-GrQc: LCC nodes=4,158 (100.00%) | edges=13,422 (100.00%)
ca-HepPh: LCC nodes=11,204 (100.00%) | edges=117,619 (100.00%)


In [9]:
rows_after = []
for name, G in graphs_lcc.items():
    stats = graph_metrics(G, path_sample_size=60)
    stats["Dataset"] = name
    rows_after.append(stats)

df_after = pd.DataFrame(rows_after).set_index("Dataset")
df_after


Unnamed: 0_level_0,Nodes,Edges,Average Degree,Average Clustering,Average Path (approx)
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ca-CondMat,21363,91286,8.546178,0.641732,5.262096
ca-GrQc,4158,13422,6.455988,0.556878,6.161463
ca-HepPh,11204,117619,20.995894,0.621582,4.679209


## Notes

- In all three datasets, the original graphs are already connected; therefore, extracting the Largest Connected Component (LCC) does not change the number of nodes or edges.
- Despite the graphs being connected, computing the exact average shortest path length is computationally expensive for large networks.
- For this reason, **Average Path (approx)** is estimated by sampling BFS from randomly selected source nodes and averaging distances over reachable node pairs.
