In [None]:
!pip install ogb
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv \
  -f https://data.pyg.org/whl/torch-2.4.0+cu124.html

!pip install -U torch-geometric

Looking in links: https://data.pyg.org/whl/torch-2.4.0+cu124.html


In [None]:
import os
from torch_geometric.datasets import TUDataset, PPI
from ogb.graphproppred import PygGraphPropPredDataset
from torch_geometric.data import DataLoader, HeteroData
import torch
import bz2, json
from tqdm import tqdm
from collections import defaultdict

# Loading our datasets

### MUTAG, PROTEINS, and PPI

In [None]:
dataset = TUDataset(root='data/TUDataset', name='MUTAG')
dataset = TUDataset(root='data/TUDataset', name='PROTEINS')
dataset = PPI(root='data/PPI')

Downloading https://data.dgl.ai/dataset/ppi.zip
Extracting data/PPI/ppi.zip
Processing...
The default value will be changed to `edges="edges" in NetworkX 3.6.


  nx.node_link_graph(data, edges="links") to preserve current behavior, or
  nx.node_link_graph(data, edges="edges") for forward compatibility.
Done!


In [None]:
import numpy as np
import torch
from torch.utils.data import ConcatDataset
from torch_geometric.datasets import TUDataset, PPI

def _undirected_unique_edge_count(edge_index: torch.Tensor) -> int:
    if edge_index is None or edge_index.numel() == 0:
        return 0
    u, v = edge_index
    lo = torch.minimum(u, v)
    hi = torch.maximum(u, v)
    pairs = torch.stack([lo, hi], dim=1)           # [E, 2]
    uniq = torch.unique(pairs, dim=0)              # unique undirected edges
    return int(uniq.size(0))

def summarize_dataset(dataset, name: str):
    # Works for TUDataset, PPI splits, and ConcatDataset of PyG datasets
    if isinstance(dataset, ConcatDataset):
        it = (dataset[i] for i in range(len(dataset)))
        num_graphs = len(dataset)
    else:
        it = iter(dataset)
        num_graphs = len(dataset)

    num_nodes_list, num_edges_list, densities, avg_degrees = [], [], [], []
    for g in it:
        n = int(g.num_nodes)
        e = _undirected_unique_edge_count(g.edge_index.long())
        num_nodes_list.append(n)
        num_edges_list.append(e)
        avg_degrees.append(0.0 if n == 0 else (2.0 * e) / n)
        densities.append(0.0 if n <= 1 else (2.0 * e) / (n * (n - 1)))

    print(f"==== {name} Summary ====")
    print(f"Graphs: {num_graphs}")
    print(f"Total nodes: {sum(num_nodes_list):,}")
    print(f"Total edges: {sum(num_edges_list):,}")
    print(f"Avg nodes per graph: {np.mean(num_nodes_list):.2f}")
    print(f"Avg edges per graph: {np.mean(num_edges_list):.2f}")
    print(f"Avg degree (mean over graphs): {np.mean(avg_degrees):.2f}")
    print(f"Avg density (mean over graphs): {np.mean(densities):.6f}")
    print(f"Max nodes in a graph: {max(num_nodes_list)}")
    print(f"Max edges in a graph: {max(num_edges_list)}\n")

# ---- Load datasets ----
mutag = TUDataset(root='data/TUDataset', name='MUTAG')           # 188 graphs
proteins = TUDataset(root='data/TUDataset', name='PROTEINS')     # 1,113 graphs

# PPI: load ALL splits, not just train
ppi_train = PPI(root='data/PPI', split='train')   # 20 graphs
ppi_val   = PPI(root='data/PPI', split='val')     # 2 graphs
ppi_test  = PPI(root='data/PPI', split='test')    # 2 graphs
ppi_all   = ConcatDataset([ppi_train, ppi_val, ppi_test])  # 24 graphs total

# ---- Summaries ----
summarize_dataset(mutag, "MUTAG")
summarize_dataset(proteins, "PROTEINS")
summarize_dataset(ppi_all, "PPI (all splits)")

==== MUTAG Summary ====
Graphs: 188
Total nodes: 3,371
Total edges: 3,721
Avg nodes per graph: 17.93
Avg edges per graph: 19.79
Avg degree (mean over graphs): 2.19
Avg density (mean over graphs): 0.138454
Max nodes in a graph: 28
Max edges in a graph: 33

==== PROTEINS Summary ====
Graphs: 1113
Total nodes: 43,471
Total edges: 81,044
Avg nodes per graph: 39.06
Avg edges per graph: 72.82
Avg degree (mean over graphs): 3.73
Avg density (mean over graphs): 0.212176
Max nodes in a graph: 620
Max edges in a graph: 1049

==== PPI (all splits) Summary ====
Graphs: 24
Total nodes: 56,944
Total edges: 793,632
Avg nodes per graph: 2372.67
Avg edges per graph: 33068.00
Avg degree (mean over graphs): 26.44
Avg density (mean over graphs): 0.012015
Max nodes in a graph: 3480
Max edges in a graph: 53377



### ogb-molhiv

In [None]:
# === Summary stats from archive_extracted/raw with 2-col edge.csv (no headers, no graph_id) ===

import os, zipfile
import numpy as np
import pandas as pd

zip_path = "archive.zip"
extract_dir = "archive_extracted"
raw_dir = os.path.join(extract_dir, "raw")

# 0) Ensure extracted
if not os.path.exists(raw_dir):
    assert os.path.exists(zip_path), f"Couldn't find {zip_path} in CWD"
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(extract_dir)
    assert os.path.exists(raw_dir), f"Couldn't find {raw_dir} after unzip"

edges_csv = os.path.join(raw_dir, "edge.csv")
num_nodes_csv = os.path.join(raw_dir, "num-node-list.csv")
num_edges_csv = os.path.join(raw_dir, "num-edge-list.csv")

# 1) Load files
assert os.path.exists(edges_csv), f"Missing {edges_csv}"
assert os.path.exists(num_nodes_csv), f"Missing {num_nodes_csv}"
assert os.path.exists(num_edges_csv), f"Missing {num_edges_csv}"

# edge.csv has NO header and only two columns -> read as src,dst
edges = pd.read_csv(edges_csv, header=None, names=["src", "dst"])
e_counts = pd.read_csv(num_edges_csv, header=None)[0].astype(int).tolist()
n_counts = pd.read_csv(num_nodes_csv, header=None)[0].astype(int).tolist()

# Sanity checks
total_edges_listed = len(edges)
if sum(e_counts) != total_edges_listed:
    # Some dumps store each undirected edge twice (u,v) + (v,u). Try halving if that matches.
    if sum(e_counts) * 2 == total_edges_listed:
        print("Note: edge.csv appears to store both directions; will unique within each graph slice.")
    else:
        raise ValueError(
            f"num-edge-list.csv sums to {sum(e_counts)} but edge.csv has {total_edges_listed} rows."
        )

if len(e_counts) != len(n_counts):
    raise ValueError(
        f"num-edge-list.csv has {len(e_counts)} rows but num-node-list.csv has {len(n_counts)} rows."
    )

# 2) Per-graph computation by slicing edge.csv according to e_counts
num_graphs = len(e_counts)
num_nodes_list, num_edges_list, avg_degrees, densities = [], [], [], []

start = 0
for i in range(num_graphs):
    n = int(n_counts[i])
    cnt_e = int(e_counts[i])

    # slice this graph's edges
    end = start + cnt_e
    sub = edges.iloc[start:end].to_numpy(dtype=np.int64)
    start = end

    if sub.size == 0:
        e_undir = 0
    else:
        u = sub[:, 0]
        v = sub[:, 1]
        lo = np.minimum(u, v)
        hi = np.maximum(u, v)
        pairs = np.stack([lo, hi], axis=1)
        # unique undirected edges
        pairs_unique = np.unique(pairs, axis=0)
        e_undir = int(pairs_unique.shape[0])

    num_nodes_list.append(n)
    num_edges_list.append(e_undir)
    avg_degrees.append(0.0 if n == 0 else (2.0 * e_undir) / n)
    densities.append(0.0 if n <= 1 else (2.0 * e_undir) / (n * (n - 1)))

# 3) Print in your format
title = os.path.basename(os.path.abspath(extract_dir))
print(f"==== {title} Summary ====")
print(f"Graphs: {num_graphs}")
print(f"Total nodes: {sum(num_nodes_list):,}")
print(f"Total edges: {sum(num_edges_list):,}")
print(f"Avg nodes per graph: {np.mean(num_nodes_list):.2f}")
print(f"Avg edges per graph: {np.mean(num_edges_list):.2f}")
print(f"Avg degree (mean over graphs): {np.mean(avg_degrees):.2f}")
print(f"Avg density (mean over graphs): {np.mean(densities):.6f}")
print(f"Max nodes in a graph: {max(num_nodes_list)}")
print(f"Max edges in a graph: {max(num_edges_list)}")


AssertionError: Couldn't find archive.zip in CWD

### Hetionet

In [None]:
def norm_id(x):
    # Normalize IDs: e.g., ["Gene", 1234] -> "Gene::1234"
    if isinstance(x, (list, tuple)):
        return "::".join(map(str, x))
    return str(x)

def first_key(d, candidates, *, required=False, context=""):
    for k in candidates:
        if k in d:
            return k
    if required:
        raise KeyError(f"None of {candidates} found in {context or 'object'}; available keys: {list(d.keys())[:10]}")
    return None

with open("hetionet-v1.0-metagraph.json", "r") as f:
    metagraph = json.load(f)
print("Metanode kinds (sample):", metagraph.get("metanode_kinds", [])[:8])

with bz2.open("hetionet-v1.0.json.bz2", "rt") as f:
    het = json.load(f)

nodes_container = het.get("nodes")
edges_container = het.get("edges")
if nodes_container is None or edges_container is None:
    raise KeyError("Top-level keys 'nodes' and/or 'edges' not found in the JSON.")

# --- Detect node storage style ---
node_is_dict_map = isinstance(nodes_container, dict)
if node_is_dict_map:
    nodes_iter = nodes_container.items()
    sample_id, sample_node = next(iter(nodes_iter))
    nodes_iter = nodes_container.items()  # rewind
else:
    nodes_iter = enumerate(nodes_container)
    _, sample_node = next(iter(nodes_iter))
    nodes_iter = enumerate(nodes_container)  # rewind

node_id_key_candidates   = ["id", "identifier", "node_id"]
node_type_key_candidates = ["kind", "type", "label", "node_type"]

if not node_is_dict_map:
    # ensure node dict has an id-like key
    _ = first_key(sample_node, node_id_key_candidates, required=True, context="node")
node_type_key = first_key(sample_node, node_type_key_candidates, required=True, context="node")

# --- Pass 1: collect ids per type (and build aliasable identity strings) ---
type_to_ids = defaultdict(list)     # canonical IDs per type (base id without type prefix)
canon_to_type = {}                  # canonical id -> type (e.g., "UBERON:0000178" -> "Anatomy")

print("Indexing nodes (with canonical IDs)...")
if node_is_dict_map:
    for raw_id, n in tqdm(nodes_iter):
        ntype = n.get(node_type_key)
        if ntype is None:
            raise KeyError(f"Node missing '{node_type_key}'. Keys present: {list(n.keys())[:10]}")
        # canonical id: prefer node's own id field if present, else dict key
        nid = str(n.get(first_key(n, node_id_key_candidates) or raw_id))
        # If nid looks like "Anatomy::UBERON:0000178", strip prefix for canonical
        canon = nid.split("::", 1)[1] if "::" in nid else nid
        canon_to_type[canon] = ntype
        type_to_ids[ntype].append(canon)
else:
    for _, n in tqdm(nodes_iter, total=len(nodes_container)):
        ntype = n.get(node_type_key)
        nid   = str(n[first_key(n, node_id_key_candidates, required=True, context="node")])
        canon = nid.split("::", 1)[1] if "::" in nid else nid
        canon_to_type[canon] = ntype
        type_to_ids[ntype].append(canon)

# --- Assign per-type contiguous indices ---
id_to_local = {ntype: {} for ntype in type_to_ids}
type_counts = {}
print("Assigning per-type node indices...")
for ntype, canon_ids in tqdm(type_to_ids.items()):
    for i, canon in enumerate(canon_ids):
        id_to_local[ntype][canon] = i
    type_counts[ntype] = len(canon_ids)

# --- Build a resolver that accepts plain ids and qualified ids ---
def resolve_type_and_index(any_id_str):
    """Accepts:
       - 'UBERON:0000178'
       - 'Anatomy::UBERON:0000178'
    Returns (node_type, local_index)."""
    if "::" in any_id_str:
        ntype, canon = any_id_str.split("::", 1)
        return ntype, id_to_local[ntype][canon]
    else:
        ntype = canon_to_type[any_id_str]
        return ntype, id_to_local[ntype][any_id_str]

# --- Prepare edges iterable and detect edge keys ---
if isinstance(edges_container, dict):
    flat_edges = []
    for rel, elist in edges_container.items():
        for e in elist:
            e = dict(e)
            e.setdefault("kind", rel)
            flat_edges.append(e)
    edges_iterable = flat_edges
else:
    edges_iterable = edges_container

first_edge = edges_iterable[0]
edge_src_key = first_key(first_edge, ["source_id", "source", "src", "start"], required=True, context="edge")
edge_dst_key = first_key(first_edge, ["target_id", "target", "dst", "end"], required=True, context="edge")
edge_rel_key = first_key(first_edge, ["kind", "type", "relation", "rel"], required=True, context="edge")

# --- Group edges by (src_type, relation, dst_type) ---
edge_lists = defaultdict(list)

print("Grouping edges...")
for e in tqdm(edges_iterable):
    src_raw = e[edge_src_key]
    dst_raw = e[edge_dst_key]
    rel     = e[edge_rel_key]

    src_id = norm_id(src_raw)  # may become "Anatomy::UBERON:0000178"
    dst_id = norm_id(dst_raw)

    try:
        src_type, src_idx = resolve_type_and_index(src_id)
        dst_type, dst_idx = resolve_type_and_index(dst_id)
    except KeyError as ke:
        missing = src_id if "::" in src_id and src_id.split("::",1)[1] not in canon_to_type else dst_id
        raise KeyError(
            f"Edge references unknown node id: {missing}. "
            f"If your nodes store plain ids (e.g., 'UBERON:...') and edges use qualified ids "
            f"(e.g., 'Anatomy::UBERON:...'), this loader should handle it. "
            f"Otherwise, inspect node/edge keys."
        ) from ke

    edge_lists[(src_type, rel, dst_type)].append([src_idx, dst_idx])

# --- Build HeteroData ---
data = HeteroData()
for ntype, count in type_counts.items():
    data[ntype].num_nodes = count

for key, pairs in tqdm(edge_lists.items(), desc="Finalizing edge_index"):
    edge_index = torch.tensor(pairs, dtype=torch.long).t().contiguous()
    data[key].edge_index = edge_index

print("✅ Loaded Hetionet into HeteroData")
print(data)

Metanode kinds (sample): ['Anatomy', 'Biological Process', 'Cellular Component', 'Compound', 'Disease', 'Gene', 'Molecular Function', 'Pathway']
Indexing nodes (with canonical IDs)...


100%|██████████| 47031/47031 [00:00<00:00, 674871.74it/s]


Assigning per-type node indices...


100%|██████████| 11/11 [00:00<00:00, 763.43it/s]


Grouping edges...


100%|██████████| 2250197/2250197 [00:23<00:00, 97069.90it/s] 
Finalizing edge_index: 100%|██████████| 24/24 [00:00<00:00, 27.01it/s]

✅ Loaded Hetionet into HeteroData
HeteroData(
  Molecular Function={ num_nodes=2884 },
  Side Effect={ num_nodes=5734 },
  Gene={ num_nodes=20945 },
  Biological Process={ num_nodes=11381 },
  Compound={ num_nodes=1552 },
  Pathway={ num_nodes=1822 },
  Anatomy={ num_nodes=402 },
  Cellular Component={ num_nodes=1391 },
  Symptom={ num_nodes=438 },
  Disease={ num_nodes=137 },
  Pharmacologic Class={ num_nodes=345 },
  (Anatomy, upregulates, Gene)={ edge_index=[2, 97848] },
  (Anatomy, expresses, Gene)={ edge_index=[2, 526407] },
  (Gene, interacts, Gene)={ edge_index=[2, 147164] },
  (Gene, participates, Pathway)={ edge_index=[2, 84372] },
  (Anatomy, downregulates, Gene)={ edge_index=[2, 102240] },
  (Compound, upregulates, Gene)={ edge_index=[2, 18756] },
  (Compound, causes, Side Effect)={ edge_index=[2, 138944] },
  (Gene, participates, Molecular Function)={ edge_index=[2, 97222] },
  (Gene, participates, Biological Process)={ edge_index=[2, 559504] },
  (Compound, binds, Gene)={ 




In [None]:
import numpy as np

print("==== Hetionet Summary ====")

# --- Type counts ---
num_node_types = len(data.node_types)
num_edge_types = len(data.edge_types)
print(f"Node types: {num_node_types}")
print(f"Edge relation types: {num_edge_types}\n")


# --- Totals (robust to missing edge_index) ---
total_nodes = sum(int(data[nt].num_nodes) for nt in data.node_types)

total_edges = 0
for et in data.edge_types:
    store = data[et]
    if "edge_index" in store:
        total_edges += int(store.edge_index.size(1))
print(f"Total nodes: {total_nodes:,}")
print(f"Total edges: {total_edges:,}\n")

# --- Global averages/density (undirected-style avg degree; density is crude) ---
avg_deg = (2 * total_edges / total_nodes) if total_nodes > 0 else float("nan")
density = (total_edges / (total_nodes * (total_nodes - 1))) if total_nodes > 1 else float("nan")

print(f"Average degree    = {avg_deg:.2f}")
print(f"Graph density (approx) ≈ {density:.8f}")

==== Hetionet Summary ====
Node types: 11
Edge relation types: 25

Total nodes: 47,031
Total edges: 2,250,197

Average degree    = 95.69
Graph density (approx) ≈ 0.00101733
