In [2]:
!pip install --upgrade pip
!pip3 install torch==2.3.1 --index-url https://download.pytorch.org/whl/cu121
!pip install torch_geometric
!pip install torch_cluster torch_scatter -f https://data.pyg.org/whl/torch-2.3.1+cu121.html
!pip install gdown

Looking in indexes: https://download.pytorch.org/whl/cu121
Looking in links: https://data.pyg.org/whl/torch-2.3.1+cu121.html


In [3]:
!pip -q install --force-reinstall --no-deps fsspec==2023.6.0



In [4]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import random_split
from torch_geometric.data import Data, InMemoryDataset, DataLoader
from torch_geometric.nn import GINEConv, global_add_pool, global_mean_pool, global_max_pool
from torch_geometric.utils import dense_to_sparse
import random
import math
import numpy as np
from typing import List
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import math, random
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device



device(type='cpu')

Dataset (TU)

In [21]:
from torch_geometric.datasets import TUDataset
dataset = TUDataset(root='data/TUD', name='MUTAG')  # 188 graphs
sizes = [data.num_nodes for data in dataset]
idx = int(np.argmax([n if n >= 25 else 0 for n in sizes]))  # pick a larger graph
data = dataset[idx]
print(f"Graph index {idx}: nodes={data.num_nodes}, edges={data.num_edges // 2} (undirected)")
print(data)

Graph index 5: nodes=28, edges=31 (undirected)
Data(edge_index=[2, 62], x=[28, 7], edge_attr=[62, 4], y=[1])



# MUTAG Data Preprocessing
Adapted from LinkTeller code for consistency

In [15]:
dataset = TUDataset(root="data/TUDataset", name="MUTAG", use_node_attr=True)
# MUTAG typically has node attributes; if not present, dataset.data.x may be None
if dataset.num_node_features == 0:
    # fallback: use one-hot of node labels if available, else create constant feature
    if dataset.num_node_labels > 0:
        print("No node features found — using node labels one-hot")
        # transform each graph: replace x by one-hot of node_label (PyG stores node labels in data.x sometimes)
        for data in dataset:
            # some TUDataset encodes node labels in data.x as integers inside x, or in data.node_label
            # here we'll convert data.x (assumed scalar) -> one-hot
            if data.x is not None and data.x.dim() == 1:
                num_cat = int(data.x.max().item()) + 1
                one_hot = F.one_hot(data.x.long(), num_classes=num_cat).to(torch.float)
                data.x = one_hot
            else:
                # create constant feature
                data.x = torch.ones((data.num_nodes, 1), dtype=torch.float)
    else:
        for data in dataset:
            data.x = torch.ones((data.num_nodes, 1), dtype=torch.float)


Downloading https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip
Processing...
Done!


# GIN Model
Customizable GINE model built around the pyg GINEConv layer. Use GINE over GIN to train on edge features

In [49]:
class EdgeMLP(nn.Module):
    """Small MLP to combine node and edge features inside each GINE layer."""
    def __init__(self, input_dim, hidden_dim, dropout=0.0):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class GINEModel(nn.Module):
    def __init__(self, node_input_dim, edge_input_dim, hidden_dim=64,
                 num_layers=3, dropout=0.2, num_classes=2):
        super().__init__()
        self.num_layers = num_layers
        self.convs = nn.ModuleList()
        self.bns = nn.ModuleList()
        self.edge_transform = nn.ModuleList()
        for i in range(num_layers):
            in_dim = node_input_dim if i == 0 else hidden_dim
            mlp = EdgeMLP(in_dim, hidden_dim, dropout)
            #don't explicitly define edge_dim here; transform edge features into
            #the same dimension as the GINE in the forward pass with edge_layer
            conv = GINEConv(nn=mlp, train_eps=True)
            edge_layer = nn.Linear(edge_input_dim, in_dim)
            self.convs.append(conv)
            self.bns.append(nn.BatchNorm1d(hidden_dim))
            self.edge_transform.append(edge_layer)

        self.pool = global_add_pool
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        if edge_attr is None:
            #if dataset doesn’t have edge attributes, use zeros
            edge_attr = torch.zeros((edge_index.size(1), 1), device=x.device)
        for conv, bn, edge_transform in zip(self.convs, self.bns, self.edge_transform):
            #transform edge features to match node input dimensions
            edge_attr_trans = edge_transform(edge_attr)
            x = conv(x, edge_index, edge_attr_trans)
            x = bn(x)
            x = F.relu(x)
        g = self.pool(x, batch)
        out = self.classifier(g)
        return out


# Training Utility Functions

In [33]:
def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = F.cross_entropy(out, data.y.view(-1).to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
        preds = out.argmax(dim=1)
        correct += (preds == data.y.to(device)).sum().item()
        total += data.num_graphs
    return total_loss / total, correct / total

In [34]:
@torch.no_grad()
def evaluate(model, loader, device):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0.0
    for data in loader:
        data = data.to(device)
        out = model(data)
        loss = F.cross_entropy(out, data.y.view(-1).to(device))
        preds = out.argmax(dim=1)
        correct += (preds == data.y.to(device)).sum().item()
        total += data.num_graphs
        total_loss += loss.item() * data.num_graphs
    return total_loss / total, correct / total





# Baseline Training & Evaluation

In [50]:
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Subset

#hyperparams
seed = 42
epochs = 100
batch_size = 32
lr = 1e-3
weight_decay = 1e-5
#initialize random #s
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


X = list(range(len(dataset)))
y = np.array([int(dataset[i].y.item()) for i in range(len(dataset))])
#StratifiedKFold for cross-validation (90/10 train-test split each time)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

fold_accuracies = []
#iterate over each k-fold
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n=== Fold {fold} ===")
    train_subset = Subset(dataset, train_idx)
    test_subset = Subset(dataset, test_idx)

    train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_subset, batch_size=batch_size, shuffle=False)
    #reinitialize model from scratch each time
    model = GINEModel(node_input_dim=dataset[0].x.shape[1],
                      edge_input_dim=dataset[0].edge_attr.shape[1],
                      hidden_dim=32, num_layers=2,
                      dropout=0.2,
                      num_classes=dataset.num_classes).to(device)
    optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_test_acc = 0.0
    #iterate over epochs
    for epoch in range(1, epochs + 1):
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, device)
        test_loss, test_acc = evaluate(model, test_loader, device)
        if test_acc > best_test_acc:
            best_test_acc = test_acc
        if epoch % 20 == 0 or epoch == 1 or epoch == epochs:
            print(f"Epoch {epoch:03d} | Train loss {train_loss:.4f} acc {train_acc:.4f} | Test loss {test_loss:.4f} acc {test_acc:.4f}")
    print(f"Fold {fold} best test acc: {best_test_acc:.4f}")
    fold_accuracies.append(best_test_acc)

mean_acc = np.mean(fold_accuracies)
std_acc = np.std(fold_accuracies)
print(f"\n10-fold CV accuracy: {mean_acc:.4f} ± {std_acc:.4f}")


=== Fold 1 ===
torch.Size([38, 4]) torch.Size([17, 7])
Epoch 001 | Train loss 1.0070 acc 0.6627 | Test loss 0.7204 acc 0.2105


  train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
  test_loader = DataLoader(test_subset, batch_size=batch_size, shuffle=False)


Epoch 020 | Train loss 0.4377 acc 0.8047 | Test loss 0.8784 acc 0.3158
Epoch 040 | Train loss 0.3106 acc 0.8402 | Test loss 1.7432 acc 0.3158
Epoch 060 | Train loss 0.3174 acc 0.8580 | Test loss 0.6960 acc 0.7368
Epoch 080 | Train loss 0.2738 acc 0.8698 | Test loss 3.3051 acc 0.3158
Epoch 100 | Train loss 0.2569 acc 0.8876 | Test loss 0.5477 acc 0.7368
Fold 1 best test acc: 1.0000

=== Fold 2 ===
torch.Size([38, 4]) torch.Size([17, 7])
Epoch 001 | Train loss 0.7076 acc 0.6036 | Test loss 0.6345 acc 0.7368
Epoch 020 | Train loss 0.3814 acc 0.7751 | Test loss 0.6836 acc 0.5789
Epoch 040 | Train loss 0.3019 acc 0.8580 | Test loss 0.7886 acc 0.3684
Epoch 060 | Train loss 0.2716 acc 0.8994 | Test loss 1.1395 acc 0.3684
Epoch 080 | Train loss 0.2494 acc 0.8876 | Test loss 1.2231 acc 0.3158
Epoch 100 | Train loss 0.1823 acc 0.9231 | Test loss 1.4674 acc 0.3158
Fold 2 best test acc: 0.8947

=== Fold 3 ===
torch.Size([38, 4]) torch.Size([17, 7])
Epoch 001 | Train loss 0.8095 acc 0.5917 | Test l