## Molecular graph prediction
- Loads the Tox21 dataset where each molecule is a graph.

- Trains a GCN model to predict multiple chemical properties.

- Uses binary cross-entropy loss and sigmoid for multi-label classification.

- Handles missing labels via masking.


Description:
In molecular graph prediction, each molecule is represented as a graph where atoms are nodes and bonds are edges. 
The goal is to predict chemical properties like toxicity, solubility, or bioactivity. 
Graph Neural Networks (GNNs) excel at this because they respect molecular structure. In this project, weâ€™ll use a GCN model to predict a molecular property from the Tox21 dataset.


In [3]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import MoleculeNet
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from sklearn.metrics import roc_auc_score
import numpy as np

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
dataset = MoleculeNet(root="data", name="Tox21")
num_tasks = dataset[0].y.size(1)

# Simple train/test split
train_dataset = dataset[: int(0.8 * len(dataset))]
test_dataset = dataset[int(0.8 * len(dataset)) :]

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Model
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_tasks):
        super().__init__()
        self.conv1 = GCNConv(num_node_features, 64)
        self.conv2 = GCNConv(64, 64)
        self.fc = torch.nn.Linear(64, num_tasks)

    def forward(self, x, edge_index, batch):
        x = x.float() 
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        return self.fc(x)

model = GCN(dataset.num_node_features, num_tasks).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss()

# Training
def train():
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index, batch.batch)

        mask = ~torch.isnan(batch.y)
        loss = criterion(out[mask], batch.y[mask])

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluation
def test():
    model.eval()
    ys, preds = [], []

    with torch.no_grad():
        for batch in test_loader:
            batch = batch.to(device)
            out = model(batch.x, batch.edge_index, batch.batch)

            mask = ~torch.isnan(batch.y)
            ys.append(batch.y[mask].cpu())
            preds.append(torch.sigmoid(out[mask]).cpu())

    y_true = torch.cat(ys).numpy()
    y_pred = torch.cat(preds).numpy()

    return roc_auc_score(y_true, y_pred)

# Run
for epoch in range(20):
    loss = train()
    auc = test()
    print(f"Epoch {epoch+1}: Loss={loss:.4f}, Test ROC-AUC={auc:.4f}")

Epoch 1: Loss=0.2809, Test ROC-AUC=0.6904
Epoch 2: Loss=0.2503, Test ROC-AUC=0.7158
Epoch 3: Loss=0.2441, Test ROC-AUC=0.7323
Epoch 4: Loss=0.2407, Test ROC-AUC=0.7432
Epoch 5: Loss=0.2372, Test ROC-AUC=0.7481
Epoch 6: Loss=0.2360, Test ROC-AUC=0.7508
Epoch 7: Loss=0.2355, Test ROC-AUC=0.7544
Epoch 8: Loss=0.2344, Test ROC-AUC=0.7531
Epoch 9: Loss=0.2339, Test ROC-AUC=0.7563
Epoch 10: Loss=0.2339, Test ROC-AUC=0.7565
Epoch 11: Loss=0.2336, Test ROC-AUC=0.7559
Epoch 12: Loss=0.2325, Test ROC-AUC=0.7567
Epoch 13: Loss=0.2335, Test ROC-AUC=0.7559
Epoch 14: Loss=0.2331, Test ROC-AUC=0.7597
Epoch 15: Loss=0.2327, Test ROC-AUC=0.7611
Epoch 16: Loss=0.2323, Test ROC-AUC=0.7585
Epoch 17: Loss=0.2318, Test ROC-AUC=0.7602
Epoch 18: Loss=0.2319, Test ROC-AUC=0.7595
Epoch 19: Loss=0.2317, Test ROC-AUC=0.7594
Epoch 20: Loss=0.2317, Test ROC-AUC=0.7616
