# Mutag

In [1]:
import pickle

import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

from GNN import GCN_Mutag

torch.manual_seed(12345)

<torch._C.Generator at 0x7f403701bf90>

## Data

In [2]:
dataset = TUDataset(
    root='../../../data/',
    name='Mutagenicity',
    
)

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: Mutagenicity(2301):
Number of graphs: 2301
Number of features: 10
Number of classes: 2

Data(edge_index=[2, 154], x=[72, 10], y=[1])
Number of nodes: 72
Number of edges: 154
Average node degree: 2.14
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [3]:
with open("../../../data/Mutagenicity/processed/index.pkl", "rb") as file:
    index = pickle.load(file)
print(index.keys())

dict_keys(['idx_train', 'idx_val', 'idx_test'])


In [4]:
train_dataset = dataset[index['idx_train']]
val_dataset = dataset[index['idx_val']]
test_dataset = dataset[index['idx_test']]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of val graphs: {len(val_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 1150
Number of val graphs: 282
Number of test graphs: 1132


In [5]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 4094], x=[1985, 10], y=[64], batch=[1985], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 3868], x=[1877, 10], y=[64], batch=[1877], ptr=[65])

Step 3:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 4240], x=[2064, 10], y=[64], batch=[2064], ptr=[65])

Step 4:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 3866], x=[1893, 10], y=[64], batch=[1893], ptr=[65])

Step 5:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 4208], x=[2058, 10], y=[64], batch=[2058], ptr=[65])

Step 6:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 4084], x=[1975, 10], y=[64], batch=[1975], ptr=[65])

Step 7:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 4084], x=[2014, 10], y=[64], batch=[2014], ptr=[65])

Step 8:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 4232], x=[2064, 10], y

## Model

In [6]:
model = GCN_Mutag(
    in_features=dataset.num_node_features,
    h_features=128,
)
print(model)

GCN_Mutag(
  (conv1): GraphConvolution (10 -> 128)
  (conv2): GraphConvolution (128 -> 128)
  (conv3): GraphConvolution (128 -> 128)
  (dense1): Linear(in_features=128, out_features=16, bias=True)
  (dense2): Linear(in_features=16, out_features=8, bias=True)
  (dense3): Linear(in_features=8, out_features=1, bias=True)
)


## Train

In [7]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCELoss()

def train():
    model.train()
    for data in train_loader:  # Iterate in batches over the training dataset.
        out = model(data.x, data.edge_index, data.batch).squeeze()
        loss = criterion(out, data.y.float())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

def test(loader):
    model.eval()
    correct = 0
    for data in loader:  # Iterate in batches over the training/test dataset.
        out = model(data.x, data.edge_index, data.batch).squeeze()  
        pred = out.round()
        correct += int((pred == data.y).sum())
    return correct / len(loader.dataset)

In [8]:
best_test_acc = 0.0
for epoch in range(1, 201):
    train()
    train_acc = test(train_loader)
    val_acc = test(val_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')
    if val_acc >= best_test_acc:
        best_test_acc = val_acc
        best_model_params = model.state_dict()
        print("Checkpoint saved!")

Epoch: 001, Train Acc: 0.7913, Val Acc: 0.8652
Checkpoint saved!
Epoch: 002, Train Acc: 0.8452, Val Acc: 0.8617
Epoch: 003, Train Acc: 0.8739, Val Acc: 0.9255
Checkpoint saved!
Epoch: 004, Train Acc: 0.8852, Val Acc: 0.9255
Checkpoint saved!
Epoch: 005, Train Acc: 0.8939, Val Acc: 0.8723
Epoch: 006, Train Acc: 0.9296, Val Acc: 0.9291
Checkpoint saved!
Epoch: 007, Train Acc: 0.9470, Val Acc: 0.9362
Checkpoint saved!
Epoch: 008, Train Acc: 0.9513, Val Acc: 0.9504
Checkpoint saved!
Epoch: 009, Train Acc: 0.9409, Val Acc: 0.9397
Epoch: 010, Train Acc: 0.9504, Val Acc: 0.9610
Checkpoint saved!
Epoch: 011, Train Acc: 0.9609, Val Acc: 0.9362
Epoch: 012, Train Acc: 0.9565, Val Acc: 0.9610
Checkpoint saved!
Epoch: 013, Train Acc: 0.9626, Val Acc: 0.9397
Epoch: 014, Train Acc: 0.9704, Val Acc: 0.9397
Epoch: 015, Train Acc: 0.9739, Val Acc: 0.9397
Epoch: 016, Train Acc: 0.9748, Val Acc: 0.9574
Epoch: 017, Train Acc: 0.9400, Val Acc: 0.9610
Checkpoint saved!
Epoch: 018, Train Acc: 0.9739, Val Acc:

## Eval

In [9]:
best_model_params = torch.load("mutag_weights.pt")
model.load_state_dict(best_model_params)
model.eval()

GCN_Mutag(
  (conv1): GraphConvolution (10 -> 128)
  (conv2): GraphConvolution (128 -> 128)
  (conv3): GraphConvolution (128 -> 128)
  (dense1): Linear(in_features=128, out_features=16, bias=True)
  (dense2): Linear(in_features=16, out_features=8, bias=True)
  (dense3): Linear(in_features=8, out_features=1, bias=True)
)

In [10]:
for dataset in ['train', 'val', 'test']:
    acc = test(eval(f"{dataset}_loader"))
    print(f"{dataset} accuracy: {100 * acc:.2f} %")

train accuracy: 98.96 %
val accuracy: 96.45 %
test accuracy: 96.64 %


## Save Weights

In [43]:
torch.save(best_model_params, "mutag_weights.pt")

## Rough

In [11]:
labels = list()
for dataset in val_dataset:
    labels.append(dataset.y)
labels = torch.Tensor(labels)

In [12]:
labels.unique(return_counts=True)

(tensor([0., 1.]), tensor([ 38, 244]))