# Mutag

In [1]:
import pickle

import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

from GNN import GCN_Mutag

torch.manual_seed(12345)

<torch._C.Generator at 0x7f61b8045530>

## Data

In [2]:
dataset = TUDataset(
    root='../../../data/',
    name='Mutagenicity',
    
)

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: Mutagenicity(2301):
Number of graphs: 2301
Number of features: 10
Number of classes: 2

Data(edge_index=[2, 154], x=[72, 10], y=[1])
Number of nodes: 72
Number of edges: 154
Average node degree: 2.14
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [3]:
with open("../../../data/Mutagenicity/processed/index.pkl", "rb") as file:
    index = pickle.load(file)
print(index.keys())

dict_keys(['idx_train', 'idx_val', 'idx_test'])


In [4]:
train_dataset = dataset[index['idx_train']]
val_dataset = dataset[index['idx_val']]
test_dataset = dataset[index['idx_test']]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of val graphs: {len(val_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 1150
Number of val graphs: 282
Number of test graphs: 1132


In [25]:
graphs = [len(set(index)) for index in [index['idx_train'], index['idx_val'], index['idx_test']]]
[round(100 * i/sum(graphs), 2) for i in graphs]

[44.85, 11.0, 44.15]

In [5]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# for step, data in enumerate(train_loader):
#     print(f'Step {step + 1}:')
#     print('=======')
#     print(f'Number of graphs in the current batch: {data.num_graphs}')
#     print(data)
#     print()

## Model

In [6]:
model = GCN_Mutag(
    in_features=dataset.num_node_features,
    h_features=64,
)
print(model)

GCN_Mutag(
  (conv1): GraphConvolution (10 -> 64)
  (conv2): GraphConvolution (64 -> 64)
  (conv3): GraphConvolution (64 -> 64)
  (conv4): GraphConvolution (64 -> 64)
  (conv5): GraphConvolution (64 -> 64)
  (dense1): Linear(in_features=64, out_features=16, bias=True)
  (dense2): Linear(in_features=16, out_features=8, bias=True)
  (dense3): Linear(in_features=8, out_features=2, bias=True)
)


## Train

In [14]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.NLLLoss()

def train():
    model.train()
    for data in train_loader:  # Iterate in batches over the training dataset.
        out = model(data.x, data.edge_index, data.batch)
        loss = criterion(out, data.y.long())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

def test(loader):
    model.eval()
    correct = 0
    for data in loader:  # Iterate in batches over the training/test dataset.
        out = model(data.x, data.edge_index, data.batch)
        pred = torch.argmax(out, dim=-1)
        correct += int((pred == data.y.long()).sum())
    return correct / len(loader.dataset)

In [8]:
best_test_acc = 0.0
for epoch in range(1, 201):
    train()
    train_acc = test(train_loader)
    val_acc = test(val_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')
    if val_acc >= best_test_acc:
        best_test_acc = val_acc
        best_model_params = model.state_dict()
        print("Checkpoint saved!")

Epoch: 001, Train Acc: 0.7913, Val Acc: 0.8652
Checkpoint saved!
Epoch: 002, Train Acc: 0.7913, Val Acc: 0.8652
Checkpoint saved!
Epoch: 003, Train Acc: 0.8400, Val Acc: 0.9078
Checkpoint saved!
Epoch: 004, Train Acc: 0.8565, Val Acc: 0.8936
Epoch: 005, Train Acc: 0.9087, Val Acc: 0.9255
Checkpoint saved!
Epoch: 006, Train Acc: 0.9226, Val Acc: 0.9326
Checkpoint saved!
Epoch: 007, Train Acc: 0.8948, Val Acc: 0.8830
Epoch: 008, Train Acc: 0.9426, Val Acc: 0.9326
Checkpoint saved!
Epoch: 009, Train Acc: 0.9530, Val Acc: 0.9433
Checkpoint saved!
Epoch: 010, Train Acc: 0.8748, Val Acc: 0.9220
Epoch: 011, Train Acc: 0.9609, Val Acc: 0.9433
Checkpoint saved!
Epoch: 012, Train Acc: 0.9713, Val Acc: 0.9574
Checkpoint saved!
Epoch: 013, Train Acc: 0.9452, Val Acc: 0.9220
Epoch: 014, Train Acc: 0.9600, Val Acc: 0.9539
Epoch: 015, Train Acc: 0.9557, Val Acc: 0.9645
Checkpoint saved!
Epoch: 016, Train Acc: 0.9826, Val Acc: 0.9752
Checkpoint saved!
Epoch: 017, Train Acc: 0.9609, Val Acc: 0.9468
Epo

## Eval

In [12]:
best_model_params = torch.load("mutag_weights.pt")
model.load_state_dict(best_model_params)
model.eval()

GCN_Mutag(
  (conv1): GraphConvolution (10 -> 64)
  (conv2): GraphConvolution (64 -> 64)
  (conv3): GraphConvolution (64 -> 64)
  (conv4): GraphConvolution (64 -> 64)
  (conv5): GraphConvolution (64 -> 64)
  (dense1): Linear(in_features=64, out_features=16, bias=True)
  (dense2): Linear(in_features=16, out_features=8, bias=True)
  (dense3): Linear(in_features=8, out_features=2, bias=True)
)

In [19]:
len(train_loader.dataset)

1150

In [21]:
for dataset in ['train', 'val', 'test']:
    print(eval(f"len({dataset}_loader.dataset)"))
    acc = test(eval(f"{dataset}_loader"))
    print(f"{dataset} accuracy: {100 * acc:.2f} %")

1150
train accuracy: 100.00 %
282
val accuracy: 98.94 %
1132
test accuracy: 98.94 %


## Save Weights

In [14]:
torch.save(best_model_params, "mutag_weights.pt")

## Rough

In [16]:
torch.Tensor([int(data.y) for data in dataset]).unique(return_counts=True)

(tensor([0., 1.]), tensor([ 448, 1853]))

In [12]:
dataset

'test'