In [13]:
# Install required packages.
import os
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


In [14]:
data_dir = 'c:/Users/david/MT_data/extracted_patches/mutant_graphs/'

# Show an example Graph

In [15]:
path = os.path.join(data_dir, 'AAAH_GraphPatch.pkl')

In [16]:
from f_helper_functions import *

patch = load_object(path)
print(patch.edge_index.shape)
print(patch.edge_weight.shape)
print(patch.fitness)
print(patch.A.shape)
print(patch.mutant)
print()
print(patch)

(2, 6565)
(6565, 1)
0.0
(1073, 1073)
AAAH

            Number of Nodes: 1073
            Features: (1073, 16)
            Adjacency Matrix: (1073, 1073)
            Edge Weights (Geodesic Distances): (6565, 1)
            Edge Index: (2, 6565)
            Fitness: 0.0
            Coordinates of Points: (1073, 3)
            Mutant Name: AAAH


## Define customized Dataset

In [17]:
#from torch.utils.data import Dataset
from c_PatchDataset_regr import PatchDataset

batch_size = 8
dataset = PatchDataset(data_dir = data_dir)
len(dataset)

1500

In [18]:
print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')


data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of node features: {data.num_node_features}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Contains self-loops: {data.has_self_loops()}')


Dataset: PatchDataset(1500):
Number of graphs: 1500

Data(x=[1049, 16], edge_index=[2, 6435], edge_attr=[6435, 1], y=1.611610009, pos=[1049, 3])
Number of nodes: 1049
Number of node features: 16
Number of edges: 6435
Average node degree: 6.13
Contains self-loops: True


In [19]:
data.y

tensor(1.6116, dtype=torch.float64)

In [20]:
data.edge_attr

tensor([[0.2276],
        [0.1386],
        [0.1039],
        ...,
        [0.0660],
        [0.0647],
        [0.0574]], dtype=torch.float64)

In [21]:
data.edge_index

tensor([[   0,    0,    0,  ..., 1048, 1048, 1048],
        [   0,    4,    2,  ..., 1037, 1046, 1024]])

In [22]:
data.x

tensor([[-0.3169,  0.0202, -0.0707,  ..., -0.4254, -0.0092, -0.0602],
        [-0.2913,  0.0608, -0.1049,  ..., -0.3702,  0.0567, -0.1223],
        [-0.2959,  0.0752, -0.1396,  ..., -0.3760,  0.0288, -0.0965],
        ...,
        [-0.1887,  0.0367, -0.0865,  ..., -0.3686, -0.0074, -0.3037],
        [-0.1788,  0.0696, -0.1506,  ..., -0.3097,  0.0227, -0.2670],
        [-0.1877,  0.0759, -0.1713,  ..., -0.2963,  0.0164, -0.2393]])

In [23]:
data.pos

tensor([[-6.3507,  3.2255,  9.1402],
        [-5.6844,  2.0925,  9.0088],
        [-5.7997,  2.8937,  9.1833],
        ...,
        [11.6539,  3.1238,  2.1397],
        [11.5864,  3.7266,  2.1783],
        [11.5499,  4.1333,  2.0281]])

In [24]:
n_train = int(len(dataset)*0.8)
n_test = len(dataset) - n_train 

trainset, testset = torch.utils.data.random_split(dataset, [n_train, n_test])

print(f'Number of training graphs: {len(trainset)}')
print(f'Number of test graphs: {len(testset)}')

Number of training graphs: 1200
Number of test graphs: 300


In [25]:
from torch_geometric.loader import DataLoader

trainloader = DataLoader(dataset = trainset, batch_size= batch_size, shuffle = True)
testloader = DataLoader(dataset = testset, batch_size= batch_size, shuffle = True)

## Training a Graph Neural Network (GNN)

Training a GNN for graph classification usually follows a simple recipe:

1. Embed each node by performing multiple rounds of message passing
2. Aggregate node embeddings into a unified graph embedding (**readout layer**)
3. Train a final classifier on the graph embedding

There exists multiple **readout layers** in literature, but the most common one is to simply take the average of node embeddings:

$$
\mathbf{x}_{\mathcal{G}} = \frac{1}{|\mathcal{V}|} \sum_{v \in \mathcal{V}} \mathcal{x}^{(L)}_v
$$

PyTorch Geometric provides this functionality via [`torch_geometric.nn.global_mean_pool`](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.glob.global_mean_pool), which takes in the node embeddings of all nodes in the mini-batch and the assignment vector `batch` to compute a graph embedding of size `[batch_size, hidden_channels]` for each graph in the batch.

The final architecture for applying GNNs to the task of graph classification then looks as follows and allows for complete end-to-end training:

In [26]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_max_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(16, hidden_channels, add_self_loops=False)                   # 16-->64 node features
        self.conv2 = GCNConv(hidden_channels, hidden_channels, add_self_loops=False)      # 64-->64 node features
        self.conv3 = GCNConv(hidden_channels, hidden_channels, add_self_loops=False)      # 64-->64 node features
        self.lin = Linear(hidden_channels, 1)                                             # 64-->1 node features fully connected layer

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        # Take the sum over all nodes in each graph = 16 values per graph
        x = global_max_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(16, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=1, bias=True)
)


Here, we again make use of the [`GCNConv`](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.conv.GCNConv) with $\mathrm{ReLU}(x) = \max(x, 0)$ activation for obtaining localized node embeddings, before we apply our final classifier on top of a graph readout layer.

Let's train our network for a few epochs to see how well it performs on the training as well as test set:

In [27]:
num_epochs = 100
learning_rate = 0.001

criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)#, momentum=0.9)

iters, losses = [], []

for epoch in range(num_epochs):  # loop over the dataset multiple times
    running_loss = 0
    for data in trainloader:  # Iterate in batches over the training dataset.
        out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
        loss = criterion(out, data.y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.
        running_loss += loss

    epoch_loss = running_loss/batch_size
    losses.append(epoch_loss) # compute *average* loss
    iters.append(epoch+1)
    print(f'epoch {epoch + 1}, loss {epoch_loss:f}')

print('Finished Training')

# plotting
#plt.title("Training Curve (batch_size={}, lr={})".format(batch_size, learning_rate)) 
#plt.plot(iters, losses, label="Train")
#plt.xlabel("Iterations")
#plt.ylabel("Loss")
#plt.show()

  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: Found dtype Double but expected Float