In [1]:
%matplotlib inline

# Lab 4: Graph Neural Networks (GNNs)

In [2]:
# Import packages
import dgl
import torch
import torch.nn.functional as F
import numpy as np
from dgl.dataloading import GraphDataLoader
from dgl.nn import GraphConv
from IPython.display import Latex
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


# Part I: Node Classification

In [3]:
# Import datasat
dataset = dgl.data.AmazonCoBuyPhotoDataset()
print('Number of classes:', dataset.num_classes)

# A DGL Dataset object may contain one or multiple graphs. The Amazon
# dataset used in this lab only consists of one single graph.
graph = dataset[0]
graph = dgl.add_self_loop(graph)

print('Number of nodes:', graph.num_nodes())
print('Number of edges:', graph.num_edges())

Downloading /home/vahan/.dgl/amazon_co_buy_photo.zip from https://data.dgl.ai/dataset/amazon_co_buy_photo.zip...
Extracting file to /home/vahan/.dgl/amazon_co_buy_photo
Number of classes: 8
Number of nodes: 7650
Number of edges: 245813


A DGL graph can store node features in a
dictionary-like attribute called ``ndata``.
In the DGL Amazon co-buy dataset, the graph contains the following node features:

- ``label``: The ground truth node category.

-  ``feat``: The node features.

In [4]:
print('Node labels and features')
print(graph.ndata)

Node labels and features
{'feat': tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 1.],
        [1., 1., 0.,  ..., 1., 0., 1.]]), 'label': tensor([2, 2, 2,  ..., 3, 1, 1])}


## Exercise 1:

### 1.1: Retrieve key properties of the dataset

In [5]:
# Define key graph variables
X = graph.ndata['feat']
y = graph.ndata['label']
num_classes = dataset.num_classes
num_feat = X.shape[1]
N = graph.number_of_nodes()

print('Number of features: ', num_feat)

Number of features:  745


In [6]:
def split_dataset(N, train_ratio, seed=4):
    """ Creates train/val/test masks

    Args:
        N (int): dataset size
        train_ratio (float): proportion of the training set
        seed (int, optional): Fixes random. Defaults to 10

    Return: 
        [tensors]: returns boolean tensors for train/val/test set
        True indicates that a node belong to this set, False otherwise
    """

    train_size = int(train_ratio * N)
    val_size = int((N - train_size)/2)
    test_size = N - train_size - val_size

    # split dataset
    subsets = torch.utils.data.random_split(range(N), lengths = [train_size, val_size, test_size], generator=torch.Generator().manual_seed(seed))
    train_inds, val_inds, test_inds = [torch.Tensor(subset.indices) for subset in subsets]

    # create tensors of masks for each subset
    dataset_inds = torch.arange(N)
    train_mask = torch.isin(dataset_inds, train_inds)
    val_mask = torch.isin(dataset_inds, val_inds)
    test_mask = torch.isin(dataset_inds, test_inds)

    return train_mask, val_mask, test_mask

train_mask, val_mask, test_mask = split_dataset(N, train_ratio=0.8)


### 1.2 Implement a Graph Convolutional Network

$$H^{(l+1)} = f(H^{(l)}, A) = \sigma( \tilde{D}^{-\frac{1}{2}} \tilde{A} \tilde{D}^{-\frac{1}{2}}H^{(l)}W^{(l)})$$



<center><img src="./gcn_web.png"/></center>

In [7]:
class GNN_model(torch.nn.Module):
    """
    Define a Graph Convolution Network 
    """
    def __init__(self, num_layers, input_size, hidden_size, output_size, dropout):
        super(GNN_model, self).__init__()

        # Define GNN components
        self.convs = torch.nn.ModuleList() # holds GraphConv layers in a list
        self.convs.append(
            GraphConv(input_size, hidden_size, activation=F.relu)) # You can either define the activation at the layer level or call it inside the forward
        for i in range(num_layers-2):
            self.convs.append(
                GraphConv(hidden_size, hidden_size, activation=F.relu))
        self.convs.append(GraphConv(hidden_size, output_size))

        self.dropout = dropout

    def forward(self, graph, x):
        # Implement the forward function that takes the graph,
        # the features tensor x and returns the output tensor as shown in figure 1
        for conv in self.convs:
            x = conv(graph, x)
    
        output = F.log_softmax(x, dim=1) # Log_softmax is more stable numerically in comparison to softmax
        return output

### 1.3 Training

In [8]:
def train(model, graph, x, labels, num_epochs, optimizer, train_mask, val_mask, test_mask):
    """ Train the GNN model 

    Args:
        model: GNN model defined in pytorch
        graph (dgl.graph): dataset on which the task is performed
        x (tensor): node feature matrix 
        labels (tensor): node labels
        num_epochs (int): number of epochs
        optimizer: Adam optimizer
        train_mask (tensor): boolean mask for training nodes
        val_mask (tensor): boolean mask for validation set
    """
    
    # Train the model (pytorch specific)
    best_val_acc = 0
    best_test_acc = 0
    nll_loss = torch.nn.NLLLoss()

    model.train()
    for epoch in range(num_epochs):
        # Forward
        pred = model(graph, x)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = nll_loss(pred[train_mask], labels[train_mask])

        # Compute accuracy on training/validation/test
        pred = torch.argmax(pred, dim=1)
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        # Save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print('Epoch {}: loss {:.3f}, train Acc: {:.3f}, val acc: {:.3f}, test acc: {:.3f}'.format(
                epoch, loss, train_acc, val_acc, test_acc))

In [9]:
# Instanciate model
num_layers=3
hidden_size=16
dropout=0.3
num_epochs=300
lr=0.01
weight_decay=0.005
train_ratio=0.8
seed=4

model = GNN_model(num_layers, num_feat, hidden_size, num_classes, dropout)

In [10]:
# Define an optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=weight_decay)

# Train model
train(model, graph, X, y, num_epochs, optimizer, train_mask, val_mask, test_mask)

Epoch 0: loss 2.203, train Acc: 0.121, val acc: 0.114, test acc: 0.119
Epoch 10: loss 1.810, train Acc: 0.423, val acc: 0.431, test acc: 0.455
Epoch 20: loss 1.505, train Acc: 0.428, val acc: 0.437, test acc: 0.456
Epoch 30: loss 1.134, train Acc: 0.632, val acc: 0.633, test acc: 0.665
Epoch 40: loss 0.761, train Acc: 0.808, val acc: 0.818, test acc: 0.829
Epoch 50: loss 0.530, train Acc: 0.853, val acc: 0.855, test acc: 0.859
Epoch 60: loss 0.438, train Acc: 0.894, val acc: 0.898, test acc: 0.902
Epoch 70: loss 0.393, train Acc: 0.907, val acc: 0.902, test acc: 0.903
Epoch 80: loss 0.356, train Acc: 0.911, val acc: 0.908, test acc: 0.911
Epoch 90: loss 0.344, train Acc: 0.913, val acc: 0.911, test acc: 0.915
Epoch 100: loss 0.324, train Acc: 0.918, val acc: 0.905, test acc: 0.907
Epoch 110: loss 0.310, train Acc: 0.923, val acc: 0.915, test acc: 0.914
Epoch 120: loss 0.314, train Acc: 0.917, val acc: 0.907, test acc: 0.919
Epoch 130: loss 0.297, train Acc: 0.927, val acc: 0.923, test 

# Part II: Graph Classification

## Exercise 3

### 3.1: Load dataset

In [11]:
dataset = dgl.data.TUDataset(name='ENZYMES')

# Add self loop to each graph
dataset.graph_lists = [dgl.add_self_loop(graph) for graph in dataset.graph_lists]

Downloading /home/vahan/.dgl/ENZYMES.zip from https://www.chrsmrrs.com/graphkerneldatasets/ENZYMES.zip...
Extracting file to /home/vahan/.dgl/ENZYMES


In [12]:
dataset[0]

(Graph(num_nodes=37, num_edges=205,
       ndata_schemes={'node_labels': Scheme(shape=(1,), dtype=torch.int64), 'node_attr': Scheme(shape=(18,), dtype=torch.float64), '_ID': Scheme(shape=(), dtype=torch.int64)}
       edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}),
 tensor([5]))

In [13]:
print('Number of graph categories:', dataset.num_labels)
print('Dimension of nodes features', dataset[0][0].ndata['node_attr'].shape[1])

Number of graph categories: 6
Dimension of nodes features 18


In [14]:
# Split dataset into train, validation and test sets
train_sampler, val_sampler, test_sampler = dgl.data.utils.split_dataset(
        dataset, frac_list=[0.6, 0.2, 0.2], shuffle=True)

In [15]:
# batch graphs with GraphDataLoader
train_dataloader = GraphDataLoader(
        train_sampler, batch_size=5, drop_last=False)
val_dataloader = GraphDataLoader(
    val_sampler, batch_size=5, drop_last=False)
test_dataloader = GraphDataLoader(
    test_sampler, batch_size=5, drop_last=False)

### 2.1: Create GNN model for graph classification

In [16]:
class BasicGraphModel(torch.nn.Module):

    def __init__(self, n_layers, input_size, hidden_size, output_size):
        super(BasicGraphModel, self).__init__()

        # Define GNN components
        self.convs = torch.nn.ModuleList()
        self.convs.append(GraphConv(input_size, hidden_size))
        for i in range(n_layers-1):
            self.convs.append(GraphConv(hidden_size, hidden_size))
        self.linear = torch.nn.Linear(hidden_size, output_size)

    def forward(self, g, x):
        # Message Passing -- Learn node representations via GCN
        for conv in self.convs[:-1]:
            x = conv(g, x)
            x = F.elu(x)
        x = self.convs[-1](g, x)
        # Readout -- average all node representations to get graph embedding
        g.ndata['h'] = x
        x = dgl.mean_nodes(g, 'h')
        # Apply linear layer to classify graph representation
        x = self.linear(x)
        return x

### 2.1 Training and evaluation

In [17]:
def train(model, loss_fcn, optimizer, train_dataloader, val_dataloader, num_epochs):
    model = model.double()
    model.train()

    for epoch in range(num_epochs):
        losses = []
        for batch, batched_graph in enumerate(train_dataloader):
            batched_graph, labels = batched_graph
            logits = model(batched_graph, batched_graph.ndata['node_attr'].double())
            loss = loss_fcn(logits, labels.T[0])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        loss_data = np.mean(losses)

        if epoch % 5 == 0:
            print("Epoch {} | Loss: {:.4f}".format(epoch, loss_data))
            test(model, loss_fcn, val_dataloader)

In [18]:
def test(model, loss_fcn, dataloader):
    scores = []
    for batch, batched_graph in enumerate(dataloader):
        batched_graph, labels = batched_graph
        scores.append(
            evaluate(model, batched_graph, labels, loss_fcn))
    mean_scores = np.mean(scores)
    print("Accuracy score: {:.4f}".format(mean_scores))

In [19]:
def evaluate(model, batched_graph, labels, loss_fcn):
    model = model.double()
    model.eval()
    with torch.no_grad():
        output = model(batched_graph, batched_graph.ndata['node_attr'].double())

    labels = labels.T[0]
    loss = loss_fcn(output, labels)
    predict = output.argmax(dim=1)
    score = (labels == predict).sum().item() / len(labels)

    return score

In [30]:
list(train_dataloader)[0]

[Graph(num_nodes=174, num_edges=882,
       ndata_schemes={'node_labels': Scheme(shape=(1,), dtype=torch.int64), 'node_attr': Scheme(shape=(18,), dtype=torch.float64), '_ID': Scheme(shape=(), dtype=torch.int64)}
       edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}),
 tensor([[3],
         [0],
         [0],
         [5],
         [3]])]

In [27]:
# Store features
n_features, n_classes = dataset[0][0].ndata['node_attr'].shape[1], \
    dataset.num_labels
hidden_size = 64

# Define model, loss function and optimizer
model = BasicGraphModel(n_layers=3, input_size=n_features,
                        hidden_size=hidden_size, output_size=n_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
loss_fcn = torch.nn.CrossEntropyLoss()

# Train and test
train(model, loss_fcn, optimizer,
        train_dataloader, val_dataloader, num_epochs=150)
test(model, loss_fcn, test_dataloader)


Epoch 0 | Loss: 2.2191
Accuracy score: 0.2167
Epoch 5 | Loss: 1.6944
Accuracy score: 0.2750
Epoch 10 | Loss: 1.6418
Accuracy score: 0.2750
Epoch 15 | Loss: 1.6025
Accuracy score: 0.2750
Epoch 20 | Loss: 1.5811
Accuracy score: 0.3083
Epoch 25 | Loss: 1.5498
Accuracy score: 0.3333
Epoch 30 | Loss: 1.5137
Accuracy score: 0.3417
Epoch 35 | Loss: 1.4738
Accuracy score: 0.3833
Epoch 40 | Loss: 1.4339
Accuracy score: 0.4167
Epoch 45 | Loss: 1.3850
Accuracy score: 0.4667
Epoch 50 | Loss: 1.3271
Accuracy score: 0.4750
Epoch 55 | Loss: 1.2644
Accuracy score: 0.4833
Epoch 60 | Loss: 1.2146
Accuracy score: 0.4667
Epoch 65 | Loss: 1.1823
Accuracy score: 0.4833
Epoch 70 | Loss: 1.1199
Accuracy score: 0.5333
Epoch 75 | Loss: 1.0671
Accuracy score: 0.4833
Epoch 80 | Loss: 1.0408
Accuracy score: 0.4750
Epoch 85 | Loss: 0.9944
Accuracy score: 0.4583
Epoch 90 | Loss: 0.9759
Accuracy score: 0.4583
Epoch 95 | Loss: 0.9195
Accuracy score: 0.4000
Epoch 100 | Loss: 1.0335
Accuracy score: 0.4917
Epoch 105 | Lo