In [1]:
# !pip install torchnet
# !pip install torch-geometric
# !pip install pytorch_lightning

In [116]:
## Standard libraries
import os
import json
import time

from tqdm.notebook import tqdm
import torchnet as tnt
## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim

import torch_geometric
import torch_geometric.nn as geom_nn
import torch_geometric.data as geom_data
from torch_geometric.utils import to_dense_adj
# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint

# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)
DATASET_PATH = "../data"
# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH = "../saved_models/tutorial7"

# Setting the seed
pl.seed_everything(123)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

Global seed set to 123


cpu


In [118]:
!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install -q torch-geometric

In [120]:
gnn_layer_by_name = {
    "GCN": geom_nn.GCNConv,
    "GAT": geom_nn.GATConv,
    "GraphConv": geom_nn.GraphConv
}

In [121]:
# from google.colab import drive
# drive.mount('/content/drive')

In [122]:
# !git clone https://github.com/Clement-nshimiyimana/gcn.git

In [123]:

Pubmed = torch_geometric.datasets.Planetoid(root='/', name='Pubmed')
print(f'Number of graphs: {len(Pubmed)}')
print(f'Number of features: {Pubmed.num_features}')
print(f'Number of classes: {Pubmed.num_classes}')
print()

data = Pubmed[0]
print(data)
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Number of validation nodes: {data.val_mask.sum()}')
print(f'Number of test nodes: {data.test_mask.sum()}')
print()

print('Train mask:')
print(data.train_mask)
print('Edge index:')
print(data.edge_index)
print('Corresponding adjacency matrix (using to_dense_adj):')
print(to_dense_adj(data.edge_index))
print()

# Some more utility functions on a Data object 
print(f'Contains isolated nodes: {data.contains_isolated_nodes()}')
print(f'Contains self-loops: {data.contains_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Number of graphs: 1
Number of features: 500
Number of classes: 3

Data(edge_index=[2, 88648], test_mask=[19717], train_mask=[19717], val_mask=[19717], x=[19717, 500], y=[19717])
Number of nodes: 19717
Number of edges: 88648
Average node degree: 4.50
Number of training nodes: 60
Number of validation nodes: 500
Number of test nodes: 1000

Train mask:
tensor([ True,  True,  True,  ..., False, False, False])
Edge index:
tensor([[    0,     0,     0,  ..., 19714, 19715, 19716],
        [ 1378,  1544,  6092,  ..., 12278,  4284, 16030]])
Corresponding adjacency matrix (using to_dense_adj):
tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])

Contains isolated nodes: False
Contains self-loops: False
Is undirected: True


In [124]:
# seed = 123
# np.random.seed(seed)

In [125]:
import torch_geometric
from torch_geometric.utils import to_dense_adj

# Fetch the Dataset object
dataset = torch_geometric.datasets.Planetoid(root='/', name='Cora')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')
print()

# One graph is one Data object, with the following attributes:
#   edge_index:     the adjacency list of shape [2, num_edges] (COO format)
#   x:              the feature matrix of shape [num_nodes, num_features]
#   y:              node labels of shape [num_nodes]
#   train_mask:     a boolean mask of shape [num_nodes], indicating the train set
#                   (similarly for val_mask and test_mask)
data = dataset[0]
print(data)
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Number of validation nodes: {data.val_mask.sum()}')
print(f'Number of test nodes: {data.test_mask.sum()}')
print()

print('Train mask:')
print(data.train_mask)
print('Edge index:')
print(data.edge_index)
print('Corresponding adjacency matrix (using to_dense_adj):')
print(to_dense_adj(data.edge_index))
print()

# Some more utility functions on a Data object 
print(f'Contains isolated nodes: {data.contains_isolated_nodes()}')
print(f'Contains self-loops: {data.contains_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Number of graphs: 1
Number of features: 1433
Number of classes: 7

Data(edge_index=[2, 10556], test_mask=[2708], train_mask=[2708], val_mask=[2708], x=[2708, 1433], y=[2708])
Number of nodes: 2708
Number of edges: 10556
Average node degree: 3.90
Number of training nodes: 140
Number of validation nodes: 500
Number of test nodes: 1000

Train mask:
tensor([ True,  True,  True,  ..., False, False, False])
Edge index:
tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
        [ 633, 1862, 2582,  ...,  598, 1473, 2706]])
Corresponding adjacency matrix (using to_dense_adj):
tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 1., 0.]]])

Contains isolated nodes: False
Contains self-loops: False
Is undirected: True


In [126]:
cit = torch_geometric.datasets.Planetoid(root='/', name='Citeseer')
data_cit = cit[0]
print(f'Number of graphs: {len(cit)}')
print(f'Number of features: {cit.num_features}')
print(f'Number of classes: {cit.num_classes}')
print()

print(data_cit)
print(f'Number of nodes: {data_cit.num_nodes}')
print(f'Number of edges: {data_cit.num_edges}')
print(f'Average node degree: {data_cit.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data_cit.train_mask.sum()}')
print(f'Number of validation nodes: {data_cit.val_mask.sum()}')
print(f'Number of test nodes: {data_cit.test_mask.sum()}')
print()

print('Train mask:')
print(data_cit.train_mask)
print('Edge index:')
print(data_cit.edge_index)
print('Corresponding adjacency matrix (using to_dense_adj):')
print(to_dense_adj(data_cit.edge_index))
print()

# Some more utility functions on a Data object 
print(f'Contains isolated nodes: {data_cit.contains_isolated_nodes()}')
print(f'Contains self-loops: {data_cit.contains_self_loops()}')
print(f'Is undirected: {data_cit.is_undirected()}')

Number of graphs: 1
Number of features: 3703
Number of classes: 6

Data(edge_index=[2, 9104], test_mask=[3327], train_mask=[3327], val_mask=[3327], x=[3327, 3703], y=[3327])
Number of nodes: 3327
Number of edges: 9104
Average node degree: 3.36
Number of training nodes: 120
Number of validation nodes: 500
Number of test nodes: 1000

Train mask:
tensor([ True,  True,  True,  ..., False, False, False])
Edge index:
tensor([[   0,    1,    1,  ..., 3324, 3325, 3326],
        [ 628,  158,  486,  ..., 2820, 1643,   33]])
Corresponding adjacency matrix (using to_dense_adj):
tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])

Contains isolated nodes: True
Contains self-loops: False
Is undirected: True


In [128]:
class GNNModel(nn.Module):
    
    def __init__(self, c_in, c_hidden, c_out, num_layers=2, layer_name="GCN", dp_rate=0.5, **kwargs):
        """
        Inputs:
            c_in - Dimension of input features
            c_hidden - Dimension of hidden features
            c_out - Dimension of the output features. Usually number of classes in classification
            num_layers - Number of "hidden" graph layers
            layer_name - String of the graph layer to use
            dp_rate - Dropout rate to apply throughout the network
            kwargs - Additional arguments for the graph layer (e.g. number of heads for GAT)
        """
        super().__init__()
        gnn_layer = gnn_layer_by_name[layer_name]
        
        layers = []
        in_channels, out_channels = c_in, c_hidden
        for l_idx in range(num_layers-1):
            layers += [
                gnn_layer(in_channels=in_channels, 
                          out_channels=out_channels,
                          **kwargs),
                nn.ReLU(inplace=True),
                nn.Dropout(dp_rate)
            ]
            in_channels = c_hidden
        layers += [gnn_layer(in_channels=in_channels, 
                             out_channels=c_out,
                             **kwargs)]
        self.layers = nn.ModuleList(layers)
    
    def forward(self, x, edge_index):
        """
        Inputs:
            x - Input features per node
            edge_index - List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
        """
        for l in self.layers:
            # For graph layers, we need to add the "edge_index" tensor as additional input
            # All PyTorch Geometric graph layer inherit the class "MessagePassing", hence
            # we can simply check the class type.
            if isinstance(l, geom_nn.MessagePassing):
                x = l(x, edge_index)
            else:
                x = l(x)
        return x

In [129]:
class MLPModel(nn.Module):
    
    def __init__(self, c_in, c_hidden, c_out, num_layers=2, dp_rate=0.5):
        """
        Inputs:
            c_in - Dimension of input features
            c_hidden - Dimension of hidden features
            c_out - Dimension of the output features. Usually number of classes in classification
            num_layers - Number of hidden layers
            dp_rate - Dropout rate to apply throughout the network
        """
        super().__init__()
        layers = []
        in_channels, out_channels = c_in, c_hidden
        for l_idx in range(num_layers-1):
            layers += [
                nn.Linear(in_channels, out_channels),
                nn.ReLU(inplace=True),
                nn.Dropout(dp_rate)
            ]
            in_channels = c_hidden
        layers += [nn.Linear(in_channels, c_out)]
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x, *args, **kwargs):
        """
        Inputs:
            x - Input features per node
        """
        return self.layers(x)

In [130]:
class NodeLevelGNN(pl.LightningModule):
    
    def __init__(self, model_name, **model_kwargs):
        super().__init__()
        # Saving hyperparameters
        self.save_hyperparameters()
        
        if model_name == "MLP":
            self.model = MLPModel(**model_kwargs)
        else:
            self.model = GNNModel(**model_kwargs)
        self.loss_module = nn.CrossEntropyLoss()
        
    
    def forward(self, data, mode="train"):
        x, edge_index = data.x, data.edge_index
        x = self.model(x, edge_index)
        
        # Only calculate the loss on the nodes corresponding to the mask
        if mode == "train":
            mask = data.train_mask
        elif mode == "val":
            mask = data.val_mask
        elif mode == "test":
            mask = data.test_mask
        else:
            assert False, "Unknown forward mode: %s" % mode
        
        loss = self.loss_module(x[mask], data.y[mask])
        acc = (x[mask].argmax(dim=-1) == data.y[mask]).sum().float() / mask.sum()
        return loss, acc
        
        
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=0.01, weight_decay=5e-4)
        return optimizer
        
        
    def training_step(self, batch, batch_idx):
        loss, acc = self.forward(batch, mode="train")
        self.log('train_loss', loss)
        self.log('train_acc', acc)
        return loss
        
        
    def validation_step(self, batch, batch_idx):
        _, acc = self.forward(batch, mode="val")
        self.log('val_acc', acc)
        
        
    def test_step(self, batch, batch_idx):
        _, acc = self.forward(batch, mode="test")
        self.log('test_acc', acc)

In [131]:
def train_node_classifier(model_name, dataset, **model_kwargs):
    pl.seed_everything(123)
    node_data_loader = geom_data.DataLoader(dataset, batch_size=1)
    
    # Create a PyTorch Lightning trainer with the generation callback
    root_dir = os.path.join(CHECKPOINT_PATH, "NodeLevel" + model_name)
    os.makedirs(root_dir, exist_ok=True)
    trainer = pl.Trainer(default_root_dir=root_dir,
                         callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc")],
                         gpus=1 if str(device).startswith("cuda") else 0,
                         max_epochs=200,
                         progress_bar_refresh_rate=0) # 0 because epoch size is 1
    trainer.logger._default_hp_metric = None # Optional logging argument that we don't need

    # Check whether pretrained model exists. If yes, load it and skip training
    pretrained_filename = os.path.join(CHECKPOINT_PATH, "NodeLevel%s.ckpt" % model_name)
    if os.path.isfile(pretrained_filename):
        print("Found pretrained model, loading...")
        model = NodeLevelGNN.load_from_checkpoint(pretrained_filename)
    else:
        pl.seed_everything()
        model = NodeLevelGNN(model_name=model_name, c_in=dataset.num_node_features, c_out=dataset.num_classes, **model_kwargs)
        trainer.fit(model, node_data_loader, node_data_loader)
        model = NodeLevelGNN.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
    
    # Test best model on the test set
    test_result = trainer.test(model, test_dataloaders=node_data_loader, verbose=False)
    batch = next(iter(node_data_loader))
    batch = batch.to(model.device)
    train_err, train_acc = model.forward(batch, mode="train")
    _, val_acc = model.forward(batch, mode="val")
    result = {"err": train_err,
              "train": train_acc,
              "val": val_acc,
              "test": test_result[0]['test_acc']}
    return model, result

In [132]:
# Function for printing the test scores
def print_results(result_dict):
    if "train" in result_dict:
        print("Loss: ",(result_dict['err']).item())
        print("Train accuracy: %4.2f%%" % (100.0*result_dict["train"]))
    if "val" in result_dict:
        print("Val accuracy:   %4.2f%%" % (100.0*result_dict["val"]))
    print("Test accuracy:  %4.2f%%" % (100.0*result_dict["test"]))

Cora training

In [133]:
node_mlp_model, node_mlp_result = train_node_classifier(model_name="MLP",
                                                        dataset=dataset,
                                                        c_hidden=16,
                                                        num_layers=2,
                                                        dp_rate=0.5)

print_results(node_mlp_result)

Global seed set to 123
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Global seed set to 123

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | MLPModel         | 23.1 K
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
23.1 K    Trainable params
0         Non-trainable params
23.1 K    Total params
0.092     Total estimated model params size (MB)
Global seed set to 123


Loss:  0.15342400968074799
Train accuracy: 94.29%
Val accuracy:   41.60%
Test accuracy:  52.10%


In [134]:
node_gnn_model, node_gnn_result = train_node_classifier(model_name="GNN",
                                                        layer_name="GCN",
                                                        dataset=dataset, 
                                                        c_hidden=16, 
                                                        num_layers=2,
                                                        dp_rate=0.5)
print_results(node_gnn_result)

Global seed set to 123
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Global seed set to 123

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModel         | 23.1 K
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
23.1 K    Trainable params
0         Non-trainable params
23.1 K    Total params
0.092     Total estimated model params size (MB)
Global seed set to 123


Loss:  0.05266476050019264
Train accuracy: 99.29%
Val accuracy:   71.00%
Test accuracy:  80.80%


Training on Citeseer 

In [135]:
node_mlp_model, node_mlp_result = train_node_classifier(model_name="MLP",
                                                        dataset=cit,
                                                        c_hidden=16,
                                                        num_layers=2,
                                                        dp_rate=0.5)

print_results(node_mlp_result)

Global seed set to 123
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Global seed set to 123

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | MLPModel         | 59.4 K
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
59.4 K    Trainable params
0         Non-trainable params
59.4 K    Total params
0.237     Total estimated model params size (MB)
Global seed set to 123


Loss:  0.1774747222661972
Train accuracy: 90.83%
Val accuracy:   42.00%
Test accuracy:  51.30%


In [136]:
node_gnn_model, node_gnn_result = train_node_classifier(model_name="GNN",
                                                        layer_name="GCN",
                                                        dataset=cit, 
                                                        c_hidden=16, 
                                                        num_layers=2,
                                                        dp_rate=0.5)
print_results(node_gnn_result)

Global seed set to 123
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Global seed set to 123

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModel         | 59.4 K
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
59.4 K    Trainable params
0         Non-trainable params
59.4 K    Total params
0.237     Total estimated model params size (MB)
Global seed set to 123


Loss:  0.04643867909908295
Train accuracy: 98.33%
Val accuracy:   64.20%
Test accuracy:  68.10%


Training on Pubmed

In [137]:
node_mlp_model, node_mlp_result = train_node_classifier(model_name="MLP",
                                                        dataset=Pubmed,
                                                        c_hidden=16,
                                                        num_layers=2,
                                                        dp_rate=0.5)

print_results(node_mlp_result)

Global seed set to 123
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Global seed set to 123

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | MLPModel         | 8.1 K 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
8.1 K     Trainable params
0         Non-trainable params
8.1 K     Total params
0.032     Total estimated model params size (MB)
Global seed set to 123


Loss:  0.09004615247249603
Train accuracy: 100.00%
Val accuracy:   63.60%
Test accuracy:  71.80%


In [138]:
node_gnn_model, node_gnn_result = train_node_classifier(model_name="GNN",
                                                        layer_name="GCN",
                                                        dataset=Pubmed, 
                                                        c_hidden=16, 
                                                        num_layers=2,
                                                        dp_rate=0.5)
print_results(node_gnn_result)

Global seed set to 123
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Global seed set to 123

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModel         | 8.1 K 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
8.1 K     Trainable params
0         Non-trainable params
8.1 K     Total params
0.032     Total estimated model params size (MB)
Global seed set to 123


Loss:  0.11121226847171783
Train accuracy: 100.00%
Val accuracy:   76.40%
Test accuracy:  78.70%
