In [179]:
import torch
torchversion = torch.__version__

In [180]:
torchversion

'2.0.0+cu118'

In [181]:
# Install PyTorch Scatter, PyTorch Sparse, and PyTorch Geometric
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-{torchversion}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-{torchversion}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

# Visualization
import networkx as nx
import matplotlib.pyplot as plt

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


##Paper Review Code: HOW POWERFUL ARE GRAPH NEURAL NETWORKS?

In [182]:
#@title Load Dataset - ENZMES
from torch_geometric.datasets import TUDataset, GNNBenchmarkDataset, Planetoid

dataset = TUDataset(root='.', name='PROTEINS').shuffle()             # You may also use paper's dataset, such as PROTEINS ENZYMES for test
# dataset = GNNBenchmarkDataset(root='.', name='MNIST').shuffle()   # The large dataset MINIST 100 epoches take over 8 hours to train

# Print information about the dataset
print(f'Dataset: {dataset}')
print('-------------------')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of nodes: {dataset[0].x.shape[0]}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

Dataset: PROTEINS(1113)
-------------------
Number of graphs: 1113
Number of nodes: 36
Number of features: 3
Number of classes: 2


In [183]:
#@title Data split
from torch_geometric.loader import DataLoader

# Create training, validation, and test sets
train_dataset = dataset[:int(len(dataset)*0.8)]
val_dataset   = dataset[int(len(dataset)*0.8):int(len(dataset)*0.9)]
test_dataset  = dataset[int(len(dataset)*0.9):]

print(f'Training set   = {len(train_dataset)} graphs')
print(f'Validation set = {len(val_dataset)} graphs')
print(f'Test set       = {len(test_dataset)} graphs')

# Create mini-batches
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print('\nTrain loader:')
for i, subgraph in enumerate(train_loader):
    print(f' - Subgraph {i}: {subgraph}')

print('\nValidation loader:')
for i, subgraph in enumerate(val_loader):
    print(f' - Subgraph {i}: {subgraph}')

print('\nTest loader:')
for i, subgraph in enumerate(test_loader):
    print(f' - Subgraph {i}: {subgraph}')

Training set   = 890 graphs
Validation set = 111 graphs
Test set       = 112 graphs

Train loader:
 - Subgraph 0: DataBatch(edge_index=[2, 3498], x=[1013, 3], y=[32], batch=[1013], ptr=[33])
 - Subgraph 1: DataBatch(edge_index=[2, 5784], x=[1613, 3], y=[32], batch=[1613], ptr=[33])
 - Subgraph 2: DataBatch(edge_index=[2, 5260], x=[1415, 3], y=[32], batch=[1415], ptr=[33])
 - Subgraph 3: DataBatch(edge_index=[2, 4954], x=[1274, 3], y=[32], batch=[1274], ptr=[33])
 - Subgraph 4: DataBatch(edge_index=[2, 3488], x=[948, 3], y=[32], batch=[948], ptr=[33])
 - Subgraph 5: DataBatch(edge_index=[2, 3980], x=[1068, 3], y=[32], batch=[1068], ptr=[33])
 - Subgraph 6: DataBatch(edge_index=[2, 2920], x=[792, 3], y=[32], batch=[792], ptr=[33])
 - Subgraph 7: DataBatch(edge_index=[2, 5434], x=[1421, 3], y=[32], batch=[1421], ptr=[33])
 - Subgraph 8: DataBatch(edge_index=[2, 5378], x=[1497, 3], y=[32], batch=[1497], ptr=[33])
 - Subgraph 9: DataBatch(edge_index=[2, 4210], x=[1091, 3], y=[32], batch=[10

In [184]:
#@title GNNs
from torch.nn import Linear, Sequential, BatchNorm1d, ReLU, Dropout, LeakyReLU
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GraphSAGE, GINConv, SAGEConv
from torch_geometric.nn import global_mean_pool, global_add_pool
from torch_scatter import scatter_add


class GCN(torch.nn.Module):
    def __init__(self, dim_h):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, dim_h)
        self.conv2 = GCNConv(dim_h, dim_h)
        self.lin = Linear(dim_h, dataset.num_classes)

    def forward(self, x, edge_index, batch): 
        h = self.conv1(x, edge_index)
        h = h.relu()
        h = self.conv2(h, edge_index)
        h = h.relu()

        hG = global_mean_pool(h, batch)

        h = F.dropout(hG, p=0.2, training=self.training)
        h = self.lin(h)
        
        return hG, F.log_softmax(h, dim=1)


class GraphSAGENet(torch.nn.Module):
    def __init__(self, dim_h):
        super(GraphSAGENet, self).__init__()
        self.conv1 = SAGEConv(dataset.num_node_features, dim_h, aggr='mean')
        self.conv2 = SAGEConv(dim_h, dim_h, aggr='mean')
        self.lin = Linear(dim_h, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        h = self.conv1(x, edge_index)
        h = h.relu()
        h = self.conv2(h, edge_index)
        h = h.relu()

        hG = global_mean_pool(h, batch)

        h = F.dropout(hG, p=0.2, training=self.training)
        h = self.lin(h)
        
        return hG, torch.log(F.softmax(h, dim=1))
        

class GIN(torch.nn.Module):
    def __init__(self, dim_h, num_layers=5):
        super(GIN, self).__init__()
        self.num_layers = num_layers
        self.mlps = torch.nn.ModuleList()
        for i in range(num_layers):
            if i == 0:
                self.mlps.append(Sequential(Linear(dataset.num_node_features, dim_h),
                                            BatchNorm1d(dim_h), LeakyReLU(),
                                            Linear(dim_h, dim_h), LeakyReLU()))
            else:
                self.mlps.append(Sequential(Linear(dim_h, dim_h),
                                            BatchNorm1d(dim_h), LeakyReLU(),
                                            Linear(dim_h, dim_h), LeakyReLU()))
        self.lin1 = Linear(dim_h*num_layers, dim_h*num_layers)
        self.lin2 = Linear(dim_h*num_layers, dim_h*num_layers)
        self.lin3 = Linear(dim_h*num_layers, dataset.num_classes)

    def forward(self, x, edge_index, batch):

        h_list = []
        for i in range(self.num_layers):
            if i == 0:
                h_list.append(self.mlps[i](x))
            else:
                h_list.append(self.mlps[i](h_list[i-1]))

        h_list = [scatter_add(h, batch, dim=0) for h in h_list]
        
        h = torch.cat(h_list, dim=1)

        h = self.lin1(h)
        h = F.leaky_relu(h, negative_slope=0.1)
        h = F.dropout(h, p=0.2, training=self.training)
        h = self.lin2(h)
        h = F.leaky_relu(h, negative_slope=0.1)
        h = F.dropout(h, p=0.2, training=self.training)
        h = self.lin3(h)
        return h, torch.log(F.softmax(h, dim=1))

dim_h = 32
gcn = GCN(dim_h=dim_h)
graphsagenet = GraphSAGENet(dim_h=dim_h)
gin = GIN(dim_h=dim_h)

In [185]:
#@title Train models
def train(model, loader):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                      lr=0.01,
                                      weight_decay=0.01)
    epochs = 100

    model.train()
    for epoch in range(epochs+1):
      total_loss = 0
      acc = 0
      val_loss = 0
      val_acc = 0

      # Train on batches
      for data in loader:
        optimizer.zero_grad()
        _, out = model(data.x, data.edge_index, data.batch)
        loss = criterion(out, data.y)
        total_loss += loss / len(loader)
        acc += accuracy(out.argmax(dim=1), data.y) / len(loader)
        loss.backward()
        optimizer.step()

        # Validation
        val_loss, val_acc = test(model, val_loader)

    # Print metrics every 10 epochs
    if(epoch % 10 == 0):
        print(f'Epoch {epoch:>3} | Train Loss: {total_loss:.2f} '
              f'| Train Acc: {acc*100:>5.2f}% '
              f'| Val Loss: {val_loss:.2f} '
              f'| Val Acc: {val_acc*100:.2f}%')
          
    test_loss, test_acc = test(model, test_loader)
    print(f'Test Loss: {test_loss:.2f} | Test Acc: {test_acc*100:.2f}%')
    
    return model

def test(model, loader):
    criterion = torch.nn.CrossEntropyLoss()
    model.eval()
    loss = 0
    acc = 0

    for data in loader:
      _, out = model(data.x, data.edge_index, data.batch)
      loss += criterion(out, data.y) / len(loader)
      acc += accuracy(out.argmax(dim=1), data.y) / len(loader)

    return loss, acc

def accuracy(pred_y, y):
    """Calculate accuracy."""
    return ((pred_y == y).sum() / len(y)).item()

gcn = train(gcn, train_loader)
graphsagenet = train(graphsagenet, train_loader)
gin = train(gin, train_loader)

Epoch 100 | Train Loss: 0.68 | Train Acc: 58.37% | Val Loss: 0.68 | Val Acc: 59.43%
Test Loss: 0.65 | Test Acc: 65.62%
Epoch 100 | Train Loss: 0.66 | Train Acc: 65.73% | Val Loss: 0.64 | Val Acc: 68.44%
Test Loss: 0.64 | Test Acc: 66.41%
Epoch 100 | Train Loss: 0.57 | Train Acc: 70.94% | Val Loss: 0.49 | Val Acc: 80.83%
Test Loss: 0.53 | Test Acc: 72.66%


In [190]:
#@title Test and compare - GIN outperformed GCN and GraphSAGE
gcn.eval()
graphsagenet.eval()
gin.eval()
acc_gcn = 0
acc_graphsage = 0
acc_gin = 0

for data in test_loader:
    # Get classifications
    _, out_gcn = gcn(data.x, data.edge_index, data.batch)
    _, out_graphsage = graphsagenet(data.x, data.edge_index, data.batch)
    _, out_gin = gin(data.x, data.edge_index, data.batch)

    # Calculate accuracy scores
    acc_gcn += accuracy(out_gcn.argmax(dim=1), data.y) / len(test_loader)
    acc_graphsage += accuracy(out_graphsage.argmax(dim=1), data.y) / len(test_loader)
    acc_gin += accuracy(out_gin.argmax(dim=1), data.y) / len(test_loader)

# Print results
print(f'GCN accuracy:     {acc_gcn*100:.2f}%')
print(f'GraphSAGE accuracy:     {acc_graphsage*100:.2f}%')
print(f'GIN accuracy:     {acc_gin*100:.2f}%')

GCN accuracy:     65.62%
GraphSAGE accuracy:     66.41%
GIN accuracy:     72.66%
