# Installs

In [1]:
!pip install torch
import torch
print(torch.__version__)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
1.12.1+cu113


Change torch.__version__ of following installation command.




In [2]:
!pip install torch-geometric torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Collecting torch-geometric
  Downloading torch_geometric-2.1.0.post1.tar.gz (467 kB)
[K     |████████████████████████████████| 467 kB 5.3 MB/s 
[?25hCollecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 5.3 MB/s 
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_sparse-0.6.15-cp37-cp37m-linux_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 8.2 MB/s 
Building wheels for collected packages: torch-geometric
  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone
  Created wheel for torch-geometric: filename=torch_geometric-2.1.0.post1-py3-none-any.whl size=689857 sha256=93f98ea72d9f919575bd10ec51f30759991442a

# Dataset Setup

In [3]:
import os.path as osp

import torch

import torch_geometric.transforms as T
from torch_geometric.datasets import TUDataset
from torch_geometric.utils import degree

In [4]:
dataset = TUDataset(root='/tmp/imdb', name='IMDB-BINARY')

Downloading https://www.chrsmrrs.com/graphkerneldatasets/IMDB-BINARY.zip
Extracting /tmp/imdb/IMDB-BINARY/IMDB-BINARY.zip
Processing...
Done!


In [5]:
dataset

IMDB-BINARY(1000)

In [6]:
dataset[0]

Data(edge_index=[2, 146], y=[1], num_nodes=20)

## add code for using the node degree as input features.

In [7]:
# class NormalizedDegree(object):
#     def __init__(self, mean, std):
#         self.mean = mean
#         self.std = std

#     def __call__(self, data):
#         deg = degree(data.edge_index[0], dtype=torch.float)
#         deg = (deg - self.mean) / self.std
#         data.x = deg.view(-1, 1)
#         return data

In [8]:
if dataset.data.x is None:
        max_degree = 0
        degs = []
        for data in dataset:
            degs += [degree(data.edge_index[0], dtype=torch.long)]
            max_degree = max(max_degree, degs[-1].max().item())

        if max_degree < 1000:
            dataset.transform = T.OneHotDegree(max_degree)
        # else:
        #     print("HERE")
        #     deg = torch.cat(degs, dim=0).to(torch.float)
        #     mean, std = deg.mean().item(), deg.std().item()
        #     dataset.transform = NormalizedDegree(mean, std)

In [9]:
dataset[0]

Data(edge_index=[2, 146], y=[1], num_nodes=20, x=[20, 136])

## add train, val, test loader.

In [10]:
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

In [11]:
dataset = dataset.shuffle()
dataset

IMDB-BINARY(1000)

In [12]:
train_perc = 0.8
val_perc = 0.1
test_perc = 0.1

In [13]:
training_dataset = dataset[:800]
val_test_dataset = dataset[800:]
val_dataset = val_test_dataset[:100]
test_dataset = val_test_dataset[100:]

In [14]:
train_loader = DataLoader(training_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [15]:
for batch in train_loader:
  print(batch)
  print(batch.num_graphs)

DataBatch(edge_index=[2, 5512], y=[32], num_nodes=564, x=[564, 136], batch=[564], ptr=[33])
32
DataBatch(edge_index=[2, 7370], y=[32], num_nodes=551, x=[551, 136], batch=[551], ptr=[33])
32
DataBatch(edge_index=[2, 4284], y=[32], num_nodes=542, x=[542, 136], batch=[542], ptr=[33])
32
DataBatch(edge_index=[2, 6296], y=[32], num_nodes=562, x=[562, 136], batch=[562], ptr=[33])
32
DataBatch(edge_index=[2, 7356], y=[32], num_nodes=701, x=[701, 136], batch=[701], ptr=[33])
32
DataBatch(edge_index=[2, 6364], y=[32], num_nodes=592, x=[592, 136], batch=[592], ptr=[33])
32
DataBatch(edge_index=[2, 6158], y=[32], num_nodes=636, x=[636, 136], batch=[636], ptr=[33])
32
DataBatch(edge_index=[2, 7328], y=[32], num_nodes=731, x=[731, 136], batch=[731], ptr=[33])
32
DataBatch(edge_index=[2, 6184], y=[32], num_nodes=694, x=[694, 136], batch=[694], ptr=[33])
32
DataBatch(edge_index=[2, 8788], y=[32], num_nodes=765, x=[765, 136], batch=[765], ptr=[33])
32
DataBatch(edge_index=[2, 7454], y=[32], num_nodes=

Model

In [16]:
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, aggr
from torch.optim import Adam

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.lin1 = Linear(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        ### You can add more layers or alter the model structure. See geometric documents which layer or model you can use.
        x = self.conv1(x, edge_index)

        
        ### aggregate node embeddings into one representation
        mean_aggr = aggr.MeanAggregation()
        x = mean_aggr(x, data.batch)


        ### Pass aggregated representation to linear layer to make final prediction
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.lin1(x)
        return F.log_softmax(x, dim=1)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [18]:
print(device)
print(model)
print(optimizer)

cpu
GCN(
  (conv1): GCNConv(136, 16)
  (lin1): Linear(in_features=16, out_features=2, bias=True)
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.01
    maximize: False
    weight_decay: 0.0005
)


Train function

In [19]:
def train():
    model.train()
    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, data.y.view(-1))
        loss.backward()
        loss_all += loss.item() * data.num_graphs
        optimizer.step()
    return loss_all / len(training_dataset)

Validation function

In [20]:
def val(loader):
    model.eval()
    loss_all = 0
    for data in loader:
        data = data.to(device)
        output = model(data)
        loss = F.nll_loss(output, data.y.view(-1))
        loss_all += loss.item() * data.num_graphs
    return loss_all / len(training_dataset)

Test function

In [21]:
def test(loader):
    model.eval()
    correct = 0
    for data in loader:
        data = data.to(device)
        output = model(data)
        pred = output.max(dim=1)[1]
        correct += pred.eq(data.y).sum().item()
    return correct / len(loader.dataset)


Main code

In [22]:
model.train()

number_of_epochs = 300 # You can change.

lowest_val_loss = float('inf')
best_model = None
for epoch in range(number_of_epochs):
    train_loss = train()
    val_loss = val(val_loader)

    # Choose the lowest validation loss checkpoint (you can implement early stopping as well)
    if val_loss < lowest_val_loss:
      lowest_val_loss = val_loss
      break
      # torch.save(model, "./model/model.pt")

# model = torch.load("./model/model.pt")
# Load the lowest validation loss checkpoint and check the performance.
test_acc = test(test_loader)
print(test_acc)

0.75
