In [1]:
import os
if not os.path.exists('training_outputs/'):
    os.makedirs('training_outputs/') # directory for saving visualizations and model checkpoints

In [2]:
!pip install torch==2.2.2
!pip install torch-cluster==1.6.3
!pip install torch-geometric==2.6.1
!pip install torch-scatter==2.1.2
!pip install torch-sparse==0.6.18
!pip install torch-spline-conv==1.2.2

!pip install ogb
!pip install numpy

Collecting torch==2.2.2
  Downloading torch-2.2.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.2)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.2)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.2)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.2)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.2.2)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.2.2)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3

In [3]:

import torch
import torch.nn.functional as F
from torch.nn import Sequential, Linear, BatchNorm1d, ReLU

from torch_geometric.utils import negative_sampling
from torch_geometric.data import Data

import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv, GINConv, GATConv

from ogb.linkproppred import PygLinkPropPredDataset, Evaluator
# import custom dataset_pyg to avoid weights_only errors
# from dataset_pyg import PygLinkPropPredDataset
from torch_geometric.loader import DataLoader
from torch_geometric.data.data import DataEdgeAttr, GlobalStorage, DataTensorAttr

from torch_geometric.nn import Node2Vec

import pandas as pd
import shutil, os
import os.path as osp
import numpy as np

#from logger import Logger
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import random

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# If you use GPU, the device should be cuda
print('Device: {}'.format(device))

Device: cuda


In [5]:
# class using PyG's GATConv layer
class GAN(torch.nn.Module):
    ''' Define graph isomorphic network. '''
    def __init__(self, in_channels, hidden_size, out_channels, in_head,
                 num_layers, dropout):
        super().__init__()
        self.out_head = 1
        self.dropout = dropout
        # Initialize 2 GATConv layers
        self.conv1 = GATConv(in_channels, hidden_size, heads=in_head,
                             dropout=self.dropout)
        self.conv2 = GATConv(hidden_size*in_head, out_channels, concat=False,
                             heads=self.out_head, dropout=self.dropout)


    def reset_parameters(self):
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()


    def forward(self, x, edge_index):
        # Execute conv -> relu -> dropout sequence
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return x

In [6]:
# class in order to predict whether a link exists between two nodes using
# their embeddings, x_i and x_j
class LinkPredictor(torch.nn.Module):
    ''' Neural network which predicts whether a link (interaction) exists between 2 nodes i,j
    given their embeddings x_i, x_j.
    '''
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 dropout):
        super(LinkPredictor, self).__init__()

        self.lins = torch.nn.ModuleList()
        self.lins.append(torch.nn.Linear(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.lins.append(torch.nn.Linear(hidden_channels, hidden_channels))
        self.lins.append(torch.nn.Linear(hidden_channels, out_channels))

        self.dropout = dropout

    def reset_parameters(self):
        for lin in self.lins:
            lin.reset_parameters()

    def forward(self, x_i, x_j):
        x = x_i * x_j # hadamard product
        for lin in self.lins[:-1]: # linear layer -> relu -> dropout
            x = lin(x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.lins[-1](x)
        return torch.sigmoid(x) # sigmoid activation outputs probability that a given edge exists for all node pairs

In [7]:
def train(model, predictor, x, adj_t, split_edge, optimizer, batch_size):

    row, col, _ = adj_t.coo()
    edge_index = torch.stack([col, row], dim=0)

    model.train()
    predictor.train()

    pos_train_edge = split_edge['train']['edge'].to(x.device)

    total_loss = total_examples = 0
    for perm in DataLoader(range(pos_train_edge.size(0)), batch_size,
                           shuffle=True):
        optimizer.zero_grad()

        h = model(x, adj_t)

        edge = pos_train_edge[perm].t()

        # computes the loss for positive edges
        pos_out = predictor(h[edge[0]], h[edge[1]])
        pos_loss = -torch.log(pos_out + 1e-15).mean()

        # samples negative edges from the graph
        edge = negative_sampling(edge_index, num_nodes=x.size(0),
                                 num_neg_samples=perm.size(0), method='dense')

        # computes the loss for negative edges
        neg_out = predictor(h[edge[0]], h[edge[1]])
        neg_loss = -torch.log(1 - neg_out + 1e-15).mean()

        loss = pos_loss + neg_loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(x, 1.0)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        torch.nn.utils.clip_grad_norm_(predictor.parameters(), 1.0)

        optimizer.step()

        num_examples = pos_out.size(0)
        total_loss += loss.item() * num_examples
        total_examples += num_examples

    return total_loss / total_examples


@torch.no_grad()
def test(model, predictor, x, adj_t, split_edge, evaluator, batch_size):
    model.eval()
    predictor.eval()

    h = model(x, adj_t)

    pos_train_edge = split_edge['eval_train']['edge'].to(x.device)
    pos_valid_edge = split_edge['valid']['edge'].to(x.device)
    neg_valid_edge = split_edge['valid']['edge_neg'].to(x.device)
    pos_test_edge = split_edge['test']['edge'].to(x.device)
    neg_test_edge = split_edge['test']['edge_neg'].to(x.device)

    # store what the link predictor outputs for each positive and negative
    # edge in order to compute the hits@K
    pos_train_preds = []
    for perm in DataLoader(range(pos_train_edge.size(0)), batch_size):
        edge = pos_train_edge[perm].t()
        pos_train_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()]
    pos_train_pred = torch.cat(pos_train_preds, dim=0)

    pos_valid_preds = []
    for perm in DataLoader(range(pos_valid_edge.size(0)), batch_size):
        edge = pos_valid_edge[perm].t()
        pos_valid_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()]
    pos_valid_pred = torch.cat(pos_valid_preds, dim=0)

    neg_valid_preds = []
    for perm in DataLoader(range(neg_valid_edge.size(0)), batch_size):
        edge = neg_valid_edge[perm].t()
        neg_valid_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()]
    neg_valid_pred = torch.cat(neg_valid_preds, dim=0)

    pos_test_preds = []
    for perm in DataLoader(range(pos_test_edge.size(0)), batch_size):
        edge = pos_test_edge[perm].t()
        pos_test_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()]
    pos_test_pred = torch.cat(pos_test_preds, dim=0)

    neg_test_preds = []
    for perm in DataLoader(range(neg_test_edge.size(0)), batch_size):
        edge = neg_test_edge[perm].t()
        neg_test_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()]
    neg_test_pred = torch.cat(neg_test_preds, dim=0)

    # compute the hits@K for training, validation, and test
    results = {}
    for K in [10, 20]:
        evaluator.K = K
        train_hits = evaluator.eval({
            'y_pred_pos': pos_train_pred,
            'y_pred_neg': neg_valid_pred,
        })[f'hits@{K}']
        valid_hits = evaluator.eval({
            'y_pred_pos': pos_valid_pred,
            'y_pred_neg': neg_valid_pred,
        })[f'hits@{K}']
        test_hits = evaluator.eval({
            'y_pred_pos': pos_test_pred,
            'y_pred_neg': neg_test_pred,
        })[f'hits@{K}']

        results[f'Hits@{K}'] = (train_hits, valid_hits, test_hits)

    return results

In [8]:
def train_model(model, emb, gnn_args, predictor, model_name):
  '''
  Train specified GNN model. Model and embeddings should be initialized.
  Save model after every run.
  '''
  train_hits_arr, val_hits_arr, test_hits_arr = [], [], []

  evaluator = Evaluator(name='ogbl-ddi')
  for run in range(2):
    max_valhits, train_hits_run, test_hits_run = float('-inf'), 0, 0

    torch.nn.init.xavier_uniform_(emb.weight)
    model.reset_parameters()
    predictor.reset_parameters()
    optimizer = torch.optim.Adam(
        list(model.parameters()) + list(emb.parameters()) +
        list(predictor.parameters()), lr=gnn_args['lr'])

    for epoch in range(1, 1 + gnn_args['epochs']):
        loss = train(model, predictor, emb.weight, adj_t, split_edge,
                      optimizer, gnn_args['batch_size'])

        if epoch % gnn_args['eval_steps'] == 0:
            results = test(model, predictor, emb.weight, adj_t, split_edge,
                            evaluator, gnn_args['batch_size'])


            if epoch % gnn_args['log_steps'] == 0:
                for key, result in results.items():
                    train_hits, valid_hits, test_hits = result
                    print(key)
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {100 * train_hits:.2f}%, '
                          f'Valid: {100 * valid_hits:.2f}%, '
                          f'Test: {100 * test_hits:.2f}%')
                print('---')

            # check val-hits@20
            train_hits, valid_hits, test_hits = results['Hits@20']
            if valid_hits >= max_valhits: # if validhits20 is higher than max, save ckpt
              max_valhits = valid_hits
              train_hits_run = train_hits
              test_hits_run = test_hits
              # Save model checkpoint for current run.
              model_path = f"training_outputs/{model_name}.pt"
              emb_path = f'training_outputs/{model_name}_init_emb.pt'
              save_model_ckpt(model, emb, optimizer, predictor, loss, emb_path, model_path)
    train_hits_arr.append(train_hits_run)
    test_hits_arr.append(test_hits_run)
    val_hits_arr.append(max_valhits)


  # Print overall stats arrays for best model based on val hits@20
  print("Val_hits@20: ", val_hits_arr)
  print("Test_hits@20: ", test_hits_arr)
  print("Train_hits@20: ", train_hits_arr)

  # Print best model stats (based on val hits@20)
  val_max = max(val_hits_arr)
  print("Best model val hits@20: ", max(val_hits_arr))
  max_idx = val_hits_arr.index(val_max)
  print('Best model test hits@20: ', test_hits_arr[max_idx])
  print('Best model train hits@20: ', val_hits_arr[max_idx])

  # convert to numpy array
  val_hits_arr = np.array(val_hits_arr)
  test_hits_arr = np.array(test_hits_arr)
  train_hits_arr = np.array(train_hits_arr)

  # Print average stats + variance
  print(f"Average best train hits@20: {np.mean(train_hits_arr)}; var: {np.var(train_hits_arr)}")
  print(f"Average best val hits@20: {np.mean(val_hits_arr)}; var: {np.var(val_hits_arr)}")
  print(f"Average best test hits@20: {np.mean(test_hits_arr)}; var: {np.var(test_hits_arr)}")

In [9]:
def save_model_ckpt(model, emb, optimizer, predictor, loss, emb_path, model_path):
  ''' Save model and embedding checkpoints. '''
  EPOCH = 100
  # Save model params
  torch.save({
            'epoch': EPOCH,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'predictor_state_dict': predictor.state_dict(),
            'loss': loss,
            }, model_path)
  # Also save initial embedding (just in case)
  torch.save(emb.weight.data.cpu(), emb_path)

In [10]:
# loaded with transform parameter set as such in order to obtain the adj_t matrix
# required for the GNN layers
# torch.serialization.add_safe_globals([DataEdgeAttr])
# torch.serialization.add_safe_globals([DataTensorAttr])
# torch.serialization.add_safe_globals([GlobalStorage])
dataset = PygLinkPropPredDataset(name='ogbl-ddi', transform=T.ToSparseTensor()) # loading ogb-ddi
print('Task type: {}'.format(dataset.task_type))
graph = dataset[0]
adj_t = graph.adj_t.to(device) # loads all edges in graph into sparse adj_t matrix

Downloading http://snap.stanford.edu/ogb/data/linkproppred/ddi.zip


Downloaded 0.04 GB: 100%|██████████| 46/46 [00:00<00:00, 52.99it/s]


Extracting dataset/ddi.zip


Processing...


Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 56.72it/s]


Converting graphs into PyG objects...


100%|██████████| 1/1 [00:00<00:00, 3618.90it/s]
Done!


Saving...
Task type: link prediction


In [11]:
# dataset reloaded without the transform parameter to obtain the data in the
# correct format
dataset = PygLinkPropPredDataset(name='ogbl-ddi')
data = dataset[0]

In [12]:
# getting the train, validation, and test edge splits
split_edge = dataset.get_edge_split()
train_edges = split_edge['train']['edge']
torch.manual_seed(70) # picking random samples to evaluate on
idx = torch.randperm(split_edge['train']['edge'].size(0))
idx = idx[:split_edge['valid']['edge'].size(0)]
split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]}

In [13]:
gnn_args = { # define GNN hyperparams
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'hidden_size': 16,
    'dropout': 0.5,
    'epochs': 100,
    'weight_decay': 1e-5,
    'lr': 0.005,
    'attn_size': 32,
    'attn_head': 1,
    'num_layers':2,
    'log_steps':1,
    'eval_steps':5,
    'runs':10,
    'batch_size': 1024,
}

In [14]:
# TRAIN GAN with random features
gan_model = GAN(gnn_args['hidden_size'], gnn_args['hidden_size'],
                gnn_args['hidden_size'], gnn_args['attn_head'],
                gnn_args['num_layers'], gnn_args['dropout']).to(device)
predictor = LinkPredictor(gnn_args['hidden_size'], gnn_args['hidden_size'], 1,
                          gnn_args['num_layers'], gnn_args['dropout']).to(device)
gan_emb_rand = torch.nn.Embedding(dataset.data.num_nodes, gnn_args['hidden_size']).to(device)
train_model(gan_model, gan_emb_rand, gnn_args, predictor, 'gan_rand_feat')



Hits@10
Run: 01, Epoch: 05, Loss: 0.6390, Train: 13.96%, Valid: 13.31%, Test: 5.98%
Hits@20
Run: 01, Epoch: 05, Loss: 0.6390, Train: 20.27%, Valid: 19.62%, Test: 9.93%
---
Hits@10
Run: 01, Epoch: 10, Loss: 0.5835, Train: 12.90%, Valid: 12.38%, Test: 4.15%
Hits@20
Run: 01, Epoch: 10, Loss: 0.5835, Train: 17.36%, Valid: 16.80%, Test: 9.07%
---
Hits@10
Run: 01, Epoch: 15, Loss: 0.5541, Train: 13.88%, Valid: 13.28%, Test: 7.03%
Hits@20
Run: 01, Epoch: 15, Loss: 0.5541, Train: 21.19%, Valid: 20.48%, Test: 9.72%
---
Hits@10
Run: 01, Epoch: 20, Loss: 0.5378, Train: 17.54%, Valid: 16.59%, Test: 5.35%
Hits@20
Run: 01, Epoch: 20, Loss: 0.5378, Train: 21.03%, Valid: 19.99%, Test: 6.87%
---
Hits@10
Run: 01, Epoch: 25, Loss: 0.5236, Train: 14.96%, Valid: 14.16%, Test: 4.83%
Hits@20
Run: 01, Epoch: 25, Loss: 0.5236, Train: 20.76%, Valid: 19.76%, Test: 6.62%
---
Hits@10
Run: 01, Epoch: 30, Loss: 0.5163, Train: 13.78%, Valid: 12.94%, Test: 3.84%
Hits@20
Run: 01, Epoch: 30, Loss: 0.5163, Train: 16.08%,