<a href="https://colab.research.google.com/github/C8XY66/GIN/blob/main/GIN_Log_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

In [None]:
!pip install pytorch-lightning

In [None]:
!pip install git+https://github.com/optuna/optuna.git
!pip install optuna-dashboard

In [None]:
import os
import datetime
import pytz
import numpy as np
from typing import Optional
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn.functional as F
from torchmetrics import Accuracy

import torch_geometric.transforms as T
from torch_geometric.data.lightning import LightningDataset
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GIN, MLP, global_add_pool
from torch_geometric.data import InMemoryDataset

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
# Load the TensorBoard notebook extension
%load_ext tensorboard

import optuna
from optuna.integration import PyTorchLightningPruningCallback
from optuna.visualization.matplotlib import plot_contour, plot_edf, plot_intermediate_values, plot_optimization_history, plot_parallel_coordinate, plot_param_importances, plot_slice

from google.colab import drive
drive.mount('/content/gdrive')
MAIN_DIR = "/content/gdrive/My Drive/ColabNotebooks/" 
PARENT_DIR = None


import logging
#logging.getLogger("pytorch_lightning").setLevel(logging.WARNING)
#logging.getLogger("lightning").setLevel(logging.ERROR)
#logging.getLogger("lightning").setLevel(logging.CRITICAL)
#logging.getLogger('lightning').setLevel(0)
#logging.getLogger("lightning.pytorch").setLevel(logging.ERROR)
logging.getLogger("pytorch_lightning.utilities.rank_zero").setLevel(logging.WARNING)
logging.getLogger("pytorch_lightning.accelerators.cuda").setLevel(logging.WARNING)
logging.getLogger("pytorch_lightning.callbacks.early_stopping").setLevel(logging.WARNING)



Mounted at /content/gdrive


In [None]:
# Check for CUDA system support and use GPU if available otherwise run on CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # device = Context-manager that changes the selected device
print(device)

cuda


In [None]:
# Hyperparameters

DATASET_NAME = 'NCI1'
num_layers = 5
lr = 0.01 
EPOCHS = 20 #final = 1000
SEED = 42
N_SPLITS = 3
REP = 2

In [None]:
class GNNModel(pl.LightningModule):
    def __init__(self, in_channels: int, out_channels: int,
                 hidden_channels: int, dropout, num_layers=num_layers):
        super().__init__()
        self.gnn = GIN(in_channels, hidden_channels, num_layers,
                       dropout=dropout, jk='cat')

        self.classifier = MLP([hidden_channels, hidden_channels, out_channels],
                              norm="batch_norm", dropout=dropout)

        self.train_acc = Accuracy(task='multiclass', num_classes=out_channels)
        self.val_acc = Accuracy(task='multiclass', num_classes=out_channels)
        self.test_acc = Accuracy(task='multiclass', num_classes=out_channels)

    def forward(self, x, edge_index, batch):
        x = self.gnn(x, edge_index)
        x = global_add_pool(x, batch)
        x = self.classifier(x)
        return x

    def training_step(self, data, batch_idx):
        y_hat = self(data.x, data.edge_index, data.batch)
        loss = F.cross_entropy(y_hat, data.y)
        self.train_acc(y_hat.softmax(dim=-1), data.y)
        self.log('train_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('train_acc', self.train_acc, prog_bar=True, on_step=False, on_epoch=True)
        return loss

    def validation_step(self, data, batch_idx):
        y_hat = self(data.x, data.edge_index, data.batch)
        loss = F.cross_entropy(y_hat, data.y)
        self.val_acc(y_hat.softmax(dim=-1), data.y)
        self.log('val_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('val_acc', self.val_acc, prog_bar=True, on_step=False, on_epoch=True)

    def test_step(self, data, batch_idx):
        y_hat = self(data.x, data.edge_index, data.batch)
        loss = F.cross_entropy(y_hat, data.y)
        self.test_acc(y_hat.softmax(dim=-1), data.y)
        self.log('test_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('test_acc', self.test_acc, prog_bar=True, on_step=False,on_epoch=True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.01)


In [None]:
class GraphDataModule(pl.LightningDataModule):
    def __init__(self, dataset_name, n_splits=10, fold=0):
        super().__init__()
        self.dataset_name = dataset_name
        self.n_splits = n_splits
        self.fold = fold

    def prepare_data(self):    
        self.dataset = TUDataset(root='data/TUDataset', name=self.dataset_name)
        self.dataset = self.dataset[:1000] #for quick experiments
        self.skf = StratifiedKFold(n_splits=self.n_splits)

    def setup(self, stage: Optional[str] = None, fold: int = 0, batch_size: int = 32):
        self.fold = fold
        self.batch_size = batch_size
        y = [data.y.item() for data in self.dataset]

        train_indices, test_indices = list(self.skf.split(torch.zeros(len(y)), y))[self.fold]
        train_dataset = self.dataset[train_indices]
        
        num_val = int(len(train_dataset) * 0.1)
        num_train = len(train_dataset) - num_val
        
        self.train_dataset, self.val_dataset = torch.utils.data.random_split(train_dataset, [num_train, num_val])
        self.test_dataset = self.dataset[test_indices]
      
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False)

    @property
    def num_node_features(self):
        return self.dataset.num_node_features

    @property
    def num_classes(self):
        return self.dataset.num_classes


In [None]:
def create_log_dir(repetition_index, fold_index, extra_info=''):
    global PARENT_DIR
    
    # Current timestamp
    now = datetime.datetime.now(pytz.timezone('Europe/Zurich')).strftime("%Y-%m-%d_%H-%M")

    # Parent directory
    parent_dir_info = f"{DATASET_NAME}_reps_{REP}_folds_{N_SPLITS}_epochs_{EPOCHS}"

    if PARENT_DIR is None:
        PARENT_DIR = f"{MAIN_DIR}logs/{parent_dir_info}_{now}"
        if not os.path.exists(PARENT_DIR):
            os.makedirs(PARENT_DIR)

    # Subdirectory for the specific repetition and fold
    if repetition_index is not None and fold_index is not None:
        sub_dir = f"{PARENT_DIR}/rep_{repetition_index}_fold_{fold_index}{extra_info}"
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
    else:
        sub_dir = PARENT_DIR
    
    return sub_dir

In [None]:
def create_trainer(log_dir, epochs, pruning_callback=None):
    # Callbacks
    early_stopping = EarlyStopping(monitor="val_acc", mode="max", patience=10, verbose=True)
  
    callbacks = [early_stopping]
    if pruning_callback is not None:
        callbacks.append(pruning_callback)

    # Create trainer
    trainer = pl.Trainer(
        callbacks=callbacks,
        max_epochs=epochs,
        log_every_n_steps=5,
        logger=TensorBoardLogger(save_dir=log_dir),
        enable_progress_bar=False,
        enable_model_summary=False,
    )

    return trainer

In [None]:
def objective(trial, datamodule, epochs, repetition_index, fold_index): 

    # Optimise hyperparameters
    hidden_channels = trial.suggest_categorical('hidden_channels', [16, 32])
    batch_size = trial.suggest_categorical('batch_size', [32, 128])
    dropout = trial.suggest_categorical('dropout', [0.0, 0.5])

    # Model and DataModule
    datamodule.setup(fold=0, batch_size=batch_size)
    model = GNNModel(in_channels=datamodule.num_node_features, out_channels=datamodule.num_classes, hidden_channels=hidden_channels, dropout=dropout)

    # Training
    pruning_callback = PyTorchLightningPruningCallback(trial, monitor="val_acc") #from optuna-pl-integration
    
    log_dir = create_log_dir(repetition_index, fold_index, extra_info='_selection')
    trainer = create_trainer(log_dir, epochs=epochs, pruning_callback=pruning_callback)
    
    hyperparameters = dict(hidden_channels=hidden_channels, batch_size=batch_size, epochs=epochs, dropout=dropout)
    trainer.logger.log_hyperparams(hyperparameters)    
    
    trainer.fit(model, datamodule=datamodule)

    return trainer.callback_metrics['val_acc'].item()
    

In [None]:
def retrain_and_evaluate(model, datamodule, epochs, repetition_index, fold_index):
    
    log_dir = create_log_dir(repetition_index, fold_index, extra_info='_assessment')
    trainer = create_trainer(log_dir, epochs=epochs, pruning_callback=None)
    
    trainer.fit(model, datamodule=datamodule)
    trainer.test(model, datamodule=datamodule)

    return trainer.callback_metrics['test_acc'].item()

In [None]:
if __name__ == '__main__': 

    datamodule = GraphDataModule(dataset_name=DATASET_NAME)
  
    overall_performances = []
    for r in range(REP):
        datamodule.prepare_data()
        fold_performances = []
        for fold in range(N_SPLITS):
            # Create a new study object for each fold
            study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner(), sampler=optuna.samplers.TPESampler(seed=SEED),)
            datamodule.setup("fit", fold)
            study.optimize(lambda trial: objective(trial, datamodule, EPOCHS, r, fold), n_trials=8)
            print(f"Best trial for fold {fold}: {study.best_trial.value}")

            # Retrain the model with the best hyperparameters
            best_params = study.best_trial.params
            model = GNNModel(in_channels=datamodule.num_node_features,
                          out_channels=datamodule.num_classes,
                          hidden_channels=best_params['hidden_channels'],
                          dropout=best_params['dropout'])
            datamodule.setup("fit", fold, batch_size=best_params['batch_size'])
            test_acc = retrain_and_evaluate(model, datamodule, EPOCHS, r, fold)
            fold_performances.append(test_acc)

        avg_performance = np.mean(fold_performances)
        print(f"Average performance for repetition {r}: {avg_performance}")
        overall_performances.append(avg_performance)

    print(f"Overall average performance: {np.mean(overall_performances)}")


Downloading https://www.chrsmrrs.com/graphkerneldatasets/NCI1.zip
Extracting data/TUDataset/NCI1/NCI1.zip
Processing...
Done!
[I 2023-04-19 17:03:03,245] A new study created in memory with name: no-name-3e54b8d3-47d1-4ab3-9252-c191fd4decc6
[I 2023-04-19 17:03:20,292] Trial 0 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 32, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:03:24,764] Trial 1 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:03:28,188] Trial 2 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:03:31,615] Trial 3 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 32, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:03:36,009] Trial 4 finished with value: 1.0 and parameters: {'hid

Best trial for fold 0: 1.0


[I 2023-04-19 17:03:51,514] A new study created in memory with name: no-name-1f46eceb-0457-4ef3-85cb-e054747edafc
[I 2023-04-19 17:03:55,224] Trial 0 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 32, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:03:58,626] Trial 1 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:04:02,026] Trial 2 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:04:06,744] Trial 3 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 32, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:04:10,074] Trial 4 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:04:13,443] Trial 5 fi

Best trial for fold 1: 1.0


[I 2023-04-19 17:04:25,718] A new study created in memory with name: no-name-23dc2762-86c8-4b89-b02b-789492af87e1
[I 2023-04-19 17:04:29,479] Trial 0 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 32, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:04:34,204] Trial 1 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:04:38,194] Trial 2 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:04:41,653] Trial 3 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 32, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:04:45,044] Trial 4 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:04:49,720] Trial 5 fi

Best trial for fold 2: 1.0


[I 2023-04-19 17:05:00,563] A new study created in memory with name: no-name-32fdd9f1-8cd1-4d70-bc6a-cea5a44860e4


Average performance for repetition 0: 1.0


[I 2023-04-19 17:05:06,011] Trial 0 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 32, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:05:09,495] Trial 1 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:05:13,280] Trial 2 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:05:17,424] Trial 3 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 32, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:05:21,646] Trial 4 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:05:25,032] Trial 5 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 128, 'dropout': 0.0}. Best is trial 0

Best trial for fold 0: 1.0


[I 2023-04-19 17:05:36,764] A new study created in memory with name: no-name-9a6b3afd-de63-4bc0-8b5d-195c2a07bdba
[I 2023-04-19 17:05:40,133] Trial 0 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 32, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:05:43,584] Trial 1 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:05:48,327] Trial 2 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:05:51,839] Trial 3 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 32, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:05:55,267] Trial 4 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:05:58,658] Trial 5 fi

Best trial for fold 1: 1.0


[I 2023-04-19 17:06:10,650] A new study created in memory with name: no-name-1767c93d-9c48-4a62-9e19-912ead10e1f5
[I 2023-04-19 17:06:14,680] Trial 0 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 32, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:06:18,986] Trial 1 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:06:22,396] Trial 2 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:06:25,805] Trial 3 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 32, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:06:30,034] Trial 4 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-19 17:06:33,896] Trial 5 fi

Best trial for fold 2: 1.0


Average performance for repetition 1: 1.0
Overall average performance: 1.0


In [None]:
#%tensorboard --logdir '/content/gdrive/My Drive/ColabNotebooks/'
%tensorboard --logdir '{MAIN_DIR}'