<a href="https://colab.research.google.com/github/C8XY66/GNN/blob/main/GIN_ModelCheckpoint_EvFr_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

In [None]:
!pip install pytorch-lightning

In [None]:
!pip install git+https://github.com/optuna/optuna.git
!pip install optuna-dashboard

In [None]:
import os
import re
import datetime
import pytz
import numpy as np
from typing import Optional
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn.functional as F
from torchmetrics import Accuracy

import torch_geometric.transforms as T
from torch_geometric.data.lightning import LightningDataset
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GIN, MLP, global_add_pool
from torch_geometric.data import InMemoryDataset

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
# Load the TensorBoard notebook extension
%load_ext tensorboard

import optuna
from optuna.integration import PyTorchLightningPruningCallback
from optuna.visualization.matplotlib import plot_contour, plot_edf, plot_intermediate_values, plot_optimization_history, plot_parallel_coordinate, plot_param_importances, plot_slice

from google.colab import drive
drive.mount('/content/gdrive')
MAIN_DIR = "/content/gdrive/My Drive/ColabNotebooks/" 
PARENT_DIR = None


import logging
#logging.getLogger("pytorch_lightning").setLevel(logging.WARNING)
#logging.getLogger("lightning").setLevel(logging.ERROR)
#logging.getLogger("lightning").setLevel(logging.CRITICAL)
#logging.getLogger('lightning').setLevel(0)
#logging.getLogger("lightning.pytorch").setLevel(logging.ERROR)
logging.getLogger("pytorch_lightning.utilities.rank_zero").setLevel(logging.WARNING)
logging.getLogger("pytorch_lightning.accelerators.cuda").setLevel(logging.WARNING)
logging.getLogger("pytorch_lightning.callbacks.early_stopping").setLevel(logging.WARNING)



Mounted at /content/gdrive


In [None]:
# Check for CUDA system support and use GPU if available otherwise run on CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # device = Context-manager that changes the selected device
print(device)

cuda


In [None]:
# Hyperparameters

DATASET_NAME = 'NCI1'
#num_layers = 5
lr = 0.01 
EPOCHS = 20 #final = 1000
SEED = 42
N_SPLITS = 3
REP = 2

LOAD_FROM_CHECKPOINT = False
RUN = "NCI1_reps_2_folds_3_epochs_20_2023-04-20_14-17"
CHECKPOINT_PATH = os.path.join(MAIN_DIR, "logs", RUN, "checkpoints")

#CHECKPOINT_PATH = os.path.join(MAIN_DIR, "logs/NCI1_reps_2_folds_3_epochs_20_2023-04-20_14-17/checkpoints/")


In [None]:
class GNNModel(pl.LightningModule):  
    def __init__(self, in_channels: int, out_channels: int,
                 hidden_channels: int, dropout, num_layers=5):
        super().__init__()
        self.save_hyperparameters()
        self.gnn = GIN(in_channels, hidden_channels, num_layers,
                       dropout=dropout, jk='cat')

        self.classifier = MLP([hidden_channels, hidden_channels, out_channels],
                              norm="batch_norm", dropout=dropout)

        self.train_acc = Accuracy(task='multiclass', num_classes=out_channels)
        self.val_acc = Accuracy(task='multiclass', num_classes=out_channels)
        self.test_acc = Accuracy(task='multiclass', num_classes=out_channels)

    def forward(self, x, edge_index, batch):
        x = self.gnn(x, edge_index)
        x = global_add_pool(x, batch)
        x = self.classifier(x)
        return x

    def training_step(self, data, batch_idx):
        y_hat = self(data.x, data.edge_index, data.batch)
        loss = F.cross_entropy(y_hat, data.y)
        self.train_acc(y_hat.softmax(dim=-1), data.y)
        self.log('train_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('train_acc', self.train_acc, prog_bar=True, on_step=False, on_epoch=True)
        return loss

    def validation_step(self, data, batch_idx):
        y_hat = self(data.x, data.edge_index, data.batch)
        loss = F.cross_entropy(y_hat, data.y)
        self.val_acc(y_hat.softmax(dim=-1), data.y)
        self.log('val_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('val_acc', self.val_acc, prog_bar=True, on_step=False, on_epoch=True)

    def test_step(self, data, batch_idx):
        y_hat = self(data.x, data.edge_index, data.batch)
        loss = F.cross_entropy(y_hat, data.y)
        self.test_acc(y_hat.softmax(dim=-1), data.y)
        self.log('test_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('test_acc', self.test_acc, prog_bar=True, on_step=False,on_epoch=True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.01)
    
    def on_save_checkpoint(self, checkpoint):
        checkpoint["init_args"] = self.hparams

In [None]:
class GraphDataModule(pl.LightningDataModule):
    def __init__(self, dataset_name, n_splits=10, fold=0):
        super().__init__()
        self.dataset_name = dataset_name
        self.n_splits = n_splits
        self.fold = fold

    def prepare_data(self):    
        self.dataset = TUDataset(root='data/TUDataset', name=self.dataset_name)
        self.dataset = self.dataset[:1000] #for quick experiments
        self.skf = StratifiedKFold(n_splits=self.n_splits)

    def setup(self, stage: Optional[str] = None, fold: int = 0, batch_size: int = 32):
        self.fold = fold
        self.batch_size = batch_size
        y = [data.y.item() for data in self.dataset]

        train_indices, test_indices = list(self.skf.split(torch.zeros(len(y)), y))[self.fold]
        train_dataset = self.dataset[train_indices]
        
        num_val = int(len(train_dataset) * 0.1)
        num_train = len(train_dataset) - num_val
        
        self.train_dataset, self.val_dataset = torch.utils.data.random_split(train_dataset, [num_train, num_val])
        self.test_dataset = self.dataset[test_indices]
      
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False)

    @property
    def num_node_features(self):
        return self.dataset.num_node_features

    @property
    def num_classes(self):
        return self.dataset.num_classes


In [None]:
def create_log_dir(repetition_index, fold_index):
    global PARENT_DIR
    
    # Current timestamp
    now = datetime.datetime.now(pytz.timezone('Europe/Zurich')).strftime("%Y-%m-%d_%H-%M")

    # Parent directory
    parent_dir_info = f"{DATASET_NAME}_reps_{REP}_folds_{N_SPLITS}_epochs_{EPOCHS}"

    if PARENT_DIR is None:
        PARENT_DIR = f"{MAIN_DIR}logs/{parent_dir_info}_{now}"
        if not os.path.exists(PARENT_DIR):
            os.makedirs(PARENT_DIR)

    # Subdirectory for the specific repetition and fold
    if repetition_index is not None and fold_index is not None:
        sub_dir = f"{PARENT_DIR}/rep_{repetition_index}_fold_{fold_index}"
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
    else:
        sub_dir = PARENT_DIR
    
    return sub_dir

In [None]:
class SaveBestModelCallback(optuna.integration.PyTorchLightningPruningCallback):
    def __init__(self, trial, monitor, repetition_index, fold_index, study):
        super().__init__(trial, monitor)
        self.repetition_index = repetition_index
        self.fold_index = fold_index
        self.study = study

        def on_validation_end(self, trainer, pl_module):
            super().on_validation_end(trainer, pl_module)

            # Check if the current model is the best so far
            if trainer.callback_metrics[self.monitor] == self.study.best_value:
                # Save the best model's weights as a checkpoint
                checkpoint_callback = ModelCheckpoint(monitor=self.monitor,
                                                      mode="max",
                                                      dirpath=f"{PARENT_DIR}/checkpoints",
                                                      filename=f"best_model_rep_{self.repetition_index}_fold_{self.fold_index}_trial_{self._trial.number}")
                checkpoint_callback.on_validation_end(trainer, pl_module)

In [None]:
def create_trainer(log_dir, epochs, pruning_callback=None, save_best_model_callback=None, testing=False):
    
    callbacks = []
    
    if not testing:
        # Training Callbacks
        early_stopping = EarlyStopping(monitor="val_acc", mode="max", patience=10, verbose=True)
        callbacks.append(early_stopping)
        
        if pruning_callback is not None:
            callbacks.append(pruning_callback)
        if save_best_model_callback is not None:
            callbacks.append(save_best_model_callback)

    # Create trainer
    trainer = pl.Trainer(
        callbacks=callbacks,
        max_epochs=epochs,
        log_every_n_steps=10,
        logger=TensorBoardLogger(save_dir=log_dir),
        enable_progress_bar=False,
        enable_model_summary=False,
    )

    return trainer

In [None]:
def objective(trial, datamodule, epochs, repetition_index, fold_index, study): 

    # Optimise hyperparameters
    hidden_channels = trial.suggest_categorical('hidden_channels', [16, 32])
    batch_size = trial.suggest_categorical('batch_size', [32, 128])
    dropout = trial.suggest_categorical('dropout', [0.0, 0.5])

    # Model and DataModule
    datamodule.setup(batch_size=batch_size)
    model = GNNModel(in_channels=datamodule.num_node_features, out_channels=datamodule.num_classes, hidden_channels=hidden_channels, dropout=dropout)

    # Training
    pruning_callback = PyTorchLightningPruningCallback(trial, monitor="val_acc") #from optuna-pl-integration
    save_best_model_callback = SaveBestModelCallback(trial, monitor="val_acc", repetition_index=repetition_index, fold_index=fold_index, study=study)

    log_dir = create_log_dir(repetition_index, fold_index)
    trainer = create_trainer(log_dir, epochs=epochs, 
                             pruning_callback=pruning_callback, 
                             save_best_model_callback=save_best_model_callback)
    
    hyperparameters = dict(hidden_channels=hidden_channels, batch_size=batch_size, epochs=epochs, dropout=dropout)
    trainer.logger.log_hyperparams(hyperparameters)    
    
    trainer.fit(model, datamodule=datamodule)

    return trainer.callback_metrics['val_acc'].item()

In [None]:
def evaluate(model, datamodule, epochs, repetition_index, fold_index, load_from_checkpoint=False):
    
    log_dir = create_log_dir(repetition_index, fold_index)
    trainer = create_trainer(log_dir, epochs=epochs, testing=True)
    trainer.test(model, datamodule=datamodule)

    return trainer.callback_metrics['test_acc'].item()

In [None]:
def load_last_checkpoint(checkpoint_path):
    # find most recent checkpoint in the folder provided by checkpoint_path
    checkpoint_files = os.listdir(checkpoint_path)
    checkpoint_files = sorted(checkpoint_files, key=lambda x: os.path.getmtime(os.path.join(checkpoint_path, x)), reverse=True)
    #checkpoints = sorted(os.listdir(checkpoint_path), reverse=True)

    if not checkpoint_files:
        raise FileNotFoundError(f"No checkpoint files found in {checkpoint_path}")

    last_checkpoint = os.path.join(checkpoint_path, checkpoint_files[0])

    # Extract the repetition and fold numbers from the filename
    pattern = r"best_model_rep_(\d+)_fold_(\d+)"
    match = re.search(pattern, last_checkpoint)

    if match:
        starting_rep = int(match.group(1))
        starting_fold = int(match.group(2))
    else:
        raise ValueError("Could not extract repetition and fold numbers from the checkpoint filename")

    return last_checkpoint, starting_rep, starting_fold

In [None]:
if __name__ == '__main__': 

    datamodule = GraphDataModule(dataset_name=DATASET_NAME)
  
    overall_performances = []
    starting_rep, starting_fold = 0, 0

    if LOAD_FROM_CHECKPOINT:
        checkpoint_path, starting_rep, starting_fold = load_last_checkpoint(CHECKPOINT_PATH)
        checkpoint = torch.load(checkpoint_path)  # Load the checkpoint dictionary from the file
        init_args = checkpoint["init_args"]  # Access the saved initialization parameters
        model = GNNModel(**init_args)  # Initialize the model using the saved parameters
        
    for r in range(starting_rep, REP):
        datamodule.prepare_data()
        fold_performances = []
        for fold in range(starting_fold if r == starting_rep else 0, N_SPLITS):
            if LOAD_FROM_CHECKPOINT and r == starting_rep and fold == starting_fold:
                test_acc = evaluate(model, datamodule, EPOCHS, r, fold)
            else:
                # Create a new study object for each fold
                study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner(), sampler=optuna.samplers.TPESampler(seed=SEED),)
                datamodule.setup("fit", fold)
                study.optimize(lambda trial: objective(trial, datamodule, EPOCHS, r, fold, study), n_trials=8)
                print(f"Best trial for fold {fold}: {study.best_trial.value}")

                # Retrain the model with the best hyperparameters
                best_params = study.best_trial.params
                model = GNNModel(in_channels=datamodule.num_node_features,
                              out_channels=datamodule.num_classes,
                              hidden_channels=best_params['hidden_channels'],
                              dropout=best_params['dropout'])
                datamodule.setup("fit", fold, batch_size=best_params['batch_size'])
                test_acc = evaluate(model, datamodule, EPOCHS, r, fold)
            fold_performances.append(test_acc)

        avg_performance = np.mean(fold_performances)
        print(f"Average performance for repetition {r}: {avg_performance}")
        overall_performances.append(avg_performance)

    print(f"Overall average performance: {np.mean(overall_performances)}")

[I 2023-04-21 11:36:33,354] A new study created in memory with name: no-name-44e53144-852b-4498-92ff-e603cdf89725
[I 2023-04-21 11:36:38,777] Trial 0 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 32, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:36:42,604] Trial 1 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:36:46,876] Trial 2 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:36:51,705] Trial 3 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 32, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:36:55,951] Trial 4 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:36:59,623] Trial 5 fi

Best trial for fold 0: 1.0


[I 2023-04-21 11:37:08,789] A new study created in memory with name: no-name-4c26003d-44f6-47e7-aad7-9769e9e2cd55
[I 2023-04-21 11:37:12,860] Trial 0 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 32, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:37:16,595] Trial 1 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:37:21,402] Trial 2 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:37:25,449] Trial 3 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 32, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:37:29,123] Trial 4 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:37:32,811] Trial 5 fi

Best trial for fold 1: 1.0


[I 2023-04-21 11:37:41,700] A new study created in memory with name: no-name-7141b6cc-4bd3-484a-9b93-8a49afe7779e
[I 2023-04-21 11:37:45,337] Trial 0 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 32, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:37:50,045] Trial 1 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
[I 2023-04-21 11:37:52,522] Trial 2 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:37:56,172] Trial 3 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 32, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
[I 2023-04-21 11:37:57,320] Trial 4 finished wit

Best trial for fold 2: 1.0


[I 2023-04-21 11:38:10,586] A new study created in memory with name: no-name-530e2d56-304c-4e81-9c07-77a2c179e9cd


Average performance for repetition 0: 1.0


[I 2023-04-21 11:38:14,447] Trial 0 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 32, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:38:19,024] Trial 1 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:38:23,391] Trial 2 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:38:27,014] Trial 3 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 32, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:38:30,694] Trial 4 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:38:35,939] Trial 5 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 128, 'dropout': 0.0}. Best is trial 0

Best trial for fold 0: 1.0


[I 2023-04-21 11:38:44,332] A new study created in memory with name: no-name-9f779168-9784-468f-becd-ba3c77e4bffc
[I 2023-04-21 11:38:49,456] Trial 0 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 32, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:38:53,978] Trial 1 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:38:57,844] Trial 2 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:39:01,823] Trial 3 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 32, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:39:06,783] Trial 4 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:39:10,651] Trial 5 fi

Best trial for fold 1: 1.0


[I 2023-04-21 11:39:20,879] A new study created in memory with name: no-name-ca0d7c42-310b-4602-8a91-62c65c0ef194
[I 2023-04-21 11:39:24,816] Trial 0 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 32, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:39:28,507] Trial 1 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:39:33,119] Trial 2 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 128, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:39:37,488] Trial 3 finished with value: 1.0 and parameters: {'hidden_channels': 16, 'batch_size': 32, 'dropout': 0.5}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:39:41,114] Trial 4 finished with value: 1.0 and parameters: {'hidden_channels': 32, 'batch_size': 128, 'dropout': 0.0}. Best is trial 0 with value: 1.0.
[I 2023-04-21 11:39:44,771] Trial 5 fi

Best trial for fold 2: 1.0


Average performance for repetition 1: 1.0
Overall average performance: 1.0


In [None]:
#%tensorboard --logdir '/content/gdrive/My Drive/ColabNotebooks/'
%tensorboard --logdir '{MAIN_DIR}'