<a href="https://colab.research.google.com/github/Clearbloo/Feynman_GNN/blob/main/Feynman_GNN_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Install and import libraries**


In [1]:
## Standard libraries
import os
import os.path as osp
import json
import math
import numpy as np 
import time
import pandas as pd
import ast
import random as rnd
from typing import Optional
from functools import partial
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, f1_score, \
    accuracy_score, precision_score, recall_score, roc_auc_score
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

## Imports for plotting
import matplotlib.pyplot as plt
%matplotlib inline 
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg', 'pdf') # For export
from matplotlib.colors import to_rgb
import matplotlib
matplotlib.rcParams['lines.linewidth'] = 2.0
import seaborn as sns
sns.reset_orig()
sns.set()

# Load the TensorBoard notebook extension
%load_ext tensorboard

## Progress bar
from tqdm.notebook import tqdm

## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
from torch.nn import MSELoss
# Torchvision
import torchvision
from torchvision import transforms
# PyTorch Lightning
try:
    import pytorch_lightning as pl
except ModuleNotFoundError: # Google Colab does not have PyTorch Lightning installed by default. Hence, we do it here if necessary
    !pip install pytorch-lightning>=1.4
    import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

## Ray
try:
    import ray
except ModuleNotFoundError: # Google Colab does not have Ray installed by default.
    !pip install ray
    import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from ray.tune.integration.pytorch_lightning import TuneReportCallback, TuneReportCheckpointCallback

## Tensorboard
try:
  import tensorboardX
except ModuleNotFoundError:
  !pip install tensorboardX
  import tensorboardX

# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)
DATASET_PATH = "/content/gdrive/MyDrive/Part_III_Project/data/"
# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH = "/content/gdrive/MyDrive/Part_III_Project/saved_models/"

# Setting the seed
pl.seed_everything()

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

# torch geometric
try: 
    import torch_geometric
except ModuleNotFoundError:
    # Installing torch geometric packages with specific CUDA+PyTorch version. 
    # See https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html for details 
    TORCH = torch.__version__.split('+')[0]
    CUDA = 'cu' + torch.version.cuda.replace('.','')

    !pip install --quiet torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
    !pip install --quiet torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
    !pip install --quiet torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
    !pip install --quiet torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
    !pip install --quiet torch-geometric 
    import torch_geometric
import torch_geometric.nn as geom_nn
import torch_geometric.data as geom_data
from torch_geometric.data import Dataset, Data, InMemoryDataset
from torch_geometric.loader import DataLoader
from torch.nn import Linear, BatchNorm1d, ModuleList
from torch_geometric.nn import TopKPooling 
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp


Mounted at /content/gdrive
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.8.0 requires tf-estimator-nightly==2.8.0.dev2021122109, which is not installed.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Collecting ray
  Downloading ray-1.11.0-cp37-cp37m-manylinux2014_x86_64.whl (52.7 MB)
[K     |████████████████████████████████| 52.7 MB 85 kB/s 
[?25hCollecting redis>=3.5.0
  Downloading redis-4.1.4-py3-none-any.whl (175 kB)
[K     |████████████████████████████████| 175 kB 62.4 MB/s 
Collecting grpcio<=1.43.0,>=1.28.1
  Downloading grpcio-1.43.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.1 MB)
[K     |████████████████████████████████| 4.1 MB 48.7 MB/s 
Collecting deprecated>=1.2.3
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Installing collected packages: de

  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 1602145498


cpu
[K     |████████████████████████████████| 7.9 MB 7.2 MB/s 
[K     |████████████████████████████████| 3.5 MB 6.9 MB/s 
[K     |████████████████████████████████| 2.5 MB 7.4 MB/s 
[K     |████████████████████████████████| 750 kB 7.6 MB/s 
[K     |████████████████████████████████| 407 kB 9.3 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


#**My own dataset class**

In [5]:
class FeynmanDataset(Dataset):
    def __init__(self, dataset_size, filename, reprocess: bool = False, root=DATASET_PATH, test: bool = False, train: bool = False, val: bool = False, pred: bool = False, transform=None, pre_transform=None, pre_filter=None):
      """
      root = directory where dataset should be stored. Contains raw data in raw_dir and processed data in processed_dir
      test, train, val = bools, what type of dataset you want. default all false
      """
      self.filename = filename
      self.trunc_size= 10000
      self.test = test 
      self.train = train
      self.val = val
      self.pred = pred
      self.reproc = reprocess
      self.label="full"
      if self.train == True:
        self.label="train"
      if self.val == True:
        self.label="val"
      if self.test == True:
        self.label="test"
      if self.pred == True:
        self.label="pred"

      if self.reproc == True:
        overwrite_conf = input("This may overwrite old dataset files. Are you happy for this to happen? Type 'yes' to confirm. \n")
        if overwrite_conf == 'yes':
          pass
      else:
          self.reproc = False
      
      self.dataset_size = dataset_size
      super().__init__(root, transform, pre_transform, pre_filter)

    @property
    def raw_file_names(self):
      #skips download if this is found
      return self.filename

    @property
    def processed_file_names(self):
      #will skip the process method if the following files are found
      print("looking for files")
      if self.reproc == False:
        proc_files = [0]*self.dataset_size
        for idx in range(self.dataset_size):
          proc_files[idx] =  f'{self.label}_data_{idx}.pt'
        return proc_files
      else:
        return "empty.pt"

    def download(self):
        # Download to `self.raw_dir`. In the future I will make this call a
        # python file to build the dataset as a csv
        print("No files to download")
        pass

    def process(self):
      print("processing starting")
      time.sleep(60)
      print("reading data")
      self.full_data = pd.read_csv(self.raw_paths[0])
      print("sampling data")
      self.data = self.full_data[0:self.dataset_size]
      #self.data = self.full_data.sample(n=self.dataset_size)

      #create a list of all y values
      print("creating list of y values")
      all_y_values=self.data['y'].tolist()
      y_max = max(all_y_values)
      y_min = min(all_y_values)
      

      #truncate dataframe into smaller dataframes
      split_size = math.ceil(self.dataset_size/self.trunc_size)
      self.trunc_data_list = np.array_split(self.data, split_size)
      #cycle through graphs and create data objects for each
      idx=0
      for k in tqdm(range(len(self.trunc_data_list))):
        self.trunc_data=self.trunc_data_list[k]
        for row, feyndiag in tqdm(self.trunc_data.iterrows(),total=self.trunc_data.shape[0]):
          #node features
          #start = time.time()
          x = self._get_node_features(feyndiag)
          #end=time.time()
          #print("time to get node features: ", end-start)
          #edge features
          #start = time.time()
          edge_attr = self._get_edge_features(feyndiag)
          #end=time.time()
          #print("time to get edge features: ", end-start)
          #adjacency list
          #start = time.time()
          edge_index = self._get_adj_list(feyndiag)
          #end = time.time()
          #print("time to get adj list: ", end-start)
          #targets
          y = self._get_targets(feyndiag)
          #normalized targets to the interval [0,1]
          y_norm = (y-y_min)/(y_max-y_min)
          
          #create data object
          #print("creating data object")
          data = Data(x=x, edge_index = edge_index, edge_attr=edge_attr, y=y, y_norm=y_norm)
          #save file
          start=time.time()
          torch.save(data, osp.join(self.processed_dir, f'{self.label}_data_{idx}.pt'))
          end=time.time()

          if idx % 10000 == 0:
            print("save time is: ", end-start)
          idx+=1      

    def _get_node_features(self, diagram):
      """
      This will return a list of the node feature vectors (which are 1D)
      [Number of Nodes, 1]
      """
      x = ast.literal_eval(diagram.loc['x'])
      x = torch.tensor(x,dtype=torch.float).view(-1,1)
      return x

    def _get_edge_features(self, diagram):
      """
      This will return a list of the edge feature vectors (which are 11D)
      [Number of Edges, 11]
      """
      attr = ast.literal_eval(diagram.loc['edge_attr'])
      return torch.tensor(attr,dtype=torch.float).view(-1,11)
      
    def _get_adj_list(self, diagram):
      """
      This will return a list of the adjacency vectors (which are 2D)
      [2, Number of Edges]
      """
      adj_list = ast.literal_eval(diagram.loc['edge_index'])
      return torch.tensor(adj_list,dtype=torch.long).view(2,-1)

    def _get_targets(self, diagram):
      """
      This will return a list of the target vectors (which are 1D)
      [Number of targets, 1]
      """
      y = diagram.loc['y']
      return torch.tensor(y,dtype=torch.float)

    def len(self):
      return len(self.processed_file_names)

    def get(self, idx):
      data = torch.load(osp.join(self.processed_dir, f'{self.label}_data_{idx}.pt'))
      return data

In [3]:
class MyDataset(torch.utils.data.Dataset):
  """
  Dataset extending the pytorch utils dataset class. Need to pass my an object with my own dataset class FeynmanDataset to it.
  """
  def __init__(self, dataset):
    self.dataset = dataset
  def __len__(self):
    return self.dataset.len()
  def __getitem__(self, idx):
    return self.dataset[idx]


#**Load the dataset**

In [6]:
#drive.mount("/content/gdrive", force_remount=True)
filename = 'QED_data.csv'
full_dataset = FeynmanDataset(dataset_size=1800000, filename=filename)

looking for files


KeyboardInterrupt: ignored

In [None]:
print("Loading datasets...")
train_dataset = FeynmanDataset(1000000, reprocess=False, filename=filename, train=True)
test_dataset = FeynmanDataset(100, reprocess=False, filename=filename, test=True)
val_dataset = FeynmanDataset(50, reprocess=False, filename=filename, val=True)
pred_dataset = FeynmanDataset(10, reprocess=False, filename=filename, pred=True)
print("Finished all!")

In [None]:
a = np.arange(1800000)
rnd.shuffle(a)
a.tolist()

In [37]:
a[0:10]

array([1421008,  959689,  696138,  234761, 1558025,  556954,  224539,
       1727901, 1030543, 1711408])

In [None]:
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, num_workers=2)
val_loader = DataLoader(dataset=val_dataset, batch_size=64, num_workers=2)
pred_loader = DataLoader(dataset=pred_dataset, batch_size=1) #keep this batch_size as one to get predictions to work

#**Some loss functions**
Defining some loss functions

In [None]:
class LogCoshLoss(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, y_t, y_prime_t):
        ey_t = y_t - y_prime_t
        return torch.mean(torch.log(torch.cosh(ey_t + 1e-12)))


In [None]:
class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, pred, actual):
        return torch.sqrt(self.mse(torch.log(pred + 1), torch.log(actual + 1)))

In [None]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
    
    def forward(self, pred, actual):
      return torch.sqrt(self.mse(pred, actual))

#**Training code and GNN model using Lightning Module**

* Lightning training module
* Uses Transformer convolution layer

In [None]:
class FeynModel(pl.LightningModule):
    def __init__(self, c_in, c_out, layer_name, model_params, filename='QED_data.csv'):
        """
        c_in = channels in (feature dimensions, e.g. RGB is 3)
        c_out = channels out (target dimension, e.g. classification is 1)
        """
        super().__init__()
        self.filename=filename
        self.batch_size = model_params["model_batch_size"]
        embedding_size = model_params["model_embedding_size"]
        n_heads = model_params["model_attention_heads"]
        self.n_layers = model_params["model_layers"]
        dropout_rate = model_params["model_dropout_rate"]
        top_k_ratio =  model_params["model_top_k_ratio"]
        self.top_k_every_n = model_params["model_top_k_every_n"]
        dense_neurons = model_params["model_dense_neurons"]
        edge_dim = model_params["model_edge_dim"]-3 #remove momenta from edge_attr
        edge_num = 5 #need to update this
                
        gnn_layer = gnn_layer_by_name[layer_name]
        self.lr = model_params["model_learning_rate"]
        self.weight_decay = model_params["model_weight_decay"]
        self.lin_dropout_prob = model_params["model_lin_dropout_prob"]
        self.save_hyperparameters()
        self.loss_fn = MSELoss()

        self.conv_layers = ModuleList([])
        self.transf_layers = ModuleList([])
        self.pooling_layers = ModuleList([])
        self.bn_layers = ModuleList([])

        # Transformation layer
        self.conv1 = gnn_layer(in_channels=c_in,
                               out_channels=embedding_size, 
                               heads=n_heads, 
                               dropout=dropout_rate,
                               edge_dim=edge_dim
                               ) 

        self.transf1 = Linear(embedding_size*n_heads, embedding_size)
        self.bn1 = BatchNorm1d(embedding_size)

        # Other layers
        for i in range(self.n_layers):
            self.conv_layers.append(gnn_layer(embedding_size, 
                                              embedding_size, 
                                              heads=n_heads, 
                                              dropout=dropout_rate,
                                              edge_dim=edge_dim,
                                              ))

            self.transf_layers.append(Linear(embedding_size*n_heads, embedding_size))
            self.bn_layers.append(BatchNorm1d(embedding_size))
            if i % self.top_k_every_n == 0:
                self.pooling_layers.append(TopKPooling(embedding_size, ratio=top_k_ratio))
            

        # Linear layers
        self.linear0 = Linear(embedding_size*2+3*2*edge_num, embedding_size*2)
        self.linear1 = Linear((embedding_size)*2, dense_neurons)
        self.linear2 = Linear(dense_neurons, c_out)

        """
        could use super node instead of topKPooling an linear layers
        or more topK pooling rather than linear layers
        """

    def forward(self, x, edge_index, edge_attr, batch_index):
        # Remove momenta from edge features
        p = edge_attr[:,8:11]
        p = p.reshape(max(batch_index)+1,-1)
        edge_attr = edge_attr[:,0:8]

        # Initial transformation
        x = self.conv1(x, edge_index, edge_attr)
        x = F.leaky_relu(self.transf1(x))
        x = self.bn1(x)

        # Holds the intermediate graph representations
        global_representation = []

        for i in range(self.n_layers):
            x = self.conv_layers[i](x, edge_index, edge_attr)
            x = F.leaky_relu(self.transf_layers[i](x))
            x = self.bn_layers[i](x)
            # Always aggregate last layer
            if i % self.top_k_every_n == 0 or i == self.n_layers:
                x , edge_index, edge_attr, batch_index, _, _ = self.pooling_layers[int(i/self.top_k_every_n)](
                    x, edge_index, edge_attr, batch_index
                    )
                # Add current representation
                global_representation.append(torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1))
    
        x = sum(global_representation)

        #add momenta on
        x = torch.cat((x,p),1)

        # Output block
        x = F.leaky_relu(self.linear0(x))
        x = F.dropout(x,p=self.lin_dropout_prob, training=self.training)
        x = F.leaky_relu(self.linear1(x))
        x = F.dropout(x, p=self.lin_dropout_prob, training=self.training)
        x = torch.sigmoid(self.linear2(x))

        return x
    
    def training_step(self, batch, batch_idx):
        x, edge_index, edge_attr, y = batch['x'], batch['edge_index'], batch['edge_attr'], batch['y_norm']
        batch_idx = batch['batch']
        y_hat = self(x,
                     edge_index,
                     edge_attr,
                     batch_idx
        )
        loss = self.loss_fn(y_hat, y.view(-1,1))
        self.log("train_loss", loss, prog_bar=True, on_step=True, on_epoch=False, batch_size=max(batch_idx)+1)
        return loss

    def validation_step(self, batch, batch_idx):
        x, edge_index, edge_attr, y = batch['x'], batch['edge_index'], batch['edge_attr'], batch['y_norm']
        batch_idx = batch['batch']
        y_hat = self(x,
                     edge_index,
                     edge_attr,
                     batch_idx
        )
        loss = self.loss_fn(y_hat, y.view(-1,1))
        self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True, batch_size=max(batch_idx)+1)
        return loss

    def test_step(self, batch, batch_idx):
        x, edge_index, edge_attr, y = batch['x'], batch['edge_index'], batch['edge_attr'], batch['y_norm']
        batch_idx = batch['batch']
        y_hat = self(x,
                     edge_index,
                     edge_attr,
                     batch_idx)
        loss = self.loss_fn(y_hat, y.view(-1,1))
        self.log("test_loss", loss, prog_bar=True, on_step=True, on_epoch=False, batch_size=max(batch_idx)+1)
        return loss

    def predict_step(self, batch, batch_idx):
        x, edge_index, edge_attr, y = batch['x'], batch['edge_index'], batch['edge_attr'], batch['y_norm']
        batch_idx = batch['batch']
        y_hat = self(x,
                     edge_index,
                     edge_attr,
                     batch_idx)
        return y_hat.item(), y.item()

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(),
                                lr=self.lr,
                                weight_decay=self.weight_decay,
                                )


#**Tuning the hyperparameters with Ray Tune**

In [None]:
def train_feyn_no_tune(params, num_gpus, num_epochs=10):
  """
  Function to train the Feynman GNN without a hyperparameter search.
  params = The hyperparameters to use, stored as a dictionary with the notation "model_..."
  """
  #need to make layer type a hyperparameter
  model_params = {k: v[0] for k, v in params.items() if k.startswith("model_")}
  model = FeynModel(c_in=-1, #train_dataset.num_node_features 
                    c_out=1,  #train_dataset.num_classes
                    layer_name="GAT",
                    model_params=model_params,
                    #filename=
                    )
  trainer = pl.Trainer(logger=TensorBoardLogger(CHECKPOINT_PATH, name="tb_logs"),
                       max_epochs=num_epochs,
                       gpus=math.ceil(num_gpus),
                       log_every_n_steps=10,
                       #progress_bar_refresh_rate=0,
                       callbacks=[EarlyStopping('val_loss',patience=10)],
                       )
  trainer.fit(model, train_loader, val_loader)
  trainer.validate(model, val_loader)
  trainer.test(model, test_loader)
  return model, trainer


In [None]:
def train_feyn_tune(config, num_epochs=10, num_gpus=0, checkpoint_dir=None):
  """
  function to run a training run that will be called later by the tuning function
  """
  model = FeynModel(c_in=-1, #train_dataset.num_node_features 
                    c_out=1,  #train_dataset.num_classes
                    layer_name="GAT",
                    model_params=config,
                    #filename=
                    )
  trainer = pl.Trainer(logger=TensorBoardLogger(save_dir=tune.get_trial_dir(),
                                                name="",
                                                version="."),
                       max_epochs=num_epochs,
                       gpus=math.ceil(num_gpus),
                       log_every_n_steps=10,
                       #progress_bar_refresh_rate=0,
                       callbacks=[TuneReportCallback({"loss": "val_loss",   
                                                      #"mean_accuracy": "val_acc"
                                                      },
                                                     on="validation_end"),
                                  #EarlyStopping('val_loss',patience=10)
                                  ]
                       )
  trainer.fit(model, train_loader, val_loader)


def tune_feyn_asha(config, gpus_per_trial=0, num_epochs=10, num_samples=10):

    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=[
                           "model_batch_size",
                           "model_weight_decay",
                           "model_learning_rate",
                           "model_embedding_size",
                           "model_attention_heads",
                           "model_layers",
                           "model_dropout_rate",
                           "model_top_k_ratio",
                           "model_top_k_every_n",
                           "model_dense_neurons",
                           "model_edge_dim",
                           "model_lin_dropout_prob"],
        metric_columns=["loss", "training_iteration"])

    train_fn_with_parameters = tune.with_parameters(train_feyn_tune,
                                                    num_epochs=num_epochs,
                                                    num_gpus=gpus_per_trial,
                                                    #checkpoint_dir=CHECKPOINT_PATH,
                                                    )
    
    resources_per_trial = {"cpu": 1, "gpu": gpus_per_trial}

    analysis = tune.run(train_fn_with_parameters,
        resources_per_trial=resources_per_trial,
        metric="loss",
        mode="min",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        name="tune_mnist_asha")

    print("Best hyperparameters found were: ", analysis.best_config)


In [None]:
print(f"Torch version: {torch.__version__}")
print(f"Cuda available: {torch.cuda.is_available()}")
print(f"Torch geometric version: {torch_geometric.__version__}")

if torch.cuda.is_available():
  gpus=1
else:
  gpus=0

Torch version: 1.10.0+cu111
Cuda available: False
Torch geometric version: 2.0.3


#**Create layer dictionary and Hyperparameters**


In [None]:
#layer name dictionary
gnn_layer_by_name = {
    "GCN": geom_nn.GCNConv,
    "GAT": geom_nn.GATConv,
    "GraphConv": geom_nn.GraphConv,
    "NNConv": geom_nn.NNConv,
    "RGCN": geom_nn.RGCNConv,
    "Trans": geom_nn.TransformerConv
}

#Hyperparameters to use if not tuning
HYPERPARAMETERS = {
    "model_batch_size": [80],
    "model_weight_decay": [0.000001],
    "model_learning_rate": [0.0001],
    "model_embedding_size": [16],
    "model_attention_heads": [4],
    "model_layers": [5],
    "model_dropout_rate": [0.6],
    "model_top_k_ratio": [0.5],
    "model_top_k_every_n": [1],
    "model_dense_neurons": [4],
    "model_edge_dim": [11],
    "model_lin_dropout_prob": [0.8],
    }

#Hyperparameters for ray tune to search through
config = {
    "model_batch_size": tune.choice([64]),
    "model_weight_decay": tune.choice([0.000001]),
    "model_learning_rate": tune.loguniform(0.0001,0.1),
    "model_embedding_size": tune.choice([4]),
    "model_attention_heads": tune.choice([4]),
    "model_layers": tune.choice([3]),
    "model_dropout_rate": tune.choice([0.5]),
    "model_top_k_ratio": tune.choice([0.2]),
    "model_top_k_every_n": tune.choice([1]),
    "model_dense_neurons": tune.choice([4]),
    "model_edge_dim": tune.choice([11]),
    "model_lin_dropout_prob": tune.choice([0.3]),
    }

In [None]:
model, trainer = train_feyn_no_tune(HYPERPARAMETERS,gpus,200)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  "A layer with UninitializedParameter was found. "

   | Name           | Type        | Params
------------------------------------------------
0  | loss_fn        | MSELoss     | 0     
1  | conv_layers    | ModuleList  | 9.0 K 
2  | transf_layers  | ModuleList  | 5.2 K 
3  | pooling_layers | ModuleList  | 80    
4  | bn_layers      | ModuleList  | 160   
5  | conv1          | GATConv     | 768   
6  | transf1        | Linear      | 1.0 K 
7  | bn1            | BatchNorm1d | 32    
8  | linear0        | Linear      | 2.0 K 
9  | linear1        | Linear      | 132   
10 | linear2        | Linear      | 5     
------------------------------------------------
18.4 K    Trainable params
0         Non-trainable params
18.4 K    Total params
0.074     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 3125819298
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_loss': 0.4919202923774719}
--------------------------------------------------------------------------------


Testing: 0it [00:00, ?it/s]

In [None]:
model.linear0.weight

In [None]:
tune_feyn_asha(config, gpus_per_trial=gpus)

== Status ==
Current time: 2022-03-10 16:04:13 (running for 00:00:00.29)
Memory usage on this node: 2.2/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 1.0/2 CPUs, 0/0 GPUs, 0.0/6.24 GiB heap, 0.0/3.12 GiB objects
Result logdir: /root/ray_results/tune_mnist_asha
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+-----------------------------+----------+-----------------+--------------------+----------------------+-----------------------+------------------------+-------------------------+----------------+----------------------+---------------------+-----------------------+-----------------------+------------------+--------------------------+
| Trial name                  | status   | loc             |   model_batch_size |   model_weight_decay |   model_learning_rate |   model_embedding_size |   model_attention_heads |   model_layers |   model_dropout_rate |   model_top_k_ratio |   model_top

[2m[36m(train_feyn_tune pid=1155)[0m GPU available: False, used: False
[2m[36m(train_feyn_tune pid=1155)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_feyn_tune pid=1155)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_feyn_tune pid=1155)[0m   "The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7."
[2m[36m(train_feyn_tune pid=1155)[0m   "A layer with UninitializedParameter was found. "
[2m[36m(train_feyn_tune pid=1155)[0m 
[2m[36m(train_feyn_tune pid=1155)[0m    | Name           | Type        | Params
[2m[36m(train_feyn_tune pid=1155)[0m ------------------------------------------------
[2m[36m(train_feyn_tune pid=1155)[0m 0  | loss_fn        | MSELoss     | 0     
[2m[36m(train_feyn_tune pid=1155)[0m 1  | conv_layers    | ModuleList  | 768   
[2m[36m(train_feyn_tune pid=1155)[0m 2  | transf_layers  | ModuleList  | 204   
[2m[36m(train_feyn_tune pid=1155)[0m 3  | pooling_layers | ModuleList  | 1

== Status ==
Current time: 2022-03-10 16:04:18 (running for 00:00:05.34)
Memory usage on this node: 2.5/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 0/0 GPUs, 0.0/6.24 GiB heap, 0.0/3.12 GiB objects
Result logdir: /root/ray_results/tune_mnist_asha
Number of trials: 10/10 (8 PENDING, 2 RUNNING)
+-----------------------------+----------+-----------------+--------------------+----------------------+-----------------------+------------------------+-------------------------+----------------+----------------------+---------------------+-----------------------+-----------------------+------------------+--------------------------+
| Trial name                  | status   | loc             |   model_batch_size |   model_weight_decay |   model_learning_rate |   model_embedding_size |   model_attention_heads |   model_layers |   model_dropout_rate |   model_top_k_ratio |   model_top

[2m[36m(train_feyn_tune pid=1154)[0m   "A layer with UninitializedParameter was found. "
[2m[36m(train_feyn_tune pid=1154)[0m 
[2m[36m(train_feyn_tune pid=1154)[0m    | Name           | Type        | Params
[2m[36m(train_feyn_tune pid=1154)[0m ------------------------------------------------
[2m[36m(train_feyn_tune pid=1154)[0m 0  | loss_fn        | MSELoss     | 0     
[2m[36m(train_feyn_tune pid=1154)[0m 1  | conv_layers    | ModuleList  | 768   
[2m[36m(train_feyn_tune pid=1154)[0m 2  | transf_layers  | ModuleList  | 204   
[2m[36m(train_feyn_tune pid=1154)[0m 3  | pooling_layers | ModuleList  | 12    
[2m[36m(train_feyn_tune pid=1154)[0m 4  | bn_layers      | ModuleList  | 24    
[2m[36m(train_feyn_tune pid=1154)[0m 5  | conv1          | GATConv     | 192   
[2m[36m(train_feyn_tune pid=1154)[0m 6  | transf1        | Linear      | 68    
[2m[36m(train_feyn_tune pid=1154)[0m 7  | bn1            | BatchNorm1d | 8     
[2m[36m(train_feyn_tune pid=1

[2m[36m(train_feyn_tune pid=1154)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/1 [00:00<?, ?it/s]


2022-03-10 16:04:24,798	ERROR trial_runner.py:920 -- Trial train_feyn_tune_ba3ed_00001: Error processing event.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/trial_runner.py", line 886, in _process_trial
    results = self.trial_executor.fetch_result(trial)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/ray_trial_executor.py", line 675, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/ray/worker.py", line 1763, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TuneError): [36mray::ImplicitFunc.train()[39m (pid=1154, ip=172.28.0.2, repr=train_feyn_tune)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/trainable.py", line 319, in train
    result = self.step()
  File "/usr/local/lib/python3

Result for train_feyn_tune_ba3ed_00001:
  date: 2022-03-10_16-04-21
  experiment_id: a855dffaa3dc446b9f2b087ef945401d
  hostname: 5fbc48a868f9
  node_ip: 172.28.0.2
  pid: 1154
  timestamp: 1646928261
  trial_id: ba3ed_00001
  
== Status ==
Current time: 2022-03-10 16:04:29 (running for 00:00:16.78)
Memory usage on this node: 3.0/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 1.0/2 CPUs, 0/0 GPUs, 0.0/6.24 GiB heap, 0.0/3.12 GiB objects
Result logdir: /root/ray_results/tune_mnist_asha
Number of trials: 10/10 (2 ERROR, 7 PENDING, 1 RUNNING)
+-----------------------------+----------+-----------------+--------------------+----------------------+-----------------------+------------------------+-------------------------+----------------+----------------------+---------------------+-----------------------+-----------------------+------------------+--------------------------+
| Trial name    

[2m[36m(train_feyn_tune pid=1276)[0m GPU available: False, used: False
[2m[36m(train_feyn_tune pid=1276)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_feyn_tune pid=1276)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_feyn_tune pid=1276)[0m   "The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7."
[2m[36m(train_feyn_tune pid=1276)[0m   "A layer with UninitializedParameter was found. "
[2m[36m(train_feyn_tune pid=1276)[0m 
[2m[36m(train_feyn_tune pid=1276)[0m    | Name           | Type        | Params
[2m[36m(train_feyn_tune pid=1276)[0m ------------------------------------------------
[2m[36m(train_feyn_tune pid=1276)[0m 0  | loss_fn        | MSELoss     | 0     
[2m[36m(train_feyn_tune pid=1276)[0m 1  | conv_layers    | ModuleList  | 768   
[2m[36m(train_feyn_tune pid=1276)[0m 2  | transf_layers  | ModuleList  | 204   
[2m[36m(train_feyn_tune pid=1276)[0m 3  | pooling_layers | ModuleList  | 1

[2m[36m(train_feyn_tune pid=1276)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/1 [00:00<?, ?it/s]
== Status ==
Current time: 2022-03-10 16:04:34 (running for 00:00:21.81)
Memory usage on this node: 3.0/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 0/0 GPUs, 0.0/6.24 GiB heap, 0.0/3.12 GiB objects
Result logdir: /root/ray_results/tune_mnist_asha
Number of trials: 10/10 (2 ERROR, 6 PENDING, 2 RUNNING)
+-----------------------------+----------+-----------------+--------------------+----------------------+-----------------------+------------------------+-------------------------+----------------+----------------------+---------------------+-----------------------+-----------------------+------------------+--------------------------+
| Trial name                  | status   | loc             |   model_batch_size |   model_weight

[2m[36m(train_feyn_tune pid=1304)[0m   "A layer with UninitializedParameter was found. "
[2m[36m(train_feyn_tune pid=1304)[0m 
[2m[36m(train_feyn_tune pid=1304)[0m    | Name           | Type        | Params
[2m[36m(train_feyn_tune pid=1304)[0m ------------------------------------------------
[2m[36m(train_feyn_tune pid=1304)[0m 0  | loss_fn        | MSELoss     | 0     
[2m[36m(train_feyn_tune pid=1304)[0m 1  | conv_layers    | ModuleList  | 768   
[2m[36m(train_feyn_tune pid=1304)[0m 2  | transf_layers  | ModuleList  | 204   
[2m[36m(train_feyn_tune pid=1304)[0m 3  | pooling_layers | ModuleList  | 12    
[2m[36m(train_feyn_tune pid=1304)[0m 4  | bn_layers      | ModuleList  | 24    
[2m[36m(train_feyn_tune pid=1304)[0m 5  | conv1          | GATConv     | 192   
[2m[36m(train_feyn_tune pid=1304)[0m 6  | transf1        | Linear      | 68    
[2m[36m(train_feyn_tune pid=1304)[0m 7  | bn1            | BatchNorm1d | 8     
[2m[36m(train_feyn_tune pid=1

[2m[36m(train_feyn_tune pid=1304)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/1 [00:00<?, ?it/s]
Result for train_feyn_tune_ba3ed_00003:
  date: 2022-03-10_16-04-34
  experiment_id: dda33b23f3a945d284a3b214dde0ceba
  hostname: 5fbc48a868f9
  node_ip: 172.28.0.2
  pid: 1304
  timestamp: 1646928274
  trial_id: ba3ed_00003
  
== Status ==
Current time: 2022-03-10 16:04:40 (running for 00:00:27.28)
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 1.0/2 CPUs, 0/0 GPUs, 0.0/6.24 GiB heap, 0.0/3.12 GiB objects
Result logdir: /root/ray_results/tune_mnist_asha
Number of trials: 10/10 (4 ERROR, 5 PENDING, 1 RUNNING)
+-----------------------------+----------+-----------------+--------------------+----------------------+-----------------------+------------------------+-------------------------+----------------+--------

[2m[36m(train_feyn_tune pid=1354)[0m GPU available: False, used: False
[2m[36m(train_feyn_tune pid=1354)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_feyn_tune pid=1354)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_feyn_tune pid=1354)[0m   "The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7."


== Status ==
Current time: 2022-03-10 16:04:45 (running for 00:00:32.35)
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 0/0 GPUs, 0.0/6.24 GiB heap, 0.0/3.12 GiB objects
Result logdir: /root/ray_results/tune_mnist_asha
Number of trials: 10/10 (4 ERROR, 4 PENDING, 2 RUNNING)
+-----------------------------+----------+-----------------+--------------------+----------------------+-----------------------+------------------------+-------------------------+----------------+----------------------+---------------------+-----------------------+-----------------------+------------------+--------------------------+
| Trial name                  | status   | loc             |   model_batch_size |   model_weight_decay |   model_learning_rate |   model_embedding_size |   model_attention_heads |   model_layers |   model_dropout_rate |   model_top_k_ratio |   

[2m[36m(train_feyn_tune pid=1354)[0m   "A layer with UninitializedParameter was found. "
[2m[36m(train_feyn_tune pid=1354)[0m 
[2m[36m(train_feyn_tune pid=1354)[0m    | Name           | Type        | Params
[2m[36m(train_feyn_tune pid=1354)[0m ------------------------------------------------
[2m[36m(train_feyn_tune pid=1354)[0m 0  | loss_fn        | MSELoss     | 0     
[2m[36m(train_feyn_tune pid=1354)[0m 1  | conv_layers    | ModuleList  | 768   
[2m[36m(train_feyn_tune pid=1354)[0m 2  | transf_layers  | ModuleList  | 204   
[2m[36m(train_feyn_tune pid=1354)[0m 3  | pooling_layers | ModuleList  | 12    
[2m[36m(train_feyn_tune pid=1354)[0m 4  | bn_layers      | ModuleList  | 24    
[2m[36m(train_feyn_tune pid=1354)[0m 5  | conv1          | GATConv     | 192   
[2m[36m(train_feyn_tune pid=1354)[0m 6  | transf1        | Linear      | 68    
[2m[36m(train_feyn_tune pid=1354)[0m 7  | bn1            | BatchNorm1d | 8     
[2m[36m(train_feyn_tune pid=1

Result for train_feyn_tune_ba3ed_00004:
  date: 2022-03-10_16-04-41
  experiment_id: 9ad1488b08224b0bb61ba208b1667917
  hostname: 5fbc48a868f9
  node_ip: 172.28.0.2
  pid: 1354
  timestamp: 1646928281
  trial_id: ba3ed_00004
  


2022-03-10 16:04:48,517	ERROR trial_runner.py:920 -- Trial train_feyn_tune_ba3ed_00005: Error processing event.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/trial_runner.py", line 886, in _process_trial
    results = self.trial_executor.fetch_result(trial)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/ray_trial_executor.py", line 675, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/ray/worker.py", line 1763, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TuneError): [36mray::ImplicitFunc.train()[39m (pid=1398, ip=172.28.0.2, repr=train_feyn_tune)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/trainable.py", line 319, in train
    result = self.step()
  File "/usr/local/lib/python3

Result for train_feyn_tune_ba3ed_00005:
  date: 2022-03-10_16-04-45
  experiment_id: 5644c6ad3124404da9b81c6bdad21837
  hostname: 5fbc48a868f9
  node_ip: 172.28.0.2
  pid: 1398
  timestamp: 1646928285
  trial_id: ba3ed_00005
  
[2m[36m(train_feyn_tune pid=1398)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/1 [00:00<?, ?it/s]
== Status ==
Current time: 2022-03-10 16:04:50 (running for 00:00:38.24)
Memory usage on this node: 3.3/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 1.0/2 CPUs, 0/0 GPUs, 0.0/6.24 GiB heap, 0.0/3.12 GiB objects
Result logdir: /root/ray_results/tune_mnist_asha
Number of trials: 10/10 (6 ERROR, 3 PENDING, 1 RUNNING)
+-----------------------------+----------+-----------------+--------------------+----------------------+-----------------------+------------------------+-------------------------+----------------+--------

[2m[36m(train_feyn_tune pid=1460)[0m GPU available: False, used: False
[2m[36m(train_feyn_tune pid=1460)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_feyn_tune pid=1460)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_feyn_tune pid=1460)[0m   "The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7."
[2m[36m(train_feyn_tune pid=1460)[0m   "A layer with UninitializedParameter was found. "
[2m[36m(train_feyn_tune pid=1460)[0m 
[2m[36m(train_feyn_tune pid=1460)[0m    | Name           | Type        | Params
[2m[36m(train_feyn_tune pid=1460)[0m ------------------------------------------------
[2m[36m(train_feyn_tune pid=1460)[0m 0  | loss_fn        | MSELoss     | 0     
[2m[36m(train_feyn_tune pid=1460)[0m 1  | conv_layers    | ModuleList  | 768   
[2m[36m(train_feyn_tune pid=1460)[0m 2  | transf_layers  | ModuleList  | 204   
[2m[36m(train_feyn_tune pid=1460)[0m 3  | pooling_layers | ModuleList  | 1

[2m[36m(train_feyn_tune pid=1460)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/1 [00:00<?, ?it/s]
== Status ==
Current time: 2022-03-10 16:04:56 (running for 00:00:43.29)
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 0/0 GPUs, 0.0/6.24 GiB heap, 0.0/3.12 GiB objects
Result logdir: /root/ray_results/tune_mnist_asha
Number of trials: 10/10 (6 ERROR, 2 PENDING, 2 RUNNING)
+-----------------------------+----------+-----------------+--------------------+----------------------+-----------------------+------------------------+-------------------------+----------------+----------------------+---------------------+-----------------------+-----------------------+------------------+--------------------------+
| Trial name                  | status   | loc             |   model_batch_size |   model_weight

2022-03-10 16:04:59,579	ERROR trial_runner.py:920 -- Trial train_feyn_tune_ba3ed_00007: Error processing event.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/trial_runner.py", line 886, in _process_trial
    results = self.trial_executor.fetch_result(trial)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/ray_trial_executor.py", line 675, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/ray/worker.py", line 1763, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TuneError): [36mray::ImplicitFunc.train()[39m (pid=1493, ip=172.28.0.2, repr=train_feyn_tune)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/trainable.py", line 319, in train
    result = self.step()
  File "/usr/local/lib/python3

Result for train_feyn_tune_ba3ed_00007:
  date: 2022-03-10_16-04-56
  experiment_id: 11a545c9c9dc443fb2b745a65bebedc1
  hostname: 5fbc48a868f9
  node_ip: 172.28.0.2
  pid: 1493
  timestamp: 1646928296
  trial_id: ba3ed_00007
  
[2m[36m(train_feyn_tune pid=1493)[0m Validation sanity check: 0it [00:00, ?it/s]Validation sanity check:   0%|          | 0/1 [00:00<?, ?it/s]
== Status ==
Current time: 2022-03-10 16:05:01 (running for 00:00:48.29)
Memory usage on this node: 3.2/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/2 CPUs, 0/0 GPUs, 0.0/6.24 GiB heap, 0.0/3.12 GiB objects
Result logdir: /root/ray_results/tune_mnist_asha
Number of trials: 10/10 (8 ERROR, 2 PENDING)
+-----------------------------+----------+-----------------+--------------------+----------------------+-----------------------+------------------------+-------------------------+----------------+---------------------

#**Predict with last model**

In [None]:
#out = trainer.predict(model, dataloaders=pred_loader)
#print(out)

#**TensorBoard Logs and running training**


In [None]:
%tensorboard --logdir /content/gdrive/MyDrive/Part_III_Project/saved_models/lightning_logs