<a href="https://colab.research.google.com/github/akanksha-ahuja/fcc-final-notebooks/blob/main/fcc_experiment_2_x_model_GCNX_2_hyperparameters_dropout_0_1_0_2_0_3_0_4_0_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libs

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import timeit
import random
import torch.nn.functional as F
import itertools
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from scipy.spatial import distance
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Install required packages.
!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install -q torch-geometric
!pip install graphlime
from torch_geometric.data import Data, DataLoader
from graphlime import GraphLIME
from torch_geometric.utils import to_networkx

[K     |████████████████████████████████| 3.0 MB 348 kB/s 
[K     |████████████████████████████████| 1.6 MB 242 kB/s 
[K     |████████████████████████████████| 222 kB 4.3 MB/s 
[K     |████████████████████████████████| 376 kB 35.3 MB/s 
[K     |████████████████████████████████| 45 kB 3.3 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone
Collecting graphlime
  Downloading graphlime-1.2.0.tar.gz (3.3 kB)
Building wheels for collected packages: graphlime
  Building wheel for graphlime (setup.py) ... [?25l[?25hdone
  Created wheel for graphlime: filename=graphlime-1.2.0-py3-none-any.whl size=2616 sha256=ccc1c1d83f8df60c35cff0a883e2effde1987ab111cea483c56e119af98d948d
  Stored in directory: /root/.cache/pip/wheels/33/29/94/9835c557e2def18b58369cda0032935a3263acfa9266aaeb5d
Successfully built graphlime
Installing collected packages: graphlime
Successfully installed graphlime-1.2.0


In [None]:
from torch_geometric.nn import GCNConv, TAGConv, SAGEConv, ChebConv
from torch_geometric.nn import GATConv
from torch_geometric.nn import GINConv
from torch_geometric.nn import JumpingKnowledge, GCN2Conv
from torch.nn import Sequential, Linear, BatchNorm1d, ReLU
from torch_geometric.nn import GNNExplainer

# Connect G drive

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load df and process the data

In [None]:
# Data Processing Functions
def load_df(path_to_file):
  df = pd.read_csv(path_to_file)
  return df

def set_constants(TOTAL_EVENTS, MAX_LENGTH_EVENT=150):
  TOTAL_EVENTS = TOTAL_EVENTS
  MAX_LENGTH_EVENT = MAX_LENGTH_EVENT
  return TOTAL_EVENTS, MAX_LENGTH_EVENT

def create_labels(df):
  conditions = [(df['isHiggs'] == True),(df['isZ'] == True), (df['isOther'] == True) ]
  # create a list of the values we want to assign for each condition
  values = [0, 1, 2] 

  # create a new column and use np.select to assign values to it using our lists as arguments
  df['label'] = np.select(conditions, values)
  return df


def normalise_x_features(df):
  # Normalise the features in the dataset 
  df_id = df[['event_list']]
  df_x = df[['pid', 'pos_r', 'pos_theta', 'pos_phi', 'pos_t', 'mom_p', 'mom_theta', 'mom_phi', 'mom_mass']]
  df_y = df[['label']]

  # Create a list of labels for the new dataframe
  new_columns = ['event_list', 'pid', 'pos_r', 'pos_theta', 'pos_phi', 'pos_t', 'mom_p', 'mom_theta', 'mom_phi', 'mom_mass', 'label']

  x = df_x.values # returns numpy 
  min_max_scaler = preprocessing.MinMaxScaler()
  x_scaled = min_max_scaler.fit_transform(x)
  df_x = pd.DataFrame(x_scaled)

  # Concatenate normalised x features and un-normalised y labels and event ids
  df_normalised_features = pd.concat([df_id, df_x, df_y], axis=1)
  df_normalised_features.columns = new_columns # You need to mention the axis
  return df_normalised_features

def split_df_by_event(df_normalised_features, TOTAL_EVENTS):
  # Dataframes split by event 
  df_event_list = []
  for i in range(TOTAL_EVENTS):
    df_event = df_normalised_features[df_normalised_features['event_list']==i]
    df_event_list.append(df_event)

  # A list of number of stable particles per event 
  length_of_each_event = [len(df_event_list[i]) for i in range(len(df_event_list))]
  return df_event_list, length_of_each_event

def create_source_target_for_COO(df_event_list):
  # Add two columns of source, target over all dataframes in df_event_list to make it compatible with pygn Data Object.
  df_event_source_target_list = []
  for i in range(len(df_event_list)):
    df_event_list[i]['source'] = None
    df_event_list[i]['target'] = None
    df_event_source_target_list.append(df_event_list[i])
  return df_event_source_target_list

#Generate Data.x and Data.y for pytorch geometric 

In [None]:
# VARIABLE SIZE GRAPHS
def generate_graph_data_x_y_for_already_normalised_features(df_event_processed_list_cleaned, TOTAL_EVENTS):
  # Generating data.x and data.y for pytorch geomteric 
  graph_data_x_list = []
  graph_data_y_list = []
  for i in range(TOTAL_EVENTS):
    df_graph = df_event_processed_list_cleaned[i]
    # Extract node features and labels from cleaned processed fixed size event list and convert to numpy 
    data_x = df_graph[['pid', 'pos_r', 'pos_theta', 'pos_phi', 'pos_t', 'mom_p', 'mom_theta', 'mom_phi', 'mom_mass']].to_numpy()
    data_y = df_graph[['label']].to_numpy()

    # Convert numpy objects into tensors for data loaders 
    graph_data_x_list.append(torch.Tensor(data_x))
    graph_data_y_list.append(torch.Tensor(data_y))
  return graph_data_x_list, graph_data_y_list

# Create graph nodes and labels 

In [None]:
def create_graph_nodes_and_labels_for_variable_graphs(path_to_file, TOTAL_EVENTS):
  df = load_df(path_to_file) # You can specify path to file here 
  TOTAL_EVENTS, _ = set_constants(TOTAL_EVENTS) # you can pass the constants here 
  df = create_labels(df) 
  df_normalised_features = normalise_x_features(df) # Don't call this if you are normalising features when creating graph dataset for FIXED GRAPHS  
  df_event_list, length_of_each_event = split_df_by_event(df_normalised_features, TOTAL_EVENTS)
  df_event_source_target_list = create_source_target_for_COO(df_event_list) 
  df_event_processed_list_cleaned = df_event_source_target_list
  graph_data_x_list, graph_data_y_list = generate_graph_data_x_y_for_already_normalised_features(df_event_processed_list_cleaned, TOTAL_EVENTS)
  return df, df_normalised_features, df_event_list, length_of_each_event, df_event_processed_list_cleaned, graph_data_x_list, graph_data_y_list

# KNN 


In [None]:
def define_knn(num_neighbours=8):
  knn = NearestNeighbors(n_neighbors=num_neighbours, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None)
  return knn

def generate_X_list_knn(knn, df_event_list, TOTAL_EVENTS, num_neighbours=8):
  X_list_knn = [] 
  for event_id in range(TOTAL_EVENTS):
    X = df_event_list[event_id]
    X = X.drop(columns=['source', 'target'])
    knn.fit(X)
    neighbour = knn.kneighbors(X, n_neighbors=num_neighbours, return_distance=False)
    target = neighbour
    source = np.zeros((neighbour.shape))
    X['source'] = None
    X['target'] = None
    for i in range(len(X)):
      X['source'].iloc[i] = np.ones((neighbour.shape)) * i
      X['target'].iloc[i] = target[i]
    X_list_knn.append(X)
  return X_list_knn

def convert_COO_for_knn_events(df, num_neighbours=8):
  source_list = [] 
  for i in range(len(df)):
    for _ in range(num_neighbours):
      source_list.append(i)
  target_list = list(itertools.chain.from_iterable(df['target'].to_numpy()))
  edge_index= torch.tensor([source_list, target_list], dtype=torch.long)
  return edge_index

def create_COO_format_data_knn_list(X_list_knn, graph_data_x_list, graph_data_y_list, TOTAL_EVENTS, num_neighbours=8):
  # Data Represented as edges with K-nearest neighbours as 8 
  # A list of graph data items to be passed on to the data loader 
  data_knn_list = []
  for event_id in range(TOTAL_EVENTS):
    data_item = Data(x = graph_data_x_list[event_id], 
                    y = graph_data_y_list[event_id], 
                    edge_index = convert_COO_for_knn_events(X_list_knn[event_id], num_neighbours=8))
    data_knn_list.append(data_item)
  return data_knn_list

In [None]:
def create_data_knn_list(df_event_list, graph_data_x_list, graph_data_y_list, TOTAL_EVENTS, num_neighbours=8):
  # Create data knn list 
  knn = define_knn(num_neighbours=8)
  X_list_knn = generate_X_list_knn(knn, df_event_list, TOTAL_EVENTS, num_neighbours=8)
  data_knn_list = create_COO_format_data_knn_list(X_list_knn, graph_data_x_list, graph_data_y_list, TOTAL_EVENTS, num_neighbours=8)
  return data_knn_list 

# Create KNN Dataset 

In [None]:
def get_data_lists_for_variable_size_graphs(path_to_file, TOTAL_EVENTS, num_neighbours):
  # VARIABLE SIZE GRAPHS
  df, df_normalised_features, df_event_list, length_of_each_event, df_event_processed_list_cleaned, graph_data_x_list, graph_data_y_list = create_graph_nodes_and_labels_for_variable_graphs(path_to_file, TOTAL_EVENTS)
  data_knn_list  = create_data_knn_list(df_event_list,graph_data_x_list, graph_data_y_list, TOTAL_EVENTS, num_neighbours)
  return data_knn_list, df_event_list, length_of_each_event, df_event_processed_list_cleaned

# calculate true mass and predicted mass for variable graphs 

In [None]:
def get_true_mass_list(data_list, df_event_processed_list_cleaned, TOTAL_EVENTS):
  total_true_mass_H_list = []
  total_true_mass_Z_list = []
  total_true_mass_O_list = []
  column_index_for_mass = -1 # fixed 
  column_index_for_label = 0 # fixed for true labels
  for event_id in range(TOTAL_EVENTS):
    true_mass_H_list = []
    true_mass_Z_list = []
    true_mass_O_list = []
    event_length = len(df_event_processed_list_cleaned[event_id])
    for node_id in range(event_length):
      if data_list[event_id].y[node_id].numpy()[column_index_for_label] == 0:
        true_mass_H_list.append(data_list[event_id].x[node_id].numpy()[column_index_for_mass])
      elif data_list[event_id].y[node_id].numpy()[column_index_for_label] == 1:
        true_mass_Z_list.append(data_list[event_id].x[node_id].numpy()[column_index_for_mass])
      elif data_list[event_id].y[node_id].numpy()[column_index_for_label] == 2:
        true_mass_O_list.append(data_list[event_id].x[node_id].numpy()[column_index_for_mass])
    total_true_mass_H_list.append(true_mass_H_list)
    total_true_mass_Z_list.append(true_mass_Z_list)
    total_true_mass_O_list.append(true_mass_O_list)
  return total_true_mass_H_list, total_true_mass_Z_list, total_true_mass_O_list

In [None]:
def calculate_predicted_mass(data_list, df_event_processed_list_cleaned, pred_list, TOTAL_EVENTS):
  total_pred_mass_H_list = []
  total_pred_mass_Z_list = []
  total_pred_mass_O_list = []
  column_index_for_mass = -1 # fixed 
  for event_id in range(TOTAL_EVENTS):
    pred_mass_H_list = []
    pred_mass_Z_list = []
    pred_mass_O_list = []
    event_length = len(df_event_processed_list_cleaned[event_id])
    for node_id in range(event_length):
      pred = pred_list[event_id][node_id].numpy()
      if pred == 0:
        pred_mass_H_list.append(data_list[event_id].x[node_id].numpy()[column_index_for_mass])
      elif pred == 1:
        pred_mass_Z_list.append(data_list[event_id].x[node_id].numpy()[column_index_for_mass])
      elif pred == 2:
        pred_mass_O_list.append(data_list[event_id].x[node_id].numpy()[column_index_for_mass])
    total_pred_mass_H_list.append(pred_mass_H_list)
    total_pred_mass_Z_list.append(pred_mass_Z_list)
    total_pred_mass_O_list.append(pred_mass_O_list)
  return total_pred_mass_H_list, total_pred_mass_Z_list, total_pred_mass_O_list

# Model

In [None]:
# For Variable Graph Size 
def set_model_constants_variable():
  dataset_num_features = 9 
  dataset_num_classes = 3
  return dataset_num_features, dataset_num_classes

In [None]:
# Define the models
class GCNX_relu(torch.nn.Module):
    def __init__(self, dataset_num_features, hidden_channels, dataset_num_classes, dropout=0.5, num_layers=2):
        super(GCNX_relu, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset_num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, dataset_num_classes )
        self.convx = torch.nn.ModuleList([GCNConv(hidden_channels, hidden_channels) for _ in range(num_layers-2)])
        self.dropout_p = dropout
    
    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        for iter_layer in self.convx:
            x = F.dropout(x, p= self.dropout_p, training=self.training)
            x = F.relu(iter_layer(x, edge_index))
        x = F.dropout(x, p= self.dropout_p, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


# Define the models
class GCNX_elu(torch.nn.Module):
    def __init__(self, dataset_num_features, hidden_channels, dataset_num_classes, dropout=0.5, num_layers=2):
        super(GCNX_elu, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset_num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, dataset_num_classes )
        self.convx = torch.nn.ModuleList([GCNConv(hidden_channels, hidden_channels) for _ in range(num_layers-2)])
        self.dropout_p = dropout
    
    def forward(self, x, edge_index):
        x = F.elu(self.conv1(x, edge_index))
        for iter_layer in self.convx:
            x = F.dropout(x, p= self.dropout_p, training=self.training)
            x = F.elu(iter_layer(x, edge_index))
        x = F.dropout(x, p= self.dropout_p, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Define the models
class GCNX_selu(torch.nn.Module):
    def __init__(self, dataset_num_features, hidden_channels, dataset_num_classes, dropout=0.5, num_layers=2):
        super(GCNX_selu, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset_num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, dataset_num_classes )
        self.convx = torch.nn.ModuleList([GCNConv(hidden_channels, hidden_channels) for _ in range(num_layers-2)])
        self.dropout_p = dropout
    
    def forward(self, x, edge_index):
        x = F.selu(self.conv1(x, edge_index))
        for iter_layer in self.convx:
            x = F.dropout(x, p= self.dropout_p, training=self.training)
            x = F.selu(iter_layer(x, edge_index))
        x = F.dropout(x, p= self.dropout_p, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Define the models
class GCNX_gelu(torch.nn.Module):
    def __init__(self, dataset_num_features, hidden_channels, dataset_num_classes, dropout=0.5, num_layers=2):
        super(GCNX_gelu, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset_num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, dataset_num_classes )
        self.convx = torch.nn.ModuleList([GCNConv(hidden_channels, hidden_channels) for _ in range(num_layers-2)])
        self.dropout_p = dropout
    
    def forward(self, x, edge_index):
        x = F.gelu(self.conv1(x, edge_index))
        for iter_layer in self.convx:
            x = F.dropout(x, p= self.dropout_p, training=self.training)
            x = F.gelu(iter_layer(x, edge_index))
        x = F.dropout(x, p= self.dropout_p, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Define the models
class GCNX_leakyrelu(torch.nn.Module):
    def __init__(self, dataset_num_features, hidden_channels, dataset_num_classes, dropout=0.5, num_layers=2):
        super(GCNX_leakyrelu, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset_num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, dataset_num_classes )
        self.convx = torch.nn.ModuleList([GCNConv(hidden_channels, hidden_channels) for _ in range(num_layers-2)])
        self.dropout_p = dropout
    
    def forward(self, x, edge_index):
        x = F.leaky_relu(self.conv1(x, edge_index))
        for iter_layer in self.convx:
            x = F.dropout(x, p= self.dropout_p, training=self.training)
            x = F.leaky_relu(iter_layer(x, edge_index))
        x = F.dropout(x, p= self.dropout_p, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


# Define the models
class GCNX_tanh(torch.nn.Module):
    def __init__(self, dataset_num_features, hidden_channels, dataset_num_classes, dropout=0.5, num_layers=2):
        super(GCNX_tanh, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset_num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, dataset_num_classes )
        self.convx = torch.nn.ModuleList([GCNConv(hidden_channels, hidden_channels) for _ in range(num_layers-2)])
        self.dropout_p = dropout
    
    def forward(self, x, edge_index):
        x = F.tanh(self.conv1(x, edge_index))
        for iter_layer in self.convx:
            x = F.dropout(x, p= self.dropout_p, training=self.training)
            x = F.tanh(iter_layer(x, edge_index))
        x = F.dropout(x, p= self.dropout_p, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


# Data Loaders

In [None]:
def create_data_loaders(data_list, train_split=0.6):
  # get length of the list
  lenth_of_list = len(data_list)
  train_split = 0.6
  valid_split = 0.2
  test_split = 0.2

  # create indices for loaders 
  train_end_index = int(lenth_of_list * train_split)
  validate_start_index = train_end_index
  validate_end_index = validate_start_index + int(lenth_of_list * valid_split)
  test_start_index = validate_end_index

  train_loader = DataLoader(data_list[:train_end_index], batch_size=1, shuffle=False)
  validation_loader = DataLoader(data_list[validate_start_index:validate_end_index], batch_size=1, shuffle=False)
  test_loader = DataLoader(data_list[test_start_index:], batch_size=1, shuffle=False)
  
  return train_loader, validation_loader, test_loader, train_end_index, validate_start_index, validate_end_index, test_start_index 

# Train, Validate, Test, Cal Accuracy and Evaluate

In [None]:
def train(model, optimizer, loader):
  model.train()
  pred_list, y_list, loss_list, y_list_numpy, pred_list_numpy = [], [],[], [], []
  for batch in loader:
    x, edges = batch.x, batch.edge_index
    y = batch.y
    y_list.append(y.squeeze(1).type(torch.LongTensor))
    y_list_numpy.append(y.squeeze(1).type(torch.LongTensor).numpy())
    optimizer.zero_grad()
    out = model(x, edges)
    loss = criterion(out, y.squeeze(1).type(torch.LongTensor))
    loss_list.append(loss)
    pred = out.argmax(dim=1)
    pred_list.append(pred)
    pred_list_numpy.append(pred.numpy()) 
    loss.backward()
    optimizer.step()  
  return loss_list, pred_list, y_list, y_list_numpy,  pred_list_numpy

def validate(model, optimizer, loader):
  model.train()
  pred_list, y_list, loss_list, y_list_numpy, pred_list_numpy = [], [],[], [], []
  for batch in loader:
    x, edges = batch.x, batch.edge_index
    y = batch.y
    y_list.append(y.squeeze(1).type(torch.LongTensor))
    y_list_numpy.append(y.squeeze(1).type(torch.LongTensor).numpy())
    optimizer.zero_grad()
    out = model(x, edges)
    loss = criterion(out, y.squeeze(1).type(torch.LongTensor))
    loss_list.append(loss)
    pred = out.argmax(dim=1)
    # convert tensor to numpy
    pred_list.append(pred) 
    pred_list_numpy.append(pred.numpy()) 
    loss.backward()
    optimizer.step()  
  return loss_list, pred_list, y_list, y_list_numpy, pred_list_numpy

def accuracy(pred_list, y_list):
  accuracy_list = []
  for event_id in range(len(pred_list)): # Because this works on the length of pred_list this logically surpasses the length mismatch issue
    length_of_each_event = len(pred_list[event_id])
    num_correct_list = []
    for node_id in range(length_of_each_event):
      p = pred_list[event_id][node_id].numpy()
      y = y_list[event_id][node_id].numpy()
      num_correct = p == y 
      num_correct_list.append(num_correct)
    acc = sum(num_correct_list)/length_of_each_event
    accuracy_list.append(acc)
  return accuracy_list

def train_on_n_epochs(n_epochs, model, optimizer, loader):
  # running for multiple epochs
  total_train_acc_list = []
  total_train_loss_list, total_train_pred_list, total_train_y_list, total_train_y_list_numpy, total_train_pred_list_numpy = [], [], [], [], []
  time_per_epoch_list = []
  for _ in range(n_epochs):
    train_acc_list = []
    starttime = timeit.default_timer()
    train_loss_list, train_pred_list, train_y_list, train_y_list_numpy, train_pred_list_numpy = train(model, optimizer, loader)
    time_per_epoch_list.append(timeit.default_timer() - starttime) 
    train_acc_list = accuracy(train_pred_list, train_y_list)
    total_train_acc_list.append(train_acc_list)
    total_train_loss_list.append(train_loss_list)
    total_train_y_list.append(train_y_list)
    total_train_pred_list.append(train_pred_list)
    total_train_y_list_numpy.append(train_y_list_numpy)
    total_train_pred_list_numpy.append(train_pred_list_numpy)
  return total_train_acc_list, total_train_loss_list, total_train_pred_list, total_train_y_list, total_train_y_list_numpy, total_train_pred_list_numpy, time_per_epoch_list

def test(model, loader):
  model.eval()
  pred_list, y_list = [], []
  y_list_numpy = []
  for batch in loader:
    x, edges = batch.x, batch.edge_index
    y = batch.y
    y_list.append(y.squeeze(1).type(torch.LongTensor))
    y_list_numpy.append(y.squeeze(1).type(torch.LongTensor).numpy())
    out = model(x, edges)
    pred = out.argmax(dim=1)
    pred_list.append(pred)    
  return pred_list, y_list, y_list_numpy

def evaluate_model_performace(model, loader):
  test_pred_list, test_y_list, test_y_list_numpy = test(model, loader)
  test_acc_list = []
  x = accuracy(test_pred_list, test_y_list)
  test_acc_list.append(x)
  return test_acc_list, test_pred_list, test_y_list, test_y_list_numpy

# Init Models

In [None]:
# Inits model and Adam optimizer 
# bool variable, int hidden_channels, float dropout, 
# int num_layers, string non_linearity, float learning_rate, float weight_decay

# you can add an if else condition based on the type of optimiser passed as a string

def init_GCNX_model(variable=True, non_linearity='relu', optimizer='adam', hidden_channels=16, dropout=0.5, num_layers=2,learning_rate=0.01, weight_decay=0.0005):
  if variable == True:
    dataset_num_features, dataset_num_classes = set_model_constants_variable()
  elif variable == False:
    dataset_num_features, dataset_num_classes = set_model_constants()
  
  if non_linearity == 'relu':
    model = GCNX_relu(dataset_num_features, hidden_channels, dataset_num_classes, dropout, num_layers)
  elif non_linearity == 'elu':
    model = GCNX_elu(dataset_num_features, hidden_channels, dataset_num_classes, dropout, num_layers)
  elif non_linearity == 'selu':
    model = GCNX_selu(dataset_num_features, hidden_channels, dataset_num_classes, dropout, num_layers)
  elif non_linearity == 'gelu':
    model = GCNX_gelu(dataset_num_features, hidden_channels, dataset_num_classes, dropout, num_layers)
  elif non_linearity == 'leakyrelu':
    model = GCNX_leakyrelu(dataset_num_features, hidden_channels, dataset_num_classes, dropout, num_layers)
  elif non_linearity == 'tanh':
    model = GCNX_tanh(dataset_num_features, hidden_channels, dataset_num_classes, dropout, num_layers)

  if optimizer == 'adam':
    optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate, weight_decay=weight_decay)
  elif optimizer == 'sgd':
    optimizer = torch.optim.SGD(model.parameters(), lr= learning_rate, weight_decay=weight_decay)
  elif optimizer == 'rmsprop':
    optimizer = torch.optim.RMSprop(model.parameters(),lr=learning_rate, weight_decay=weight_decay)

  criterion = torch.nn.CrossEntropyLoss()

  return dataset_num_features, dataset_num_classes, model, optimizer, criterion

# Experiment  KNN to save values

In [None]:
def save_to_experiment_path_as_dataframe(saved_list, path, file_name, dtype):
  df = pd.DataFrame(data=saved_list, dtype=dtype)
  df.to_csv(path + file_name)
  return "Saved as " + path + file_name

# KNN

In [None]:
def experiment_knn(data_list, df_event_list, model_knn, optimizer_knn, n_epochs = 1000, path_to_experiment = '/content/'):
    # Data loaders for KNN (60:20:20)
    train_loader, validation_loader, test_loader,  train_end_index, validate_start_index, validate_end_index, test_start_index = create_data_loaders(data_list)

    # Calculate True Mass of each particle for each train/valid/test 
    train = df_event_list[:train_end_index]
    valid = df_event_list[validate_start_index:validate_end_index]
    test = df_event_list[test_start_index:]
    # Split the data lists of graph events by (60:20:20)
    data_list_train = data_list[:train_end_index]
    data_list_valid = data_list[validate_start_index:validate_end_index]
    data_list_test = data_list[test_start_index:]
    # Call method over the 3 lists 
    total_true_mass_H_list_train, total_true_mass_Z_list_train, total_true_mass_O_list_train = get_true_mass_list(data_list_train, train, len(train))
    total_true_mass_H_list_valid, total_true_mass_Z_list_valid, total_true_mass_O_list_valid = get_true_mass_list(data_list_valid, valid, len(valid))
    total_true_mass_H_list_test, total_true_mass_Z_list_test, total_true_mass_O_list_test = get_true_mass_list(data_list_test, test, len(test))

    # Training on KNN Dataset
    total_train_acc_list, total_train_loss_list, total_train_pred_list, total_train_y_list, total_train_y_list_numpy, total_train_pred_list_numpy, train_time_per_epoch_list = train_on_n_epochs(n_epochs, model_knn, optimizer_knn, train_loader)
    total_valid_acc_list, total_valid_loss_list, total_valid_pred_list, total_valid_y_list, total_valid_y_list_numpy, total_valid_pred_list_numpy, valid_time_per_epoch_list = train_on_n_epochs(n_epochs, model_knn, optimizer_knn, validation_loader)
    mean_train_epoch_time = sum(train_time_per_epoch_list)/len(train_time_per_epoch_list)
    mean_valid_epoch_time = sum(valid_time_per_epoch_list)/len(valid_time_per_epoch_list)

    # Testing on KNN Data set 
    test_acc_list, test_pred_list, test_y_list, test_y_list_numpy = evaluate_model_performace(model_knn, test_loader)
    
    # Calculate the set of predicted mass for each particle
    total_pred_mass_H_list_train, total_pred_mass_Z_list_train, total_pred_mass_O_list_train = calculate_predicted_mass(data_list_train, train, total_train_pred_list[-1], len(train))
    total_pred_mass_H_list_valid, total_pred_mass_Z_list_valid, total_pred_mass_O_list_valid = calculate_predicted_mass(data_list_valid, valid, total_valid_pred_list[-1], len(valid))
    total_pred_mass_H_list_test, total_pred_mass_Z_list_test, total_pred_mass_O_list_test = calculate_predicted_mass(data_list_test, test, test_pred_list, len(test))

    # Results
    train_acc_knn = ['knn_train', np.mean(total_train_acc_list[-1]),np.std(total_train_acc_list[-1])]
    valid_acc_knn = ['knn_valid', np.mean(total_valid_acc_list[-1]),np.std(total_valid_acc_list[-1])]
    test_acc_knn = ['knn_test', np.mean(test_acc_list), np.std(test_acc_list) ]

    df_result_knn = pd.DataFrame([train_acc_knn, valid_acc_knn, test_acc_knn ], columns=['type','mean', 'stdev'])

    # save to drive 
    df_result_knn.to_csv(path_to_experiment  + 'df_result_knn.csv' )
    print(df_result_knn)

    # fix file names 
    file_name1 = 'df_knn_train_acc.csv'
    file_name2 = 'df_knn_test_acc.csv'
    file_name3 = 'df_knn_train_loss.csv'
    file_name4 = 'df_knn_test_pred.csv'
    file_name5 = 'df_knn_test_y.csv'
    file_name6 = 'df_knn_train_pred.csv'
    file_name7 = 'df_knn_train_y.csv'
    file_name8 = 'df_knn_train_time_per_epoch_list.csv'
    file_name9 = 'df_knn_valid_time_per_epoch_list.csv'
    file_name10 = 'df_knn_valid_acc.csv'
    file_name11 = 'df_knn_valid_loss.csv'
    file_name12 = 'df_knn_valid_pred.csv'
    file_name13 = 'df_knn_valid_y.csv'
    file_name14 = 'df_knn_test_y_numpy.csv'
    file_name15 = 'df_knn_train_y_numpy.csv'
    file_name16 = 'df_knn_valid_y_numpy.csv'
    file_name17 = "df_knn_true_mass_H_list_train"
    file_name18 = "df_knn_true_mass_Z_list_train"
    file_name19 = "df_knn_true_mass_O_list_train"
    file_name20 = "df_knn_true_mass_H_list_valid"
    file_name21 = "df_knn_true_mass_Z_list_valid"
    file_name22 = "df_knn_true_mass_O_list_valid"
    file_name23 = "df_knn_true_mass_H_list_test"
    file_name24 = "df_knn_true_mass_Z_list_test"
    file_name25 = "df_knn_true_mass_O_list_test"
    file_name26 = "df_knn_pred_mass_H_list_train"
    file_name27 = "df_knn_pred_mass_Z_list_train"
    file_name28 = "df_knn_pred_mass_O_list_train"
    file_name29 = "df_knn_pred_mass_H_list_valid"
    file_name30 = "df_knn_pred_mass_Z_list_valid"
    file_name31 = "df_knn_pred_mass_O_list_valid"
    file_name32 = "df_knn_pred_mass_H_list_test"
    file_name33 = "df_knn_pred_mass_Z_list_test"
    file_name34 = "df_knn_pred_mass_O_list_test"
    file_name35 = 'df_knn_train_pred_numpy.csv'
    file_name36 = 'df_knn_valid_pred_numpy.csv'

    # use path_to_experiment
    save_to_experiment_path_as_dataframe(total_train_acc_list,  path_to_experiment, file_name1, dtype=float)
    save_to_experiment_path_as_dataframe(test_acc_list, path_to_experiment, file_name2, dtype=float)
    save_to_experiment_path_as_dataframe(total_train_loss_list, path_to_experiment, file_name3, dtype=float)
    save_to_experiment_path_as_dataframe(test_pred_list, path_to_experiment, file_name4, dtype=float)
    save_to_experiment_path_as_dataframe(test_y_list, path_to_experiment, file_name5, dtype=object)
    save_to_experiment_path_as_dataframe(total_train_pred_list, path_to_experiment, file_name6, dtype=object)
    save_to_experiment_path_as_dataframe(total_train_y_list, path_to_experiment, file_name7, dtype=object)
    save_to_experiment_path_as_dataframe(train_time_per_epoch_list, path_to_experiment, file_name8, dtype=float)
    save_to_experiment_path_as_dataframe(valid_time_per_epoch_list, path_to_experiment, file_name9, dtype=float)
    save_to_experiment_path_as_dataframe(total_valid_acc_list,  path_to_experiment, file_name10, dtype=float)
    save_to_experiment_path_as_dataframe(total_valid_loss_list, path_to_experiment, file_name11, dtype=float)
    save_to_experiment_path_as_dataframe(total_valid_pred_list, path_to_experiment, file_name12, dtype=object)
    save_to_experiment_path_as_dataframe(total_valid_y_list, path_to_experiment, file_name13, dtype=object)
    # test y labels 
    save_to_experiment_path_as_dataframe(test_y_list_numpy, path_to_experiment, file_name14, dtype=float)
    save_to_experiment_path_as_dataframe(total_train_y_list_numpy, path_to_experiment, file_name15, dtype=float)
    save_to_experiment_path_as_dataframe(total_valid_y_list_numpy, path_to_experiment, file_name16, dtype=float)
    # mass true
    save_to_experiment_path_as_dataframe(total_true_mass_H_list_train,  path_to_experiment, file_name17, dtype=float)
    save_to_experiment_path_as_dataframe(total_true_mass_Z_list_train,  path_to_experiment, file_name18, dtype=float)
    save_to_experiment_path_as_dataframe(total_true_mass_O_list_train,  path_to_experiment, file_name19, dtype=float)
    save_to_experiment_path_as_dataframe(total_true_mass_H_list_valid,  path_to_experiment, file_name20, dtype=float)
    save_to_experiment_path_as_dataframe(total_true_mass_Z_list_valid,  path_to_experiment, file_name21, dtype=float)
    save_to_experiment_path_as_dataframe(total_true_mass_O_list_valid,  path_to_experiment, file_name22, dtype=float)
    save_to_experiment_path_as_dataframe(total_true_mass_H_list_test,  path_to_experiment, file_name23, dtype=float)
    save_to_experiment_path_as_dataframe(total_true_mass_Z_list_test,  path_to_experiment, file_name24, dtype=float)
    save_to_experiment_path_as_dataframe(total_true_mass_O_list_test,  path_to_experiment, file_name25, dtype=float)

    # mass calculated
    save_to_experiment_path_as_dataframe(total_pred_mass_H_list_train,  path_to_experiment, file_name26, dtype=float)
    save_to_experiment_path_as_dataframe(total_pred_mass_Z_list_train,  path_to_experiment, file_name27, dtype=float)
    save_to_experiment_path_as_dataframe(total_pred_mass_O_list_train,  path_to_experiment, file_name28, dtype=float)
    save_to_experiment_path_as_dataframe(total_pred_mass_H_list_valid,  path_to_experiment, file_name29, dtype=float)
    save_to_experiment_path_as_dataframe(total_pred_mass_Z_list_valid,  path_to_experiment, file_name30, dtype=float)
    save_to_experiment_path_as_dataframe(total_pred_mass_O_list_valid,  path_to_experiment, file_name31, dtype=float)
    save_to_experiment_path_as_dataframe(total_pred_mass_H_list_test,  path_to_experiment, file_name32, dtype=float)
    save_to_experiment_path_as_dataframe(total_pred_mass_Z_list_test,  path_to_experiment, file_name33, dtype=float)
    save_to_experiment_path_as_dataframe(total_pred_mass_O_list_test,  path_to_experiment, file_name34, dtype=float)
    # numpy pred labels 
    save_to_experiment_path_as_dataframe(total_train_pred_list_numpy, path_to_experiment, file_name35, dtype=float)
    save_to_experiment_path_as_dataframe(total_valid_pred_list_numpy, path_to_experiment, file_name36, dtype=float)

    # Save model 
    torch.save(model_knn.state_dict(), path_to_experiment + 'model_knn.pickle')
    return mean_train_epoch_time, mean_valid_epoch_time, df_result_knn

# Create KNN Dataset

In [None]:
# Create the data_knn_list_k_2
path_to_file = '/content/drive/MyDrive/FCC_Experiments_2021/dataset/processed_csv_files/output_11_07_2021.csv'
data_knn_list_variable, df_event_list_variable, length_of_each_event_variable, df_event_processed_list_cleaned_variable = get_data_lists_for_variable_size_graphs(path_to_file, TOTAL_EVENTS=10000, num_neighbours=8)

# GCN 2 layer variable model on the KNN Dataset

We will only **run the experiments on KNN dataset** as they have proved to be the most effective and accurate over other datasets.

The hyperparameters that will be used to test the performance of the model are as follows:
1. K nearest neighbours
2. Non-Linearity
3. Hidden Channels
4. Dropout
5. Learning Rate
6. Weight Decay 
7. Optimizer

In [None]:
# SET PATHS and FILE NAMES
path_to_experiment_variable_depth_2_gcn_dropout_1 = '/content/drive/MyDrive/FCC_Experiments_2021/model_gcn/variable/depth_2/hyperparameters/dropout_x/dropout_0.1/'

path_to_experiment_variable_depth_2_gcn_dropout_2 = '/content/drive/MyDrive/FCC_Experiments_2021/model_gcn/variable/depth_2/hyperparameters/dropout_x/dropout_0.2/'

path_to_experiment_variable_depth_2_gcn_dropout_3 = '/content/drive/MyDrive/FCC_Experiments_2021/model_gcn/variable/depth_2/hyperparameters/dropout_x/dropout_0.3/'

path_to_experiment_variable_depth_2_gcn_dropout_4 = '/content/drive/MyDrive/FCC_Experiments_2021/model_gcn/variable/depth_2/hyperparameters/dropout_x/dropout_0.4/'

path_to_experiment_variable_depth_2_gcn_dropout_5 = '/content/drive/MyDrive/FCC_Experiments_2021/model_gcn/variable/depth_2/hyperparameters/dropout_x/dropout_0.5/'


# GCN - Dropout 
1. Dropout =0.1
2. Dropout =0.2
3. Dropout =0.3
4. Dropout =0.4
5. Dropout =0.5

# Dropout = 0.1

In [None]:
dataset_num_features, dataset_num_classes, model_knn_variable_depth_2_gcn, optimizer_knn_variable_depth_2_gcn, criterion= init_GCNX_model(variable=True, non_linearity='relu', optimizer='adam', hidden_channels=16, dropout=0.1, num_layers=2,learning_rate=0.01, weight_decay=0.0005)
path_to_experiment_variable_depth_2_gcn_dropout_1 = '/content/drive/MyDrive/FCC_Experiments_2021/model_gcn/variable/depth_2/hyperparameters/dropout_x/dropout_0.1/'
train_epoch_knn_2, valid_epoch_knn_2, df_result_knn_2 = experiment_knn(data_knn_list_variable, df_event_processed_list_cleaned_variable, model_knn_variable_depth_2_gcn, optimizer_knn_variable_depth_2_gcn, n_epochs = 51, path_to_experiment = path_to_experiment_variable_depth_2_gcn_dropout_1)

        type      mean     stdev
0  knn_train  0.997477  0.015262
1  knn_valid  0.997704  0.012563
2   knn_test  0.997852  0.006924


#Dropout = 0.2

In [None]:
dataset_num_features, dataset_num_classes, model_knn_variable_depth_2_gcn, optimizer_knn_variable_depth_2_gcn, criterion= init_GCNX_model(variable=True, non_linearity='relu', optimizer='adam', hidden_channels=16, dropout=0.2, num_layers=2,learning_rate=0.01, weight_decay=0.0005)
path_to_experiment_variable_depth_2_gcn_dropout_2 = '/content/drive/MyDrive/FCC_Experiments_2021/model_gcn/variable/depth_2/hyperparameters/dropout_x/dropout_0.2/'
train_epoch_knn_2, valid_epoch_knn_2, df_result_knn_2 = experiment_knn(data_knn_list_variable, df_event_processed_list_cleaned_variable, model_knn_variable_depth_2_gcn, optimizer_knn_variable_depth_2_gcn, n_epochs = 51, path_to_experiment = path_to_experiment_variable_depth_2_gcn_dropout_2)

        type      mean     stdev
0  knn_train  0.997453  0.015351
1  knn_valid  0.997809  0.012281
2   knn_test  0.997987  0.006747


# Dropout = 0.3

In [None]:
dataset_num_features, dataset_num_classes, model_knn_variable_depth_2_gcn, optimizer_knn_variable_depth_2_gcn, criterion= init_GCNX_model(variable=True, non_linearity='relu', optimizer='adam', hidden_channels=16, dropout=0.3, num_layers=2,learning_rate=0.01, weight_decay=0.0005)
path_to_experiment_variable_depth_2_gcn_dropout_3 = '/content/drive/MyDrive/FCC_Experiments_2021/model_gcn/variable/depth_2/hyperparameters/dropout_x/dropout_0.3/'

train_epoch_knn_2, valid_epoch_knn_2, df_result_knn_2 = experiment_knn(data_knn_list_variable, df_event_processed_list_cleaned_variable, model_knn_variable_depth_2_gcn, optimizer_knn_variable_depth_2_gcn, n_epochs = 51, path_to_experiment =path_to_experiment_variable_depth_2_gcn_dropout_3)

        type      mean     stdev
0  knn_train  0.997558  0.015224
1  knn_valid  0.997726  0.012410
2   knn_test  0.998032  0.006670


# Dropout = 0.4 

In [None]:
dataset_num_features, dataset_num_classes, model_knn_variable_depth_2_gcn, optimizer_knn_variable_depth_2_gcn, criterion= init_GCNX_model(variable=True, non_linearity='relu', optimizer='adam', hidden_channels=16, dropout=0.4, num_layers=2,learning_rate=0.01, weight_decay=0.0005)
path_to_experiment_variable_depth_2_gcn_dropout_4 = '/content/drive/MyDrive/FCC_Experiments_2021/model_gcn/variable/depth_2/hyperparameters/dropout_x/dropout_0.4/'

train_epoch_knn_2, valid_epoch_knn_2, df_result_knn_2 = experiment_knn(data_knn_list_variable, df_event_processed_list_cleaned_variable, model_knn_variable_depth_2_gcn, optimizer_knn_variable_depth_2_gcn, n_epochs = 51, path_to_experiment = path_to_experiment_variable_depth_2_gcn_dropout_4 )

        type      mean     stdev
0  knn_train  0.997507  0.015306
1  knn_valid  0.997782  0.012378
2   knn_test  0.998467  0.005931


# Dropout = 0.5 

In [None]:
dataset_num_features, dataset_num_classes, model_knn_variable_depth_2_gcn, optimizer_knn_variable_depth_2_gcn, criterion= init_GCNX_model(variable=True, non_linearity='relu', optimizer='adam', hidden_channels=16, dropout=0.5, num_layers=2,learning_rate=0.01, weight_decay=0.0005)
path_to_experiment_variable_depth_2_gcn_dropout_5 = '/content/drive/MyDrive/FCC_Experiments_2021/model_gcn/variable/depth_2/hyperparameters/dropout_x/dropout_0.5/'

train_epoch_knn_2, valid_epoch_knn_2, df_result_knn_2 = experiment_knn(data_knn_list_variable, df_event_processed_list_cleaned_variable, model_knn_variable_depth_2_gcn, optimizer_knn_variable_depth_2_gcn, n_epochs = 51, path_to_experiment = path_to_experiment_variable_depth_2_gcn_dropout_5 )

        type      mean     stdev
0  knn_train  0.997351  0.016184
1  knn_valid  0.997796  0.012294
2   knn_test  0.998618  0.005674
