<a href="https://colab.research.google.com/github/akanksha-ahuja/fcc-final-notebooks/blob/main/fcc_experiment_1_x_save_variable_graphs_as_torch_data_objects.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Libs

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import timeit
import random
import torch.nn.functional as F
import itertools
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from scipy.spatial import distance
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Install required packages.
!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install -q torch-geometric
!pip install graphlime
from torch_geometric.data import Data, DataLoader
from graphlime import GraphLIME
from torch_geometric.utils import to_networkx

[K     |████████████████████████████████| 3.0 MB 5.1 MB/s 
[K     |████████████████████████████████| 1.6 MB 5.0 MB/s 
[K     |████████████████████████████████| 222 kB 4.3 MB/s 
[K     |████████████████████████████████| 376 kB 52.2 MB/s 
[K     |████████████████████████████████| 45 kB 3.4 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone
Collecting graphlime
  Downloading graphlime-1.2.0.tar.gz (3.3 kB)
Building wheels for collected packages: graphlime
  Building wheel for graphlime (setup.py) ... [?25l[?25hdone
  Created wheel for graphlime: filename=graphlime-1.2.0-py3-none-any.whl size=2616 sha256=e5ae09cf5f667211e96e6675b39ac50e73ca97a5751af28d1130214c3756141b
  Stored in directory: /root/.cache/pip/wheels/33/29/94/9835c557e2def18b58369cda0032935a3263acfa9266aaeb5d
Successfully built graphlime
Installing collected packages: graphlime
Successfully installed graphlime-1.2.0


In [None]:
from torch_geometric.nn import GCNConv, TAGConv, SAGEConv, ChebConv
from torch_geometric.nn import GATConv
from torch_geometric.nn import GINConv
from torch_geometric.nn import JumpingKnowledge, GCN2Conv
from torch.nn import Sequential, Linear, BatchNorm1d, ReLU
from torch_geometric.nn import GNNExplainer

# Connect to G-drive

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Load and process df

In [None]:
# Data Processing Functions
def load_df(path_to_file):
  df = pd.read_csv(path_to_file)
  return df

def set_constants(TOTAL_EVENTS, MAX_LENGTH_EVENT=150):
  TOTAL_EVENTS = TOTAL_EVENTS
  MAX_LENGTH_EVENT = MAX_LENGTH_EVENT
  return TOTAL_EVENTS, MAX_LENGTH_EVENT

def create_labels(df):
  conditions = [(df['isHiggs'] == True),(df['isZ'] == True), (df['isOther'] == True) ]
  # create a list of the values we want to assign for each condition
  values = [0, 1, 2] 

  # create a new column and use np.select to assign values to it using our lists as arguments
  df['label'] = np.select(conditions, values)
  return df


def normalise_x_features(df):
  # Normalise the features in the dataset 
  df_id = df[['event_list']]
  df_x = df[['pid', 'pos_r', 'pos_theta', 'pos_phi', 'pos_t', 'mom_p', 'mom_theta', 'mom_phi', 'mom_mass']]
  df_y = df[['label']]

  # Create a list of labels for the new dataframe
  new_columns = ['event_list', 'pid', 'pos_r', 'pos_theta', 'pos_phi', 'pos_t', 'mom_p', 'mom_theta', 'mom_phi', 'mom_mass', 'label']

  x = df_x.values # returns numpy 
  min_max_scaler = preprocessing.MinMaxScaler()
  x_scaled = min_max_scaler.fit_transform(x)
  df_x = pd.DataFrame(x_scaled)

  # Concatenate normalised x features and un-normalised y labels and event ids
  df_normalised_features = pd.concat([df_id, df_x, df_y], axis=1)
  df_normalised_features.columns = new_columns # You need to mention the axis
  return df_normalised_features

def split_df_by_event(df_normalised_features, TOTAL_EVENTS):
  # Dataframes split by event 
  df_event_list = []
  for i in range(TOTAL_EVENTS):
    df_event = df_normalised_features[df_normalised_features['event_list']==i]
    df_event_list.append(df_event)

  # A list of number of stable particles per event 
  length_of_each_event = [len(df_event_list[i]) for i in range(len(df_event_list))]
  return df_event_list, length_of_each_event

def create_source_target_for_COO(df_event_list):
  # Add two columns of source, target over all dataframes in df_event_list to make it compatible with pygn Data Object.
  df_event_source_target_list = []
  for i in range(len(df_event_list)):
    df_event_list[i]['source'] = None
    df_event_list[i]['target'] = None
    df_event_source_target_list.append(df_event_list[i])
  return df_event_source_target_list

#Generate Data.x and Data.y for pytorch geometric 

In [None]:
def generate_graph_data_x_y_for_already_normalised_features(df_event_processed_list_cleaned, TOTAL_EVENTS):
  # Generating data.x and data.y for pytorch geomteric 
  graph_data_x_list = []
  graph_data_y_list = []
  for i in range(TOTAL_EVENTS):
    df_graph = df_event_processed_list_cleaned[i]
    # Extract node features and labels from cleaned processed fixed size event list and convert to numpy 
    data_x = df_graph[['pid', 'pos_r', 'pos_theta', 'pos_phi', 'pos_t', 'mom_p', 'mom_theta', 'mom_phi', 'mom_mass']].to_numpy()
    data_y = df_graph[['label']].to_numpy()

    # Convert numpy objects into tensors for data loaders 
    graph_data_x_list.append(torch.Tensor(data_x))
    graph_data_y_list.append(torch.Tensor(data_y))
  return graph_data_x_list, graph_data_y_list

# Create graph nodes and labels 

In [None]:
def create_graph_nodes_and_labels_for_variable_graphs(path_to_file, TOTAL_EVENTS):
  df = load_df(path_to_file) # You can specify path to file here 
  TOTAL_EVENTS, _ = set_constants(TOTAL_EVENTS) # you can pass the constants here 
  df = create_labels(df) 
  df_normalised_features = normalise_x_features(df) # Don't call this if you are normalising features when creating graph dataset for FIXED GRAPHS  
  df_event_list, length_of_each_event = split_df_by_event(df_normalised_features, TOTAL_EVENTS)
  df_event_source_target_list = create_source_target_for_COO(df_event_list) 
  df_event_processed_list_cleaned = df_event_source_target_list
  graph_data_x_list, graph_data_y_list = generate_graph_data_x_y_for_already_normalised_features(df_event_processed_list_cleaned, TOTAL_EVENTS)
  return df, df_normalised_features, df_event_list, length_of_each_event, df_event_processed_list_cleaned, graph_data_x_list, graph_data_y_list

# Label 

In [None]:
def generate_particle_lists_for_label_connections(df_event_list):
  # Getting all the lists for each event in the dataset - define h_list, z_list, o_list
  h_list = [] # all higgs nodes are connected Ω
  z_list = []  # all z nodes are connected 
  o_list = [] # all otehr nodes are connected 

  for i in range(len(df_event_list)):
    df_event = df_event_list[i]
    h = df_event[df_event['label']==0]
    h.reset_index(drop=True)
    z = df_event[df_event['label']==1]
    z.reset_index(drop=True)
    o = df_event[df_event['label']==2] 
    o.reset_index(drop=True)
    h_list.append(h)
    z_list.append(z)
    o_list.append(o)

  return h_list, z_list, o_list

def generate_X_label_list(h_list, z_list, o_list, df_event_list, TOTAL_EVENTS):
  Z_BOSON = int(1)
  H_BOSON = int(0)
  NO_BOSON = int(2)
  # NO_PARTICLE = int(3)
  X_list = []
  # for each event event_id 
  for event_id in range(TOTAL_EVENTS):
    X = df_event_list[event_id]
    source_list, target_list  = [], []
    # for each node_id 
    for node_id in range(len(X)):
      if X.iloc[node_id].label == Z_BOSON:
        source = [node_id for x in range(len(z_list[event_id]))]
        target = [x for x in range(len(z_list[event_id]))]
        source_list.append(source)
        target_list.append(target)
      elif X.iloc[node_id].label == H_BOSON:
        source = [node_id for x in range(len(h_list[event_id]))]
        target = [x for x in range(len(h_list[event_id]))]
        source_list.append(source)
        target_list.append(target)
      elif X.iloc[node_id].label == NO_BOSON:
        source = [node_id for x in range(len(o_list[event_id]))]
        target = [x for x in range(len(o_list[event_id]))]
        source_list.append(source)
        target_list.append(target)
      # Add all values as 2 columns for eache event   
    X['source'] = source_list
    X['target'] = target_list
    X_list.append(X)

  return X_list

def convert_coo_format_for_label_events(df):
    source_list = list(itertools.chain.from_iterable(df['source'].to_numpy())) 
    target_list = list(itertools.chain.from_iterable(df['target'].to_numpy()))
    edge_index= torch.tensor([source_list, target_list], dtype=torch.long)
    return edge_index
    
def create_COO_format_data_label_list(X_list, graph_data_x_list, graph_data_y_list, TOTAL_EVENTS):
  # Data Represented as edges with same labels 
  # A list of graph data items to be passed on to the data loader 
  data_label_list = []
  for event_id in range(TOTAL_EVENTS):
    data_item = Data(x = graph_data_x_list[event_id], 
                     y = graph_data_y_list[event_id], 
                    edge_index = convert_coo_format_for_label_events(X_list[event_id]))
    data_label_list.append(data_item)
  return data_label_list

def create_data_label_list(df_event_list, graph_data_x_list, graph_data_y_list, TOTAL_EVENTS):
  # Create Data Label List 
  h_list, z_list, o_list  = generate_particle_lists_for_label_connections(df_event_list)
  X_list = generate_X_label_list(h_list, z_list, o_list, df_event_list, TOTAL_EVENTS)
  data_label_list = create_COO_format_data_label_list(X_list, graph_data_x_list, graph_data_y_list, TOTAL_EVENTS)
  return data_label_list 

#Radius

In [None]:
def get_features_extraction_list(df_event_list, TOTAL_EVENTS):
  feature_extraction_list = [] 
  for event_id in range(TOTAL_EVENTS):
    event_features = df_event_list[event_id][["pid",	"pos_r",	"pos_theta",	"pos_phi",	"pos_t"	,"mom_p",	"mom_theta",	"mom_phi",	"mom_mass"]]
    feature_extraction_list.append(event_features)
  return feature_extraction_list

def get_PCA_transformed_features(feature_extraction_list, TOTAL_EVENTS):
  X_pca_list = []   
  for event_id in range(TOTAL_EVENTS):
    pca = PCA() 
    X_pca = pca.fit_transform(feature_extraction_list[event_id])
    X_pca_list.append(X_pca)
  return X_pca_list

def get_2_D_coordinates(X_pca_list, length_of_each_event, TOTAL_EVENTS):
  # Find all 2-d coordinates 
  point_event_list = [] # stores all points for each event in a list of tuple points
  index_event_list = [] # stores all indices for each event in a list of tuple indices
  principal_components_list = [5, 6] # after data exploration, these two were chosen 
  for event_id in range(TOTAL_EVENTS):
    length = length_of_each_event[event_id]
    points = []
    index_list = []
    # print(length)
    for node_id_source in range(length):
      for node_id_target in range(length):
        pt = (X_pca_list[event_id][node_id_source, principal_components_list[0]],
              X_pca_list[event_id][node_id_target, principal_components_list[1]])
        index_list.append((node_id_source, node_id_target))
        points.append(pt)
    point_event_list.append(points)
    index_event_list.append(index_list)
  return point_event_list, index_event_list

def calculate_euclidean_distance(point_event_list, length_of_each_event, TOTAL_EVENTS):
  # Calculate euclidean distance between each consecutive pair 
  distance_event_list = []
  for event_id in range(TOTAL_EVENTS):
    event_length = length_of_each_event[event_id]
    distance_list = []
    for node_id_source in range(event_length):
      for node_id_target in range(event_length):
        # print(k, length_of_each_event[k], i, j)
        xpt = point_event_list[event_id][node_id_source][0]
        ypt = point_event_list[event_id][node_id_target][1]
        dist = distance.euclidean(xpt,ypt)
        distance_list.append(dist)
    distance_event_list.append(distance_list)
  return distance_event_list

def calculate_node_distances_by_event(distance_event_list, length_of_each_event, TOTAL_EVENTS):
  # Calculating the each node's distances for event_length for each event for 10,000 events
  distance_each_particle_event_list = []
  for event_id in range(TOTAL_EVENTS):
    distance_each_particle_list = np.array_split(distance_event_list[event_id], length_of_each_event[event_id]) 
    distance_each_particle_event_list.append(distance_each_particle_list)
  return distance_each_particle_event_list

def calculate_edges_by_radius(distance_each_particle_event_list, length_of_each_event, TOTAL_EVENTS, radius = 0.2):
  # Fixing radius at random or threshold to be 0.2, therefore all node_ids with distances less than this threshold, is added to the neighbour_list 
  neighbour_event_list = []
  target_event_list = []
  source_event_list = [] 
  for event_id in range(TOTAL_EVENTS):
    event_length = length_of_each_event[event_id]
    neighbour_list = []
    source_list = []
    target_list = []
    for node_id_source in range(event_length):
      for node_id_target in range(event_length):
        if distance_each_particle_event_list[event_id][node_id_source][node_id_target] <= radius:
            source_list.append(node_id_source)
            target_list.append(node_id_target)
            neighbour_list.append((node_id_source, node_id_target))
    neighbour_event_list.append(neighbour_list)
    target_event_list.append(target_list)
    source_event_list.append(source_list)
  return neighbour_event_list, target_event_list, source_event_list

def convert_coo_format_for_radius_events(source_event_list,target_event_list, event_id):
    edge_index= torch.tensor([source_event_list[event_id], target_event_list[event_id]], dtype=torch.long)
    return edge_index

def create_COO_format_data_radius_list(source_event_list,target_event_list, graph_data_x_list, graph_data_y_list, TOTAL_EVENTS):
  # Data Represented as edges within Radius = 0.2 
  # A list of graph data items to be passed on to the data loader 
  data_radius_list = []
  for event_id in range(TOTAL_EVENTS):
    data_item = Data(x = graph_data_x_list[event_id], 
                    y = graph_data_y_list[event_id], 
                    edge_index = convert_coo_format_for_radius_events(source_event_list,target_event_list, event_id))
    data_radius_list.append(data_item)
  return data_radius_list

def create_data_radius_list(df_event_list, length_of_each_event, graph_data_x_list, graph_data_y_list, TOTAL_EVENTS, radius = 0.2):
  # Create Data Radius List 
  feature_extraction_list = get_features_extraction_list(df_event_list, TOTAL_EVENTS)
  X_pca_list = get_PCA_transformed_features(feature_extraction_list, TOTAL_EVENTS) 
  point_event_list, index_event_list = get_2_D_coordinates(X_pca_list, length_of_each_event, TOTAL_EVENTS) 
  distance_event_list = calculate_euclidean_distance(point_event_list, length_of_each_event, TOTAL_EVENTS)
  distance_each_particle_event_list = calculate_node_distances_by_event(distance_event_list, length_of_each_event,  TOTAL_EVENTS)
  neighbour_event_list, target_event_list, source_event_list = calculate_edges_by_radius(distance_each_particle_event_list, length_of_each_event, TOTAL_EVENTS, radius = 0.2)
  data_radius_list = create_COO_format_data_radius_list(source_event_list,target_event_list, graph_data_x_list, graph_data_y_list, TOTAL_EVENTS)
  return data_radius_list 

# KNN

In [None]:
def define_knn(num_neighbours=8):
  knn = NearestNeighbors(n_neighbors=num_neighbours, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None)
  return knn

def generate_X_list_knn(knn, df_event_list, TOTAL_EVENTS, num_neighbours=8):
  X_list_knn = [] 
  for event_id in range(TOTAL_EVENTS):
    X = df_event_list[event_id]
    X = X.drop(columns=['source', 'target'])
    knn.fit(X)
    neighbour = knn.kneighbors(X, n_neighbors=num_neighbours, return_distance=False)
    target = neighbour
    source = np.zeros((neighbour.shape))
    X['source'] = None
    X['target'] = None
    for i in range(len(X)):
      X['source'].iloc[i] = np.ones((neighbour.shape)) * i
      X['target'].iloc[i] = target[i]
    X_list_knn.append(X)
  return X_list_knn

def convert_COO_for_knn_events(df, num_neighbours=8):
  source_list = [] 
  for i in range(len(df)):
    for _ in range(num_neighbours):
      source_list.append(i)
  target_list = list(itertools.chain.from_iterable(df['target'].to_numpy()))
  edge_index= torch.tensor([source_list, target_list], dtype=torch.long)
  return edge_index

def create_COO_format_data_knn_list(X_list_knn, graph_data_x_list, graph_data_y_list, TOTAL_EVENTS, num_neighbours=8):
  # Data Represented as edges with K-nearest neighbours as 8 
  # A list of graph data items to be passed on to the data loader 
  data_knn_list = []
  for event_id in range(TOTAL_EVENTS):
    data_item = Data(x = graph_data_x_list[event_id], 
                    y = graph_data_y_list[event_id], 
                    edge_index = convert_COO_for_knn_events(X_list_knn[event_id], num_neighbours=8))
    data_knn_list.append(data_item)
  return data_knn_list

def create_data_knn_list(df_event_list, graph_data_x_list, graph_data_y_list, TOTAL_EVENTS, num_neighbours=8):
  # Create data knn list 
  knn = define_knn(num_neighbours=8)
  X_list_knn = generate_X_list_knn(knn, df_event_list, TOTAL_EVENTS, num_neighbours=8)
  data_knn_list = create_COO_format_data_knn_list(X_list_knn, graph_data_x_list, graph_data_y_list, TOTAL_EVENTS, num_neighbours=8)
  return data_knn_list 

#Create dataset

In [None]:
def get_data_lists_for_variable_size_graphs(path_to_file, TOTAL_EVENTS):
  # VARIABLE SIZE GRAPHS
  df, df_normalised_features, df_event_list, length_of_each_event, df_event_processed_list_cleaned, graph_data_x_list, graph_data_y_list = create_graph_nodes_and_labels_for_variable_graphs(path_to_file, TOTAL_EVENTS)
  data_label_list = create_data_label_list(df_event_list, graph_data_x_list, graph_data_y_list, TOTAL_EVENTS)
  data_radius_list = create_data_radius_list(df_event_list, length_of_each_event, graph_data_x_list, graph_data_y_list, TOTAL_EVENTS, radius = 0.2)
  data_knn_list  = create_data_knn_list(df_event_list,graph_data_x_list, graph_data_y_list, TOTAL_EVENTS, num_neighbours=8)
  return data_radius_list, data_knn_list, data_label_list, df_event_list, length_of_each_event, df_event_processed_list_cleaned

#Creating, Saving and Loading Graphs


In [None]:
def save_data_list_as_torch_data_object(data_list,  path_to_save_data_object, TOTAL_EVENTS):
  # TORCH SAVING
  for x in range(TOTAL_EVENTS):
    torch.save(data_list[x], path_to_save_data_object + 'event_' + str(x) + '.pickle')
  return "Saved " + str(TOTAL_EVENTS) +" data objects"

def load_data_list_as_torch_object( path_to_save_data_object, TOTAL_EVENTS):
  data_list_loaded_list = []
  # TORCH LOADING 
  for x in range(TOTAL_EVENTS):
    data_list_x = Data(torch.load( path_to_save_data_object +'event_'+ str(x) + '.pickle'))
    data_list_loaded_list.append(data_list_x)
  return data_list_loaded_list


In [None]:
# MENTION NUMBER OF EVENTS and path to file
TOTAL_EVENTS = 10
path_to_file="output_11_07_2021.csv"
data_radius_list_variable, data_knn_list_variable, data_label_list_variable, df_event_list_variable, length_of_each_event_variable, df_event_processed_list_cleaned_variable = get_data_lists_for_variable_size_graphs(path_to_file, TOTAL_EVENTS)

In [None]:
path_to_save_data_object_radius = '/content/drive/MyDrive/FCC_Experiments_2021/dataset/pytorch_geometric_data_lists/variable/data_radius_list/'
path_to_save_data_object_knn = '/content/drive/MyDrive/FCC_Experiments_2021/dataset/pytorch_geometric_data_lists/variable/data_knn_list/'
path_to_save_data_object_label = '/content/drive/MyDrive/FCC_Experiments_2021/dataset/pytorch_geometric_data_lists/variable/data_label_list/'

In [None]:
save_data_list_as_torch_data_object(data_radius_list_variable, path_to_save_data_object= path_to_save_data_object_radius +'data_radius_list_variable_', TOTAL_EVENTS=10)


'Saved 10 data objects'

In [None]:
save_data_list_as_torch_data_object(data_radius_list_variable, path_to_save_data_object= path_to_save_data_object_radius +'data_radius_list_variable_', TOTAL_EVENTS=10)


'Saved 10 data objects'

In [None]:
save_data_list_as_torch_data_object(data_label_list_variable , path_to_save_data_object=path_to_save_data_object_label +'data_label_list_variable_', TOTAL_EVENTS=10)


'Saved 10 data objects'

In [None]:
# save_data_list_as_torch_data_object(data_knn_list_variable,  path_to_save_data_object= path_to_save_data_object_knn + 'data_knn_list_variable_', TOTAL_EVENTS=10)
# save_data_list_as_torch_data_object(data_radius_list_variable, path_to_save_data_object= path_to_save_data_object_radius +'data_radius_list_variable_', TOTAL_EVENTS=10)
# save_data_list_as_torch_data_object(data_label_list_variable , path_to_save_data_object=path_to_save_data_object_label +'data_label_list_variable_', TOTAL_EVENTS=10)
