# Building Dataset 
We build a Pytorch Dataset Object to store and structure the graph data: 

In [1]:
# import libraries : 
import gc 
import os 
import random 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

# set random seed : 
np.random.seed( 41 )

import torch
import torch.nn as nn 
from torch import cdist
from torch import Tensor 
import torch.functional as F 
import torch.utils.data as data 

import torch_geometric
from torch_geometric.data import Data
from torch_geometric.utils import dense_to_sparse , remove_self_loops

In [2]:
# check if CPU is available for training : 

device = 'gpu'
if torch.cuda.is_available(): 
    device = 'cuda'
elif torch.mps.is_available(): 
    device = 'mps'

device 

'mps'

In [3]:
x = Tensor( [ 1 , 2 ]  ).int()
y = np.array( [ 1, 2 , 4 , 5 ])
# y = Tensor( y )
type(y[x])

numpy.ndarray

In [4]:
x = Tensor([True , True , False ]) 
x 

tensor([1., 1., 0.])

In [5]:
x = Tensor([
    [3,4],
    [12,5],
    [7,24]
])

norm = torch.linalg.norm(x , ord=2 , dim = 1 , keepdim=True )

print( norm )

y = x/norm 
print( x )
print( y ) 

torch.sum( x*y , dim = 1 , keepdim= True )

tensor([[ 5.],
        [13.],
        [25.]])
tensor([[ 3.,  4.],
        [12.,  5.],
        [ 7., 24.]])
tensor([[0.6000, 0.8000],
        [0.9231, 0.3846],
        [0.2800, 0.9600]])


tensor([[ 5.],
        [13.],
        [25.]])

In [6]:
class EventData(data.Dataset): 
    
    # initialaize the event dataset 
    def __init__(self,path:str,threshold_dist:float=40)->None:
        '''
        Inputs : 
            path: path to the folder where the csv file was contained.  
        '''
        super(EventData,self).__init__()
        self.events = [code[:-9] for code in os.listdir(path) if code.endswith('-hits.csv')]
        self.num_events = len(self.events)
        self.threshold_dist  = threshold_dist
        self.path = path 
        
    # function returns graph type represntation of the event dataset 
    def GraphData(self,idx:int) -> Data :
        eventid = self.events[idx] 
        
        # read the required csv files : 
        hits = pd.read_csv(self.path+eventid+'-hits.csv')
        truth = pd.read_csv(self.path+eventid+'-truth.csv')
        cells = pd.read_csv(self.path+eventid+'-cells.csv')
        
        # total number of hits : these form the NODES of our graph. 
        nhits = hits.shape[0] 
        # x , y , z spatial featuers of the hits:  
        hits_spatial = hits.to_numpy()[: , 1:4 ]
        # Add a new feature vector : the number of cells that detect the hit : 
        node_fets = np.concatenate(
            (
                hits_spatial ,
                cells.hit_id.value_counts().get( hits.hit_id , 0 ).to_numpy().reshape((-1,1))
            ), 
            axis = 1 
        )
        # id's related to the hits 
        # this will help to initialize the graph structure : 
        hit_ids = hits.to_numpy(dtype = int)[: ,[0,*list(range(4,7))]]
        
        # get the particle true hit position and momentum, we add this to the node feat matrix : 
        node_fets = np.concatenate(
            (
                node_fets , 
                truth[['tx' , 'ty' , 'tz'  ]].to_numpy() - hits_spatial , 
                truth[['tpx' , 'tpy' ,'tpz']].to_numpy()
            ), 
            axis = 1 
        )
        node_fets = Tensor( node_fets )
        hits_spatial = Tensor( hits_spatial )
        
        # here we create edge_index's for the graph skeleton : 
        # First create a mask for ensuring the distance lying under 40 
        # mask = cdist(hits_spatial , hits_spatial , p = 2 ) < self.threshold_dist  
        # add constraint for volume_id : 
        volume_id = Tensor( hit_ids[: , 1 ] )
        volume_mask = volume_id.unsqueeze(0) - volume_id.unsqueeze(1) > 0  
        # add constraint for layer_id : 
        layer_id = Tensor( hit_ids[:,2] )
        layer_mask = ( volume_id.unsqueeze(0) - volume_id.unsqueeze(1) == 0 ) & ( layer_id.unsqueeze(0) - layer_id.unsqueeze(1) >= 0 )
        # merge the masks: 
        del volume_id , layer_id 
        gc.collect()
        mask =  ( volume_mask | layer_mask ) # & mask 
        # create the adj matrix : 
        edge_index = mask.float() 
        del volume_mask , layer_mask , mask 
        gc.collect()
        # create adj_index : 
        edge_index , _ = dense_to_sparse(edge_index) 
        # remove slef loop's from adj_index : 
        edge_index, _ = remove_self_loops(edge_index)
        row , col = edge_index 
        
        # number of edges : 
        num_edges = edge_index.shape[1]
        
        # create edge labels and edge attributes : 
        # Lables : 
            # label == 0 if the two nodes are not part of a traj 
            # label == 1 otherwise 
        edge_labels = ((truth.particle_id.to_numpy()[row] == truth.particle_id.to_numpy()[col]) & ( truth.particle_id.to_numpy()[row] != 0 ))
        edge_labels = Tensor( edge_labels ).float()
        
        # Attributes : 
            # Angle: between the momentum vector of the particle and the displacement vector between the hits. 
            # Distance: euclidean distance between the two hits. 
        pVector = Tensor( truth[['tpx' , 'tpy' ,'tpz']].to_numpy()[row] )
        pVector = pVector/torch.linalg.norm( pVector , ord = 2 , dim = 1 , keepdim= True )
        disp = Tensor( hits[['x','y','z']].to_numpy()[row] -  hits[['x','y','z']].to_numpy()[col] )
        dist = torch.linalg.norm( disp , ord = 2 , dim = 1 , keepdim=True )
        angle = torch.sum( pVector*(disp/dist) , dim = 1 , keepdim=True )
        angle[torch.isnan(angle)] = 0.
        del pVector , disp  
        gc.collect()
        edge_attr = torch.cat([angle , dist] , dim = 1 )
        del angle , dist 
        gc.collect()
        
        # define graph data : 
        graph_data = Data(
            x = node_fets , 
            edge_index=edge_index , 
            edge_attr = edge_attr , 
            label = edge_labels , 
            num_nodes = nhits , 
            num_edges = num_edges 
        )
        
        return graph_data 
    
    def __len__(self)->int: 
        return self.num_events 
    
    def __getitem__(self,index:int)->Data:
        return self.GraphData(index)

In [None]:
# test event data code : 
dataset = EventData(path='../data/train_100_events/')
size = len( dataset )
size 

100

: 

In [None]:
rnum = np.random.choice(np.arange(size))
random_event = dataset[rnum]