In [30]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, InMemoryDataset
from torch_geometric.nn import GCNConv, global_mean_pool

In [31]:
def coord_convert(x,y,z):
    phi = np.arctan2(y, x)
    theta = np.arctan2(np.sqrt(x ** 2 + y ** 2), z)
    eta = -np.log(np.tan(theta/2))    
    return phi, eta, z

In [32]:
def sector_splitter(df, n, m):
    
    phi_bins = np.linspace(-np.pi, np.pi, n+1)  # bins for phi angles
    eta_bins = np.linspace(-4.5, 4.5, m+1)  # bins for eta angles
        
    df_list = []
    for i in range(n):
        df_sublist = []
        for j in range(m):
            
            phi_mask = (phi_bins[i] < df['phi']) & (df['phi'] < phi_bins[i+1])
            eta_mask = (eta_bins[j] < df['eta']) & (df['eta'] < eta_bins[j+1])
            
            df_sublist.append(df[(phi_mask & eta_mask)])
        
        df_list.append(df_sublist)
            
    return df_list

In [33]:
def create_graph(df):
    G = nx.Graph()
    
    # Add nodes
    for i in range(len(df)):
        node = df.iloc[i]
        G.add_node(i, attr_dict=node.to_dict())

    # Add edges
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            dphi = df.iloc[i]["phi"] - df.iloc[j]["phi"]
            deta = df.iloc[i]["eta"] - df.iloc[j]["eta"]
            dR = np.sqrt(dphi**2 + deta**2)

            #checks if dR is too big
            #the other conditions are just to not double count
            if dR < 1.7:# and dphi > 0 and deta > 0:
                attr = (dphi, deta)
                G.add_edge(i, j, attr=attr)

    return G

In [34]:
def create_graph_tensor(X, Y):

    input_data = X
    output_data = Y
    
    # Create a tensor of node features by stacking the columns of the input data
    node_features = torch.tensor(input_data.values)
    output_features = torch.tensor(output_data['particle_id'].values)
    
    phi = torch.from_numpy(input_data[['phi']].values)
    eta = torch.from_numpy(input_data[['eta']].values)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    phi = phi.to(device)
    eta = eta.to(device)
    
    # Compute the pairwise differences between the phi and eta columns of adjacent nodes
    phi_diff = phi.unsqueeze(0) - phi.unsqueeze(-1)
    eta_diff = eta.unsqueeze(0) - eta.unsqueeze(-1)

    #this is deltaR
    diff_norm = torch.sqrt(phi_diff**2 + eta_diff**2)

    
    # Create a binary adjacency matrix based on a threshold of 1.7
    adjacency_matrix = torch.where(
        (diff_norm < 1.7) & (phi_diff > 0) & (eta_diff > 0), 
        torch.ones_like(diff_norm), 
        torch.zeros_like(diff_norm)
    )
    
    #print(adjacency_matrix.shape)
    


    # Convert the adjacency matrix to a list of edge indices
    edge_indices = adjacency_matrix.squeeze(-1).nonzero(as_tuple=False).t()
         
    print(edge_indices.shape)
        
    # Create a tensor of edge features by concatenating the phi and eta differences    
    phi_diff = phi_diff[edge_indices[0], edge_indices[1]]
    eta_diff = eta_diff[edge_indices[0], edge_indices[1]]
    edge_features = torch.cat((
        phi_diff.unsqueeze(-1),
        eta_diff.unsqueeze(-1)
    ), -1)
      
#     print(edge_indices.shape)
    # Create a PyTorch Geometric Data object
    data = Data(
        x          = node_features.float(), 
        edge_index = edge_indices.long(), 
        edge_attr  = edge_features,
        y          = output_features
    )

    return data

In [35]:
def dataMaker(datamax):
    directory = '/home/aaportel/teams/group-3/data/'
    folders = ['train_1/','train_2/','train_5/']

    data = []
    dataCutoff = 0


    for folder in folders:
        print(folder)

        hit_files = sorted([f for f in os.listdir(directory + folder) if f.endswith('hits.csv')])   
        truth_files = sorted([f for f in os.listdir(directory + folder) if f.endswith('truth.csv')])   
        
        for hit_file, truth_file in zip(hit_files, truth_files):

            print(hit_file)
            # read a CSV file into a DataFrame
            X = pd.read_csv(directory + folder + hit_file, usecols=['x','y','z'])
            Y = pd.read_csv(directory + folder + truth_file, usecols=['tx','ty','tz','particle_id'])


            # calculate phi, eta, and z from x, y, and z
            phiX, etaX, zX = coord_convert(X['x'],X['y'],X['z'])
            phiY, etaY, zY = coord_convert(Y['tx'],Y['ty'],Y['tz'])

            # create a new DataFrame with phi, theta, eta, and z columns
            X = pd.DataFrame({
                'phi': phiX,
                'eta': etaX,
                'z': zX
            })
            
            Y = pd.DataFrame({
                'phi': phiY,
                'eta': etaY,
                'z': zY,
                'particle_id': Y['particle_id'] 
            })

            n, m = 8, 4
            X_list = sector_splitter(X, n, m)
            Y_list = sector_splitter(Y, n, m)
            ID = hit_file.split('-')[0]

            for i in range(n):
                for j in range(m):
                    outfile = f'{ID}-graph_phi{i}_eta{j}.pt'

                    G = create_graph_tensor(X_list[i][j], Y_list[i][j])
                    g = G.cpu()
                    del G
                    torch.cuda.empty_cache()
                    
                    #torch.save(G, directory + folder + outfile)
                    
                    print(f'{outfile}')
                    data.append(g)
                    
                    dataCutoff+=1
                    if dataCutoff >= datamax:
                        print('cutoff')
                        return data
    return data