## Required package

In [114]:
#import uproot
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from collections import namedtuple, defaultdict
#import open3d as o3d
import random
random.seed(42)
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.utils import add_self_loops
from torch_geometric.transforms import ToUndirected
from torchvision import transforms
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib
from torch.nn import BatchNorm1d
from torch.optim.lr_scheduler import LambdaLR

## Required data generated by GNNonCalo_Scaling_DataPreparation.ipynb

In [115]:
hf_cellFeaturesScaled_neighbor= h5py.File("./cellFeaturesScaled_train_500evs.hdf5", 'r')
hf_train_edge_source_BD = h5py.File("./train_edge_source_BD_500evs.hdf5", 'r')
hf_train_edge_dest_BD = h5py.File("./train_edge_dest_BD_500evs.hdf5", 'r')
hf_train_edge_source_noBD = h5py.File("./train_edge_source_noBD_500evs.hdf5", 'r')
hf_train_edge_dest_noBD = h5py.File("./train_edge_dest_noBD_500evs.hdf5", 'r')
hf_truth_label_train_neighbor= h5py.File("./truth_label_train_500evs.hdf5", 'r')

In [116]:
cellFeaturesScaled = hf_cellFeaturesScaled_neighbor.get("cellFeatures_trainS")[:]
train_edge_source_BD = hf_train_edge_source_BD.get("train_edge_source_BD")[:]
train_edge_dest_BD = hf_train_edge_dest_BD.get("train_edge_dest_BD")[:]
train_edge_source_noBD = hf_train_edge_source_noBD.get("train_edge_source_noBD")[:]
train_edge_dest_noBD = hf_train_edge_dest_noBD.get("train_edge_dest_noBD")[:]
truth_label_train = hf_truth_label_train_neighbor.get("truth_label_train")[:]

In [117]:
hf_cellFeaturesScaled_neighbor.close()
hf_train_edge_source_BD.close()
hf_train_edge_dest_BD.close()
hf_train_edge_source_noBD.close()
hf_train_edge_dest_noBD.close()
hf_truth_label_train_neighbor.close()

In [118]:
cellFeaturesScaled.shape

(500, 187652, 8)

In [119]:
cellFeaturesScaled[2][1]

array([0.58421445, 0.51289224, 0.16380877, 0.23466876, 0.52418755,
       0.26086957, 0.09700815, 0.14570355])

In [120]:
x = torch.tensor(cellFeaturesScaled, dtype=torch.float)

In [121]:
x[0].shape

torch.Size([187652, 8])

In [122]:
train_edge_source_BD.shape

(500, 60000)

## Preparing bi directional edges (align source and destination) for GNN

In [123]:
edge_index = torch.tensor([train_edge_source_BD, train_edge_dest_BD], dtype=torch.long)

In [124]:
edge_index.shape

torch.Size([2, 500, 60000])

In [125]:
edge_index_ch = edge_index.permute(1, 0, 2)

In [126]:
edge_index_ch.shape

torch.Size([500, 2, 60000])

## Preparing uni directional edges for final binary classification

In [127]:
edge_index_out = torch.tensor([train_edge_source_noBD, train_edge_dest_noBD], dtype=torch.long)

In [128]:
edge_index_out.shape

torch.Size([2, 500, 30000])

In [129]:
edge_index_out_ch = edge_index_out.permute(1, 0, 2)

In [130]:
edge_index_out_ch.shape

torch.Size([500, 2, 30000])

## Preparing label (true/Fake) tensor 

In [131]:
truth_label_train = np.expand_dims(truth_label_train, axis=1)

In [132]:
truth_label_train.shape

(500, 1, 30000)

In [133]:
y_train = torch.tensor(truth_label_train, dtype=torch.float)

In [134]:
y_train.shape

torch.Size([500, 1, 30000])

## Data customization specific to pytorch 

In [135]:
# Generate data_list
data_list = []
for i in range(500):
    x_mat = x[i]
    edge_index = edge_index_ch[i]
    edge_index, _ = add_self_loops(edge_index)
    data = Data(x=x_mat, edge_index=edge_index, edge_index_out = edge_index_out_ch[i], y=y_train[i])
    data = ToUndirected()(data)
    data_list.append(data)

In [136]:
ind0 = data_list[0].edge_index
ind1 = data_list[1].edge_index
ind2 = data_list[2].edge_index

In [137]:
print(ind0.shape)
print(ind1.shape)
print(ind2.shape)

torch.Size([2, 247649])
torch.Size([2, 247651])
torch.Size([2, 247648])


In [141]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data_list):
        self.data_list = data_list

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        return self.data_list[idx]

In [142]:
def collate(data_list):
    batch_x = [data.x for data in data_list]
    batch_edge_index = [data.edge_index for data in data_list]
    batch_edge_index_out = [data.edge_index_out for data in data_list]
    batch_y = [data.y for data in data_list]

    return batch_x, batch_edge_index, batch_edge_index_out, batch_y

In [143]:
custom_dataset = CustomDataset(data_list)

In [144]:
batch_size = 20
data_loader = torch.utils.data.DataLoader(custom_dataset, batch_size=batch_size, collate_fn=collate)

In [145]:
for batch_x, batch_edge_index, batch_edge_index_out, _ in data_loader:
    print(len(batch_edge_index))

20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20


In [146]:
#y_test.shape

In [147]:
data.edge_index.shape

torch.Size([2, 247650])

In [148]:
data.y.shape

torch.Size([1, 30000])

In [149]:
x.size(1)

187652

## Edge Classifier Model

In [163]:
# Define the GNN model for edge classification
class EdgeClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(EdgeClassifier, self).__init__()

        # Node embedding layer
        self.node_embedding = nn.Linear(input_dim, hidden_dim)

        # Graph convolutional layers
        self.conv1 = GCNConv(hidden_dim, 128)
        self.bn1 = BatchNorm1d(128)
        
        self.conv2 = GCNConv(128, 64)
        self.bn2 = BatchNorm1d(64)
        
        # Edge classification layer
        self.fc = nn.Linear(128 , output_dim)

    def forward(self, x, edge_index, edge_index_out):
        edge_index = edge_index
        x = self.node_embedding(x)
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = torch.relu(x)

        # Edge representations
        edge_index_to_compare = edge_index_out
        edge_rep = torch.cat([x[edge_index_to_compare[0]], x[edge_index_to_compare[1]]], dim=1)

        # Edge classification
        edge_scores = torch.sigmoid(self.fc(edge_rep))

        return edge_scores

In [164]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Instantiate the model
input_dim = 8
hidden_dim = 256
output_dim = 1  # Binary classification (citing or not citing)
model = EdgeClassifier(input_dim, hidden_dim, output_dim)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

def lr_schedule(epoch):
    if epoch < 10:
        return 1.0  # No change for the first 10 epochs
    else:
        return 0.1  # Decrease learning rate by a factor of 10 after 10 epochs

# Define the learning rate scheduler
scheduler = LambdaLR(optimizer, lr_lambda=lr_schedule)



In [165]:
def train(model, device, data_loader, optimizer, criterion):
    model.train()
    model.to(device)
    #output = []
    totalLossPerEpoch = []
    for batch_x, batch_edge_index, batch_edge_index_out, batch_y in data_loader:
        batch_x = torch.stack(batch_x).to(device)
        batch_edge_index = [edge_index.to(device) for edge_index in batch_edge_index]
        #print(batch_edge_index[0].shape)
        batch_edge_index_out = [edge_index.to(device) for edge_index in batch_edge_index_out]
        batch_y = [y.to(device) for y in batch_y]
        #print(len(batch_y))
        optimizer.zero_grad()
        loss_per_batch = []
        for i in range(len(batch_edge_index)):
            _output = model(batch_x[i], batch_edge_index[i], batch_edge_index_out[i])
            #print(len(_output))
            #output.append(_output)
            loss = criterion(_output.squeeze(), batch_y[i].squeeze())
            #print(loss)
            loss_per_batch.append(loss)
        #print(loss_per_batch)
        #loss_per = torch.tensor(loss_per, dtype=torch.float)
        total_loss_per_batch = sum(loss_per_batch)/ len(loss_per_batch)
        totalLossPerEpoch.append(total_loss_per_batch)
        #total_loss = torch.tensor(total_loss, requires_grad=True) 
        #print("total_loss_per_batch: ",total_loss_per_batch)
        #total_loss.backward()
        total_loss_per_batch.backward()
        optimizer.step()
    #print("totalLossPerEpoch: ",totalLossPerEpoch)
    total_loss_per_epoch = sum(totalLossPerEpoch)/len(totalLossPerEpoch)
    print("total_loss_per_epoch:",total_loss_per_epoch)
    return total_loss_per_epoch

In [None]:
num_epochs = 500
lossPerEpoch = []
for epoch in range(num_epochs):
    lossPerEpoch.append(train(model, device, data_loader, optimizer, criterion))
    # Update the learning rate at the end of each epoch
    scheduler.step()

total_loss_per_epoch: tensor(0.5586, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.5096, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.5009, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.4931, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.4886, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.4941, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.4899, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.4900, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.4860, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.4882, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.4828, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.4753, device='cuda:0', grad_fn=<DivBackward0>)
total_loss_per_epoch: tensor(0.4714, device='cuda:0', grad_fn=<D

In [157]:
lossPerEpoch = [tensor.cpu() for tensor in lossPerEpoch]

In [161]:
lossPerEpoch = [tensor.detach().numpy() for tensor in lossPerEpoch]