In [3]:
import sys
import os

In [4]:
sys.path.insert(0, "..")

In [5]:
#imports
import torch
import torch.nn as nn
import torch_geometric
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
from tqdm.notebook import tqdm
import numpy as np
local = True

In [6]:
import tensorflow.keras as keras
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import uproot
from config.utils import *
import tensorflow
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, BatchNormalization, GlobalAveragePooling1D
import tensorflow.keras.backend as K
from tqdm.notebook import tqdm
import XRootD

In [7]:
import yaml

with open('../config/definitions.yml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    definitions = yaml.load(file, Loader=yaml.FullLoader)
    
features = definitions['features']
spectators = definitions['spectators']
labels = definitions['labels']

nfeatures = definitions['nfeatures']
nspectators = definitions['nspectators']
nlabels = definitions['nlabels']
ntracks = definitions['ntracks']

## Graph Neural Network (with GENConv)

In [8]:
#adding config path for importing GraphDataset class
sys.path.insert(0, "../config")

In [9]:
# Load Dataset
from GraphDataset import GraphDataset
if local:
    file_names = ['~/teams/DSC180A_FA21_A00/a11/train/ntuple_merged_10.root']
    file_names_test = ['~/teams/DSC180A_FA21_A00/a11/test/ntuple_merged_0.root']
else:
    file_names = ['root://eospublic.cern.ch//eos/opendata/cms/datascience/HiggsToBBNtupleProducerTool/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC/train/ntuple_merged_10.root']
    file_names_test = ['root://eospublic.cern.ch//eos/opendata/cms/datascience/HiggsToBBNtupleProducerTool/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC/test/ntuple_merged_0.root']

graph_dataset = GraphDataset('gdata_train', features, labels, spectators, n_events=1000, n_events_merge=1, 
                             file_names=file_names)

test_dataset = GraphDataset('gdata_test', features, labels, spectators, n_events=2000, n_events_merge=1, 
                             file_names=file_names_test)

## Generators for Training, Testing and Validation

In [10]:
#create train/validation/test sets

import uproot
trainroot = uproot.open(file_names[0])
traintree = trainroot['deepntuplizer/tree']
trainlabels = traintree.arrays(['label_QCD_b', 
                      'label_QCD_bb', 
                      'label_QCD_c', 
                      'label_QCD_cc', 
                      'label_QCD_others', 
                      'label_H_bb', 
                      'sample_isQCD'], 
                     entry_stop=20000,
                     library='np')
testroot = uproot.open(file_names_test[0])
testtree = testroot['deepntuplizer/tree']
testlabels = testtree.arrays(['label_QCD_b', 
                      'label_QCD_bb', 
                      'label_QCD_c', 
                      'label_QCD_cc', 
                      'label_QCD_others', 
                      'label_H_bb', 
                      'sample_isQCD'], 
                     entry_stop=20000,
                     library='np')

In [11]:
y_train = trainlabels['label_H_bb']
y_test = testlabels['label_H_bb']

In [14]:
from torch_geometric.loader import RandomNodeSampler

In [15]:
from torch_geometric.data import Data, Batch
from torch_geometric.loader import DataListLoader
from torch.utils.data import random_split

def collate(items):
    l = sum(items, [])
    return Batch.from_data_list(l)

torch.manual_seed(0)
valid_frac = 0.20
full_length = len(graph_dataset)
valid_num = int(valid_frac*full_length)
batch_size = 32

train_dataset, valid_dataset = random_split(graph_dataset, [full_length-valid_num,valid_num])

train_loader = DataListLoader(train_dataset, batch_size=batch_size, pin_memory=True, shuffle=True)
train_loader.collate_fn = collate
valid_loader = DataListLoader(valid_dataset, batch_size=batch_size, pin_memory=True, shuffle=False)
valid_loader.collate_fn = collate
test_loader = DataListLoader(test_dataset, batch_size=batch_size, pin_memory=True, shuffle=False)
test_loader.collate_fn = collate


train_samples = len(train_dataset)
valid_samples = len(valid_dataset)
test_samples = len(test_dataset)
print(full_length)
print(train_samples)
print(valid_samples)
print(test_samples)

920
736
184
1889


In [16]:
inputs = 48
hidden = 128
outputs = 2
#Normalization → ReLU → GraphConv → Addition
#class GNN(nn.Module):
#    def __init__(self, input_dim = inputs, hidden_dim = hidden, a):

In [17]:
from torch_geometric.nn import GENConv

In [18]:
class GENConv_Classifier(nn.Module):

    def __init__(self, width = hidden, n_inputs = inputs):
        super(GENConv_Classifier, self).__init__()
        self.width = width
        self.act = nn.ReLU

        # Initial linear layers
        self.nn1 = nn.Sequential(
            nn.BatchNorm1d(n_inputs),
            self.act(),
            nn.Linear(n_inputs, width),
            self.act(),
            nn.Linear(width, width),
            self.act(),
            nn.Linear(width, width)                   
        )
        # Generalized Convolutional layer
        self.conv = GENConv(width, width, num_layers=2, t=1, learn_t=True)

        # Pre-final linear layers
        self.nn2 = nn.Sequential(
            nn.Linear(width, width),
            self.act(),
            nn.Linear(width, width),
            self.act(),
            nn.Linear(width, width),
            self.act(),
            nn.Linear(width, width),
        )

        # output layer
        self.output = nn.Linear(width, outputs)

    def forward(self, X, edge_index, edge_attr):
        #Normalization → ReLU → GraphConv → Addition
        x0 = X
        # input layer
        x1 = self.nn1(x0)
        #GENConv
        x2 = self.conv(x1, edge_index, edge_attr)
        # hidden layers
        x3 = self.nn2(x2)

        # output layer
        x = torch.sigmoid(self.output(x3))

        return x

In [19]:
model = GENConv_Classifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

## Training/Test Loop as Functions

In [20]:
@torch.no_grad()

def train(epoch):
    model.train()

    pbar = tqdm(total=len(train_loader))
    pbar.set_description(f'Training epoch: {epoch:04d}')

    total_loss = total_examples = 0
    for data in train_loader:
        optimizer.zero_grad()
        data = data.to(device)
        out = model(data.x, data.edge_index, data.edge_attr)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * int(data.train_mask.sum())
        total_examples += int(data.train_mask.sum())

        pbar.update(1)

    pbar.close()

    return total_loss / total_examples

def test():
    model.eval()

    y_true = {'train': [], 'valid': [], 'test': []}
    y_pred = {'train': [], 'valid': [], 'test': []}

    pbar = tqdm(total=len(test_loader))
    pbar.set_description(f'Evaluating epoch: {epoch:04d}')

    for data in test_loader:
        data = data.to(device)
        out = model(data.x, data.edge_index, data.edge_attr)

        for split in y_true.keys():
            mask = data[f'{split}_mask']
            y_true[split].append(data.y[mask].cpu())
            y_pred[split].append(out[mask].cpu())

        pbar.update(1)

    pbar.close()

    train_rocauc = evaluator.eval({
        'y_true': torch.cat(y_true['train'], dim=0),
        'y_pred': torch.cat(y_pred['train'], dim=0),
    })['rocauc']

    valid_rocauc = evaluator.eval({
        'y_true': torch.cat(y_true['valid'], dim=0),
        'y_pred': torch.cat(y_pred['valid'], dim=0),
    })['rocauc']

    test_rocauc = evaluator.eval({
        'y_true': torch.cat(y_true['test'], dim=0),
        'y_pred': torch.cat(y_pred['test'], dim=0),
    })['rocauc']

    return train_rocauc, valid_rocauc, test_rocauc


In [21]:
for epoch in range(1, 1001):
    loss = train(epoch)
    train_rocauc, valid_rocauc, test_rocauc = test()
    print(f'Loss: {loss:.4f}, Train: {train_rocauc:.4f}, '
          f'Val: {valid_rocauc:.4f}, Test: {test_rocauc:.4f}')

    

  0%|          | 0/23 [00:00<?, ?it/s]

AttributeError: 'GlobalStorage' object has no attribute 'train_mask'