In [1]:
from libs.GNNjetTagger import GNNjetTagger

2024-04-04 10:26:00.227236: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from jetgraphs.transforms import BuildEdges
from jetgraphs.transforms import GraphFilter
from torch_geometric.transforms import Compose
from jetgraphs.transforms import NumLayers, LayersNum


edge_builder = BuildEdges(
    directed=False, 
    self_loop_weight=0,
    same_layer_threshold=0.6, 
    consecutive_layer_threshold=0.6,
    distance_p=2)

graph_filter = GraphFilter(min_num_nodes=2) # only graphs with at least 3 nodes will be accepted

optional_transforms = Compose([NumLayers(), LayersNum()])

In [3]:
from jetgraphs.JetGraphDataset import JetGraphDatasetInMemory_v2

#datasets_root = "content/jetgraphdataset"
datasets_root = "../data/GNN"
raw_data_url = "https://cernbox.cern.ch/s/PYurUUzcNdXEGpz/download"

jet_graph_dataset = JetGraphDatasetInMemory_v2(
    root = datasets_root,           # directory where to download data 
    url = raw_data_url,             # url to raw data
    subset = '10%',                # which subset of the intial 100k graph to consider, default is 100%
    pre_filter = graph_filter,      
    pre_transform = edge_builder,   # edge_builder should be passed as pre_transform to keep data on disk.
    post_filter = graph_filter,
    transform = optional_transforms,
    verbose=True
    )

Loaded dataset containing subset of 10%


In [4]:
from torch_geometric.loader import DataLoader
from sklearn.model_selection import train_test_split

train_val_idx, test_idx = train_test_split(range(len(jet_graph_dataset)), stratify=[m.y[0].item() for m in jet_graph_dataset], test_size=0.2, random_state=77)
train_val_dataset = jet_graph_dataset[train_val_idx]

train_idx, val_idx = train_test_split(range(len(train_val_dataset)), stratify=[m.y[0].item() for m in train_val_dataset], test_size=0.4, random_state=77)
train_loader = DataLoader(jet_graph_dataset[train_idx], batch_size=512, num_workers= 96, shuffle=True)
val_loader = DataLoader(jet_graph_dataset[val_idx], batch_size=512, num_workers= 96)


train_dataset = train_val_dataset[train_idx]
val_dataset = train_val_dataset[val_idx]
test_dataset = jet_graph_dataset[test_idx]

jet_graph_dataset.stats()
train_val_dataset.stats()




*** JetGraph Dataset ***

Number of classes: 2
Number of graphs: 19931
Dataset is directed: False
Number of node features: 4
Number of edge features: 1
Number of positive samples:9941.00

*** JetGraph Dataset ***

Number of classes: 2
Number of graphs: 15944
Dataset is directed: False
Number of node features: 4
Number of edge features: 1
Number of positive samples:7952.00


In [5]:
model = GNNjetTagger(model_name="GNNjetTagger_0")
model.create_model(hidden_channels=256, node_features_size=jet_graph_dataset[0].x.shape[1])

/cvmfs/sft.cern.ch/lcg/views/LCG_105/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/pytorch_lightning/utilities/parsing.py:198: Attribute 'loss_func' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss_func'])`.


In [6]:
model.train_model(train_loader, val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type              | Params
--------------------------------------------
0 | loss  | BCEWithLogitsLoss | 0     
1 | norm  | BatchNorm         | 8     
2 | conv1 | ARMAConv          | 203 K 
3 | conv2 | ARMAConv          | 590 K 
4 | conv3 | ARMAConv          | 590 K 
5 | lin0  | Linear            | 65.8 K
6 | lin   | Linear            | 257   
--------------------------------------------
1.5 M     Trainable params
0         Non-trainable params
1.5 M     Total params
5.803     Total estimated model params size (MB)


Sanity Checking: |                                                          | 0/? [00:00<?, ?it/s]

Training: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                               | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.539


Validation: |                                                               | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.044 >= min_delta = 0.0. New best score: 0.495


Validation: |                                                               | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.061 >= min_delta = 0.0. New best score: 0.434


Validation: |                                                               | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.050 >= min_delta = 0.0. New best score: 0.384


Validation: |                                                               | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.012 >= min_delta = 0.0. New best score: 0.373


Validation: |                                                               | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.020 >= min_delta = 0.0. New best score: 0.353


Validation: |                                                               | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.003 >= min_delta = 0.0. New best score: 0.350


Validation: |                                                               | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.007 >= min_delta = 0.0. New best score: 0.343


Validation: |                                                               | 0/? [00:00<?, ?it/s]

Validation: |                                                               | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.008 >= min_delta = 0.0. New best score: 0.335
`Trainer.fit` stopped: `max_epochs=10` reached.


In [None]:
import numpy as np
import sklearn.metrics
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from mpl_toolkits.mplot3d import Axes3D
import warnings
import jetgraphs
from jetgraphs.utils import plot_jet_graph, plot_metrics
from jetgraphs.models import Arma
from torch_geometric.loader import DataLoader



# Predict.
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    pretrained_filename = f'./checkpoints/gcn-epoch=19.ckpt'
    model = Arma.load_from_checkpoint(pretrained_filename)
    full_loader = DataLoader(jet_graph_dataset, batch_size=512, num_workers=96)

    weights = trainer.predict(model, full_loader)
    truths = []

for data in jet_graph_dataset:
    truths.append((data['y'].unsqueeze(1).float()).tolist())

weights = np.array(sum(weights, []))
truths = np.array(sum(truths, []))



In [7]:
# 1. GNNgraphMaker (similar to CNNimageMaker)
# 2. dataset converter (similar to CNNdatasetConverter)
# 3. graph conversion while loading data from root files
# 4. measure_performance into superclass (incl. plots from GNN notebook)