In [1]:
import logging
from datetime import datetime
import argparse
import glob
import os
from pathlib import Path

import numpy as np
import torch

import h5py

import tqdm
import yaml
from scipy.special import softmax
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

from src.data.h5data import H5Data
from src.models.InteractionNet import InteractionNetSingleTagger, InteractionNetTagger
from src.models.pretrain_vicreg import Projector, VICReg, get_backbones

In [2]:
!pwd

/ssl-jet-vol-v2/hbb_interaction_network/notebooks


In [3]:
project_dir = "/ssl-jet-vol-v2/hbb_interaction_network"
definitions = f"{project_dir}/src/data/definitions.yml"
with open(definitions) as yaml_file:
    defn = yaml.load(yaml_file, Loader=yaml.FullLoader)

N = defn["nobj_2"]  # number of charged particles
N_sv = defn["nobj_3"]  # number of SVs
n_targets = len(defn["reduced_labels"])  # number of classes
spectators = defn["spectators"]
params = defn["features_2"]
params_sv = defn["features_3"]
spectators = defn["spectators"]
labels = defn["labels"]
n_feature_sets = defn["n_feature_sets"]

In [4]:
logger = logging.getLogger(__name__)

device = "cuda"
batch_size = 1024

files_test = glob.glob(os.path.join("/ssl-jet-vol-v2/hbb_interaction_network/data/processed/max_pT_1500/train/", "newdata_*.h5"))

dataset = "train"

data_test = H5Data(
    batch_size=batch_size,
    cache=None,
    preloading=0,
    features_name=f"{dataset}ing_subgroup",
    labels_name="target_subgroup",
    spectators_name="spectator_subgroup",
)
data_test.set_file_names(files_test)
n_test = data_test.count_data()
print(f"test data: {n_test}")

min_pt = 300.0
max_pt = 2000.0
min_eta = -999.0
max_eta = 999.0
min_msd = 40.0
max_msd = 200

test data: 7796508


In [5]:
model = InteractionNetTagger(
                pf_dims=N,
                sv_dims=N_sv,
                num_classes=n_targets,
                pf_features_dims=len(params),
                sv_features_dims=len(params_sv),
                hidden=128,
                De=32,
                Do=64,
            ).to(device)
model.load_state_dict(torch.load("/ssl-jet-vol-v2/hbb_interaction_network/models/trained_models/gnn_min_pt_psv_best.pth"))
model.eval()
print(f"Parameters = {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

Parameters = 93786


In [6]:
eval_path = "max_pt_inf_h5_exp"
iterator = data_test.generate_data()
total_ = int(n_test / batch_size)
pbar = tqdm.tqdm(iterator, total=total_)
for j, element in enumerate(pbar):
    j += 1
    print(j)
    (sub_X, sub_Y, sub_Z) = element
    print(sub_X)
    training = sub_X[2]
    training_sv = sub_X[3]
    target = sub_Y[0]
    spectator = sub_Z[0]
    print(f"target: {target}")
    print(f"spectator: {spectator}")
    break


  0%|                                                                                                               | 0/7613 [01:25<?, ?it/s]

1
[array([[[10.        ],
        [ 2.        ],
        [ 2.1717873 ],
        ...,
        [ 0.73734796],
        [-0.39374384],
        [ 0.95334494]],

       [[10.        ],
        [ 2.        ],
        [ 2.1717873 ],
        ...,
        [ 0.73734796],
        [-0.39374384],
        [ 0.95334494]],

       [[10.        ],
        [ 2.        ],
        [ 2.1717873 ],
        ...,
        [ 0.73734796],
        [-0.39374384],
        [ 0.95334494]],

       ...,

       [[10.        ],
        [ 2.        ],
        [ 2.1717873 ],
        ...,
        [ 0.73734796],
        [-0.39374384],
        [ 0.95334494]],

       [[10.        ],
        [ 2.        ],
        [ 2.1717873 ],
        ...,
        [ 0.73734796],
        [-0.39374384],
        [ 0.95334494]],

       [[10.        ],
        [ 2.        ],
        [ 2.1717873 ],
        ...,
        [ 0.73734796],
        [-0.39374384],
        [ 0.95334494]]], dtype=float32), array([[[ 2.24791914e-01,  1.81695297e-01,  8.8779




In [10]:
spectator.shape

(1024, 1, 11)

In [11]:
target.shape

(1024, 2)

In [13]:
new_tar = np.concatenate((target, target), axis=0)
new_tar.shape

(2048, 2)

In [14]:
new_spec = np.concatenate((spectator, spectator), axis=0)
new_spec.shape

(2048, 1, 11)

In [7]:
# # mask away selection
#     fj_pt = spectator[:, 0, 0]
#     fj_eta = spectator[:, 0, 1]
#     fj_sdmass = spectator[:, 0, 2]
#     if args.no_undef:
#         no_undef = np.sum(target, axis=1) == 1
#     else:
#         no_undef = fj_pt > -999  # no cut
#     mask = (
#         (fj_sdmass > min_msd)
#         & (fj_sdmass < max_msd)
#         & (fj_eta > min_eta)
#         & (fj_eta < max_eta)
#         & (fj_pt > min_pt)
#         & (fj_pt < max_pt)
#         & no_undef
#     )
#     training = training[mask]
#     training_sv = training_sv[mask]
#     target = target[mask]
#     spectator = spectator[mask]

#     trainingv = torch.tensor(training, dtype=torch.float, device=device)
#     trainingv_sv = torch.tensor(training_sv, dtype=torch.float, device=device)

#     if args.load_vicreg_path:
#         representation, representation_sv = vicreg(trainingv, trainingv_sv)
#         out_test = model(torch.cat((representation, representation_sv), dim=-1))
#     else:
#         if args.just_svs:
#             out_test = model(trainingv_sv)
#         elif args.just_tracks:
#             out_test = model(trainingv)
#         else:
#             out_test = model(trainingv, trainingv_sv)
#     out_test = out_test.cpu().data.numpy()
#     out_test = softmax(out_test, axis=1)
#     if args.argmax:
#         out_test = np.argmax(out_test, axis=1)

#     if j == 1:
#         # initialize the arrays
#         prediction = out_test
#         target_test = target
#         feature_arrays = sub_X
#         target_array = prediction
#         spec_array = spectator
#     else:
#         prediction = np.concatenate((prediction, out_test), axis=0)
#         target_test = np.concatenate((target_test, target))

#     if args.save_h5:
#         if j % 500 == 0 or j == total_:
#             # save the model
#             # save the feature_arrays, target_array, and spec_array to h5 file
#             model_pred_loc = f"{args.outdir}/model_predictions/" + eval_path
#             os.makedirs(model_pred_loc, exist_ok=True)
#             model_name = Path(args.load_path).stem
#             real_batch_size = len(target)
#             feature_arrays = sub_X
#             target_array = out_test
#             spec_array = spectator
#             with h5py.File(f"{model_pred_loc}/newdata_{j}.h5", "w") as h5:
#                 logger.info(f"creating {h5.filename} h5 file with {real_batch_size} events")
#                 feature_data = h5.create_group(f"{dataset}ing_subgroup")
#                 target_data = h5.create_group("target_subgroup")
#                 spec_data = h5.create_group("spectator_subgroup")
#                 for i in range(n_feature_sets):
#                     feature_data.create_dataset(
#                         f"{dataset}ing_{i}",
#                         data=feature_arrays[i].astype("float32"),
#                     )
#                     np.save(
#                         f"{model_pred_loc}/{dataset}_{j}_features_{i}.npy",
#                         feature_arrays[i].astype("float32"),
#                     )  # save the features
#                 target_data.create_dataset("target", data=target_array.astype("float32"))
#                 np.save(
#                     f"{model_pred_loc}/{dataset}_{i}_truth.npy",
#                     target_array.astype("float32"),
#                 )  # saving the labels
#                 spec_data.create_dataset("spectators", data=spec_array.astype("float32"))
#                 np.save(
#                     f"{model_pred_loc}/{dataset}_{i}_spectators.npy",
#                     spec_array.astype("float32"),
#                 )  # saving the spectators
#                 print(f"saved {h5.filename} h5 file with {real_batch_size} events")
#                 h5.close()  # close the h5 file
#             # re-initialize the arrays
#             feature_arrays = sub_X
#             target_array = prediction
#             spec_array = spectator
#         else:
#             # Don't save the model, just add to the arrays.
#             pass