In [1]:
import os, sys, pathlib, importlib
sys.path.append('../')

# Load the package and modules for training and plotting
import nsbi_common_utils
from nsbi_common_utils import plotting, training
from nsbi_common_utils.training import TrainEvaluate_NN, TrainEvaluatePreselNN


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Nadam
import mplhep as hep
import matplotlib.pyplot as plt
import pickle
import yaml
import random

from utils import preselection_using_score, calculate_preselection_observable

from coffea.analysis_tools import PackedSelection

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

hep.style.use(hep.style.ATLAS)

  hep.set_style("ATLAS")
2025-08-08 19:09:29.026914: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-08 19:09:29.876201: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754680170.158881  362940 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754680170.254788  362940 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-08 19:09:31.023289: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized t

In [7]:
with open("config.yml", "r") as f:
    config = yaml.safe_load(f)

In [8]:
PATH_PREFIX = config["PATH_PREFIX"]
PATH_TO_SAVED_DATA = config["PATH_SAVED_DATA"]

PATH_PRESEL_MODEL = PATH_TO_SAVED_DATA + 'preselection_model/'

config.update(
    {
        "PATH_PRESEL_MODEL": PATH_PRESEL_MODEL
    }
)

In [9]:
# Get the dictionary of labels to processes
labels_dict = config["PROCESS_TO_INT_LABELS_DICT"]

# Signal processes in the model
signal_processes = config["SIGNAL_PROCESSES"]

# Background processes in the model
background_processes = config["BACKGROUND_PROCESSES"]

print(signal_processes)
print(background_processes)

['htautau']
['ttbar', 'ztautau']


In [19]:
# If the preselection NN has already been trained and saved, load from the saved model
USE_SAVED_MODEL_PRESEL = False

# If the preselection NN has already been trained and evaluated, load the numpy array of predictions
USE_SAVED_PRESEL_PREDICTIONS = False

# Input features for training
features = config["TRAINING_FEATURES"]

# Subset of the features to standardize before training
features_scaling = config["TRAINING_FEATURES_TO_SCALE"]


In [20]:
# Load the nominal dataset saved from the pre-processing notebook

path_to_nominal_dataframe = config["PATH_TO_nominal_DATA"]
dataset_incl_nominal = pd.read_hdf(path_to_nominal_dataframe, key="dataset", mode='r')

In [21]:
# Load the MC/data weights and training labels identifying different processes
weights         = dataset_incl_nominal["weights"].to_numpy()
train_labels    = dataset_incl_nominal["train_labels"].to_numpy()

In [22]:
# Normalizing the training weights - only discriminating shapes
weights_normed  = weights.copy()

for key in labels_dict:

    weights_normed[train_labels==labels_dict[key]] /= weights[train_labels==labels_dict[key]].sum()

dataset_incl_nominal['weights_normed'] = weights_normed


Training the Preselection NN
===

**Choice of reference sample**

The density ratios need to be trained on phase space regions with support for the reference hypothesis $p_{ref}(x) > 0$.

To ensure this, we make a selection that selects events in the phase space regions with $p_{ref}(x) > 0$, or $p_c(x) \gg p_{ref}(x)$, and only perform the NSBI fit in this selected analysis region. **A natural choice for the reference hypothesis is then the signal-rich hypotheses**. This is referred to in the ATLAS publications as the Search-Oriented Mixture Models approach: 

$$p_{ref}(x) = \frac{1}{\sum_S \nu_S} \sum_S \frac{d\sigma_S}{dx} = \frac{1}{\nu_{H \to \tau\tau}} \frac{d\sigma_{H \to \tau\tau}}{dx}$$

where the sum runs over all signal hypothesis in the model and the second equality is due to the sole signal hypothesis in our toy model, $pp \to {t\bar{t}}$. 

In [23]:
# What are the signal processes in the user-provided model?
print(signal_processes)

['htautau']


In [24]:
# The reference hypothesis is chosen as the sum of signal hypothesis
ref_processes = config["REFERENCE_PROCESSES"]
print(ref_processes)

['htautau', 'ttbar']


**Selecting out regions with $p_{ref}\sim 0$**

A multi-class classification NN, with softmax output, is trained to output a score:

$$ \text{NN}_\text{presel} = \log \left[\frac{\sum_S P_S (x)}{\sum_B P_B(x)} \right]$$

where $P_c$ are the probability scores outputted from the softmax layer of the trained NN.



In [25]:
importlib.reload(sys.modules['nsbi_common_utils.training'])
from nsbi_common_utils.training import TrainEvaluatePreselNN

num_classes = len(labels_dict)

preselectionTraining = TrainEvaluatePreselNN(dataset_incl_nominal, 
                                            num_classes, 
                                            features, 
                                            features_scaling)

In [27]:
if USE_SAVED_PRESEL_PREDICTIONS:

    pred_NN_incl = np.load(f"{PATH_TO_SAVED_DATA}pred_NN_incl.npy")
    presel_score = calculate_preselection_observable(pred_NN_incl, labels_dict, signal_processes, background_processes, pre_factor_dict = pre_factor_preselection_score)

else:
    if not USE_SAVED_MODEL_PRESEL:
        preselectionTraining.train(test_size=0.2, 
                                   random_state=42, 
                                   path_to_save=PATH_PRESEL_MODEL,
                                  batch_size=4096,
                                  epochs=50, learning_rate=0.1)
    
    else:
        preselectionTraining.get_trained_model(PATH_PRESEL_MODEL)

    # Get predictions (softmax outputs)
    pred_NN_incl = preselectionTraining.predict(dataset_incl_nominal)

    presel_score = calculate_preselection_observable(pred_NN_incl, labels_dict, signal_processes, background_processes, pre_factor_dict = pre_factor_preselection_score)

    np.save(f"{PATH_TO_SAVED_DATA}presel_score.npy", presel_score)
    np.save(f"{PATH_TO_SAVED_DATA}pred_NN_incl.npy", pred_NN_incl)

I0000 00:00:1754682964.177106  362940 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13377 MB memory:  -> device: 0, name: NVIDIA A16, pci bus id: 0000:08:00.0, compute capability: 8.6
W0000 00:00:1754682964.967370  373648 gpu_backend_lib.cc:579] Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice. This may result in compilation or runtime failures, if the program we try to run uses routines from libdevice.
Searched for CUDA in the following directories:
  ./cuda_sdk_lib
  ipykernel_launcher.runfiles/cuda_nvcc
  ipykern/cuda_nvcc
  
  /usr/local/cuda
  /home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/tensorflow/python/platform/../../../nvidia/cuda_nvcc
  /home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/tensorflow/python/platform/../../../../nvidia/cuda_nvcc
  /home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/tensorflow/python/platform/../../cuda
  .
You can cho

Epoch 1/50


I0000 00:00:1754682966.433520  373525 service.cc:148] XLA service 0x738c080078c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1754682966.436800  373525 service.cc:156]   StreamExecutor device (0): NVIDIA A16, Compute Capability 8.6
2025-08-08 19:56:06.541030: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1754682966.680128  373525 cuda_dnn.cc:529] Loaded cuDNN version 91002
2025-08-08 19:56:06.782028: W tensorflow/core/framework/op_kernel.cc:1841] OP_REQUIRES failed at xla_ops.cc:577 : NOT_FOUND: Couldn't find a suitable version of ptxas. The following locations were considered: ./cuda_sdk_lib/bin/ptxas, /modules/opt/linux-ubuntu24.04-x86_64/jupyterlab/unity-jupyterlab4.4.3/bin/ptxas, /modules/apps/matlab/r2025a/bin/ptxas, /modules/opt/linux-ubuntu24.04-x86_64/miniforge3/24.7.1/condabin/ptxas, /modules/user-reso

NotFoundError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/runpy.py", line 198, in _run_module_as_main

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/runpy.py", line 88, in _run_code

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/asyncio/base_events.py", line 608, in run_forever

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/asyncio/events.py", line 84, in _run

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 519, in dispatch_queue

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 508, in process_one

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 400, in dispatch_shell

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 368, in execute_request

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 767, in execute_request

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 455, in do_execute

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/ipykernel/zmqshell.py", line 577, in run_cell

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3077, in run_cell

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3132, in _run_cell

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3336, in run_cell_async

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3519, in run_ast_nodes

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3579, in run_code

  File "/tmp/ipykernel_362940/3396085713.py", line 8, in <module>

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/nsbi_common_utils/training.py", line 107, in train

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 377, in fit

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 220, in function

  File "/home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 133, in multi_step_on_iterator

Couldn't find a suitable version of ptxas. The following locations were considered: ./cuda_sdk_lib/bin/ptxas, /modules/opt/linux-ubuntu24.04-x86_64/jupyterlab/unity-jupyterlab4.4.3/bin/ptxas, /modules/apps/matlab/r2025a/bin/ptxas, /modules/opt/linux-ubuntu24.04-x86_64/miniforge3/24.7.1/condabin/ptxas, /modules/user-resources/bin/ptxas, /usr/local/sbin/ptxas, /usr/local/bin/ptxas, /usr/sbin/ptxas, /usr/bin/ptxas, /sbin/ptxas, /bin/ptxas, /usr/games/ptxas, /usr/local/games/ptxas, /snap/bin/ptxas, ipykernel_launcher.runfiles/cuda_nvcc/bin/ptxas, ipykern/cuda_nvcc/bin/ptxas, bin/ptxas, /usr/local/cuda/bin/ptxas, /home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/tensorflow/python/platform/../../../nvidia/cuda_nvcc/bin/ptxas, /home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/tensorflow/python/platform/../../../../nvidia/cuda_nvcc/bin/ptxas, /home/jsandesara_umass_edu/.conda/envs/NSBI_env/lib/python3.11/site-packages/tensorflow/python/platform/../../cuda/bin/ptxas
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_multi_step_on_iterator_1883]

In [None]:
min_pred = np.amin(presel_score)
max_pred = np.amax(presel_score)

bins = np.linspace(min_pred, max_pred, num=50)

hist_NN_output = {}
hist_NN_output_errs = {}

for key in labels_dict: 
    hist_NN_output[key], _ = np.histogram(presel_score[train_labels==labels_dict[key]], 
                                          weights = weights[train_labels==labels_dict[key]], bins=bins)
    
    hist_NN_output_errs[key], _ = np.histogram(presel_score[train_labels==labels_dict[key]], 
                                          weights = weights[train_labels==labels_dict[key]]**2, bins=bins)


for key in labels_dict:  
    hep.histplot(hist_NN_output[key], bins=bins, 
             alpha=0.6, label=key, 
             density=True, linewidth=2.0, yerr = np.sqrt(hist_NN_output_errs[key]))

plt.xlabel("Preselection Score", size=18)
plt.ylabel("Density", size=18)
plt.legend()
plt.yscale('log')
plt.show()

Making the cut
===

Make a selection cut for regions with $p_{ref} \gg 0$ for performing the NSBI analysis. The remaining events - which by definition are background-dominated - can be used as a **Control Region** for data-driven background estimation, pre-unblinding validations, etc. 

Moreover, the preselections act as a tuning know for the tradeoff in selecting as many signal events as possible to go into the **Signal Region** (increasing sensitivity) and the feasibility of training accurate and precise NNs over a large phase space (need bigger models and more statistics). **The preselections can also weed out phase space regions with low background statistics to avoid poorly modelled regions.** 

Heres a first cut that you can optimize as much as you like to get the desired final results. 

In [None]:
# Play around with these selections - decrease if estimators are unbiased but need more sensitivity and increase if the model is biased to reduce complexity
preselection_cuts = {'upper': 4.5, 'lower': -1.}
np.save(f"{PATH_TO_SAVED_DATA}preselection_cuts.npy", preselection_cuts)

for key in labels_dict:  
    hep.histplot(hist_NN_output[key], bins = bins, 
             alpha = 0.6, label = key, 
             density = True, linewidth = 2.0, 
                 yerr = np.sqrt(hist_NN_output_errs[key]))

plt.xlabel("Preselection Score", size=18)

for key in preselection_cuts:
    if preselection_cuts[key] != -999:
        plt.axvline(preselection_cuts[key], ymax=0.9, linestyle='--', label=f'preselection cut {key} = {preselection_cuts[key]}')

plt.ylabel("Density", size=18)
plt.legend()
plt.yscale('log')
plt.show()

In [None]:
for key in labels_dict:  
    hep.histplot(hist_NN_output[key], bins = bins, 
             alpha = 0.6, label = key, 
             density = False, linewidth = 2.0, 
                 yerr = np.sqrt(hist_NN_output_errs[key]))

plt.xlabel("Preselection Score", size=18)

for key in preselection_cuts:
    if preselection_cuts[key] != -999:
        plt.axvline(preselection_cuts[key], ymax=0.9, linestyle='--', label=f'preselection cut {key} = {preselection_cuts[key]}')

plt.ylabel("Density", size=18)
plt.legend()
plt.yscale('log')
plt.show()

Signal and Control Regions
===

The high signal over background phase space towards the right of the preselection cut shown above will be categorized as the **Signal Region** where the NSBI analysis is performed.

The low signal phase space towards the left will be used as a **Control Region**, with typical uses such as background estimation, pre-unblinding data-MC checks, etc. In this phase space, we will use a binned summary observable like in any traditional analysis.

importlib.reload(sys.modules['utils'])
from utils import preselection_using_score


dataset_incl_nominal['presel_score'] = presel_score

channel_selections = {'CR': {'observable': 'presel_score', 
                             'lower_presel': -999, 
                             'upper_presel': preselection_cuts.get('lower'), 
                             'num_bins': 4},
                      
                      'SR_binned': {'observable': 'presel_score', 
                                    'lower_presel': preselection_cuts.get('upper'), 
                                    'upper_presel': -999,
                                    'num_bins': 1},
                      
                      'SR': {'observable': None, 
                             'upper_presel': preselection_cuts.get('upper'), 
                             'lower_presel': preselection_cuts.get('lower')}}

dataset_channels = preselection_using_score(dataset_incl_nominal, channel_selections)

del dataset_incl_nominal