In [1]:
import sys
import importlib
import pandas as pd
# load local nsbi_common_utils package in home directory
sys.path.append('/workspaces/NSBI-workflow-tutorial/src/')
import nsbi_common_utils.configuration
import nsbi_common_utils.datasets


# Load the config file to get metadata like training features
config = nsbi_common_utils.configuration.ConfigManager(file_path_string = './config.yml')

# Input features for training
features, features_scaling = config.get_training_features()


branches_to_load = features + ['presel_score']

Datasets = nsbi_common_utils.datasets.datasets(config_path = './config.yml',
                                                branches_to_load =  branches_to_load)

dataset_incl_dict      = Datasets.load_datasets_from_config(load_systematics = False)

dataset_incl_nominal   = dataset_incl_dict["Nominal"].copy()

dataset_SR_nominal     = Datasets.filter_region_dataset(dataset_incl_nominal,
                                                       region = "SR")

# Define the processes that make up the mixture model formula for density ratio estimation

PATH_TO_SAVED_DATA = './saved_datasets/'
TRAINING_OUTPUT_PATH = f'{PATH_TO_SAVED_DATA}output_training_nominal/'

# Signal processes in the model
basis_processes = config.get_basis_samples()
print(basis_processes)

ref_processes = config.get_reference_samples()
print(ref_processes)


##### Training the density ratio estimator model #####
importlib.reload(sys.modules['nsbi_common_utils.training'])
from nsbi_common_utils.training import TrainEvaluate_NN

ref_train_label_sample_dict = {**{ref: 0 for ref in ref_processes}}

dataset_ref     = Datasets.merge_dataframe_dict_for_training(dataset_SR_nominal, 
                                                            ref_train_label_sample_dict, 
                                                            samples_to_merge = ref_processes)

NN_training_mix_model = {}

use_log_loss = False

# DELETE_EXISTING_MODELS = True
DELETE_EXISTING_MODELS = False

path_to_ratios = {}
path_to_figures = {}
path_to_models = {}

for process_type in basis_processes:

    # Get the dictionary of labels to processes
    _train_label_sample_dict = {process_type        : 1}

    dataset_num     = Datasets.merge_dataframe_dict_for_training(dataset_SR_nominal, 
                                                                _train_label_sample_dict, 
                                                                samples_to_merge = [process_type])

    # Build a training dataset for the training of p_<process_type>/p_<ref_processes> density ratio
    dataset_mix_model = pd.concat([dataset_num, dataset_ref])

    # Save paths
    output_name                     = f'{process_type}'
    output_dir                      = f'{TRAINING_OUTPUT_PATH}general_output_{process_type}'
    path_to_ratios[process_type]    = f'{TRAINING_OUTPUT_PATH}output_ratios_{process_type}/'
    path_to_figures[process_type]   = f'{TRAINING_OUTPUT_PATH}output_figures_{process_type}/'
    path_to_models[process_type]    = f'{TRAINING_OUTPUT_PATH}output_model_params_{process_type}/'

    NN_training_mix_model[process_type] = TrainEvaluate_NN(dataset_mix_model, 
                                                           dataset_mix_model['weights_normed'].to_numpy(),
                                                           dataset_mix_model['train_labels'].to_numpy(),
                                                           features, 
                                                           features_scaling, 
                                                           [process_type, 'ref'], 
                                                            output_dir, output_name, 
                                                            path_to_figures=path_to_figures[process_type],
                                                            path_to_ratios=path_to_ratios[process_type], 
                                                            path_to_models=path_to_models[process_type],
                                                            use_log_loss = use_log_loss,
                                                            delete_existing_models = DELETE_EXISTING_MODELS)


    del dataset_mix_model


FileNotFoundError: [Errno 2] No such file or directory: '/workspaces/NSBI-workflow-tutorial/FAIR_universe_Higgs_tautau/saved_datasets/dataset_nominal.root'