In [100]:
import os, sys, importlib
# sys.path.append('../')

import nsbi_common_utils
from nsbi_common_utils import plotting, training, inference
import glob
import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
from tensorflow.keras.optimizers import Nadam
import mplhep as hep
import pickle
import matplotlib.pyplot as plt
import yaml

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

hep.style.use(hep.style.ATLAS)


In [101]:
# Initialize a skeleton workspace spec
spec = {
    "channels": [],
    "measurements": [],
    "observations": [],
    "version": [],
}

In [102]:
with open("config.yml", "r") as f:
    config = yaml.safe_load(f)

In [103]:
# path prefix for general save directory
path_prefix = config['path_prefix']

# sub-path for saving cached data used between modules
path_saved_data = config['path_saved_data']
saved_data = f'{path_prefix}{path_saved_data}'

# Get the dictionary of labels to processes
labels_dict = config["labels_dict"]

# Signal processes in the model
signal_processeses = config["signal_processes"]

# Background processes in the model
background_processes = config["background_processes"]

mix_model_hypotheses = config["mix_model_hypotheses"]
ref_processes = config["ref_processes"]

all_process = signal_processeses + background_processes

channels_binned = config["channels_binned"]
channels_unbinned = config["channels_unbinned"]

print(channels_binned)
print(channels_unbinned)

['SR_binned', 'CR']
['SR']


In [104]:
# Load the nominal dataset for evaluation
dataset = pd.read_hdf(f"{saved_data}dataset_preselected_nominal_SR.h5", key="dataset", mode='r')

mask_processes = {}
for process_type in all_process:
    mask_processes[process_type] = dataset.type==process_type

# Mask that extracts the expected Asimov dataset
process_asimov = all_process

mask_Asimov = np.logical_or.reduce([mask_processes[process_type] for process_type in process_asimov])
weights_Asimov = np.array(dataset[mask_Asimov].weights.ravel())

  weights_Asimov = np.array(dataset[mask_Asimov].weights.ravel())


In [105]:
path_to_workspace_data_nominal = f'{path_prefix}output_training_nominal/'

In [106]:
path_to_Asimov_weights = f"{path_to_workspace_data_nominal}/weights_nominal_Asimov.npy"
np.save(path_to_Asimov_weights, weights_Asimov)

In [107]:
spec_channels_list = []

spec_channels_list.append(
    {
        "name": "SR",
        "type": "unbinned",
        "weights": path_to_Asimov_weights,
        "samples": []
    }
)

In [108]:
for process in all_process:
    if process in mix_model_hypotheses:
        path_to_ratio = f'{path_to_workspace_data_nominal}output_ratios_{process_type}/ratio_{process_type}.npy'
        spec_channels_list[0]["samples"].append(
            {
                "name": process,
                "data": path_to_ratio,
                "modifiers":[
                    {
                        "data": None,
                        "name": f"mu_{process}",
                        "type": "normfactor",   
                    },
                ]
            }
        )
    else:
        # The case where a sample is used as reference and thus has density ratio = 1 by default
        spec_channels_list[0]["samples"].append(
            {
                "name": process,
                "data": "",
                "modifiers":[
                    {
                        "data": None,
                        "name": f"mu_{process}",
                        "type": "normfactor",   
                    },
                ]
            }
        )
    

In [109]:
for channel in channels_binned:
    spec_channels_list.append(
        {
            "name": channel,
            "type": "binned",
            "samples": []
        }
    )

    with open(f"{saved_data}hist_binned_{channel}.pkl", "rb") as fp:
            hist_channel = pickle.load(fp)
        
    for process in all_process:
        
        spec_channels_list[-1]["samples"].append(
            {
                "name": process,
                "data": hist_channel[process].astype(float).tolist(),
                "modifiers": [{
                    "data": None,
                    "name": f"mu_{process}",
                    "type": "normfactor",  
                }]
            }
    )

In [110]:
spec = {
    "channels": spec_channels_list,
    "measurements": [
        {
            "name": "higgs_measurement",
            "config": {
                "poi": "mu_htautau", "parameters": []
            }
        }
    ],
    "observations": [],
    "version": "1.0.0",
}

In [111]:
import json

with open("workspace_stat_only.json", "w", encoding="utf-8") as f:
    json.dump(spec, f, indent=2)

In [112]:
path_to_dict_systs = f"{saved_data}dict_systs.npy"

# Check if user has provided uncertainty NPs
if "dict_systs" in config:
    dict_systs = config["dict_systs"]
    # Load the SR yield variations
    with open(f"{saved_data}yield_SR_variations.pkl", "rb") as fp:
        nu_var_SR = pickle.load(fp)
    # Save the Control Region variation histogram
    with open(f"{saved_data}hist_binned_variations.pkl", "rb") as fp:
        hist_variations = pickle.load(fp)
else:
    dict_systs = {}

# Full list of systematics
list_syst = [key for key in dict_systs]

print(list_syst)

['TES', 'JES']


In [113]:
dict_systs

{'TES': {'process': ['htautau', 'ttbar', 'ztautau'],
  'directions': ['up', 'dn']},
 'JES': {'process': ['htautau', 'ttbar', 'ztautau'],
  'directions': ['up', 'dn']}}

In [114]:
top_path_systematics = f'{path_prefix}output_training_systematics/'

for syst, data in dict_systs.items():
    
    for process_dict in spec_channels_list[0]["samples"]:
        
        process_name = process_dict["name"]
        
        if process_name in data["process"]:

            if "up" in data["directions"]:
                path = f'{top_path_systematics}output_ratios_{process_name}_{syst}_up/'
                hi_data_path = f'{path}ratio_{syst}_up.npy'
            else:
                hi_data_path = ""
                
            if "dn" in data["directions"]:
                path = f'{top_path_systematics}output_ratios_{process_name}_{syst}_dn/'
                lo_data_path = f'{path}ratio_{syst}_dn.npy'
            else:
                lo_data_path = ""
            
            process_dict["modifiers"].append(
                {
                    "data": {"hi_data": hi_data_path, "lo_data": lo_data_path},
                    "name": f"alpha_{syst}",
                    "type": "shapesys"   
                }
            )

In [119]:
dict_channel_index = {
    'SR': 0,
    'SR_binned': 1,
    'CR': 2
}

In [120]:
len(spec_channels_list[dict_channel_index[channel_selected]]["samples"][0]['data'])

1

In [None]:
channel_selected =  'SR_binned'

for syst, data in dict_systs.items():
    
    for process_dict in spec_channels_list[dict_channel_index[channel_selected]]["samples"]:
        
        process_name = process_dict["name"]
        
        if process_name in data["process"]:

            if "up" in data["directions"]:
                hi_data = hist_variations[channel_selected][process_name][syst]['up']
            else:
                hi_data = np.ones(len(spec_channels_list[dict_channel_index[channel_selected]]["samples"][0]['data'])).astype(float).to_list()
                
            if "dn" in data["directions"]:
                lo_data = hist_variations[channel_selected][process_name][syst]['dn']
            else:
                lo_data = 2.0 - hi_data
            
            process_dict["modifiers"].append(
                {
                    "data": {"hi_data": hi_data, "lo_data": lo_data},
                    "name": f"alpha_{syst}",
                    "type": "shapesys"   
                }
            )

In [124]:
spec_channels_list[1]

{'name': 'SR_binned',
 'type': 'binned',
 'samples': [{'name': 'htautau',
   'data': [1.3404816389083862],
   'modifiers': [{'data': None, 'name': 'mu_htautau', 'type': 'normfactor'},
    {'data': {'hi_data': array([1.0010142], dtype=float32),
      'lo_data': array([0.9900152], dtype=float32)},
     'name': 'alpha_TES',
     'type': 'shapesys'},
    {'data': {'hi_data': array([1.2158968], dtype=float32),
      'lo_data': array([0.7761778], dtype=float32)},
     'name': 'alpha_JES',
     'type': 'shapesys'}]},
  {'name': 'ttbar',
   'data': [0.20330865681171417],
   'modifiers': [{'data': None, 'name': 'mu_ttbar', 'type': 'normfactor'},
    {'data': {'hi_data': array([1.], dtype=float32),
      'lo_data': array([1.], dtype=float32)},
     'name': 'alpha_TES',
     'type': 'shapesys'},
    {'data': {'hi_data': array([1.5734978], dtype=float32),
      'lo_data': array([1.2855403], dtype=float32)},
     'name': 'alpha_JES',
     'type': 'shapesys'}]},
  {'name': 'ztautau',
   'data': [7.2