In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, auc
from sklearn.model_selection import train_test_split

from rapidgbm import RapidGBMTuner

import mplhep as hep
hep.style.use([hep.style.ATLAS])
import pickle


In [4]:
def concatenate_parquet_files(path):
    """
    Concatenate all parquet files in the given directory into a single DataFrame.

    Parameters:
    path (str): The directory containing the parquet files.

    Returns:
    pd.DataFrame: The concatenated DataFrame.
    """
    # List all files in the directory
    files = [f for f in os.listdir(path) if f.endswith('.parquet')]
    
    # Read and concatenate all parquet files
    dataframes = [pd.read_parquet(os.path.join(path, file)) for file in files]
    concatenated_df = pd.concat(dataframes, ignore_index=True)
    
    return concatenated_df

In [5]:
path_zee = '/groups/hep/kinch/data/preprocessed_data/files_for_dual_models/zee_deriv/pairs'
path_ttbar = '/groups/hep/kinch/data/preprocessed_data/files_for_dual_models/ttbar/pairs'

data_ttbar = concatenate_parquet_files(path_ttbar)
data_zeederiv = concatenate_parquet_files(path_zee)

In [6]:
m_ee_ttbar = np.sqrt( (data_ttbar['el1_truth_E'] + data_ttbar['el2_truth_E'])**2 - (data_ttbar['el1_truth_px']+data_ttbar['el2_truth_px'])**2 - (data_ttbar['el1_truth_py']+data_ttbar['el2_truth_py'])**2 - (data_ttbar['el1_truth_pz']+data_ttbar['el2_truth_pz'])**2 ) / 1000
m_ee_zeederiv = np.sqrt( (data_zeederiv['el1_truth_E'] + data_zeederiv['el2_truth_E'])**2 - (data_zeederiv['el1_truth_px']+data_zeederiv['el2_truth_px'])**2 - (data_zeederiv['el1_truth_py']+data_zeederiv['el2_truth_py'])**2 - (data_zeederiv['el1_truth_pz']+data_zeederiv['el2_truth_pz'])**2 ) / 1000


data_ttbar['m_ee_truth'] = m_ee_ttbar
data_zeederiv['m_ee_truth'] = m_ee_zeederiv


In [7]:
param_list1 = ['el1_eta', 'el1_phi', 'el1_m', 'el1_charge', 'el1_f1',
       'el1_neflowisol20', 'el1_truthPdgId', 'el1_truthType',
       'el1_truthOrigin', 'el1_DFCommonElectronsECIDS',
       'el1_DFCommonElectronsECIDSResult', 'el1_DFCommonElectrons_LHLoose',
       'el1_DFCommonElectronsLHLooseIsEMValue',
       'el1_DFCommonElectronsLHVeryLoose',
       'el1_DFCommonElectronsLHVeryLooseIsEMValue',
       'el1_DFCommonElectronsDNN_pel', 'el1_DFCommonElectronsDNN_pcf',
       'el1_DFCommonElectronsDNN_ppc', 'el1_DFCommonElectronsDNN_phf',
       'el1_DFCommonElectronsDNN_ple', 'el1_DFCommonElectronsDNN_plh',
       'el1_GSFTrack_d0', 'el1_GSFTrack_z0', 'el1_GSFTrack_theta',
       'el1_GSFTrack_phi', 'el1_GSFTrack_qOverP', 'el1_GSF_dR',
       'el1_GSF_Track_Var0', 'el1_GSF_Track_Var1', 'el1_GSF_Track_Var2',
       'el1_GSF_Track_Var3', 'el1_GSF_Track_Var4', 'el1_ptvarcone20',
       'el1_topoetcone20', 'el1_topoetcone40',
       'el1_ptvarcone30_Nonprompt_All_MaxWeightTTVALooseCone_pt1000',
       'el1_ptcone20_Nonprompt_All_MaxWeightTTVALooseCone_pt1000',
       'el1_core57cellsEnergyCorrection', 'el1_topoetcone20ptCorrection',
       'el1_ptcone20_Nonprompt_All_MaxWeightTTVALooseCone_pt500',
       'el1_ptvarcone30_Nonprompt_All_MaxWeightTTVALooseCone_pt500','el1_InDetTrack_d0', 'el1_InDetTrack_z0',
       'el1_InDetTrack_theta', 'el1_InDetTrack_phi', 'el1_InDetTrack_covdiag0',
       'el1_InDetTrack_covdiag1', 'el1_InDetTrack_covdiag2',
       'el1_InDetTrack_covdiag3', 'el1_InDetTrack_covdiag4', 'el1_InDet_dR' ]
param_list2 = ['el2_eta', 'el2_phi', 'el2_m', 'el2_charge', 'el2_f1',
       'el2_neflowisol20', 'el2_truthPdgId', 'el2_truthType',
       'el2_truthOrigin', 'el2_DFCommonElectronsECIDS',
       'el2_DFCommonElectronsECIDSResult', 'el2_DFCommonElectrons_LHLoose',
       'el2_DFCommonElectronsLHLooseIsEMValue',
       'el2_DFCommonElectronsLHVeryLoose',
       'el2_DFCommonElectronsLHVeryLooseIsEMValue',
       'el2_DFCommonElectronsDNN_pel', 'el2_DFCommonElectronsDNN_pcf',
       'el2_DFCommonElectronsDNN_ppc', 'el2_DFCommonElectronsDNN_phf',
       'el2_DFCommonElectronsDNN_ple', 'el2_DFCommonElectronsDNN_plh',
       'el2_GSFTrack_d0', 'el2_GSFTrack_z0', 'el2_GSFTrack_theta',
       'el2_GSFTrack_phi', 'el2_GSFTrack_qOverP', 'el2_GSF_dR',
       'el2_GSF_Track_Var0', 'el2_GSF_Track_Var1', 'el2_GSF_Track_Var2',
       'el2_GSF_Track_Var3', 'el2_GSF_Track_Var4', 'el2_ptvarcone20',
       'el2_topoetcone20', 'el2_topoetcone40',
       'el2_ptvarcone30_Nonprompt_All_MaxWeightTTVALooseCone_pt1000',
       'el2_ptcone20_Nonprompt_All_MaxWeightTTVALooseCone_pt1000',
       'el2_core57cellsEnergyCorrection', 'el2_topoetcone20ptCorrection',
       'el2_ptcone20_Nonprompt_All_MaxWeightTTVALooseCone_pt500',
       'el2_ptvarcone30_Nonprompt_All_MaxWeightTTVALooseCone_pt500','el2_InDetTrack_d0', 'el2_InDetTrack_z0',
       'el2_InDetTrack_theta', 'el2_InDetTrack_phi', 'el2_InDetTrack_covdiag0',
       'el2_InDetTrack_covdiag1', 'el2_InDetTrack_covdiag2',
       'el2_InDetTrack_covdiag3', 'el2_InDetTrack_covdiag4', 'el2_InDet_dR' ]

param_list = param_list1 + param_list2
        

In [8]:
mask_ttbar = (data_ttbar['el1_truthType'] == 2) & (data_ttbar['el2_truthType'] == 2)
mask_zeederiv = (data_zeederiv['el1_truthType'] == 2) & (data_zeederiv['el2_truthType'] == 2)

data_ttbar = data_ttbar[mask_ttbar]
data_zeederiv = data_zeederiv[mask_zeederiv]

In [44]:
data = pd.concat([data_ttbar, data_zeederiv]).reset_index(drop=True)


print(((data['eventNumber'] % 4)).value_counts())
train = data[data['eventNumber'] % 4 != 0]
test = data[data['eventNumber'] % 4 == 0]


# train, test = train_test_split(data, test_size=0.25, random_state=1)

# train = data.sample(frac=0.75, random_state=13)
# test = data.drop(train.index)

print('train:', len(train), len(train)/(len(train) + len(test)))
print('test:', len(test), len(test)/(len(train) + len(test)))
print('ratio:', len(test)/len(train))

X_train = train[param_list]
y_train = train['m_ee_truth']

eventNumber
3    164361
2    163744
0    163468
1    163245
Name: count, dtype: int64
train: 491350 0.7503611690576618
test: 163468 0.24963883094233819
ratio: 0.332691564058207


In [19]:
for col in X_train.columns:
    missing_indices = X_train[col].index[X_train[col].isna()]
    for i in missing_indices:
        print(col, i)

In [49]:
testdf = X_train.iloc[660:672]
for col in testdf.columns:
    print(testdf[col])

891   -2.247147
892    0.318419
893    0.321870
894    1.035340
899    0.419433
900    0.555963
901   -0.555268
902    1.407565
903   -0.740946
904    1.115448
905   -1.043603
907   -0.738282
Name: el1_eta, dtype: float32
891    2.959022
892   -1.398195
893    0.475622
894    2.659543
899    2.993395
900    2.995723
901   -1.777446
902   -3.113723
903    2.975203
904    0.025204
905    2.227498
907    2.155857
Name: el1_phi, dtype: float32
891    0.510998
892    0.510998
893    0.510998
894    0.510998
899    0.510998
900    0.510998
901    0.510998
902    0.510998
903    0.510998
904    0.510998
905    0.510998
907    0.510998
Name: el1_m, dtype: float32
891   -1.0
892    1.0
893    1.0
894    1.0
899    1.0
900    1.0
901   -1.0
902   -1.0
903   -1.0
904    1.0
905    1.0
907   -1.0
Name: el1_charge, dtype: float32
891    0.294951
892    0.179821
893    0.301759
894    0.263660
899    0.217422
900    0.278564
901    0.231240
902    0.000000
903    0.407386
904    0.172071
905    0.30

In [48]:
print(y_train[660:672])

891    161.429585
892     98.555354
893     53.418970
894     52.419586
899     49.010613
900     79.501818
901    209.388495
902     53.869788
903     12.121423
904     80.013624
905     43.852136
907     76.788485
Name: m_ee_truth, dtype: float64


In [50]:
tuner = RapidGBMTuner(metric='mae', trials=2, refit=True, verbosity=1, visualization=True, seed=414243)
length = 670
tuner.fit(X_train.head(length), y_train.head(length))

# tuner.fit_optimized(X_train.to_numpy(), y_train.squeeze().to_numpy())


 * Initiating LGBMTuner.fit
     . Settings:
     .. Trying 2 trials
     .. Evaluation metric: mae 
     .. Study direction: minimize mae

     . Trial number: 0 finished
     .. Optimization score (lower-better): mae: 32.71672799066297
 ...........................................................................


ValueError: Input contains NaN.