In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split



In [2]:
sample = pd.read_parquet('HZeeg_ggF_MC_reduced_Zmodel_eepairs_28August_131variables_300events.parquet')

In [3]:
print(sample.shape)
print(sample.columns)

(116813, 64)
Index(['event_index', 'el_index', 'runNumber', 'eventNumber',
       'actualInteractionsPerCrossing', 'averageInteractionsPerCrossing',
       'm_ee', 'isZ', 'el1_pt', 'el1_eta', 'el1_phi', 'el1_m', 'el1_charge',
       'el1_ptvarcone20', 'el1_topoetcone20', 'el1_topoetcone40', 'el1_f1',
       'el1_neflowisol20', 'el1_truthPdgId', 'el1_truthType',
       'el1_truthOrigin', 'el1_DFCommonElectronsECIDS',
       'el1_DFCommonElectronsECIDSResult', 'el1_DFCommonElectrons_pel',
       'el1_DFcommonElectrons_LHLoose', 'el1_GSFTrack_d0', 'el1_GSFTrack_z0',
       'el1_GSFTrack_theta', 'el1_GSFTrack_phi', 'el1_GSFTrack_qOverP',
       'el1_GSF_dR', 'el1_GSF_Track_Var0', 'el1_GSF_Track_Var1',
       'el1_GSF_Track_Var2', 'el1_GSF_Track_Var3', 'el1_GSF_Track_Var4',
       'el2_pt', 'el2_eta', 'el2_phi', 'el2_m', 'el2_charge',
       'el2_ptvarcone20', 'el2_topoetcone20', 'el2_topoetcone40', 'el2_f1',
       'el2_neflowisol20', 'el2_truthPdgId', 'el2_truthType',
       'el2_truthOri

In [4]:
isZ = sample[sample['isZ'] == 1]
isNotZ = sample[sample['isZ'] == 0]
isNotZ_sampled = isNotZ.sample(frac=0.01, random_state=42) 
small_sample = pd.concat([isZ, isNotZ_sampled])

print(f'full sample size:{len(sample)}, number of Z:{len(isZ)}, number of not Z:{len(isNotZ)}')
print(f'small sample size:{len(small_sample)}, number of Z:{len(isZ)}, number of not Z:{len(isNotZ_sampled)}')

full sample size:116813, number of Z:688, number of not Z:116125
small sample size:1849, number of Z:688, number of not Z:1161


In [5]:
drop_list = ['isZ', 'm_ee', 'event_index', 'el_index', 'runNumber', 'eventNumber', 'el1_pt', 'el2_pt'
             , 'el1_truthPdgId', 'el2_truthPdgId', 'el1_truthOrigin', 'el2_truthOrigin', 'el1_truthType', 'el2_truthType']

In [6]:
input_data = small_sample.drop(drop_list, axis=1)
truth_data = small_sample['isZ']

In [7]:
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

r = np.random
r.seed(12)

X_train, X_test, y_train, y_test = train_test_split(input_data, truth_data, test_size=0.25, random_state=12)

In [22]:
def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'max_depth': trial.suggest_int('max_depth', 0, 100),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1,log=True),
        'drop_rate': trial.suggest_float('drop_rate', 0.005, 0.4),
        'verbose': 0
    }

    dtrain = lgb.Dataset(X_train, label=y_train)
    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(X_test)
    auc = roc_auc_score(y_test, preds)
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)


[I 2024-09-15 08:20:26,774] A new study created in memory with name: no-name-c412c643-64cc-4753-8394-6d4938b2ede9




[I 2024-09-15 08:20:27,320] Trial 0 finished with value: 0.9907972340858247 and parameters: {'max_depth': 52, 'num_leaves': 21, 'learning_rate': 0.02156189861534425, 'drop_rate': 0.2394377827070462}. Best is trial 0 with value: 0.9907972340858247.




[I 2024-09-15 08:20:28,596] Trial 1 finished with value: 0.9908684157006303 and parameters: {'max_depth': 100, 'num_leaves': 129, 'learning_rate': 0.02754474059136628, 'drop_rate': 0.177089995856689}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:29,167] Trial 2 finished with value: 0.9840349806792761 and parameters: {'max_depth': 62, 'num_leaves': 228, 'learning_rate': 0.00012040240898773157, 'drop_rate': 0.08912671482944923}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:29,651] Trial 3 finished with value: 0.985041692088672 and parameters: {'max_depth': 60, 'num_leaves': 113, 'learning_rate': 0.004560397271404459, 'drop_rate': 0.18339353329106278}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:29,948] Trial 4 finished with value: 0.9839637990644702 and parameters: {'max_depth': 5, 'num_leaves': 30, 'learning_rate': 0.0004261297020865831, 'drop_rate': 0.3003763352509431}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:31,193] Trial 5 finished with value: 0.9894447834045149 and parameters: {'max_depth': 83, 'num_leaves': 220, 'learning_rate': 0.06296525067022599, 'drop_rate': 0.1383171441936207}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:31,943] Trial 6 finished with value: 0.9889363432987595 and parameters: {'max_depth': 9, 'num_leaves': 179, 'learning_rate': 0.07535502812938324, 'drop_rate': 0.17135936233393062}. Best is trial 1 with value: 0.9908684157006303.
[I 2024-09-15 08:20:32,042] Trial 7 finished with value: 0.9870957901159243 and parameters: {'max_depth': 1, 'num_leaves': 105, 'learning_rate': 0.08334776638375886, 'drop_rate': 0.24248851965355533}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:32,237] Trial 8 finished with value: 0.9862619483424853 and parameters: {'max_depth': 53, 'num_leaves': 4, 'learning_rate': 0.004326489161659076, 'drop_rate': 0.2524132530209613}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:32,666] Trial 9 finished with value: 0.9840349806792761 and parameters: {'max_depth': 35, 'num_leaves': 72, 'learning_rate': 0.0006638971770984752, 'drop_rate': 0.32814983087281585}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:33,948] Trial 10 finished with value: 0.9861297539149888 and parameters: {'max_depth': 96, 'num_leaves': 157, 'learning_rate': 0.010250367776548936, 'drop_rate': 0.01187754686637657}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:35,208] Trial 11 finished with value: 0.9900549115314216 and parameters: {'max_depth': 31, 'num_leaves': 63, 'learning_rate': 0.01943330765690191, 'drop_rate': 0.24470963490042774}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:36,929] Trial 12 finished with value: 0.9902989627821843 and parameters: {'max_depth': 80, 'num_leaves': 180, 'learning_rate': 0.020186841915605047, 'drop_rate': 0.36746444131074263}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:37,865] Trial 13 finished with value: 0.9902379499694937 and parameters: {'max_depth': 77, 'num_leaves': 67, 'learning_rate': 0.01999259136499562, 'drop_rate': 0.11319911030661045}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:37,941] Trial 14 finished with value: 0.9796013829570877 and parameters: {'max_depth': 99, 'num_leaves': 2, 'learning_rate': 0.0014682022683949881, 'drop_rate': 0.06549449697509711}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:38,472] Trial 15 finished with value: 0.9848688224527151 and parameters: {'max_depth': 34, 'num_leaves': 141, 'learning_rate': 0.008962965856712747, 'drop_rate': 0.20545292541518115}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:39,724] Trial 16 finished with value: 0.9905023388244865 and parameters: {'max_depth': 24, 'num_leaves': 248, 'learning_rate': 0.043076794908331384, 'drop_rate': 0.3048587537696693}. Best is trial 1 with value: 0.9908684157006303.




[I 2024-09-15 08:20:41,003] Trial 17 finished with value: 0.9914988814317675 and parameters: {'max_depth': 45, 'num_leaves': 93, 'learning_rate': 0.029630772722250533, 'drop_rate': 0.3887193084875688}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:20:41,741] Trial 18 finished with value: 0.9830892820825707 and parameters: {'max_depth': 71, 'num_leaves': 91, 'learning_rate': 0.0018574839923064304, 'drop_rate': 0.39815040633265325}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:20:43,097] Trial 19 finished with value: 0.9910717917429327 and parameters: {'max_depth': 42, 'num_leaves': 130, 'learning_rate': 0.036176407453795426, 'drop_rate': 0.14581345634266465}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:20:43,852] Trial 20 finished with value: 0.9864551555826723 and parameters: {'max_depth': 43, 'num_leaves': 193, 'learning_rate': 0.009890122338220798, 'drop_rate': 0.047255685867313865}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:20:44,789] Trial 21 finished with value: 0.9908684157006306 and parameters: {'max_depth': 15, 'num_leaves': 138, 'learning_rate': 0.03764450781518419, 'drop_rate': 0.1481116638801261}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:20:46,312] Trial 22 finished with value: 0.9910717917429327 and parameters: {'max_depth': 17, 'num_leaves': 149, 'learning_rate': 0.03912935480234904, 'drop_rate': 0.138876718695967}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:20:48,907] Trial 23 finished with value: 0.9904616636160261 and parameters: {'max_depth': 21, 'num_leaves': 162, 'learning_rate': 0.09988033651198795, 'drop_rate': 0.12190403290922952}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:20:50,723] Trial 24 finished with value: 0.9896684970510474 and parameters: {'max_depth': 42, 'num_leaves': 93, 'learning_rate': 0.04791917411316464, 'drop_rate': 0.21446970586382214}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:20:52,076] Trial 25 finished with value: 0.9897498474679682 and parameters: {'max_depth': 41, 'num_leaves': 123, 'learning_rate': 0.01200460511692016, 'drop_rate': 0.08875376133041049}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:20:52,998] Trial 26 finished with value: 0.9848484848484849 and parameters: {'max_depth': 23, 'num_leaves': 153, 'learning_rate': 0.005623674106701633, 'drop_rate': 0.14981015683483553}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:20:57,379] Trial 27 finished with value: 0.9907463900752491 and parameters: {'max_depth': 62, 'num_leaves': 84, 'learning_rate': 0.03332679062379434, 'drop_rate': 0.28005832408454995}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:20:58,014] Trial 28 finished with value: 0.9826011795810454 and parameters: {'max_depth': 12, 'num_leaves': 38, 'learning_rate': 0.0023649183310218984, 'drop_rate': 0.34397507503018593}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:20:58,724] Trial 29 finished with value: 0.9903803131991052 and parameters: {'max_depth': 52, 'num_leaves': 44, 'learning_rate': 0.01591840694606069, 'drop_rate': 0.2109484183499624}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:21:00,473] Trial 30 finished with value: 0.9896278218425869 and parameters: {'max_depth': 48, 'num_leaves': 114, 'learning_rate': 0.06074895726014617, 'drop_rate': 0.1000994169546088}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:21:01,735] Trial 31 finished with value: 0.9895871466341266 and parameters: {'max_depth': 16, 'num_leaves': 140, 'learning_rate': 0.029844383606057692, 'drop_rate': 0.14694120536081007}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:21:03,291] Trial 32 finished with value: 0.9907870652837095 and parameters: {'max_depth': 30, 'num_leaves': 126, 'learning_rate': 0.03800879283338503, 'drop_rate': 0.16897501552301403}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:21:04,155] Trial 33 finished with value: 0.9912344925767744 and parameters: {'max_depth': 13, 'num_leaves': 143, 'learning_rate': 0.027294834903333346, 'drop_rate': 0.1309010578112945}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:21:04,672] Trial 34 finished with value: 0.9850823672971325 and parameters: {'max_depth': 0, 'num_leaves': 172, 'learning_rate': 0.006526486489299868, 'drop_rate': 0.047339454694829614}. Best is trial 17 with value: 0.9914988814317675.




[I 2024-09-15 08:21:05,735] Trial 35 finished with value: 0.9920886719544438 and parameters: {'max_depth': 26, 'num_leaves': 193, 'learning_rate': 0.02574663848876454, 'drop_rate': 0.18915898051159}. Best is trial 35 with value: 0.9920886719544438.




[I 2024-09-15 08:21:06,907] Trial 36 finished with value: 0.9906243644498678 and parameters: {'max_depth': 27, 'num_leaves': 197, 'learning_rate': 0.01466934326623325, 'drop_rate': 0.224344849917915}. Best is trial 35 with value: 0.9920886719544438.




[I 2024-09-15 08:21:07,302] Trial 37 finished with value: 0.9840349806792761 and parameters: {'max_depth': 37, 'num_leaves': 233, 'learning_rate': 0.0001831335631981833, 'drop_rate': 0.1894581501014093}. Best is trial 35 with value: 0.9920886719544438.




[I 2024-09-15 08:21:08,270] Trial 38 finished with value: 0.9917022574740696 and parameters: {'max_depth': 57, 'num_leaves': 207, 'learning_rate': 0.025861929862405583, 'drop_rate': 0.17061185171088594}. Best is trial 35 with value: 0.9920886719544438.




[I 2024-09-15 08:21:09,245] Trial 39 finished with value: 0.9917836078909905 and parameters: {'max_depth': 67, 'num_leaves': 211, 'learning_rate': 0.02402229278557237, 'drop_rate': 0.2717182135967279}. Best is trial 35 with value: 0.9920886719544438.




[I 2024-09-15 08:21:10,725] Trial 40 finished with value: 0.9905633516371771 and parameters: {'max_depth': 67, 'num_leaves': 211, 'learning_rate': 0.05982697996567312, 'drop_rate': 0.2792174237091325}. Best is trial 35 with value: 0.9920886719544438.




[I 2024-09-15 08:21:11,821] Trial 41 finished with value: 0.9916209070571487 and parameters: {'max_depth': 59, 'num_leaves': 211, 'learning_rate': 0.02507761680228392, 'drop_rate': 0.3958440828151637}. Best is trial 35 with value: 0.9920886719544438.




[I 2024-09-15 08:21:12,544] Trial 42 finished with value: 0.9917632702867603 and parameters: {'max_depth': 57, 'num_leaves': 209, 'learning_rate': 0.022909702259356794, 'drop_rate': 0.39820489204404064}. Best is trial 35 with value: 0.9920886719544438.




Exception ignored on calling ctypes callback function: <function _log_callback at 0x7fc32c5e8f40>
Traceback (most recent call last):
  File "/groups/hep/kinch/miniconda3/lib/python3.12/site-packages/lightgbm/basic.py", line 257, in _log_callback
    def _log_callback(msg: bytes) -> None:
    
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf


[I 2024-09-15 08:21:13,190] Trial 43 finished with value: 0.9906243644498678 and parameters: {'max_depth': 56, 'num_leaves': 211, 'learning_rate': 0.015191973657532007, 'drop_rate': 0.3677761593066839}. Best is trial 35 with value: 0.9920886719544438.




[I 2024-09-15 08:21:13,632] Trial 44 finished with value: 0.9846146023998373 and parameters: {'max_depth': 58, 'num_leaves': 256, 'learning_rate': 0.006829915435039097, 'drop_rate': 0.3377827783298275}. Best is trial 35 with value: 0.9920886719544438.




[I 2024-09-15 08:21:16,754] Trial 45 finished with value: 0.9910514541387024 and parameters: {'max_depth': 67, 'num_leaves': 235, 'learning_rate': 0.020311394098234088, 'drop_rate': 0.36778114556115055}. Best is trial 35 with value: 0.9920886719544438.




[I 2024-09-15 08:21:17,360] Trial 46 finished with value: 0.9850213544844417 and parameters: {'max_depth': 74, 'num_leaves': 194, 'learning_rate': 0.004157859690200806, 'drop_rate': 0.2684691657269115}. Best is trial 35 with value: 0.9920886719544438.




[I 2024-09-15 08:21:18,633] Trial 47 finished with value: 0.9918852959121415 and parameters: {'max_depth': 89, 'num_leaves': 222, 'learning_rate': 0.024378416010063537, 'drop_rate': 0.31063452687383863}. Best is trial 35 with value: 0.9920886719544438.




[I 2024-09-15 08:21:26,039] Trial 48 finished with value: 0.9895057962172057 and parameters: {'max_depth': 88, 'num_leaves': 225, 'learning_rate': 0.0754857912787257, 'drop_rate': 0.30221602032406414}. Best is trial 35 with value: 0.9920886719544438.




[I 2024-09-15 08:21:26,577] Trial 49 finished with value: 0.9840349806792761 and parameters: {'max_depth': 91, 'num_leaves': 242, 'learning_rate': 0.0008854725723016375, 'drop_rate': 0.1882949546661624}. Best is trial 35 with value: 0.9920886719544438.


