In [10]:
from root_pandas import read_root
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, precision_recall_curve, confusion_matrix, precision_score, recall_score
import gc ; gc.enable()
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm
import lightgbm as lgb

In [51]:
ids = [
'runNumber'
,'eventNumber'
,'nCandidate']

features = [
'TwoBody_DIRA_OWNPV'
,'TwoBody_DOCAMAX'
,'TwoBody_ENDVERTEX_CHI2'
,'noexpand:log(TwoBody_FDCHI2_OWNPV)'    
,'TwoBody_FD_OWNPV'
,'TwoBody_M'
,'TwoBody_IPCHI2_OWNPV'
,'TwoBody_Mcorr'
,'TwoBody_PT'
,'noexpand:log(Track1_MINIPCHI2)'
,'Track1_PT'
,'noexpand:log(Track2_MINIPCHI2)'
,'Track2_PT'
,'TwoBody_n_Extra']

labels = [
'TwoBody_FromSameB' ]

cols = ids+ features + labels

In [52]:
df = read_root('TaggingJpsiK2012_tiny.root', columns=cols)
#df = read_root('TaggingJpsiK2012_5pct.root', columns=cols)
#as eventNumber is not unique, but for a given run number, it is, so we can combine the two to get a completely unique id for each event
df.index = df.apply(lambda x:str(int(x.runNumber)) + str(int(x.eventNumber))+'-'+str(int(x.nCandidate)), axis=1 )
df = df.drop(columns=['runNumber', 'eventNumber', 'nCandidate'], axis=0)
#extra features
df['FD_over_FDCHI2'] = df.apply(lambda x: x['TwoBody_FD_OWNPV']/1.5*x['log(TwoBody_FDCHI2_OWNPV)'], axis=1)
print('-Total Number of two_track_vertex candidates: ',df.shape[0])

-Total Number of two_track_vertex candidates:  19107


In [53]:
df.head()

Unnamed: 0,TwoBody_M,TwoBody_Mcorr,TwoBody_PT,Track1_PT,TwoBody_n_Extra,TwoBody_DOCAMAX,TwoBody_ENDVERTEX_CHI2,TwoBody_DIRA_OWNPV,TwoBody_IPCHI2_OWNPV,Track2_PT,TwoBody_FromSameB,TwoBody_FD_OWNPV,log(TwoBody_FDCHI2_OWNPV),log(Track1_MINIPCHI2),log(Track2_MINIPCHI2),FD_over_FDCHI2
5758380686695-0,965.996975,3980.631373,1088.551894,735.016895,26,0.032598,0.251422,0.986617,33.366437,375.623008,0.0,1.282012,3.5507,2.269493,3.226786,3.034694
5758380686695-1,803.724934,2700.606158,1041.253501,1031.496729,7,0.094166,0.557728,0.998899,563.848867,216.595901,0.0,17.710202,6.982014,6.773103,1.964766,82.435252
5758380686695-2,892.712487,2731.500455,1154.322927,1031.496729,7,0.043765,0.52562,0.999298,565.746638,392.828135,0.0,18.742195,6.990956,6.773103,1.726091,87.350578
5758380686695-3,417.081191,1588.27769,1203.702635,1031.496729,5,0.251199,5.76726,0.999661,706.378235,181.251025,1.0,34.10046,7.193034,6.773103,3.428836,163.523855
5758380686695-4,1459.852016,17355.173461,1215.486774,346.628432,4,0.055936,0.405544,0.932815,971.44105,1031.496729,0.0,2.79383,6.919061,4.713366,6.773103,12.88712


In [54]:
feats = [c for c in df.columns if c not in labels]
X = df[feats]
y = df[labels]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=43)
ids1 = X_train.index ; ids2 = X_test.index

oof = pd.Series(np.zeros(y_train.shape[0]), index= ids1)
preds = pd.Series(np.zeros(y_test.shape[0]), index= ids2)

all_data = pd.concat([X_train, X_test])
norm_data = StandardScaler().fit_transform(all_data)
X_train = norm_data[:X_train.shape[0]] ; y_train = y_train.to_numpy().ravel()
X_test = norm_data[X_train.shape[0]:] ; y_test = y_test.to_numpy().ravel()

skf = StratifiedKFold(n_splits=5, random_state=42)


for train_idx, cv_idx in tqdm(skf.split(X_train, y_train), total=skf.n_splits):
    model = LGBMClassifier(metric='roc')   
    model.fit(X_train[train_idx], y_train[train_idx])
    oof.iloc[cv_idx] = model.predict_proba(X_train[cv_idx])[:,1]
    preds.loc[ids2] += model.predict_proba(X_test)[:,1] / skf.n_splits

100%|██████████| 5/5 [00:01<00:00,  2.61it/s]


In [55]:
roc_auc_score(y_train, oof), precision_score(y_train, round(oof)) , recall_score(y_train, round(oof))

(0.9317755325099645, 0.7410468319559229, 0.491324200913242)

In [56]:
roc_auc_score(y_test, preds), precision_score(y_test, round(preds)) , recall_score(y_test, round(preds))

(0.9507322089188837, 0.7662721893491125, 0.5068493150684932)

In [57]:
all_preds = pd.concat([oof, preds])

In [58]:
all_preds[all_preds>0.16].to_csv('second_round_twobody_candidates_tiny.csv')

  """Entry point for launching an IPython kernel.
