# Create & Train ML Models

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from joblib import dump
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler

In [6]:
version = '7.0.1'
train = df = pd.read_csv(f'../data_files/{version}/train.csv', index_col=[0])
X_train = train.drop(['category', 'Lb_M', 'IsSimulated'], axis=1)
y_train = train.category

val = df = pd.read_csv(f'../data_files/{version}/val.csv', index_col=[0])
X_val = val.drop(['category', 'Lb_M', 'IsSimulated'], axis=1)
y_val = val.category

test = df = pd.read_csv(f'../data_files/{version}/test.csv', index_col=[0])
X_test = test.drop(['category', 'Lb_M', 'IsSimulated'], axis=1)
y_test = test.category

## K-Nearest Neighbors Classifier

In [4]:
knn_classifier = KNeighborsClassifier(n_neighbors=70)
knn_classifier.fit(X_train, y_train)
val_pred_classes = knn_classifier.predict(X_test)
val_pred_probs = knn_classifier.predict_proba(X_test)[:,1]

ras = roc_auc_score(y_test, val_pred_probs)
bas = accuracy_score(y_test, val_pred_classes)

print(f"ROC-AUC Score: {ras:.4f}\nAccuracy Score: {bas:.4f}")

# Save the model using joblib
dump(knn_classifier, f'models/KNN_{version}.joblib')  

ROC-AUC Score: 0.9761
Accuracy Score: 0.9145


['models/KNN_7.0.1.joblib']

## Random Forest Classifier

In [7]:
clf = RandomForestClassifier(n_estimators=500, max_depth=12, random_state=0)
clf.fit(X_train, y_train)

val_pred_classes = clf.predict(X_val)
val_pred_probs = clf.predict_proba(X_val)[:,1]

ras = roc_auc_score(y_val, val_pred_probs)
bas = accuracy_score(y_val, val_pred_classes)

print(f"ROC-AUC Score: {ras:.4f}\nAccuracy Score: {bas:.4f}")

# Save the model using joblib
dump(clf, f'models/RFC_{version}.joblib') 

ROC-AUC Score: 0.9859
Accuracy Score: 0.9406


['models/RFC_7.0.1.joblib']

In [10]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(clf, random_state=1).fit(X_val, y_val)
eli5.show_weights(perm, feature_names = X_val.columns.tolist())

Weight,Feature
0.0489  ± 0.0028,Lb_DTF_PV_chi2
0.0125  ± 0.0006,Lb_TAUERR
0.0101  ± 0.0040,Lb_IP23
0.0062  ± 0.0019,LStar_ORIVX_CHI2
0.0051  ± 0.0025,SUM_CONE_ISO
0.0048  ± 0.0031,p_TRACK_VeloCHI2NDOF
0.0045  ± 0.0027,Lb_ENDVERTEX_CHI2
0.0038  ± 0.0034,LN_COS_LBDIRA
0.0035  ± 0.0016,Lb_PT
0.0028  ± 0.0011,K_TRACK_VeloCHI2NDOF


## Decision Tree Classifier

In [12]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(max_depth=8, random_state=0)
dtc.fit(X_train, y_train)

val_pred_classes = dtc.predict(X_val)
val_pred_probs = dtc.predict_proba(X_val)[:,1]

ras = roc_auc_score(y_val, val_pred_probs)
bas = accuracy_score(y_val, val_pred_classes)

print(f"ROC-AUC Score: {ras:.4f}\nAccuracy Score: {bas:.4f}")

# Save the model using joblib
dump(dtc, f'models/DTC_{version}.joblib') 

ROC-AUC Score: 0.9551
Accuracy Score: 0.9132


['models/DTC_7.0.1.joblib']

In [13]:
perm = PermutationImportance(dtc, random_state=1).fit(X_val, y_val)
eli5.show_weights(perm, feature_names = X_val.columns.tolist())

Weight,Feature
0.0784  ± 0.0019,Lb_DTF_PV_chi2
0.0779  ± 0.0046,LStar_ORIVX_CHI2
0.0363  ± 0.0038,Lb_TAUERR
0.0336  ± 0.0027,LN_LB_MINIPCHI2
0.0314  ± 0.0058,LN_COS_LBDIRA
0.0223  ± 0.0020,Lb_IP23
0.0118  ± 0.0047,p_TRACK_VeloCHI2NDOF
0.0110  ± 0.0024,SUM_CONE_ISO
0.0048  ± 0.0016,Lb_IPCHI2_OWNPV
0.0043  ± 0.0012,K_TRACK_VeloCHI2NDOF


## Gradient Boosted Decision Tree

## CURRENTLY BROKEN - BROKEN INSTALL?!

In [6]:
params = {
    'objective': 'binary:logistic',
    'max_depth': 6,
    'learning_rate': 0.05,
    'n_estimators': 500,
    'eval_metric': ['auc', 'logloss'],
    'use_label_encoder': False,
}

xgb_clf = xgb.XGBClassifier(**params)

# Fit the classifier to the training data
xgb_clf.fit(X_train.to_numpy(), y_train.to_numpy(), eval_set=[(X_val.to_numpy(), y_val.to_numpy())], early_stopping_rounds=20, eval_metric='auc')

AttributeError: /cvmfs/lhcbdev.cern.ch/conda/envs/default/2021-09-07_04-06/linux-64/lib/libxgboost.so: undefined symbol: XGDMatrixCreateFromDense

## Random Forest + Boosted Decision Tree