# Create & Train ML Models

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from joblib import dump
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score
import eli5
from eli5.sklearn import PermutationImportance

In [3]:
version = '7.0.2'
train = df = pd.read_csv(f'../data_files/{version}/train.csv', index_col=[0])
X_train = train.drop(['category', 'Lb_M', 'IsSimulated'], axis=1)
y_train = train.category

val = df = pd.read_csv(f'../data_files/{version}/val.csv', index_col=[0])
X_val = val.drop(['category', 'Lb_M', 'IsSimulated'], axis=1)
y_val = val.category

test = df = pd.read_csv(f'../data_files/{version}/test.csv', index_col=[0])
X_test = test.drop(['category', 'Lb_M', 'IsSimulated'], axis=1)
y_test = test.category

## K-Nearest Neighbors Classifier

In [4]:
knn_classifier = KNeighborsClassifier(n_neighbors=70)
knn_classifier.fit(X_train, y_train)
val_pred_classes = knn_classifier.predict(X_test)
val_pred_probs = knn_classifier.predict_proba(X_test)[:,1]

ras = roc_auc_score(y_test, val_pred_probs)
bas = accuracy_score(y_test, val_pred_classes)

print(f"ROC-AUC Score: {ras:.4f}\nAccuracy Score: {bas:.4f}")

# Save the model using joblib
dump(knn_classifier, f'models/KNN_{version}.joblib')  

ROC-AUC Score: 0.9833
Accuracy Score: 0.9335


['models/KNN_7.0.2.joblib']

## Random Forest Classifier

In [5]:
clf = RandomForestClassifier(n_estimators=500, max_depth=12, random_state=0)
clf.fit(X_train, y_train)

val_pred_classes = clf.predict(X_val)
val_pred_probs = clf.predict_proba(X_val)[:,1]

ras = roc_auc_score(y_val, val_pred_probs)
bas = accuracy_score(y_val, val_pred_classes)

print(f"ROC-AUC Score: {ras:.4f}\nAccuracy Score: {bas:.4f}")

# Save the model using joblib
dump(clf, f'models/RFC_{version}.joblib') 

ROC-AUC Score: 0.9868
Accuracy Score: 0.9435


['models/RFC_7.0.2.joblib']

In [7]:
perm = PermutationImportance(clf, random_state=1).fit(X_val, y_val)
eli5.show_weights(perm, feature_names = X_val.columns.tolist())

Weight,Feature
0.0516  ± 0.0052,LN_Lb_DTF_PV_chi2
0.0103  ± 0.0020,Lb_TAUERR
0.0070  ± 0.0016,LStar_IP_ORIVX
0.0056  ± 0.0021,JPs_IP_TOPPV
0.0049  ± 0.0014,p_TRACK_VeloCHI2NDOF
0.0047  ± 0.0026,LN_Lb_IP23
0.0042  ± 0.0014,L1_TRACK_VeloCHI2NDOF
0.0037  ± 0.0013,LN_SUM_CONEISO
0.0037  ± 0.0009,K_TRACK_VeloCHI2NDOF
0.0036  ± 0.0009,Lb_PT


## Decision Tree Classifier

In [8]:
dtc = DecisionTreeClassifier(max_depth=8, random_state=0)
dtc.fit(X_train, y_train)

val_pred_classes = dtc.predict(X_val)
val_pred_probs = dtc.predict_proba(X_val)[:,1]

ras = roc_auc_score(y_val, val_pred_probs)
bas = accuracy_score(y_val, val_pred_classes)

print(f"ROC-AUC Score: {ras:.4f}\nAccuracy Score: {bas:.4f}")

# Save the model using joblib
dump(dtc, f'models/DTC_{version}.joblib') 

ROC-AUC Score: 0.9590
Accuracy Score: 0.9143


['models/DTC_7.0.2.joblib']

In [9]:
perm = PermutationImportance(dtc, random_state=1).fit(X_val, y_val)
eli5.show_weights(perm, feature_names = X_val.columns.tolist())

Weight,Feature
0.0819  ± 0.0056,LN_Lb_DTF_PV_chi2
0.0545  ± 0.0028,JPs_ORIVX_CHI2
0.0510  ± 0.0026,LN_Lb_IPCHI2_OWNPV
0.0271  ± 0.0050,Lb_TAUERR
0.0258  ± 0.0015,LStar_IP_ORIVX
0.0234  ± 0.0047,LN_COS_LBDIRA
0.0160  ± 0.0015,LN_Lb_IP23
0.0134  ± 0.0009,JPs_IP_ORIVX
0.0096  ± 0.0020,JPs_IP_TOPPV
0.0093  ± 0.0023,p_TRACK_VeloCHI2NDOF


## Gradient Boosted Decision Tree

## CURRENTLY BROKEN - BROKEN INSTALL?!

In [8]:
params = {
    'objective': 'binary:logistic',
    'max_depth': 6,
    'learning_rate': 0.05,
    'n_estimators': 500,
    'eval_metric': ['auc', 'logloss'],
    'use_label_encoder': False,
}

xgb_clf = xgb.XGBClassifier(**params)

# Fit the classifier to the training data
xgb_clf.fit(X_train.to_numpy(), y_train.to_numpy(), eval_set=[(X_val.to_numpy(), y_val.to_numpy())], early_stopping_rounds=20, eval_metric='auc')

AttributeError: /cvmfs/lhcbdev.cern.ch/conda/envs/default/2021-09-07_04-06/linux-64/lib/libxgboost.so: undefined symbol: XGDMatrixCreateFromDense

## Random Forest + Boosted Decision Tree