# Create & Train ML Models

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from joblib import dump
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler

In [43]:
version = '7.0.0'
df = pd.read_csv(f'../data_files/{version}/all.csv', index_col=[0])
df.dropna(inplace=True)

info_cols = ['Lb_M', 'IsSimulated']
info_df = df[info_cols]
df.drop(info_cols, inplace=True, axis=1)

val = df.sample(frac=0.15, random_state=0)
X_val = val.drop('category', axis=1)
y_val = val.category
# Reserve some data for validation

df.drop(val.index.to_list(), axis=0, inplace=True)
# Remove validation data from df

y = df.category
X = df.drop('category', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=0)

# Normalise the features
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## K-Nearest Neighbors Classifier

In [38]:
knn_classifier = KNeighborsClassifier(n_neighbors=70)
knn_classifier.fit(X_train, y_train)
val_pred_classes = knn_classifier.predict(X_val)
val_pred_probs = knn_classifier.predict_proba(X_val)[:,1]

ras = roc_auc_score(y_val, val_pred_probs)
bas = accuracy_score(y_val, val_pred_classes)

print(f"ROC-AUC Score: {ras:.4f}\nAccuracy Score: {bas:.4f}")

# Save the model using joblib
dump(knn_classifier, f'models/KNN_{version}.joblib')  

ROC-AUC Score: 0.8335
Accuracy Score: 0.7744


['models/KNN_7.0.0.joblib']

## Random Forest Classifier

In [31]:
clf = RandomForestClassifier(n_estimators=500, max_depth=12, random_state=0)
clf.fit(X_train, y_train)

val_pred_classes = clf.predict(X_val)
val_pred_probs = clf.predict_proba(X_val)[:,1]

ras = roc_auc_score(y_val, val_pred_probs)
bas = accuracy_score(y_val, val_pred_classes)

print(f"ROC-AUC Score: {ras:.4f}\nAccuracy Score: {bas:.4f}")

# Save the model using joblib
dump(clf, f'models/RFC_{version}.joblib') 

ROC-AUC Score: 0.9802
Accuracy Score: 0.9341


['models/RFC_7.0.0.joblib']

## Decision Tree Classifier

In [32]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(max_depth=8, random_state=0)
dtc.fit(X_train, y_train)

val_pred_classes = dtc.predict(X_val)
val_pred_probs = dtc.predict_proba(X_val)[:,1]

ras = roc_auc_score(y_val, val_pred_probs)
bas = accuracy_score(y_val, val_pred_classes)

print(f"ROC-AUC Score: {ras:.4f}\nAccuracy Score: {bas:.4f}")

# Save the model using joblib
dump(dtc, 'models/DTC_6.0.6.joblib') 

ROC-AUC Score: 0.9432
Accuracy Score: 0.9038


['models/DTC_6.0.6.joblib']

## Gradient Boosted Decision Tree

## CURRENTLY BROKEN - BROKEN INSTALL?!

In [6]:
params = {
    'objective': 'binary:logistic',
    'max_depth': 6,
    'learning_rate': 0.05,
    'n_estimators': 500,
    'eval_metric': ['auc', 'logloss'],
    'use_label_encoder': False,
}

xgb_clf = xgb.XGBClassifier(**params)

# Fit the classifier to the training data
xgb_clf.fit(X_train.to_numpy(), y_train.to_numpy(), eval_set=[(X_val.to_numpy(), y_val.to_numpy())], early_stopping_rounds=20, eval_metric='auc')

AttributeError: /cvmfs/lhcbdev.cern.ch/conda/envs/default/2021-09-07_04-06/linux-64/lib/libxgboost.so: undefined symbol: XGDMatrixCreateFromDense

## Random Forest + Boosted Decision Tree