# Machine Learning Classification Methods
This notebook will use a variety of other more basic classification methods using machine learning. We will then be able to evaluate these against the neural network earlier constructed. We can then evaluate the probability distributions for test precitions, the AUC-ROC cruves and the Punzi FOM value as a function of probability cut. 

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from matplotlib.ticker import AutoMinorLocator
import matplotlib.pyplot as plt
import xgboost as xgb
import numpy as np
import pandas as pd
import pickle
import sys
sys.path.append('../')
from dataflow import Flow

In [2]:
version = '6.0.3'
data = Flow(None, None, None, csv_path=f'../data_files/{version}.csv')
(X_train, y_train), (X_val, y_val), (X_test, y_test) = data.get_train_val_test_split()

In [4]:
def get_metrics(model, use_test=False):
    """
    Get a lot of useful metrics from the fit model which will now
    be tested on the validation data can convert to test if needed
    
    Parameters
    ----------
    model : object
        A particular instance of some model which takes the usual
        predict and predict_proba and fit methods
    """
    
    X = X_val   
    Y = y_val 
    if use_test:
        X = X_test
        Y = y_test
    Y = Y.to_numpy()
        
    predicted_probabilities = model.predict_proba(X)
    # Generate model predictions on these input data e.g. event 1 [prob sig, prob bg]
    
    pred_signal_probs = predicted_probabilities[:,0]
    pred_bg_probs = predicted_probabilities[:,1]
    
    bins = np.linspace(0, 1, 11)
    # Generate 11 bin edges i.e. 10 bins
    probability_distribution_s, _ = np.histogram(pred_signal_probs, bins=bins, density=True)
    probability_distribution_b, _ = np.histogram(pred_bg_probs, bins=bins, density=True)
    prob_dist_with_bins = [[probability_distribution_s, bins], [probability_distribution_b, bins]]
    # Get the frequencies and bin edges and dump them into a list

    predicted_classes = model.predict(X)
    # Sort via a 0.5 cut point these predictions into predicted classes
    n_correct = np.count_nonzero((predicted_classes == Y))
    binary_accuracy = n_correct / len(predicted_classes)
    
    return prob_dist_with_bins, binary_accuracy
    

## KNN Classifier

In [14]:
knn_classifier = KNeighborsClassifier(n_neighbors=50)
knn_classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=50)

In [15]:
probs, bin_acc = get_metrics(knn_classifier)
print(f'Validation Binary Accuracy: {bin_acc:.5f}')

Validation Binary Accuracy: 0.91921


In [16]:
from joblib import dump
dump(knn_classifier, 'models/KNN_6.0.3.joblib')                    

['models/KNN_6.0.3.joblib']

## XGBoost + RandomForestClassifier

In [19]:
params = {"objective": "binary:logistic",
          "eta": 0.3,
          "max_depth": 6,
          "min_child_weight": 3,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          "learning_rate": 0.05,
          "seed": 1}

rf = RandomForestClassifier(n_estimators=120, random_state=1)
rf.fit(X_train, y_train)

num_trees=250
gbm = xgb.train(params, xgb.DMatrix(X_train, y_train), num_trees)

test_probs = (rf.predict_proba(X_val)[:,1] + gbm.predict(xgb.DMatrix(X_val)))/2



In [20]:
# Evaluate the predictions now...
classes = np.where(test_probs > 0.5, 1, 0)
value_counts = np.count_nonzero(classes == y_val)
print(f'Binary Accuracy: {value_counts/len(classes):.5f}')

Binary Accuracy: 0.94298
