In [1]:
# Setup
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from pathlib import Path

# Import custom methods
import sys
sys.path.append("../") # go to parent dir
from utilities import utilities as utils

# Ignore deprecation warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Setup Paths
pickle_path = Path('../../pickles')
fig_path = Path('../../figures')

# Read in Data
features_a = pd.read_pickle(pickle_path / "features_a.pkl")
X = features_a.drop(['label', 'tot_chrgs'], axis=1)
y = features_a[['label']]

# Split data
# Same features and splits used for all models
X_train, X_test, y_train, y_test = utils.scale_split_data(X, y)

## Testing Models
Briefly compare the performance of several binary classification models.   
• All models use the same Train/Test data split (above)  
• All models use thei default parameters, initialized with random_state=0  

Comparing the following Models:  
• Logistic Regression (LR)  
• Support Vector Machine (SVM)  
• Neural Network (NN)  
• Random Forest (RF)  
• K-Nearest Neighbors (KNN)

In [20]:
# Setup Scores df
scores = pd.DataFrame(columns=['Model', 'train', 'test'])

In [21]:
# Logistic Regression 
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(random_state=0).fit(X_train, np.ravel(y_train))  

scores = scores.append({'Model': 'Logistic Regression', 
                        'train': LR.score(X_train,y_train), 
                        'test': LR.score(X_test,y_test)}, ignore_index=True)

In [22]:
# SVM
from sklearn import svm
SVM = svm.LinearSVC(random_state=0).fit(X_train, np.ravel(y_train))  

scores = scores.append({'Model': 'SVM', 
                        'train': SVM.score(X_train,y_train), 
                        'test': SVM.score(X_test,y_test)}, ignore_index=True)



In [23]:
# Neural Network
from sklearn.neural_network import MLPClassifier 
NN = MLPClassifier(random_state=0).fit(X_train, np.ravel(y_train))  
nn_score = NN.score(X_test, y_test)

scores = scores.append({'Model': 'MLP Classifier', 
                        'train': NN.score(X_train, y_train), 
                        'test': NN.score(X_test, y_test)}, ignore_index=True)



In [24]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(random_state=0).fit(X_train, np.ravel(y_train))  
rf_score = RF.score(X_test,y_test)

scores = scores.append({'Model': 'Random Forest', 
                        'train': RF.score(X_train,y_train), 
                        'test': RF.score(X_test,y_test)}, ignore_index=True)

In [25]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier().fit(X_train, np.ravel(y_train)) 
knn_score = neigh.score(X_test,y_test)

scores = scores.append({'Model': 'K-Nearest Neighbors', 
                        'train': neigh.score(X_train,y_train), 
                        'test': neigh.score(X_test,y_test)}, ignore_index=True)

In [27]:
# Print results, csv for Report
# print(scores.round(3).to_csv())
scores.round(3)

Unnamed: 0,Model,train,test
0,Logistic Regression,0.861,0.843
1,SVM,0.856,0.843
2,Neural Network,0.876,0.866
3,Random Forest,0.989,0.807
4,K-Nearest Neighbors,0.88,0.835


**Try a few other models**  
Based on Kaggle article reccomendation  
https://www.kaggle.com/klaudiajankowska/binary-classification-methods-comparison  
• Linear Discriminant Analysis  
• Quadratic Discriminant Analysis  
• Gaussian Naive Bayes  

I negelcted to include these models in the report because (a) I don't have enough understanding to effectively evaluate their performance and (b) none of them performed well enough to justify using these models over the ones above. 

In [9]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis().fit(X_train, np.ravel(y_train))
print("LDA Score: {:.3f}".format(clf.score(X_test, y_test)))

LDA Score: 0.830


In [10]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
clf = QuadraticDiscriminantAnalysis().fit(X_train, np.ravel(y_train))
print("QDA Score: {:.3f}".format(clf.score(X_test, y_test)))

QDA Score: 0.540


In [11]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(X_train, np.ravel(y_train))
print("Gaussian NB Score: {:.3f}".format(gnb.score(X_test, y_test)))

Gaussian NB Score: 0.350


## Model Tuning
C, Inverse of regularization strength: Default is best
Solver: 'liblinear' (default) and 'lbfgs' are the best

In [50]:
# Setup Scores df
scores = pd.DataFrame(columns=['Tuning', 'train', 'test'])

In [51]:
LR = LogisticRegression(random_state=0).fit(X_train, np.ravel(y_train))  
scores = scores.append({'Tuning': 'None', 
                        'train': LR.score(X_train,y_train), 
                        'test': LR.score(X_test,y_test)}, ignore_index=True)

In [52]:
LR = LogisticRegression(random_state=0, C=0.5).fit(X_train, np.ravel(y_train))  
scores = scores.append({'Tuning': 'C=0.5', 
                        'train': LR.score(X_train,y_train), 
                        'test': LR.score(X_test,y_test)}, ignore_index=True)

In [53]:
LR = LogisticRegression(random_state=0, C=0.1).fit(X_train, np.ravel(y_train))  
scores = scores.append({'Tuning': 'C=0.1', 
                        'train': LR.score(X_train,y_train), 
                        'test': LR.score(X_test,y_test)}, ignore_index=True)

In [54]:
LR = LogisticRegression(random_state=0, C=2).fit(X_train, np.ravel(y_train))  
scores = scores.append({'Tuning': 'C=2', 
                        'train': LR.score(X_train,y_train), 
                        'test': LR.score(X_test,y_test)}, ignore_index=True)

In [55]:
LR = LogisticRegression(random_state=0, C=5).fit(X_train, np.ravel(y_train))  
scores = scores.append({'Tuning': 'C=5', 
                        'train': LR.score(X_train,y_train), 
                        'test': LR.score(X_test,y_test)}, ignore_index=True)

In [56]:
scores

Unnamed: 0,Tuning,train,test
0,,0.860825,0.843188
1,C=0.5,0.861684,0.838046
2,C=0.1,0.840206,0.809769
3,C=2,0.859966,0.838046
4,C=5,0.857388,0.840617


In [57]:
# ‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}
# Setup Scores df
scores = pd.DataFrame(columns=['Tuning', 'train', 'test'])

In [58]:
LR = LogisticRegression(random_state=0).fit(X_train, np.ravel(y_train))  
scores = scores.append({'Tuning': 'None', 
                        'train': LR.score(X_train,y_train), 
                        'test': LR.score(X_test,y_test)}, ignore_index=True)

In [59]:
LR = LogisticRegression(random_state=0, solver='newton-cg').fit(X_train, np.ravel(y_train))  
scores = scores.append({'Tuning': 'newton-cg', 
                        'train': LR.score(X_train,y_train), 
                        'test': LR.score(X_test,y_test)}, ignore_index=True)

In [60]:
LR = LogisticRegression(random_state=0, solver='lbfgs').fit(X_train, np.ravel(y_train))  
scores = scores.append({'Tuning': 'lbfgs', 
                        'train': LR.score(X_train,y_train), 
                        'test': LR.score(X_test,y_test)}, ignore_index=True)

In [61]:
LR = LogisticRegression(random_state=0, solver='sag').fit(X_train, np.ravel(y_train))  
scores = scores.append({'Tuning': 'sag', 
                        'train': LR.score(X_train,y_train), 
                        'test': LR.score(X_test,y_test)}, ignore_index=True)



In [62]:
LR = LogisticRegression(random_state=0, solver='saga').fit(X_train, np.ravel(y_train))  
scores = scores.append({'Tuning': 'saga', 
                        'train': LR.score(X_train,y_train), 
                        'test': LR.score(X_test,y_test)}, ignore_index=True)



In [63]:
scores

Unnamed: 0,Tuning,train,test
0,,0.860825,0.843188
1,newton-cg,0.859966,0.843188
2,lbfgs,0.859966,0.845758
3,sag,0.853952,0.825193
4,saga,0.843643,0.812339
