In [1]:
# Setup
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from pathlib import Path

# Import custom methods
import sys
sys.path.append("../") # go to parent dir
from utilities import utilities as utils

# Ignore deprecation warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Setup Paths
pickle_path = Path('/Users/andreakeane/Documents/DataScience/GridCure_Problems/pickles/')
fig_path = Path('/Users/andreakeane/Documents/DataScience/GridCure_Problems/figures/')

# Read in Data
features_a = pd.read_pickle(pickle_path / "features_a.pkl")
X = features_a.drop(['label', 'tot_chrgs'], axis=1)
y = features_a[['label']]

# Split data
# Same features and splits used for all models
X_train, X_test, y_train, y_test = utils.scale_split_data(X, y)

## Testing Models
Briefly compare the performance of several binary classification models.   
• All models use the same Train/Test data split (above)  
• All models use thei default parameters, initialized with random_state=0  

Comparing the following Models:  
• Logistic Regression (LR)  
• Support Vector Machine (SVM)  
• Neural Network (NN)  
• Random Forest (RF)  
• K-Nearest Neighbors (KNN)

In [15]:
# Logistic Regression 
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(random_state=0).fit(X_train, np.ravel(y_train))  
lr_score = LR.score(X_test,y_test)

In [16]:
# SVM
from sklearn import svm
SVM = svm.LinearSVC(random_state=0).fit(X_train, np.ravel(y_train))  
svm_score = SVM.score(X_test,y_test)



In [17]:
# Neural Network
from sklearn.neural_network import MLPClassifier 
NN = MLPClassifier(random_state=0).fit(X_train, np.ravel(y_train))  
nn_score = NN.score(X_test, y_test)



In [18]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(random_state=0).fit(X_train, np.ravel(y_train))  
rf_score = RF.score(X_test,y_test)

In [19]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier().fit(X_train, np.ravel(y_train)) 
knn_score = neigh.score(X_test,y_test)

In [8]:
print(
    "Logistic Regression Score: {:.3f}\n"
    "SVM Score: {:.3f}\n"
    "Neural Network Score: {:.3f}\n"
    "Random Forest Score: {:.3f}\n"
    "K-Nearest Neighbors Score: {:.3f}\n".format(lr_score, svm_score, nn_score, rf_score, knn_score)
)


Logistic Regression Score: 0.843
SVM Score: 0.843
Neural Network Score: 0.866
Random Forest Score: 0.807
K-Nearest Neighbors Score: 0.835



**Try a few other models**  
Based on Kaggle article reccomendation  
https://www.kaggle.com/klaudiajankowska/binary-classification-methods-comparison  
• Linear Discriminant Analysis  
• Quadratic Discriminant Analysis  
• Gaussian Naive Bayes  

I negelcted to include these models in the report because (a) I don't have enough understanding to effectively evaluate their performance and (b) none of them performed well enough to justify using these models over the ones above. 

In [9]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis().fit(X_train, np.ravel(y_train))
print("LDA Score: {:.3f}".format(clf.score(X_test, y_test)))

LDA Score: 0.830


In [10]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
clf = QuadraticDiscriminantAnalysis().fit(X_train, np.ravel(y_train))
print("QDA Score: {:.3f}".format(clf.score(X_test, y_test)))

QDA Score: 0.540


In [11]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(X_train, np.ravel(y_train))
print("Gaussian NB Score: {:.3f}".format(gnb.score(X_test, y_test)))

Gaussian NB Score: 0.350
