In [16]:
# Setup
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from pathlib import Path
from sklearn.linear_model import LogisticRegression

# Import custom methods
import sys
sys.path.append("../") # go to parent dir
from utilities import utilities as utils

# Ignore deprecation warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
# Read in data, check formatting compatibility
pickle_path = Path('/Users/andreakeane/Documents/DataScience/GridCure_Problems/pickles/')
labels = pd.read_pickle(pickle_path / "labels.pkl")
X = pd.read_pickle(pickle_path / "X_all.pkl")
y = pd.read_pickle(pickle_path / "y_all.pkl")

if X.shape[0] != y.shape[0]: 
    # TODO: Raise Error
    print("ERROR. Dataframe shapes don't match.\n"
          "X Rows: {}"
          "y Rows: {}".format(X.shape, y.shape))

In [18]:
# Remove houses without EVs
houses_woEV = utils.get_pickle(pickle_path / "houses_woEV.pkl")
print("{} Houses without EVs".format(len(houses_woEV)))

y = y.drop(houses_woEV, errors='ignore')
X = X.drop(houses_woEV, errors='ignore')

print("Features:")
for feature in X.columns.tolist(): 
    print("\t" + feature)

1105 Houses without EVs
Features:
	value
	diff
	h8_avg
	h24_avg
	h24_min
	h24_max
	h72_avg
	diff_2
	diff_3
	diff_5


## Testing Models  
Briefly compare the performance of several binary classification models.  
• All models use the same Train/Test data split (above)  
• All models use thei default parameters, initialized with random_state=0  
 
Comparing the following Models:  
• Logistic Regression (LR)  
• Support Vector Machine (SVM)  
• Neural Network (NN)  
• Random Forest (RF)  
• K-Nearest Neighbors (KNN)  

In [45]:
# Split data
# Same features and splits used for all models
X_train, X_test, y_train, y_test = utils.scale_split_data(X, y)

# Setup Scores df
scores = pd.DataFrame(columns=['Model', 'train', 'test'])

In [46]:
# Logistic Regression 
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(random_state=0).fit(X_train, np.ravel(y_train))  

scores = scores.append({'Model': 'Logistic Regression', 
                        'train': LR.score(X_train,y_train), 
                        'test': LR.score(X_test,y_test)}, ignore_index=True)

In [47]:
# SVM
from sklearn import svm
SVM = svm.LinearSVC(random_state=0).fit(X_train, np.ravel(y_train))  

scores = scores.append({'Model': 'SVM', 
                        'train': SVM.score(X_train,y_train), 
                        'test': SVM.score(X_test,y_test)}, ignore_index=True)



In [48]:
# Neural Network
from sklearn.neural_network import MLPClassifier 
NN = MLPClassifier(random_state=0).fit(X_train, np.ravel(y_train))    

scores = scores.append({'Model': 'MLP Classifier', 
                        'train': NN.score(X_train, y_train), 
                        'test': NN.score(X_test, y_test)}, ignore_index=True)

In [49]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(random_state=0).fit(X_train, np.ravel(y_train))  

scores = scores.append({'Model': 'Random Forest', 
                        'train': RF.score(X_train,y_train), 
                        'test': RF.score(X_test,y_test)}, ignore_index=True)

In [53]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier().fit(X_train, y_train)

scores = scores.append({'Model': 'K-Nearest Neighbors', 
                        'train': neigh.score(X_train,y_train), 
                        'test': neigh.score(X_test,y_test)}, ignore_index=True)

In [57]:
# Print results, csv for Report
# print(scores.round(4).to_csv())
scores.round(4)

Unnamed: 0,Model,train,test
0,Logistic Regression,0.9458,0.9455
1,SVM,0.9452,0.9449
2,Neural Network,0.9575,0.9573
3,Random Forest,0.9964,0.9598
4,K-Nearest Neighbors,0.9696,0.9586
5,K-Nearest Neighbors,0.9696,0.9586


In [44]:
# KNN - Extra Testing
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
scores = pd.DataFrame(columns=['neighbors', 'train', 'test'])
for n in n_neighbors:
    neigh = KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train) 
    train_score = neigh.score(X_train,y_train)
    test_score = neigh.score(X_test,y_test)
    scores = scores.append({'neighbors': n, 
                            'train': train_score, 
                            'test': test_score}, ignore_index=True)

scores

   neighbors  train      test
0        1.0    1.0  0.951015
   neighbors     train      test
0        1.0  1.000000  0.951015
1        3.0  0.975038  0.957246
   neighbors     train      test
0        1.0  1.000000  0.951015
1        3.0  0.975038  0.957246
2        5.0  0.969579  0.958616
   neighbors     train      test
0        1.0  1.000000  0.951015
1        3.0  0.975038  0.957246
2        5.0  0.969579  0.958616
3        7.0  0.966866  0.958491
   neighbors     train      test
0        1.0  1.000000  0.951015
1        3.0  0.975038  0.957246
2        5.0  0.969579  0.958616
3        7.0  0.966866  0.958491
4        9.0  0.965190  0.958362
   neighbors     train      test
0        1.0  1.000000  0.951015
1        3.0  0.975038  0.957246
2        5.0  0.969579  0.958616
3        7.0  0.966866  0.958491
4        9.0  0.965190  0.958362
5       11.0  0.963918  0.958414
   neighbors     train      test
0        1.0  1.000000  0.951015
1        3.0  0.975038  0.957246
2        5.0  0.

Unnamed: 0,neighbors,train,test
0,1.0,1.0,0.951015
1,3.0,0.975038,0.957246
2,5.0,0.969579,0.958616
3,7.0,0.966866,0.958491
4,9.0,0.96519,0.958362
5,11.0,0.963918,0.958414
6,13.0,0.963049,0.958653
7,15.0,0.962377,0.958576
8,17.0,0.961835,0.958476
9,19.0,0.96132,0.95842
