In [16]:
# Setup
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from pathlib import Path
from sklearn.linear_model import LogisticRegression

# Import custom methods
import sys
sys.path.append("../") # go to parent dir
from utilities import utilities as utils

# Ignore deprecation warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
# Read in data, check formatting compatibility
pickle_path = Path('/Users/andreakeane/Documents/DataScience/GridCure_Problems/pickles/')
labels = pd.read_pickle(pickle_path / "labels.pkl")
X = pd.read_pickle(pickle_path / "X_all.pkl")
y = pd.read_pickle(pickle_path / "y_all.pkl")

if X.shape[0] != y.shape[0]: 
    # TODO: Raise Error
    print("ERROR. Dataframe shapes don't match.\n"
          "X Rows: {}"
          "y Rows: {}".format(X.shape, y.shape))

In [18]:
# Remove houses without EVs
houses_woEV = utils.get_pickle(pickle_path / "houses_woEV.pkl")
print("{} Houses without EVs".format(len(houses_woEV)))

y = y.drop(houses_woEV, errors='ignore')
X = X.drop(houses_woEV, errors='ignore')

print("Features:")
for feature in X.columns.tolist(): 
    print("\t" + feature)

1105 Houses without EVs
Features:
	value
	diff
	h8_avg
	h24_avg
	h24_min
	h24_max
	h72_avg
	diff_2
	diff_3
	diff_5


In [19]:
# Split data
# Same features and splits used for all models
X_train, X_test, y_train, y_test = utils.scale_split_data(X, y)

In [20]:
# Logistic Regression 
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(random_state=0).fit(X_train, np.ravel(y_train))  
lr_score = LR.score(X_test,y_test)

In [22]:
# SVM
from sklearn import svm
SVM = svm.LinearSVC(random_state=0).fit(X_train, np.ravel(y_train))  
svm_score = SVM.score(X_test,y_test)



In [23]:
# Neural Network
from sklearn.neural_network import MLPClassifier 
NN = MLPClassifier(random_state=0).fit(X_train, np.ravel(y_train))    
nn_score = NN.score(X_test, y_test)

In [24]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(random_state=0).fit(X_train, np.ravel(y_train))  
rf_score = RF.score(X_test,y_test)

In [25]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier().fit(X_train, y_train) 
knn_score = neigh.score(X_test,y_test)

In [26]:
print(
    "Logistic Regression Score: {:.3f}\n"
    "SVM Score: {:.3f}\n"
    "Neural Network Score: {:.3f}\n"
    "Random Forest Score: {:.3f}\n"
    "K-Nearest Neighbors Score: {:.3f}\n".format(lr_score, svm_score, nn_score, rf_score, knn_score)
)

Logistic Regression Score: 0.946
SVM Score: 0.945
Neural Network Score: 0.957
Random Forest Score: 0.960
K-Nearest Neighbors Score: 0.959



In [None]:
# KNN - Extra Testing
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
scores = pd.DataFrame(columns=['neighbors', 'train', 'test'])
for n in n_neighbors:
    neigh = KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train) 
    train_score = neigh.score(X_train,y_train)
    test_score = neigh.score(X_test,y_test)
    scores = scores.append({'neighbors': n, 
                            'train': train_score, 
                            'test': test_score}, ignore_index=True)
    print(scores)

scores