In [1]:
# Setup
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from pathlib import Path
from sklearn.linear_model import LogisticRegression

# Import custom methods
import sys
sys.path.append("../") # go to parent dir
from utilities import utilities as utils

# Ignore deprecation warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Read in data, check formatting compatibility
pickle_path = Path('/Users/andreakeane/Documents/DataScience/GridCure_Problems/pickles/')
labels = pd.read_pickle(pickle_path / "labels.pkl")
X = pd.read_pickle(pickle_path / "X_all.pkl")
y = pd.read_pickle(pickle_path / "y_all.pkl")

if X.shape[0] != y.shape[0]: 
    # TODO: Raise Error
    print("ERROR. Dataframe shapes don't match.\n"
          "X Rows: {}"
          "y Rows: {}".format(X.shape, y.shape))

In [3]:
# Remove houses without EVs
houses_woEV = utils.get_pickle(pickle_path / "houses_woEV.pkl")
print("{} Houses without EVs".format(len(houses_woEV)))

y = y.drop(houses_woEV, errors='ignore')
X = X.drop(houses_woEV, errors='ignore')

print("Features:")
for feature in X.columns.tolist(): 
    print("\t" + feature)

1105 Houses without EVs
Features:
	value
	diff
	h8_avg
	h24_avg
	h24_min
	h24_max
	h72_avg
	diff_2
	diff_3
	diff_5


In [10]:
# Try different sets of features
# ['value', 'diff', 'h8_avg', 'h24_avg', 'h24_min', 'h24_max', 'h72_avg','diff_2', 'diff_3', 'diff_5']
X0 = X[['value', 'diff', 'h24_avg', 'h72_avg']]
X1 = X[['diff', 'h24_avg', 'h72_avg']]
X2 = X[['value', 'h24_avg', 'h72_avg']]
X3 = X[['value', 'diff', 'h72_avg']]
X4 = X[['value', 'diff', 'h24_avg']]
X5 = X[['value']]
X6 = X[['diff']]
X7 = X[['diff', 'value']]

X8 = X[['value', 'h8_avg', 'h24_avg', 'h72_avg']]
X9 = X[['value', 'diff', 'h24_min', 'h24_max']]
X10 = X[['value', 'h24_min', 'h24_max']]
X11 = X[['value', 'diff', 'h8_avg']]
X12 = X[['diff','diff_2', 'diff_3', 'diff_5']]
X13 = X[['value', 'h8_avg','diff_3']]


cases = [X, X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13]

In [11]:
# Test feature sets
scores = pd.DataFrame()
for i, X_case in enumerate(cases):
    
    X_train, X_test, y_train, y_test = utils.scale_split_data(X_case,y)

    # Train logistic regression model
    logreg = LogisticRegression(solver='liblinear', random_state=0)
    logreg.fit(X_train, y_train)
    scores = scores.append({'score': logreg.score(X_test, y_test), 
                           'cols': X_case.columns.tolist()}, 
                           ignore_index=True)
    
scores.sort_values('score', ascending=False)


Unnamed: 0,cols,score
0,"[value, diff, h8_avg, h24_avg, h24_min, h24_ma...",0.945535
9,"[value, h8_avg, h24_avg, h72_avg]",0.944575
3,"[value, h24_avg, h72_avg]",0.94456
1,"[value, diff, h24_avg, h72_avg]",0.944336
5,"[value, diff, h24_avg]",0.943569
4,"[value, diff, h72_avg]",0.943431
11,"[value, h24_min, h24_max]",0.937695
12,"[value, diff, h8_avg]",0.937297
10,"[value, diff, h24_min, h24_max]",0.937086
14,"[value, h8_avg, diff_3]",0.936989


Note, we see a worse score by only training for EV houses.