In [1]:
# Setup
import numpy as np
import pandas as pd
import warnings
import utilities as utils

from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression

# Ignore deprecation warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# Read in data, check formatting compatibility
pickle_path = Path('/Users/andreakeane/Documents/DataScience/GridCure_Problems/pickles/')
labels = pd.read_pickle(pickle_path / "labels.pkl")
X = pd.read_pickle(pickle_path / "X_all.pkl")
y = pd.read_pickle(pickle_path / "y_all.pkl")

if X.shape[0] != y.shape[0]: 
    # TODO: Raise Error
    print("ERROR. Dataframe shapes don't match.")
    print("X Rows: {}".format(X.shape))
    print("y Rows: {}".format(y.shape))

In [5]:
# Remove houses without EVs
houses_woEV = utils.get_pickle(pickle_path / "houses_woEV.pkl")
print("{} Houses without EVs".format(len(houses_woEV)))

y = y.drop(houses_woEV, errors='ignore')
X = X.drop(houses_woEV, errors='ignore')

print("Features:")
for feature in X.columns.tolist(): 
    print("\t" + feature)

1105 Houses without EVs
Features:
	value
	diff
	h8_avg
	h24_avg
	h24_min
	h24_max
	h72_avg


In [13]:
# Try different sets of features
# ['value', 'diff', 'h8_avg', 'h24_avg', 'h24_min', 'h24_max', 'h72_avg']
X0 = X[['value', 'diff', 'h24_avg', 'h72_avg']]
X1 = X[['diff', 'h24_avg', 'h72_avg']]
X2 = X[['value', 'h24_avg', 'h72_avg']]
X3 = X[['value', 'diff', 'h72_avg']]
X4 = X[['value', 'diff', 'h24_avg']]
X5 = X[['value']]
X6 = X[['diff']]
X7 = X[['diff', 'value']]

X8 = X[['value', 'h8_avg', 'h24_avg', 'h72_avg']]
X9 = X[['value', 'diff', 'h24_min', 'h24_max']]
X10 = X[['value', 'h24_min', 'h24_max']]
X11 = X[['value', 'diff', 'h8_avg']]


cases = [X, X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11]

In [14]:
# Test feature sets
scores = pd.DataFrame()
for i, X_case in enumerate(cases):
    
    # Scale X-data between -1 and 1
    scaler = StandardScaler().fit(X_case)                                    
    X_scaled = pd.DataFrame(scaler.transform(X_case), 
                            index=X_case.index, 
                            columns=X_case.columns)

    # Split data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=0)

    # Train logistic regression model
    logreg = LogisticRegression(solver='liblinear', random_state=0)
    logreg.fit(X_train, y_train)
    scores = scores.append({'score': logreg.score(X_test, y_test), 
                           'cols': X_case.columns.tolist()}, 
                           ignore_index=True)
    
scores.sort_values('score', ascending=False)


Unnamed: 0,cols,score
0,"[value, diff, h8_avg, h24_avg, h24_min, h24_ma...",0.944621
9,"[value, h8_avg, h24_avg, h72_avg]",0.944575
3,"[value, h24_avg, h72_avg]",0.94456
1,"[value, diff, h24_avg, h72_avg]",0.944336
5,"[value, diff, h24_avg]",0.943569
4,"[value, diff, h72_avg]",0.943431
11,"[value, h24_min, h24_max]",0.937695
12,"[value, diff, h8_avg]",0.937297
10,"[value, diff, h24_min, h24_max]",0.937086
7,[diff],0.925143


Note, we see a worse score by only training for EV houses.