In [1]:
# Setup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import time
import warnings

# Import custom methods
import sys
sys.path.append("../") # go to parent dir
from utilities import utilities as utils

from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
pickle_path = Path('../../pickles')
test = pd.read_pickle(pickle_path / "test.pkl")
train = pd.read_pickle(pickle_path / "train.pkl")
labels = pd.read_pickle(pickle_path / "labels.pkl")


In [34]:
X = pd.read_pickle(pickle_path / "X_all.pkl")
y = pd.read_pickle(pickle_path / "y_all.pkl")

if X.shape[0] != y.shape[0]: 
    # TODO: Raise Error
    print("ERROR. Dataframe shapes don't match.")
    print("X Rows: {}".format(X.shape))
    print("y Rows: {}".format(y.shape))

In [77]:
# Scale X-data between -1 and 1
scaler = StandardScaler().fit(X)                                    
X_scaled = pd.DataFrame(scaler.transform(X), 
                        index=X.index, 
                        columns=X.columns)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=0)

# Train logistic regression model
logreg = LogisticRegression(solver='liblinear', random_state=0)
logreg.fit(X_train, y_train)
print('Accuracy of classifier on test set: {:.3f}'.format(logreg.score(X_test, y_test)))


Accuracy of classifier on test set: 0.977


In [86]:
# Prepare df's for joining
# Join df's
results = X_test.join(y_test.rename('label_true'))
results.head()

# Predict bool classification
y_pred = logreg.predict(X_test)
y_pred = pd.DataFrame(y_pred, 
                      index=X_test.index, 
                      columns=['label_pred'])

# Probability for each testing classification
probs = logreg.predict_proba(X_test)[:,1]
probs = pd.DataFrame(probs, 
                     index=X_test.index, 
                     columns=['label_prob'])


results = results.join(y_pred)
results = results.join(probs)
results.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,value,diff,h24_avg,h72_avg,label_true,label_pred,label_prob
House ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11633733,Interval_1075,1.35233,1.664168,1.256962,1.096586,0.0,0.0,0.006449
11636379,Interval_265,-0.439749,-0.015719,-0.122096,-0.557148,0.0,0.0,0.007096
11653142,Interval_1488,-0.772596,9.2e-05,-0.160336,0.056551,0.0,0.0,0.002139
11631582,Interval_2137,-0.31947,0.029737,-0.270404,-0.211978,0.0,0.0,0.007711
11655055,Interval_1992,0.112475,-0.033506,-0.328146,-0.301984,0.0,0.0,0.020876


In [87]:
# Evaluation
# True Values
labels_mod = results['label_true'].unstack()
labels_mod = pd.DataFrame(labels_mod.sum(axis=1), columns=['sum'])
labels_mod['bool'] = labels_mod['sum'].where(labels_mod['sum'] == 0, other=1)

# Predicted Values
predict = results['label_pred'].unstack()
predict = pd.DataFrame(predict.sum(axis=1), columns=['sum_pred'])
predict['bool_pred'] = predict['sum_pred'].where(predict['sum_pred'] == 0, other=1)

df = predict.join(labels_mod)
df['correctness'] = np.where(df['bool'] == df['bool_pred'], True, False)
df.head(n=20)

# Per-House classification correctness
vc = df['correctness'].value_counts(sort=True)
print("{:.1f}% Incorrect Classifications".format(100 * vc[0]/(vc[1] + vc[0])))


20.7% Incorrect Classifications
