In [3]:
# Setup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import time
import utilities as utils
import warnings

from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression

warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
pickle_path = Path('/Users/andreakeane/Documents/DataScience/GridCure_Problems/pickles/')

# TODO: Put these somewhere better
# test.to_pickle(folder / "test.pkl")
# train.to_pickle(folder / "train.pkl")
# labels.to_pickle(folder / "labels.pkl")

test = pd.read_pickle(pickle_path / "test.pkl")
train = pd.read_pickle(pickle_path / "train.pkl")
labels = pd.read_pickle(pickle_path / "labels.pkl")



## Outlier Removal

In [5]:
# Determine outliers
outliers = utils.classify_outliers(train)

# Drop Outliers
train_clean = train.drop(outliers.index)
labels_clean = labels.drop(outliers.index)


Number of Outliers: 37
Percent Removed: 2.33%



## Logistic Regression with Feature Engineering
A series with significant dependence among values. In this case we need to use some statistical models like ARIMA to forecast the data.

In [6]:
# 24-hour average
feat_24h_avg = train.rolling(window=(24*2), axis=1).mean()

# Difference from preceding interval
feat_diff = train.diff(axis=1)


In [6]:
features = pd.DataFrame(index=train_clean.stack().index)
print("Total anticipated rows: {}".format(features.shape[0]))
for i, index in enumerate(features.index.tolist()): 
    house_id, interval = index
    features.at[index, 'value'] = train[interval].loc[house_id]
    features.at[index, 'diff'] = feat_diff[interval].loc[house_id]
    features.at[index, 'day_avg'] = feat_24h_avg[interval].loc[house_id]  
    features.at[index, 'label'] = labels_clean[interval].loc[house_id]  
    if i % 1000000 == 0: print(i)

features.to_pickle(pickle_path / "features.pkl")


4471920
0
1000000
2000000
3000000
4000000
                      value   diff   day_avg  label
House ID                                           
11655099 Interval_1   0.950    NaN       NaN    0.0
         Interval_2   0.826 -0.124       NaN    0.0
         Interval_3   0.361 -0.465       NaN    0.0
         Interval_4   0.238 -0.123       NaN    0.0
         Interval_5   0.342  0.104       NaN    0.0
         Interval_6   0.233 -0.109       NaN    0.0
         Interval_7   0.351  0.118       NaN    0.0
         Interval_8   0.194 -0.157       NaN    0.0
         Interval_9   0.292  0.098       NaN    0.0
         Interval_10  0.234 -0.058       NaN    0.0
         Interval_11  0.260  0.026       NaN    0.0
         Interval_12  0.274  0.014       NaN    0.0
         Interval_13  0.192 -0.082       NaN    0.0
         Interval_14  0.329  0.137       NaN    0.0
         Interval_15  0.192 -0.137       NaN    0.0
         Interval_16  0.443  0.251       NaN    0.0
         Interval_17  

NameError: name 'pfolder' is not defined

In [7]:
# Retrieve a clean version of features
features = pd.read_pickle(pickle_path / "features.pkl")

# Split into features and classifications
X = features[['diff', 'value', 'day_avg']]
y = features['label']

# Drop NA-containing rows from X
before = set(X.index.tolist())
X = X.dropna(axis=0)
after = set(X.index.tolist())
removed_id = before - after
print("{} Rows were dropped for NA reasons.".format(len(removed_id)))

# Drop corresponsing rows from y
y = y.drop(index=removed_id)


72991 Rows were dropped for NA reasons.


In [57]:
# Scale X-data between -1 and 1
scaler = StandardScaler().fit(X)                                    
X_scaled = scaler.transform(X)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=0)

# Train logistic regression model
logreg = LogisticRegression(solver='liblinear', random_state=0)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of classifier on test set: {:.3f}'.format(logreg.score(X_test, y_test)))

Accuracy of classifier on test set: 0.977


In [56]:
logreg.predict_proba(X_test)

array([[9.89212656e-01, 1.07873438e-02],
       [9.99942115e-01, 5.78848335e-05],
       [9.83378421e-01, 1.66215794e-02],
       ...,
       [9.64300724e-01, 3.56992764e-02],
       [9.95584787e-01, 4.41521292e-03],
       [9.96475624e-01, 3.52437590e-03]])

In [12]:
# Results Testing
labels_mod = features['label'].unstack()
labels_mod = pd.DataFrame(labels_mod.sum(axis=1))
labels_mod.rename(columns= {0: 'sum'}, inplace=True)
labels_mod['bool'] = labels_mod['sum'].where(labels_mod['sum'] == 0, other=1)

labels_mod.head()


Unnamed: 0_level_0,sum,bool
House ID,Unnamed: 1_level_1,Unnamed: 2_level_1
11655099,0.0,0.0
11633257,0.0,0.0
11651552,0.0,0.0
11636092,0.0,0.0
11647239,117.0,1.0
