In [3]:
# Setup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import time
import utilities as utils
import warnings

from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression

warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
pickle_path = Path('/Users/andreakeane/Documents/DataScience/GridCure_Problems/pickles/')
test = pd.read_pickle(pickle_path / "test.pkl")
train = pd.read_pickle(pickle_path / "train.pkl")
labels = pd.read_pickle(pickle_path / "labels.pkl")


In [3]:
# Determine houses with and without EVs
temp = pd.DataFrame(index=labels.index)
temp['sum'] = labels.sum(axis=1)
temp['bool'] = np.where(temp['sum'] > 0, True, False)

houses_wEV = temp.index[temp['bool'] == True].tolist()
houses_woEV = temp.index[temp['bool'] == False].tolist()

print("{} Houses with EVs.".format(len(houses_wEV)))
print("{} Houses without EVs.".format(len(houses_woEV)))

# Pickle the data for reference elsewhere
# Lists, so we can't use Pandas pickling method
utils.make_pickle(houses_wEV, pickle_path / "houses_wEV.pkl")
utils.make_pickle(houses_woEV, pickle_path / "houses_woEV.pkl")

485 Houses with EVs.
1105 Houses without EVs.



## Outlier Removal

In [4]:
# Determine outliers
outliers = utils.classify_outliers(train)

# Drop Outliers
train_clean = train.drop(outliers.index)
labels_clean = labels.drop(outliers.index)

Number of Outliers: 37
Percent Removed: 2.33%



## Logistic Regression with Feature Engineering
A series with significant dependence among values. In this case we need to use some statistical models like ARIMA to forecast the data.

In [None]:
# Feature Setup
# Difference from preceding interval
feat_diff = train.diff(axis=1)

# 8-hour average
feat_8h_avg = train.rolling(window=(8*2), axis=1).mean()

# 24-hour average
feat_24h_avg = train.rolling(window=(24*2), axis=1).mean()

# 24-hour min
feat_24h_min = train.rolling(window=(24*2), axis=1).min()

# 24-hour max
feat_24h_max = train.rolling(window=(24*2), axis=1).max()

# 72-hour average
feat_72h_avg = train.rolling(window=(72*2), axis=1).mean()


features = pd.DataFrame(index=train_clean.stack().index)
print("Total anticipated rows: {}".format(features.shape[0]))
for i, index in enumerate(features.index.tolist()): 
    house_id, interval = index
    features.at[index, 'value'] = train[interval].loc[house_id]
    features.at[index, 'diff'] = feat_diff[interval].loc[house_id]
    features.at[index, 'h8_avg'] = feat_8h_avg[interval].loc[house_id]
    features.at[index, 'h24_avg'] = feat_24h_avg[interval].loc[house_id]
    features.at[index, 'h24_min'] = feat_24h_min[interval].loc[house_id]
    features.at[index, 'h24_max'] = feat_24h_max[interval].loc[house_id]
    features.at[index, 'h72_avg'] = feat_72h_avg[interval].loc[house_id]
    features.at[index, 'label'] = labels_clean[interval].loc[house_id]  
    if i % 1000000 == 0: print(i)

# Create combinations of features
features['diff_2'] = features['diff']**2
features['diff_3'] = features['diff']**3
features['diff_5'] = features['diff']**5

features.to_pickle(pickle_path / "features.pkl")


Total anticipated rows: 4471920
0
1000000
2000000
3000000
4000000


In [7]:
# Retrieve a clean version of features
features = pd.read_pickle(pickle_path / "features.pkl")

# Split into features and labels
X = features.drop(columns=['label'])
y = features['label']


In [8]:

# Drop NA-containing rows from X
before = set(X.index.tolist())
X = X.dropna(axis=0)
after = set(X.index.tolist())
removed_id = before - after
print("{} Rows were dropped for NA reasons.".format(len(removed_id)))

# Drop corresponsing rows from y
y = y.drop(index=removed_id)

print(X.shape)
print(y.shape)

X.to_pickle(pickle_path / "X_all.pkl")
y.to_pickle(pickle_path / "y_all.pkl")

222079 Rows were dropped for NA reasons.
(4249841, 10)
(4249841,)
