In [48]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

In [49]:
# Importing sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

In [50]:
%run "./../1.Load Codes/1.Variables.ipynb"

Stored 'home_path' (str)


# Loading data from previous modules

In [51]:
import pickle

files_data = pickle.load(open(home_path + '/1.Load Codes/files.pickle',"rb"))
acorn_data = files_data['acorn_data']
energy_data = files_data['energy_data']
households_data = files_data['households_data']
weather_data = files_data['weather_data']

# Getting Dataset variables

In [52]:
energy_data_ml = energy_data[['LCLid', 'day','energy_sum']]
households_data_ml = households_data[['LCLid', 'stdorToU', 'Acorn', 'Acorn_grouped']]
weather_data_ml = weather_data[['temperatureMax', 'visibility', 'humidity', 'windSpeed', 'day']]

In [53]:
energy_data_ml.head()

Unnamed: 0,LCLid,day,energy_sum
0,MAC000131,2011-12-15,9.505
1,MAC000131,2011-12-16,14.216
2,MAC000131,2011-12-17,9.111
3,MAC000131,2011-12-18,10.511
4,MAC000131,2011-12-19,15.647


In [54]:
households_data_ml.head()

Unnamed: 0,LCLid,stdorToU,Acorn,Acorn_grouped
2,MAC000002,Std,ACORN-A,Affluent
3,MAC003613,Std,ACORN-A,Affluent
4,MAC003597,Std,ACORN-A,Affluent
5,MAC003579,Std,ACORN-A,Affluent
6,MAC003566,Std,ACORN-A,Affluent


In [55]:
weather_data_ml.head()

Unnamed: 0,temperatureMax,visibility,humidity,windSpeed,day
0,11.96,3.3,0.95,3.88,2011-11-11
1,8.59,12.09,0.88,3.94,2011-12-11
2,10.33,13.39,0.74,3.54,2011-12-27
3,8.07,11.89,0.87,3.0,2011-12-02
4,8.22,13.16,0.8,4.46,2011-12-24


In [57]:
energy_data_ml.loc[:,'day']=  pd.to_datetime(energy_data_ml['day']).dt.date

In [58]:
combined_ml_dataset = pd.merge(energy_data_ml, households_data_ml, on = 'LCLid')
combined_ml_dataset = pd.merge(combined_ml_dataset, weather_data_ml, on = 'day')
combined_ml_dataset = combined_ml_dataset.head(50000)

In [59]:
combined_ml_dataset.head()

Unnamed: 0,LCLid,day,energy_sum,stdorToU,Acorn,Acorn_grouped,temperatureMax,visibility,humidity,windSpeed
0,MAC000131,2011-12-15,9.505,Std,ACORN-E,Affluent,7.97,12.79,0.77,4.71
1,MAC000132,2011-12-15,9.578,Std,ACORN-E,Affluent,7.97,12.79,0.77,4.71
2,MAC000221,2011-12-15,21.145,Std,ACORN-E,Affluent,7.97,12.79,0.77,4.71
3,MAC000228,2011-12-15,9.778,Std,ACORN-E,Affluent,7.97,12.79,0.77,4.71
4,MAC000234,2011-12-15,25.918,Std,ACORN-E,Affluent,7.97,12.79,0.77,4.71


In [60]:
combined_ml_x = combined_ml_dataset.drop(['energy_sum'], axis = 1)
combined_ml_y = combined_ml_dataset['energy_sum']

In [61]:
combined_ml_x['dayOfYear'] = pd.to_datetime(combined_ml_x['day']).dt.dayofyear

# Converting Strings to numerical values

In [62]:
combined_ml_x['LCLid'] = combined_ml_dataset['LCLid'].str.findall("\d+").explode().astype(int)

# Encoding Labeled Class Columns to Numeric Values

In [63]:
categories = ['stdorToU', 'Acorn', 'Acorn_grouped']

In [64]:
enc = OneHotEncoder(drop='first')
enc.fit(combined_ml_x[categories])

OneHotEncoder(drop='first')

In [65]:
#Performing the encoding to numerical columns and merging them with the database

combined_ml_x = pd.concat([combined_ml_x, pd.DataFrame(enc.transform(combined_ml_x[categories]).toarray())], axis = 1)

In [66]:
combined_ml_x.head()

Unnamed: 0,LCLid,day,stdorToU,Acorn,Acorn_grouped,temperatureMax,visibility,humidity,windSpeed,dayOfYear,...,11,12,13,14,15,16,17,18,19,20
0,131,2011-12-15,Std,ACORN-E,Affluent,7.97,12.79,0.77,4.71,349,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,132,2011-12-15,Std,ACORN-E,Affluent,7.97,12.79,0.77,4.71,349,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,221,2011-12-15,Std,ACORN-E,Affluent,7.97,12.79,0.77,4.71,349,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,228,2011-12-15,Std,ACORN-E,Affluent,7.97,12.79,0.77,4.71,349,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,234,2011-12-15,Std,ACORN-E,Affluent,7.97,12.79,0.77,4.71,349,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [67]:
combined_ml_x = combined_ml_x.drop(categories, axis = 1)
combined_ml_x = combined_ml_x.drop(['day'], axis = 1)

In [68]:
combined_ml_x.head()

Unnamed: 0,LCLid,temperatureMax,visibility,humidity,windSpeed,dayOfYear,0,1,2,3,...,11,12,13,14,15,16,17,18,19,20
0,131,7.97,12.79,0.77,4.71,349,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,132,7.97,12.79,0.77,4.71,349,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,221,7.97,12.79,0.77,4.71,349,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,228,7.97,12.79,0.77,4.71,349,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,234,7.97,12.79,0.77,4.71,349,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [69]:
combined_ml_y.head()

0     9.505
1     9.578
2    21.145
3     9.778
4    25.918
Name: energy_sum, dtype: float64

# Train Test Split

In [70]:
X_train, X_test, y_train, y_test = train_test_split(combined_ml_x, combined_ml_y, test_size=0.20, random_state=42)

# Linear Regression

In [71]:
# Performing Linear Regression using selected variables

In [72]:
reg = LinearRegression()

In [73]:
lr_score = cross_val_score(reg, X_train, y_train, cv = 5)

In [74]:
lr_score

array([0.03155138, 0.04035843, 0.04121022, 0.04100787, 0.04505465])

# KNN

In [89]:
n = 10

In [90]:
neigh = KNeighborsRegressor(n_neighbors=n)

In [91]:
knn_score = cross_val_score(neigh, X_train, y_train, cv = 5)

In [92]:
knn_score

array([0.13079803, 0.13412554, 0.13140086, 0.14842407, 0.13527885])

# Selecting best model

In [93]:
best_model = reg.fit(X_train, y_train) if(lr_score.mean() > knn_score.mean()) else neigh.fit(X_train, y_train)   

In [94]:
best_model

KNeighborsRegressor(n_neighbors=10)

# Export training data and models

In [95]:
import pickle

model_data = {
    'test_data': X_test,
    'test_targets': y_test,
    'best_model': best_model
}
save_path = open("model.pickle","wb")
pickle.dump(model_data, save_path)