In [2]:
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from datetime import datetime

from visualisation import *


Get the dataset as a `pandas` dataframe.

In [4]:
EnergySet = pd.read_csv("data/energydata_complete.csv")
df = pd.DataFrame(data=EnergySet)

In [None]:
df

In [5]:
# split training and test data
def split_data(df):
    indices = np.random.permutation(len(df))
    test_size = int(len(df) * 0.15)
    test_index = indices[:test_size]
    train_index = indices[test_size:]
    return (df.iloc[train_index], df.iloc[test_index])

In [8]:
# split data into training and test data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=35)

# stratified sampling to avoid sampling bias as dataset is relatively small
# use hour sin/cos as base as it has highest correlation
#train_df, test_df = train_test_split(df, test_size=0.2, random_state=35, stratify=df[])

# Preprocessing

Preprocess timestamp into cyclic hour of day and day of week values

In [None]:
# get tuple of cyclic hour and day 
def get_cyclic_time(timestamp):
    dt = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    hour_sin = np.sin(2*np.pi*dt.hour/24)
    hour_cos = np.cos(2*np.pi*dt.hour/24)
    
    day_sin = np.sin(2*np.pi*dt.weekday()/7)
    day_cos = np.cos(2*np.pi*dt.weekday()/7)
    
    return hour_sin, hour_cos, day_sin, day_cos


def get_numeric_time(timestamp):
    dt = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    hour = dt.hour
    day = dt.weekday()
    
    return hour,day

In [None]:
# convert date to cyclic 'hour of day' and 'day of week'
df[["hour_sin","hour_cos","day_sin","day_cos"]] = df["date"].map(get_cyclic_time).apply(pd.Series)

#df[["hour_num", "day_num"]] = df["date"].map(get_numeric_time).apply(pd.Series)

In [None]:
# get the correlation matrix to see how well each input feature correlates to the output

correlation_matrix = df.corr()
correlation_matrix["Appliances"].sort_values(ascending=False)

# Select features

In [15]:
def best_features(df, threshold):
    output_feature = "Appliances"
    #threshold = 0.012

    feature_list = []
    for feature,correlation in df.corr()[output_feature].iteritems():
        if abs(correlation) > threshold and feature != output_feature:
            feature_list.append(feature)
            
    return feature_list

In [17]:
feature_list = best_features(train_df, 0.012)

In [None]:
plot_histogram(df, feature_list)

In [35]:
output = train_df["Appliances"]
input = train_df[feature_list]

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler

In [None]:
# simple linear and polynomial prediction models

def get_linreg(intercept, normalise):
    return LinearRegression(fit_intercept=intercept, normalize=normalise, n_jobs=-1)


def linreg_predict(X, y, test_input, intercept=False, normalise=True):
    print("Get linear regression prediction")

    linreg = get_linreg(intercept, normalise)
    linreg.fit(X, y)

    return linreg.predict(test_input)


def polyreg_predict(X, y, test_input, degrees, intercept=False, normalise=True):
    print("Get polynomial regression prediction")

    linreg = get_linreg(intercept, normalise)
    polyreg = PolynomialFeatures(degree=degrees)

    x_hat = polyreg.fit_transform(X)
    predict_hat = polyreg.fit_transform(test_input)

    linreg.fit(x_hat, y)

    return linreg.predict(predict_hat)


In [22]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [46]:
data_pipeline = Pipeline([
    ("selection", DataFrameSelector(feature_list)),
    ("standardisation", StandardScaler())
])

lin_pipeline = Pipeline([
    ("data_preparation", data_pipeline),
    ("linear", LinearRegression())
])

poly_pipeline = Pipeline([
    ("data_preparation", data_pipeline),
    ("poly", PolynomialFeatures(degree=3)),
    ("linear", LinearRegression())
])

In [32]:
train_data = data_pipeline.fit_transform(train_df)

In [33]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
def kfold_train(X, y, k):
    kfold = KFold(n_splits=k)
    

In [37]:
kfold = KFold(n_splits=3)

In [40]:
poly_result = cross_val_score(poly_pipeline, input, output, cv=kfold, scoring="neg_mean_squared_error")
lin_result = cross_val_score(lin_pipeline, input, output, cv=kfold, scoring="neg_mean_squared_error")

In [41]:
poly_result

array([ -8179.05202422,  -8542.82893843, -11320.29828191])

In [42]:
lin_result

array([-8042.04929689, -8803.82526671, -9386.27040278])

In [None]:

lin_predictions = linreg_predict(input, output, test_input)
poly_predictions = polyreg_predict(input, output, test_input, 3)

In [None]:
mean_squared_error(df.loc[:999, "Appliances"], poly_predictions)
# 3881 - with time numeric

In [None]:
mean_squared_error(df.loc[:999, "Appliances"], lin_predictions)