In [1]:
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

Get the dataset as a `pandas` dataframe.

In [3]:
EnergySet = pd.read_csv("data/energydata_complete.csv")
df = pd.DataFrame(data=EnergySet)

In [4]:
df

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,...,17.033333,45.530000,6.600000,733.500000,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,...,17.066667,45.560000,6.483333,733.600000,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,...,17.000000,45.500000,6.366667,733.700000,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,...,17.000000,45.400000,6.250000,733.800000,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,...,17.000000,45.400000,6.133333,733.900000,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
5,2016-01-11 17:50:00,50,40,19.890000,46.026667,19.200000,44.500000,19.790000,44.933333,18.890000,...,17.000000,45.290000,6.016667,734.000000,92.000000,5.333333,43.833333,4.800000,44.919484,44.919484
6,2016-01-11 18:00:00,60,50,19.890000,45.766667,19.200000,44.500000,19.790000,44.900000,18.890000,...,17.000000,45.290000,5.900000,734.100000,92.000000,5.000000,40.000000,4.700000,47.233763,47.233763
7,2016-01-11 18:10:00,60,50,19.856667,45.560000,19.200000,44.500000,19.730000,44.900000,18.890000,...,17.000000,45.290000,5.916667,734.166667,91.833333,5.166667,40.000000,4.683333,33.039890,33.039890
8,2016-01-11 18:20:00,60,40,19.790000,45.597500,19.200000,44.433333,19.730000,44.790000,18.890000,...,17.000000,45.290000,5.933333,734.233333,91.666667,5.333333,40.000000,4.666667,31.455702,31.455702
9,2016-01-11 18:30:00,70,40,19.856667,46.090000,19.230000,44.400000,19.790000,44.863333,18.890000,...,17.000000,45.290000,5.950000,734.300000,91.500000,5.500000,40.000000,4.650000,3.089314,3.089314


In [5]:
df["RH_1"].describe()

count    19735.000000
mean        40.259739
std          3.979299
min         27.023333
25%         37.333333
50%         39.656667
75%         43.066667
max         63.360000
Name: RH_1, dtype: float64

In [6]:
# get the correlation matrix to see how well each input feature correlates to the output

correlation_matrix = df.corr()
correlation_matrix["Appliances"].sort_values(ascending=False)

Appliances     1.000000
lights         0.197278
T2             0.120073
T6             0.117638
T_out          0.099155
Windspeed      0.087122
RH_1           0.086031
T3             0.085060
T1             0.055447
T4             0.040281
T8             0.039572
RH_3           0.036292
T7             0.025801
T5             0.019760
RH_4           0.016965
Tdewpoint      0.015353
T9             0.010010
RH_5           0.006955
Visibility     0.000230
rv2           -0.011145
rv1           -0.011145
Press_mm_hg   -0.034885
RH_9          -0.051462
RH_7          -0.055642
RH_2          -0.060465
RH_6          -0.083178
RH_8          -0.094039
RH_out        -0.152282
Name: Appliances, dtype: float64

Remove columns (input cleaning)

In [7]:
# remove columns below a certain threshold

feature_correlation_threshold = 0.012

drop_list = []
for feature,correlation in correlation_matrix["Appliances"].iteritems():
    if abs(correlation) < feature_correlation_threshold:
        drop_list.append(feature)
        
drop_list

['RH_5', 'T9', 'Visibility', 'rv1', 'rv2']

In [8]:
df = df.drop(drop_list, axis=1)

In [36]:
from datetime import datetime


In [37]:
df.iloc[0][0]

'2016-01-11 17:00:00'

In [39]:
dt = datetime.strptime(df.iloc[0][0], "%Y-%m-%d %H:%M:%S")

In [40]:
dt

datetime.datetime(2016, 1, 11, 17, 0)

In [41]:
df.iloc[:, 0]

0        2016-01-11 17:00:00
1        2016-01-11 17:10:00
2        2016-01-11 17:20:00
3        2016-01-11 17:30:00
4        2016-01-11 17:40:00
5        2016-01-11 17:50:00
6        2016-01-11 18:00:00
7        2016-01-11 18:10:00
8        2016-01-11 18:20:00
9        2016-01-11 18:30:00
10       2016-01-11 18:40:00
11       2016-01-11 18:50:00
12       2016-01-11 19:00:00
13       2016-01-11 19:10:00
14       2016-01-11 19:20:00
15       2016-01-11 19:30:00
16       2016-01-11 19:40:00
17       2016-01-11 19:50:00
18       2016-01-11 20:00:00
19       2016-01-11 20:10:00
20       2016-01-11 20:20:00
21       2016-01-11 20:30:00
22       2016-01-11 20:40:00
23       2016-01-11 20:50:00
24       2016-01-11 21:00:00
25       2016-01-11 21:10:00
26       2016-01-11 21:20:00
27       2016-01-11 21:30:00
28       2016-01-11 21:40:00
29       2016-01-11 21:50:00
                ...         
19705    2016-05-27 13:10:00
19706    2016-05-27 13:20:00
19707    2016-05-27 13:30:00
19708    2016-

In [26]:
# split training and test data
def split_data(df):
    indices = np.random.permutation(len(df))
    test_size = int(len(df) * 0.15)
    test_index = indices[:test_size]
    train_index = indices[test_size:]
    return (df.iloc[train_index], df.iloc[test_index])

In [20]:
# simple linear and polynomial prediction models

def get_linreg(intercept, normalise):
    return LinearRegression(fit_intercept=intercept, normalize=normalise, n_jobs=-1)


def linreg_predict(X, y, test_input, intercept=False, normalise=True):
    print("Get linear regression prediction")

    linreg = get_linreg(intercept, normalise)
    linreg.fit(X, y)

    return linreg.predict(test_input)


def polyreg_predict(X, y, test_input, degrees, intercept=False, normalise=True):
    print("Get polynomial regression prediction")

    linreg = get_linreg(intercept, normalise)
    polyreg = PolynomialFeatures(degree=degrees)

    x_hat = polyreg.fit_transform(X)
    predict_hat = polyreg.fit_transform(test_input)

    linreg.fit(x_hat, y)

    return linreg.predict(predict_hat)


In [27]:
train_df, test_df = split_data(df)

In [29]:
train_df

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T7,RH_7,T8,RH_8,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Tdewpoint
6871,2016-02-28 10:10:00,50,0,19.700000,35.723333,18.463333,35.126667,20.100000,35.560000,18.963333,...,19.390000,32.384286,20.600000,38.790000,36.566667,1.833333,757.050000,67.666667,7.333333,-3.583333
692,2016-01-16 12:20:00,80,0,22.100000,41.290000,21.566667,40.500000,21.100000,42.700000,20.463333,...,19.445000,43.400000,19.926667,51.096667,45.000000,5.100000,763.833333,80.000000,3.666667,1.800000
1986,2016-01-25 12:00:00,40,0,20.100000,44.000000,19.463333,43.290000,20.200000,44.200000,18.200000,...,16.775000,41.808333,18.116667,47.982222,48.500000,12.100000,763.300000,56.000000,4.000000,3.500000
15292,2016-04-26 21:40:00,70,0,22.000000,36.200000,19.790000,37.326667,22.000000,35.500000,20.200000,...,19.890000,28.730000,22.926667,37.963333,35.463333,1.433333,749.700000,91.666667,4.000000,0.233333
13900,2016-04-17 05:40:00,50,0,21.390000,39.400000,17.790000,44.700000,23.390000,38.000000,20.937143,...,20.830000,32.834000,22.790000,38.260000,45.290000,1.400000,752.000000,98.333333,1.000000,1.133333
2440,2016-01-28 15:40:00,20,0,19.500000,45.230000,18.700000,44.826667,19.790000,44.400000,18.260000,...,18.890556,39.787222,18.790000,45.777222,46.656667,7.066667,765.100000,69.333333,2.000000,1.733333
13439,2016-04-14 00:50:00,50,0,21.633333,41.860000,18.700000,45.790000,23.700000,39.433333,21.700000,...,20.890000,35.790000,23.000000,45.433333,45.966667,3.833333,753.600000,95.333333,0.166667,3.183333
14374,2016-04-20 12:40:00,50,0,21.963333,36.730000,23.700000,31.700000,22.100000,37.030000,22.790000,...,20.640000,28.890000,21.323333,36.526667,37.326667,11.766667,767.300000,56.000000,6.000000,3.266667
2336,2016-01-27 22:20:00,40,0,19.230000,46.826667,18.500000,46.790000,19.790000,45.290000,18.200000,...,17.600000,45.275000,17.616667,51.215000,49.000000,9.800000,755.300000,94.333333,7.666667,8.900000
19589,2016-05-26 17:50:00,250,0,24.390000,45.466667,24.870000,37.200000,27.566667,38.900000,24.600000,...,24.290000,37.997778,24.290000,45.530000,43.621429,21.033333,755.850000,45.833333,2.000000,8.866667


In [30]:
test_df

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T7,RH_7,T8,RH_8,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Tdewpoint
13425,2016-04-13 22:30:00,50,0,21.963333,42.863333,19.463333,45.500000,23.500000,39.200000,21.890000,...,21.000000,36.222500,23.290000,44.642857,44.000000,6.000000,753.200000,95.500000,1.000000,5.300000
13556,2016-04-14 20:20:00,70,0,22.823333,41.530000,21.600000,41.266667,23.500000,39.433333,22.890000,...,21.290000,35.971429,24.171429,44.457143,41.826667,13.566667,751.866667,68.000000,2.000000,7.766667
12977,2016-04-10 19:50:00,130,0,23.100000,39.975000,21.823333,39.760000,22.926667,38.863333,22.100000,...,22.890000,34.826667,24.238571,39.577143,39.590000,14.300000,750.700000,64.666667,4.333333,7.700000
293,2016-01-13 17:50:00,100,0,19.290000,40.730000,18.700000,39.760000,19.290000,42.030000,18.890000,...,18.426667,35.826667,19.426667,44.730000,40.090000,3.916667,757.300000,76.000000,3.000000,0.016667
7988,2016-03-07 04:20:00,50,0,19.390000,37.030000,17.100000,39.700000,19.790000,39.060000,18.890000,...,17.700000,31.890000,20.790000,44.290000,40.230000,0.066667,747.233333,99.333333,1.666667,0.000000
4245,2016-02-10 04:30:00,70,0,21.390000,41.560000,20.760000,39.966667,22.500000,43.090000,18.790000,...,19.878889,41.080000,22.290000,48.602222,46.200000,2.550000,739.100000,92.500000,9.000000,1.450000
999,2016-01-18 15:30:00,30,0,19.290000,41.400000,18.200000,41.400000,19.700000,40.663333,18.790000,...,19.000000,34.700000,19.600000,41.933333,37.790000,-0.450000,756.900000,68.500000,2.500000,-5.600000
18344,2016-05-18 02:20:00,60,0,23.290000,40.790000,21.421429,41.652857,24.777143,39.290000,23.200000,...,23.290000,40.554000,23.840000,44.200000,43.152857,8.933333,755.400000,87.000000,2.000000,6.866667
8910,2016-03-13 14:00:00,40,0,21.390000,35.066667,20.500000,34.163333,20.390000,35.950000,21.000000,...,22.066667,28.033333,24.166667,33.463333,38.590000,8.800000,769.200000,59.000000,6.000000,1.200000
4365,2016-02-11 00:30:00,30,0,21.100000,43.326667,20.600000,41.260000,20.823333,42.200000,20.100000,...,20.200000,40.308333,23.005556,48.015000,43.200000,3.550000,747.500000,97.500000,3.000000,3.150000


In [15]:
output = df["Appliances"]
input = df.iloc[:, 2:]
test_input = df.iloc[:1000, 2:]

In [16]:
output.shape

(19735,)

In [21]:
lin_predictions = linreg_predict(input, output, test_input)
poly_predictions = polyreg_predict(input, output, test_input, 3)

Get linear regression prediction


TypeError: __init__() got an unexpected keyword argument 'dual'

In [18]:
mean_squared_error(df.loc[:999, "Appliances"], poly_predictions)

NameError: name 'poly_predictions' is not defined

In [19]:
mean_squared_error(df.loc[:999, "Appliances"], lin_predictions)

NameError: name 'lin_predictions' is not defined