In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
%matplotlib inline

In [3]:
df = pd.read_csv('energydata_complete.csv')
df

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,...,17.033333,45.5300,6.600000,733.5,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,...,17.066667,45.5600,6.483333,733.6,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,...,17.000000,45.5000,6.366667,733.7,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,2016-05-27 17:20:00,100,0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,...,23.200000,46.7900,22.733333,755.2,55.666667,3.333333,23.666667,13.333333,43.096812,43.096812
19731,2016-05-27 17:30:00,90,0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,...,23.200000,46.7900,22.600000,755.2,56.000000,3.500000,24.500000,13.300000,49.282940,49.282940
19732,2016-05-27 17:40:00,270,10,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,...,23.200000,46.7900,22.466667,755.2,56.333333,3.666667,25.333333,13.266667,29.199117,29.199117
19733,2016-05-27 17:50:00,420,10,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,...,23.200000,46.8175,22.333333,755.2,56.666667,3.833333,26.166667,13.233333,6.322784,6.322784


In [4]:
df.isna().sum()

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

In [5]:
scaler = MinMaxScaler()
norm_df = scaler.fit_transform(df[['T2', 'T6']])
x_1 = norm_df[:,0].reshape(-1,1)
y_1 = norm_df[:,1].reshape(-1,1)
linear_model = LinearRegression()
linear_model.fit(x_1, y_1)

LinearRegression()

In [6]:
linear_predictions = linear_model.predict(x_1)

In [7]:
R2 = r2_score(y_1, linear_predictions)
R2

0.6418990830855493

In [8]:
MAE = mean_absolute_error(y_1, linear_predictions)
MAE

0.08262185186108569

In [9]:
RSS = np.sum(np.square(y_1 - linear_predictions))
RSS

222.08778382822797

In [10]:
RMSE = math.sqrt(mean_squared_error(y_1, linear_predictions))
RMSE

0.10608250581715784

In [11]:
mean_output = y_1.mean()
TSS = np.sum(np.square(y_1-mean_output))

In [12]:
Coeff_Det = 1-(RSS/TSS)
Coeff_Det

0.6418990830855493

In [13]:
linear_model.coef_

array([[0.89097447]])

In [14]:
df_full = df.drop(['date','lights'], axis = 1)


full_scaler = MinMaxScaler()
norm_df_full = pd.DataFrame(full_scaler.fit_transform(df_full), columns = df_full.columns)
features_df = norm_df_full.drop('Appliances', axis = 1)
target = norm_df_full['Appliances']

x_train, x_test, y_train, y_test = train_test_split(features_df, target, test_size = 0.3, random_state = 1)
linear_model_full = LinearRegression()
linear_model_full.fit(x_train, y_train)

LinearRegression()

In [15]:
def get_weights_df(model, feat, col_name):
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    weights_df[col_name].round(3)
    return weights_df

In [17]:
get_weights_df(linear_model_full, x_train, 'Linear_Weights').sort_values

<bound method DataFrame.sort_values of        Features  Linear_Weights
0           rv2   -2.233287e+11
1          RH_2   -4.584224e-01
2         T_out   -3.302139e-01
3            T2   -2.499689e-01
4            T9   -2.047422e-01
5          RH_8   -1.565790e-01
6        RH_out   -7.612303e-02
7          RH_7   -4.759104e-02
8          RH_9   -3.603543e-02
9   Press_mm_hg    4.666465e-03
10           T1    5.960194e-03
11   Visibility    1.097406e-02
12           T7    1.257681e-02
13           T5    1.268458e-02
14         RH_5    1.628121e-02
15         RH_4    2.665406e-02
16           T4    2.732548e-02
17    Windspeed    2.792343e-02
18         RH_6    3.809413e-02
19           T8    8.855453e-02
20         RH_3    9.562216e-02
21    Tdewpoint    1.194432e-01
22           T6    2.489888e-01
23           T3    2.933060e-01
24         RH_1    5.358454e-01
25          rv1    2.233287e+11>

In [18]:
ridge_reg = Ridge(0.4)
ridge_reg.fit(x_train, y_train)

Ridge(alpha=0.4)

In [19]:
ridge_predictions = ridge_reg.predict(x_test)

In [20]:
full_predictions = linear_model_full.predict(x_test)

In [21]:
math.sqrt(mean_squared_error(y_test, full_predictions))

0.08823257879630612

In [22]:
math.sqrt(mean_squared_error(y_test, ridge_predictions))

0.08825441128574509

In [23]:
lasso_reg = Ridge(0.001)
lasso_reg.fit(x_train, y_train)

Ridge(alpha=0.001)

In [26]:
get_weights_df(lasso_reg, x_train, 'Lasso_Weights').sort_values

<bound method DataFrame.sort_values of        Features  Lasso_Weights
0          RH_2      -0.458049
1         T_out      -0.330184
2            T2      -0.249703
3            T9      -0.204715
4          RH_8      -0.156533
5        RH_out      -0.076148
6          RH_7      -0.047617
7          RH_9      -0.036039
8           rv2      -0.000350
9           rv1      -0.000350
10  Press_mm_hg       0.004641
11           T1       0.005916
12   Visibility       0.010992
13           T7       0.012582
14           T5       0.012603
15         RH_5       0.016277
16         RH_4       0.026654
17           T4       0.027363
18    Windspeed       0.027922
19         RH_6       0.038086
20           T8       0.088499
21         RH_3       0.095535
22    Tdewpoint       0.119372
23           T6       0.248960
24           T3       0.293240
25         RH_1       0.535651>

In [25]:
lasso_predictions = lasso_reg.predict(x_test)
math.sqrt(mean_squared_error(y_test, ridge_predictions))

0.08825441128574509