In [44]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split

In [45]:
data = pd.read_csv('energydata_complete.csv')

In [46]:
data.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [47]:
df = data.drop(columns = ['date', 'lights'])

In [48]:
df.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,60,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,50,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,50,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,60,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


### Normalize the dataset

In [49]:
# normalise our dataset to a common scale using the min max scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalised_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
X = normalised_df.drop(columns = ['Appliances'])
y = normalised_df['Appliances']

### Train_Test_Split

In [50]:
#split the dataset,  70% for training and 30% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(13814, 26) (5921, 26) (13814,) (5921,)


### Multiple Linear Regression

In [51]:
multiple_linear_model = LinearRegression()
multiple_linear_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [52]:
#obtain linear predictions
multiple_linear_predictions = multiple_linear_model.predict(X_test)
multiple_linear_predictions

array([0.03322207, 0.24411599, 0.03400024, ..., 0.06844707, 0.10032325,
       0.05722198])

### Measuring Multiple Linear Regression Performance

####  Mean Absolute Error

In [55]:
#MAE for multiple linear regression
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, multiple_linear_predictions)

round(mae, 2)

0.05

#### R-Squared

In [59]:
#R-squared error for multiple linear regression
from sklearn.metrics import r2_score
r2_score = r2_score(y_test, multiple_linear_predictions)
round(r2_score, 2)

0.15

####  Residual Sum of Squares

In [56]:
#rss for multiple linear regression
import numpy as np
rss = np.sum(np.square(y_test - multiple_linear_predictions))
round(rss, 2)

45.35

#### Root Mean Square Error

In [57]:
#rmse for multiple linear regression
from sklearn.metrics import  mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, multiple_linear_predictions))
round(rmse, 2)

0.09

### Ridge regression

In [60]:
ridge_reg = Ridge(alpha=0.4)
ridge_reg.fit(X_train, y_train)

Ridge(alpha=0.4, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [61]:
#obtain Ridge regression predictions
ridge_predictions = ridge_reg.predict(X_test)
ridge_predictions

array([0.03321872, 0.24043824, 0.03461337, ..., 0.06872351, 0.10025536,
       0.05851175])

### Measuring Ridge Regression Performance

####  Mean Absolute Error

In [62]:
#MAE for ridge regression
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, ridge_predictions)
round(mae, 3)

0.05

#### R-Squared

In [63]:
#R-squared error for ridge regression
from sklearn.metrics import r2_score
r2_score = r2_score(y_test, ridge_predictions)
round(r2_score, 3)

0.149

####  Residual Sum of Squares

In [64]:
#rss for ridge regression
import numpy as np
rss = np.sum(np.square(y_test - ridge_predictions))
round(rss, 3)

45.368

#### Root Mean Square Error

In [65]:
#rmse for ridge regression
from sklearn.metrics import  mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, ridge_predictions))
round(rmse, 3)

0.088

### Feature Selection and Lasso Regression

In [20]:
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [21]:
#obtain Lasso regression predictions
lasso_predictions = lasso_reg.predict(X_test)
lasso_predictions

array([0.07370267, 0.08143458, 0.07716072, ..., 0.07792848, 0.09034412,
       0.08359255])

### Measuring Lasso Regression Performance

####  Mean Absolute Error

In [22]:
#MAE for lasso regression
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, lasso_predictions)
round(mae, 3)

0.055

#### R-Squared

In [23]:
#R-squared error for lasso regression
from sklearn.metrics import r2_score
r2_score = r2_score(y_test, lasso_predictions)
round(r2_score, 3)

0.027

####  Residual Sum of Squares

In [24]:
#rss for lasso regression
import numpy as np
rss = np.sum(np.square(y_test - lasso_predictions))
round(rss, 3)

51.853

#### Root Mean Square Error

In [66]:
#rmse for lasso regression
from sklearn.metrics import  mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, lasso_predictions))
round(rmse, 3)

0.094

## Penalization Methods

### Comparing the Effects of regularisation

In [26]:
#comparing the effects of regularisation
def get_weights_df(model, feat, col_name):
  weights = pd.Series(model.coef_, feat.columns).sort_values()
  weights_df = pd.DataFrame(weights).reset_index()
  weights_df.columns = ['Features', col_name]
  weights_df[col_name].round(3)
  return weights_df

In [27]:
linear_model_weights = get_weights_df(multiple_linear_model, X_train, 'Linear_Model_Weight')
ridge_weights_df = get_weights_df(ridge_reg, X_train, 'Ridge_Weight')
lasso_weights_df = get_weights_df(lasso_reg, X_train, 'Lasso_weight')
#elastic_weights_df = get_weights_df(elastic, X_train, 'Elastic_Weight')

In [28]:
final_weights = pd.merge(linear_model_weights, ridge_weights_df, on='Features')
final_weights = pd.merge(final_weights, lasso_weights_df, on='Features')
#final_weights = pd.merge(final_weights, elastic_weights_df, on='Features')
final_weights

Unnamed: 0,Features,Linear_Model_Weight,Ridge_Weight,Lasso_weight
0,RH_2,-0.456698,-0.401134,-0.0
1,T_out,-0.32186,-0.250765,0.0
2,T2,-0.236178,-0.19388,0.0
3,T9,-0.189941,-0.188584,-0.0
4,RH_8,-0.157595,-0.156596,-0.00011
5,RH_out,-0.077671,-0.050541,-0.049557
6,RH_7,-0.044614,-0.046291,-0.0
7,RH_9,-0.0398,-0.041701,-0.0
8,T5,-0.015657,-0.020727,-0.0
9,T1,-0.003281,-0.021549,0.0


### Simple Linear Regression

In [29]:
data.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [30]:
df2 = df = data.drop(columns = ['date', 'lights'])

In [31]:
#Firstly, we normalise our dataset to a common scale using the min max scaler
#from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalised_df2 = pd.DataFrame(scaler.fit_transform(df2), columns=df2.columns)
X1 = normalised_df2[['T2']]
y1 = normalised_df2['T6']

In [32]:
#X1 = normalised_df[['T2']]
#y1 = normalised_df['T6']

In [33]:
#X1 = normalised_df['T2']
#array = np.array([X1])
#X1 = array.reshape(-1, 1)
#X1

In [34]:
#split the dataset,  70% for training and 30% for testing
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(13814, 1) (5921, 1) (13814,) (5921,)


In [35]:
simple_linear_model = LinearRegression()
simple_linear_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [36]:
#obtain linear predictions
simple_linear_predictions = simple_linear_model.predict(X_test)
simple_linear_predictions

array([0.23928945, 0.46794238, 0.23108472, ..., 0.3001772 , 0.4297256 ,
       0.3217686 ])

### Measuring Simple Linear Regression Performance

####  Mean Absolute Error

In [37]:
#MAE for linear regression
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, simple_linear_predictions)

round(mae, 3)

0.082

#### R-Squared

In [41]:
#R-squared error for linear regression
from sklearn.metrics import r2_score
r2_score = r2_score(y_test, simple_linear_predictions)
round(r2_score, 2)

0.64

####  Residual Sum of Squares

In [39]:
#rss for linear regression
import numpy as np
rss = np.sum(np.square(y_test - simple_linear_predictions))
round(rss, 3)

66.116

#### Root Mean Square Error

In [40]:
#rmse for linear regression
from sklearn.metrics import  mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, simple_linear_predictions))
round(rmse, 3)

0.106