### Import Relevant Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

### Load The Data

In [3]:
# Read the Data
df = pd.read_csv('energydata_complete.csv')

#### Feature Description

The dataset for the remainder of this quiz is the Appliances Energy Prediction data. The data set is at 10 min for about 4.5 months. The house temperature and humidity conditions were monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and humidity conditions around 3.3 min. Then, the wireless data was averaged for 10 minutes periods. The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis (rp5.ru), and merged together with the experimental data sets using the date and time column. Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters). The attribute information can be seen below.

Date, time year-month-day hour:minute:second

Appliances, energy use in Wh

lights, energy use of light fixtures in the house in Wh

T1, Temperature in kitchen area, in Celsius

RH_1, Humidity in kitchen area, in %

T2, Temperature in living room area, in Celsius

RH_2, Humidity in living room area, in %

T3, Temperature in laundry room area

RH_3, Humidity in laundry room area, in %

T4, Temperature in office room, in Celsius

RH_4, Humidity in office room, in %

T5, Temperature in bathroom, in Celsius

RH_5, Humidity in bathroom, in %

T6, Temperature outside the building (north side), in Celsius

RH_6, Humidity outside the building (north side), in %

T7, Temperature in ironing room , in Celsius

RH_7, Humidity in ironing room, in %

T8, Temperature in teenager room 2, in Celsius

RH_8, Humidity in teenager room 2, in %

T9, Temperature in parents room, in Celsius

RH_9, Humidity in parents room, in %

To, Temperature outside (from Chievres weather station), in Celsius

Pressure (from Chievres weather station), in mm Hg

RH_out, Humidity outside (from Chievres weather station), in %

Wind speed (from Chievres weather station), in m/s

Visibility (from Chievres weather station), in km

Tdewpoint (from Chievres weather station), Â°C

rv1, Random variable 1, nondimensional

rv2, Random variable 2, nondimensional

### Question 12

In [9]:
#  11
x = df['T2']
y = df['T6']

linear_model = LinearRegression()
# Fitting the model to the training dataset
linear_model.fit(x.values.reshape(-1,1), y)

predicted_values= linear_model.predict(x.values.reshape(-1,1))

from sklearn.metrics import r2_score
r2_score = r2_score(y, predicted_values)
round(r2_score, 3)

0.642

### Question 13

In [11]:
df = df.drop(columns=['date','lights'], axis=1)
df

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,45.566667,17.166667,...,17.033333,45.5300,6.600000,733.5,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,60,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,45.992500,17.166667,...,17.066667,45.5600,6.483333,733.6,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,50,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,45.890000,17.166667,...,17.000000,45.5000,6.366667,733.7,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,50,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,45.723333,17.166667,...,17.000000,45.4000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,60,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,45.530000,17.200000,...,17.000000,45.4000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,100,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,45.590000,23.200000,...,23.200000,46.7900,22.733333,755.2,55.666667,3.333333,23.666667,13.333333,43.096812,43.096812
19731,90,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,45.590000,23.230000,...,23.200000,46.7900,22.600000,755.2,56.000000,3.500000,24.500000,13.300000,49.282940,49.282940
19732,270,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,45.730000,23.230000,...,23.200000,46.7900,22.466667,755.2,56.333333,3.666667,25.333333,13.266667,29.199117,29.199117
19733,420,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,45.790000,23.200000,...,23.200000,46.8175,22.333333,755.2,56.666667,3.833333,26.166667,13.233333,6.322784,6.322784


In [26]:
scaler = MinMaxScaler()
normalised_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Creating features
features_df = normalised_df.drop(columns=['Appliances'])
heating_target = normalised_df['Appliances']

# Splitting the Dataset
x_train, x_test, y_train, y_test = train_test_split(features_df, heating_target, 
                                                     test_size = 0.3, random_state=42)

linear_model = LinearRegression()
# Fitting the model to the training dataset
linear_model.fit(x_train, y_train)

# MAE
predicted_values = linear_model.predict(x_test)
# MAE
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, predicted_values)
round(mae, 2)

0.05

### Question 14

In [17]:
# Residual Sum of Squares
rss = np.sum(np.square(y_test - predicted_values))
round(rss, 2)

45.35

### Question 15

In [18]:
# Root Mean Square
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predicted_values)
rmse = np.sqrt(mse)
round(rmse, 3)

0.088

### Question 16

In [20]:
# Coefficient of determination Or R squared
from sklearn.metrics import r2_score
r2_score = r2_score(y_test, predicted_values)
round(r2_score, 3)

0.149

### Question 17

In [21]:
# Features for the lowest and Highest weights
# Using stats Model.api
import statsmodels.api as sm
x = sm.add_constant(x_train)
results = sm.OLS(y_train,x_train).fit()
results.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Appliances,R-squared (uncentered):,0.503
Model:,OLS,Adj. R-squared (uncentered):,0.502
Method:,Least Squares,F-statistic:,558.4
Date:,"Mon, 08 Aug 2022",Prob (F-statistic):,0.0
Time:,21:52:42,Log-Likelihood:,13799.0
No. Observations:,13814,AIC:,-27550.0
Df Residuals:,13789,BIC:,-27360.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
T1,-0.0152,0.020,-0.767,0.443,-0.054,0.024
RH_1,0.5385,0.028,19.321,0.000,0.484,0.593
T2,-0.2069,0.025,-8.228,0.000,-0.256,-0.158
RH_2,-0.3992,0.030,-13.448,0.000,-0.457,-0.341
T3,0.2894,0.014,19.980,0.000,0.261,0.318
RH_3,0.1004,0.016,6.092,0.000,0.068,0.133
T4,0.0328,0.012,2.643,0.008,0.008,0.057
RH_4,0.0187,0.017,1.120,0.263,-0.014,0.051
T5,-0.0301,0.014,-2.190,0.029,-0.057,-0.003

0,1,2,3
Omnibus:,9902.115,Durbin-Watson:,2.021
Prob(Omnibus):,0.0,Jarque-Bera (JB):,154535.153
Skew:,3.367,Prob(JB):,0.0
Kurtosis:,17.938,Cond. No.,1.06e+16


**From the statsmodel summary we have Lowest_weights = RH_2, Highest_weight = RH_1** 

### Question 18

In [22]:
# Ridge Regression Model
from sklearn.linear_model import Ridge
ridge_reg2 = Ridge(alpha=0.4)
ridge_reg2.fit(x_train, y_train)
predicted_ridge= ridge_reg2.predict(x_test)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predicted_ridge)
rmse = np.sqrt(mse)
round(rmse, 3)

0.088

### Question 19

In [24]:
# Lasso Regression
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train, y_train)

# Effects of Regularization
def get_weights_df(model, feat, col_name):
    # This function returns the weight of every feature
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    weights_df[col_name].round(3)
    return weights_df


lasso_weight_df = get_weights_df(lasso_reg, x_train, 'Lasso_Weight')
lasso_weight_df

Unnamed: 0,Features,Lasso_Weight
0,RH_out,-0.049557
1,RH_8,-0.00011
2,T1,0.0
3,Tdewpoint,0.0
4,Visibility,0.0
5,Press_mm_hg,-0.0
6,T_out,0.0
7,RH_9,-0.0
8,T9,-0.0
9,T8,0.0


### Question 20

In [25]:
# RMSE for Lasso Regression
from sklearn.linear_model import Lasso
lasso_reg2 = Lasso(alpha=0.001)
lasso_reg2.fit(x_train, y_train)
predicted_lasso= lasso_reg2.predict(x_test)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predicted_lasso)
rmse = np.sqrt(mse)
round(rmse, 3)

0.094