# Stage B Test

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('energydata_complete.csv')
data.drop(["date","lights"], axis=1, inplace =True) # Removing the "date" and "lights" features

In [3]:
data.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,60,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,50,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,50,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,60,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [4]:
from sklearn.preprocessing import MinMaxScaler

In [5]:
scaler = MinMaxScaler()
normalized_array = scaler.fit_transform(data)

In [6]:
normalized_df = pd.DataFrame(normalized_array, columns=data.columns) # Normalized dataframe
features = normalized_df.drop(columns=['Appliances'])
targets = normalized_df['Appliances']

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x_train, x_test, y_train, y_test =  train_test_split(features, targets, test_size=0.3, random_state=42) # Splitting data

In [9]:
# Question 12

room_temp_x_train = x_train['T2'].values.reshape(-1, 1) # Room temperature train data
room_temp_x_test = x_test['T2'].values.reshape(-1, 1) # Room temperature test data
building_temp_y_train = x_train['T6'] # Building temperature train data
building_temp_y_test = x_test['T6'] # Building temperature test data

In [10]:
from sklearn.linear_model import LinearRegression

In [11]:
reg = LinearRegression()
reg.fit(room_temp_x_train, building_temp_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
building_temp_y_pred = reg.predict(room_temp_x_test)
building_temp_y_pred

array([0.23928945, 0.46794238, 0.23108472, ..., 0.3001772 , 0.4297256 ,
       0.3217686 ])

In [13]:
from sklearn.metrics import r2_score
r2score = r2_score(building_temp_y_test, building_temp_y_pred)
round(r2score, 2) # The r2 score to 2 D.P is 0.64

0.64

In [14]:
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [15]:
y_pred = model.predict(x_test) # predicted y values
y_pred

array([0.03322207, 0.24411599, 0.03400024, ..., 0.06844707, 0.10032325,
       0.05722198])

In [16]:
# Question 13

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred) # mean absolute error
round(mae, 2)

0.05

In [17]:
# Question 14

rss = np.sum(np.square(y_test - y_pred))
round(rss, 2)

45.35

In [18]:
# Question 15

from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
round(rmse, 3)

0.088

In [19]:
# Question 16

r2score = r2_score(y_test, y_pred)
round(r2score, 2)

0.15

In [20]:
# Question 17

coefficients = list(model.coef_) # linear regression model coefficients
coefficients

[-0.0032810511943450298,
 0.553546599838639,
 -0.23617791869490953,
 -0.45669794833849975,
 0.29062713749703495,
 0.09604827330604843,
 0.028980998554440585,
 0.026385776891802118,
 -0.015656842778628804,
 0.01600578842813373,
 0.23642490886650808,
 0.038048652058950055,
 0.010318776387943096,
 -0.04461363736108866,
 0.10199504524367482,
 -0.15759547759013134,
 -0.18994077300830736,
 -0.03980031661682142,
 -0.3218596681300848,
 0.006839326382765937,
 -0.07767065318166685,
 0.029183129198430136,
 0.01230660897681083,
 0.11775773489644273,
 0.0007700981331769012,
 0.0007700981331765222]

In [21]:
min_index = coefficients.index(min(coefficients)) # index of minimum value coefficient
max_index = coefficients.index(max(coefficients)) # index of maximum value coefficient

In [22]:
features.columns[min_index] # Comparing with the column names, the feature with the least weight is 'RH_2'

'RH_2'

In [23]:
features.columns[max_index] # The feature with the highest weight is 'RH_1'

'RH_1'

In [24]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=0.4)
ridge_reg.fit(x_train, y_train)

Ridge(alpha=0.4, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [25]:
ridge_y_pred = ridge_reg.predict(x_test)

In [26]:
# Question 18

ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_y_pred))
round(ridge_rmse, 3)

0.088

In [27]:
# Question 19

from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train, y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [28]:
lasso_coefficients = lasso_reg.coef_ # Lasso model coefficients
lasso_coefficients

array([ 0.        ,  0.01787993,  0.        , -0.        ,  0.        ,
        0.        , -0.        ,  0.        , -0.        ,  0.        ,
        0.        , -0.        , -0.        , -0.        ,  0.        ,
       -0.00011004, -0.        , -0.        ,  0.        , -0.        ,
       -0.04955749,  0.00291176,  0.        ,  0.        , -0.        ,
       -0.        ])

In [29]:
num_of_nonzero_coefs = 0
for coef in lasso_coefficients:
    if coef != 0:
        num_of_nonzero_coefs += 1
        
num_of_nonzero_coefs # The total number of nonzero coefficients is 4

4

In [30]:
lasso_y_pred = lasso_reg.predict(x_test)

In [31]:
# Question 20

lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_y_pred)) # RMSE value for the lasso model
round(lasso_rmse, 3)

0.094