In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [3]:
df=pd.read_csv('energydata_complete.csv')

### A linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6)

In [4]:
t2 = df['T2']
t6 = df['T6']
t2 = np.array(t2)
t6 = np.array(t6)

#reshaping the data so it can fit the model 
t2 = t2.reshape((-1, 1))
t6 = t6.reshape((-1, 1))

In [5]:
from sklearn.linear_model import LinearRegression
linear_model1 = LinearRegression()
linear_model1.fit(t2, t6)

LinearRegression()

## Calculating the value of the R squared to 2 decimal places

In [6]:
r_squared = linear_model1.score(t2,t6)

round(r_squared, 2 )

0.64

## Dropping the Date and lights Columns

In [7]:
df = df.drop(columns=['date', 'lights'])


### Scaling the Data and dropping the target and Reatining the feauture variables

In [8]:
#Firstly, we normalise our dataset to a common scale using the min max scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

normalised_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
#Declaring our target variable and retaining the feautures variables
features_df = normalised_df.drop(columns=['Appliances'])
Appliances_target = normalised_df[ 'Appliances' ]

### Splitting the dataset into training and testing data

In [9]:
#Now, we split our dataset into the training and testing dataset. Recall that we
#had earlier segmented the features and target variables.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_df, Appliances_target,
test_size= 0.3 , random_state= 1 )

### Buiding the linear Regression model with the traing data

In [10]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
#fit the model to the training dataset
linear_model.fit(x_train, y_train)
#obtain predictions
predicted_values = linear_model.predict(x_test)

#### The Mean Absolute Error in 2 decimal places is 0.05

In [11]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, predicted_values)
round(mae, 2 )

0.05

### The Residual Sum of Squares is 46.097

In [12]:
import numpy as np
#rss = np.sum(np.square(predicted_values - y_test ))
rss = np.sum(np.square(y_test - predicted_values ))
round(rss, 3 )

46.097

### Root Mean Squared Error (in three decimal places) is 0.088

In [13]:

from sklearn.metrics import mean_squared_error
rmse = np. sqrt(mean_squared_error(y_test,predicted_values))
round(rmse, 3 )

0.088

### Coefficient of Determination(r squared) in two decimal places is 0.16

In [14]:
from sklearn.metrics import r2_score
r2_score = r2_score(y_test, predicted_values)
round(r2_score, 2 )

0.16

### Lasso Regression 

In [15]:
#● Feature Selection and Lasso Regression
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha= 0.001 )
lasso_reg.fit(x_train, y_train)

Lasso(alpha=0.001)

### Ridge Regression with alpha vaue 0.4

In [16]:
#● Ridge Regression
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha= 0.4 )
ridge_reg.fit(x_train, y_train)

Ridge(alpha=0.4)

In [29]:
pred_train_lasso  = lasso_reg.predict(x_test)


### The function that calculates the weights of the feautures

In [17]:
#comparing the effects of regularisation
def get_weights_df(model, feat, col_name) :
#this function returns the weight of every feature
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = [ 'Features' , col_name]
    weights_df[col_name].round( 3 )
    return weights_df

In [18]:
linear_model_weights = get_weights_df(linear_model, x_train, 'Linear_Model_Weight' )
ridge_weights_df = get_weights_df(ridge_reg, x_train, 'Ridge_Weight' )
lasso_weights_df = get_weights_df(lasso_reg, x_train, 'Lasso_weight' )
final_weights = pd.merge(linear_model_weights, ridge_weights_df, on= 'Features' )
final_weights = pd.merge(final_weights, lasso_weights_df, on= 'Features' )

## Obtaining the Feauture Weights from linear model 

In [19]:
linear_model_weights

Unnamed: 0,Features,Linear_Model_Weight
0,RH_2,-0.458176
1,T_out,-0.330375
2,T2,-0.249801
3,T9,-0.204718
4,RH_8,-0.156534
5,RH_out,-0.076223
6,RH_7,-0.047614
7,RH_9,-0.036034
8,rv1,-0.00035
9,rv2,-0.00035


### From the table above the lowest and highest Features are RH2 and RH1 respectively

####          Training a ridge regression model with an alpha value of 0.4. Is there any change to the root mean squared error (RMSE) when evaluated on the test set

In [31]:
pred_train_ridge  = ridge_reg.predict(x_test)

In [32]:

from sklearn.metrics import mean_squared_error
rmse = np. sqrt(mean_squared_error(y_test,pred_train_ridge))
round(rmse, 3 )

0.088

### No, there is no change to the root mean square (0.088)

### Lasso Regression model

In [22]:
lasso_weights_df

Unnamed: 0,Features,Lasso_weight
0,RH_out,-0.047062
1,T1,0.0
2,Tdewpoint,0.0
3,Visibility,0.0
4,Press_mm_hg,-0.0
5,T_out,0.0
6,RH_9,-0.0
7,T9,-0.0
8,RH_8,-0.0
9,T8,0.0


### 4 feautures have non zero feauture weights

### The new RMSE with the LASSO regression would be :   0.095

In [30]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test,pred_train_lasso))
round(rmse, 3 )
#rmse

0.095

In [25]:
final_weights

Unnamed: 0,Features,Linear_Model_Weight,Ridge_Weight,Lasso_weight
0,RH_2,-0.458176,-0.412703,-0.0
1,T_out,-0.330375,-0.269159,0.0
2,T2,-0.249801,-0.214797,8.4e-05
3,T9,-0.204718,-0.203381,-0.0
4,RH_8,-0.156534,-0.155889,-0.0
5,RH_out,-0.076223,-0.052736,-0.047062
6,RH_7,-0.047614,-0.048777,-0.0
7,RH_9,-0.036034,-0.037753,-0.0
8,rv1,-0.00035,-0.000354,-0.0
9,rv2,-0.00035,-0.000354,-0.0
