In [1]:
import pandas as pd
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
df = pd.read_csv('energydata.csv')

In [4]:
#Question 17:
"""
fit a linear model on the relationship between temperature in the living room (x = T2) and 
temperature outside the building (y = T6)
"""
y = df['T6']
x = np.array(df['T2']).reshape(-1, 1)

#split the data into train and test set
#x_train, x_val, y_train, y_val = train_test_split(x, y)

#fit the model to the training set
linear_reg = LinearRegression()
linear_reg.fit(x, y)

#predict with model
y_pred = linear_reg.predict(x)

#evaluate the performance of the model
mae = mean_absolute_error(y, y_pred)

print('The mean absolute error is: {:.2f}'.format(mae))

The mean absolute error is: 2.84


In [5]:
#Question 18

#Remove date and light from the dateset
df = df.drop(['date', 'lights'], axis = 1)

#assign predictors and response
df_feature = df.drop('Appliances', axis = 1)

target_feature = df['Appliances']

#split the dataset into training set and test set
x_train, x_val, y_train, y_val = train_test_split(df_feature, target_feature,
                                                 test_size = 0.3, random_state = 42)

#Normalize the dataset
scaler = MinMaxScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns = x_train.columns)
x_val = pd.DataFrame(scaler.transform(x_val), columns = x_val.columns)

#fit training set to the model
linear_reg = LinearRegression()
linear_reg.fit(x_train, y_train)

#predict the train set
y_pred = linear_reg.predict(x_train)

#evaluate the performance of the model
mae = mean_absolute_error(y_train, y_pred)

print('The mean absolute error for training set is: {:.3f}'.format(mae))

The mean absolute error for training set is: 53.742


In [6]:
#Question 19

#What is the Root Mean Squared Error (in three decimal places) for the training set?
rmse = np.sqrt(mean_squared_error(y_train, y_pred))

print('The root mean square error for training set is: {:.3f}'.format(rmse))

The root mean square error for training set is: 95.216


In [7]:
#Question 20
#What is the Mean Absolute Error (in three decimal places) for test set?

#predict the test set
y_pred = linear_reg.predict(x_val)

#evaluate the performance of the model
mae = mean_absolute_error(y_val, y_pred)

print('The mean absolute error is for test set: {:.3f}'.format(mae))

The mean absolute error is for test set: 53.643


In [8]:
#Question 21
#What is the Root Mean Squared Error (in three decimal places) for test set?

#predict the test set
y_pred = linear_reg.predict(x_val)

#evaluate the performance of the model
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print('The root mean square error for test set is: {:.3f}'.format(rmse))


The root mean square error for test set is: 93.640


Question 22

Did the Model above overfit to the training set

A large difference between training and validation error indicates overfitting

MAE on training set = 53.742

MAE on test set = 53.643

Difference between the errors = 0.099

Hence, there's approximately no overfitting.


In [9]:
#Question 23
#Train a ridge regression model with default parameters.

ridge_reg = Ridge(alpha = 0.5)
ridge_reg.fit(x_train, y_train)

y_pred = ridge_reg.predict(x_val)

#evaluate the performance of the model
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print('The root mean square error for test set is: {:.3f}'.format(rmse))

The root mean square error for test set is: 93.667


Linear Regression RMSE = 93.640

Ridge Regression RMSE = 93.667.

Error difference = 0.027

Approximately, there is no difference between the errors

In [10]:
#Question 24:
#How many features have non zero features weight from lasso regression

lasso_reg = Lasso(alpha = 0.001)

#fit the model to the training set
lasso_reg.fit(x_train, y_train)

#evaluate the feature weights with non zero values
weight = lasso_reg.coef_

#arrange the weight in dataframe
weight_df = pd.DataFrame({'feature_weight':weight,
                          "features" : x_train.columns}).sort_values(by = 'feature_weight').reset_index()

non_zero_weight = weight_df.loc[weight_df['feature_weight'] != 0]

print('Number of non_zero_weight: {}'.format(non_zero_weight.shape[0]))

Number of non_zero_weight: 26


In [11]:
#Question 25:
#What is the new RMSE with the Lasso Regression on the test set?

#predict the test set
y_pred = lasso_reg.predict(x_val)

#evaluate the performance of the Lasso model on test set
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print('The root mean square error for test set is : {:.3f}'.format(rmse))


The root mean square error for test set is : 93.641
