In [1]:
# import all necessary packages
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
# load the dataset into a pandas DataFrame
df = pd.read_csv('energydata_complete.csv')
df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [3]:
# display numbers of rows and columns
print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns.')

The dataset has 19735 rows and 29 columns.


In [4]:
# importing all necessary methods from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [5]:
# create both input variable and target variable
X = df['T2'].values.reshape(-1, 1)
y = df['T6']

In [6]:
# calling LineraRegression() method and fit it on the variables (X and y)
linReg = LinearRegression()
linReg.fit(X,y)

In [7]:
# evaluating the model performance by the root mean squared error (RMSE) on the X
pred = linReg.predict(X)
lin_mse = mean_squared_error(pred, y)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

3.6444644873484573

In [8]:
# create both input variables and target variable
X = df.drop(['date', 'lights', 'Appliances'], axis = 1)
y = df['Appliances']

In [9]:
# calling MinMaxScaler() method and fit and transform it on the predictors
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
X_scaled = scaler.fit_transform(X)

In [11]:
# splitting the dataset into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y , test_size= 0.3, random_state= 42)

In [12]:
# calling LineraRegression() method and fit it on the training sets
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [13]:
# evaluating the model performance by the mean absolute error (MAE) on the training sets
y_preds = lin_reg.predict(X_train)
lin_mae = mean_absolute_error(y_train, y_preds)
lin_mae

53.74442013710102

In [14]:
# evaluating the model performance by the root mean squared error (RMSE) on the training set
y_preds = lin_reg.predict(X_train)
lin_mse = mean_squared_error(y_train, y_preds)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

95.21569237731437

In [15]:
# evaluating the model performance by the mean absolute error (MAE) on the testing sets
y_preds = lin_reg.predict(X_test)
lin_mae = mean_absolute_error(y_test, y_preds)
lin_mae

53.644365325237374

In [16]:
# evaluating the model performance by the root mean squared error (RMSE) on the testing set
y_preds = lin_reg.predict(X_test)
lin_mse = mean_squared_error(y_test, y_preds)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

93.63882078714602

In [17]:
# calling Ridge() method and fit it on the training sets
ridge = Ridge()
ridge.fit(X_train, y_train)

In [18]:
# evaluating the model performance by the root mean squared error (RMSE) on the testing set
y_preds = ridge.predict(X_test)
ridge_mse = mean_squared_error(y_test, y_preds)
ridge_rmse = np.sqrt(ridge_mse)
ridge_rmse

93.7156533675714

In [19]:
# calling Lasso() method and fit it on the training sets
lasso = Lasso()
lasso.fit(X_train, y_train)

In [20]:
# calling the array of weights of the predictors
lasso.coef_

array([  0.        ,  37.49194311,   0.        ,  -0.        ,
         0.        ,   0.        ,  -0.        ,   0.        ,
        -0.        ,   0.        ,   0.        ,  -0.        ,
        -0.        ,  -0.        ,   0.        , -11.4994144 ,
        -0.        ,  -0.        ,   0.        ,  -0.        ,
       -52.13992414,   5.62458594,   0.        ,   0.        ,
        -0.        ,  -0.        ])

In [21]:
# evaluating the model performance by the root mean squared error (RMSE) on the testing set
y_preds = lasso.predict(X_test)
lasso_mse = mean_squared_error(y_test, y_preds)
lasso_rmse = np.sqrt(lasso_mse)
lasso_rmse

99.81294734273483