## Hamoye Stage B Quiz: Machine Learning-Regression

In [7]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [27]:
# Read the downloaded dataset
energy_df = pd.read_csv("energydata_complete.csv") 
energy_df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [29]:
energy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [31]:
# Convert 'date' column to datetime
energy_df['date'] = pd.to_datetime(energy_df['date'])
energy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         19735 non-null  datetime64[ns]
 1   Appliances   19735 non-null  int64         
 2   lights       19735 non-null  int64         
 3   T1           19735 non-null  float64       
 4   RH_1         19735 non-null  float64       
 5   T2           19735 non-null  float64       
 6   RH_2         19735 non-null  float64       
 7   T3           19735 non-null  float64       
 8   RH_3         19735 non-null  float64       
 9   T4           19735 non-null  float64       
 10  RH_4         19735 non-null  float64       
 11  T5           19735 non-null  float64       
 12  RH_5         19735 non-null  float64       
 13  T6           19735 non-null  float64       
 14  RH_6         19735 non-null  float64       
 15  T7           19735 non-null  float64       
 16  RH_7

In [37]:
energy_df.rename(columns={'date':'Date'}, inplace=True)
energy_df.head()

Unnamed: 0,Date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [39]:
energy_df.describe()

Unnamed: 0,Date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,2016-03-20 05:30:00,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,...,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
min,2016-01-11 17:00:00,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,2016-02-14 23:15:00,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,...,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,2016-03-20 05:30:00,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,...,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,2016-04-23 11:45:00,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,...,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,2016-05-27 18:00:00,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653
std,,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,...,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634


### Q17: Linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6)

In [46]:
# We use StandardScaler to standardize the features/predictor variables as they are on different scales.
scaler = StandardScaler()
X1 = energy_df[['T2']].to_numpy()
y1 = energy_df['T6'].to_numpy()

X1 = scaler.fit_transform(X1)
X1[0:5]

array([[-0.52041138],
       [-0.52041138],
       [-0.52041138],
       [-0.52041138],
       [-0.52041138]])

In [50]:
# Spliting the dataset into training, and testing
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=1)

In [54]:
lr1 = LinearRegression()
lr1.fit(X1_train, y1_train)
predictions1 = lr1.predict(X1_test)

In [60]:
RMSE = np.sqrt(mean_squared_error(y1_test, predictions1))
r2 = r2_score(y1_test, predictions1)
print(f'RMSE: {RMSE}\nR-Squared: {r2}')

RMSE: 3.6505192908036967
R-Squared: 0.6451487946884393


### Q18: Remove the following columns: [“date”, “lights”]. The target variable is “Appliances”.

In [70]:
# Separate the target and features
X = energy_df.drop(columns=["Date", "lights", "Appliances"], axis=1)
y = energy_df["Appliances"]

X = X.to_numpy()
y = y.to_numpy()

In [76]:
normalizer = MinMaxScaler()
X = normalizer.fit_transform(X)
X[0:5]

array([[0.32734952, 0.56618659, 0.22534529, 0.68403787, 0.21518777,
        0.74606637, 0.35135135, 0.76426234, 0.17550565, 0.38169106,
        0.3810702 , 0.84182676, 0.17059378, 0.6534279 , 0.17332926,
        0.66141193, 0.22303156, 0.67729029, 0.37299035, 0.09767442,
        0.89473684, 0.5       , 0.95384615, 0.53846154, 0.26544891,
        0.26544891],
       [0.32734952, 0.54132648, 0.22534529, 0.68213984, 0.21518777,
        0.74887054, 0.35135135, 0.78243705, 0.17550565, 0.38169106,
        0.37544268, 0.83987192, 0.17059378, 0.65106383, 0.17332926,
        0.66015536, 0.22650017, 0.67853201, 0.36923901, 0.1       ,
        0.89473684, 0.47619048, 0.89487179, 0.53393665, 0.37208289,
        0.37208289],
       [0.32734952, 0.53050179, 0.22534529, 0.67944512, 0.21518777,
        0.7555694 , 0.34474474, 0.77806231, 0.17550565, 0.38003709,
        0.36748654, 0.83070442, 0.17059378, 0.6465721 , 0.17332926,
        0.65558602, 0.21956296, 0.67604857, 0.36548767, 0.10232558,
      

In [80]:
# Spliting the dataset into training, and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f'X_train size: {X_train.shape}; y_train size: {y_train.shape}\nX_test size: {X_test.shape}; y_test size: {y_test.shape}')

X_train size: (13814, 26); y_train size: (13814,)
X_test size: (5921, 26); y_test size: (5921,)


In [84]:
lr = LinearRegression()
lr.fit(X_train, y_train)

**Questions 18 & 19**

In [95]:
# We also predict for training set
predictions_train = lr.predict(X_train)
MAE_train = mean_absolute_error(y_train, predictions_train)
MSE_train = mean_squared_error(y_train, predictions_train)
RMSE_train = np.sqrt(mean_squared_error(y_train, predictions_train))
r2_train = r2_score(y_train, predictions_train)
print(f'MAE_train: {MAE_train}\nMSE_train: {MSE_train}\nRMSE_train: {RMSE_train}\nR-Squared_train: {r2_train}')

MAE_train: 53.74228107493443
MSE_train: 9066.02188107958
RMSE_train: 95.21565985214606
R-Squared_train: 0.14471942308518737


**Questions 20 & 21**

In [88]:
# We then predict
predictions = lr.predict(X_test)
MAE = mean_absolute_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)
print(f'MAE: {MAE}\nMSE: {MSE}\nRMSE: {RMSE}\nR-Squared: {r2}')

MAE: 53.642977655849485
MSE: 8768.535925051976
RMSE: 93.6404609399803
R-Squared: 0.14890246319303524


**Questions 23, & 25**  
Lasso(L1), and Ridge(L2) regression models with default parameters......

In [122]:
# Ridge regression with default parameters
ridge = Ridge()
ridge.fit(X_train, y_train)

# Predict using Ridge model
ridge_predictions = ridge.predict(X_test)

In [124]:
# Lasso regression with default parameters
lasso = Lasso()
lasso.fit(X_train, y_train)

# Predict using Lasso model
lasso_predictions = lasso.predict(X_test)

In [126]:
ridge_mae = mean_absolute_error(y_test, ridge_predictions)
ridge_mse = mean_squared_error(y_test, ridge_predictions)
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_predictions))
ridge_r2 = r2_score(y_test, ridge_predictions)

lasso_mae = mean_absolute_error(y_test, lasso_predictions)
lasso_mse = mean_squared_error(y_test, lasso_predictions)
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_predictions))
lasso_r2 = r2_score(y_test, lasso_predictions)

print(f"Ridge Regression MSE: {ridge_mse}")
print(f"Lasso Regression MSE: {lasso_mse}")
print(f"Ridge Regression RMSE: {ridge_rmse}")
print(f"Lasso Regression RMSE: {lasso_rmse}")

Ridge Regression MSE: 8782.623686110795
Lasso Regression MSE: 9962.624457243555
Ridge Regression RMSE: 93.71565336757139
Lasso Regression RMSE: 99.81294734273483


**Q24: Features with non-zero feature weights using Lasso Regression**

In [131]:
# The coefficients & intercepts
print ('Coefficients: ', lasso.coef_)
print ('Intercept: ', lasso.intercept_)

Coefficients:  [  0.          37.49194311   0.          -0.           0.
   0.          -0.           0.          -0.           0.
   0.          -0.          -0.          -0.           0.
 -11.4994144   -0.          -0.           0.          -0.
 -52.13992414   5.62458594   0.           0.          -0.
  -0.        ]
Intercept:  126.00548429102737
