## EXPLORATORY DATA ANALYSIS

### Import Libraries

In [1]:
%matplotlib inline
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

### Import Dataset

In [2]:
data = pd.read_csv(r"C:\Users\Akunna Anyamkpa\Documents\HAMOYE\PROJECT 3\energy.csv")

In [3]:
data.head(4)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389


In [4]:
data.shape

(19735, 29)

### View Columns

In [5]:
data.columns

Index(['date', 'Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3',
       'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8',
       'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed',
       'Visibility', 'Tdewpoint', 'rv1', 'rv2'],
      dtype='object')

### Check for datatypes and missing values

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [7]:
data.isnull().sum()

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

## Data Profiling 

### Checking for outliers

In [8]:
data.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,...,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,...,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,...,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,...,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,...,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [9]:
data.lights.value_counts()

0     15252
10     2212
20     1624
30      559
40       77
50        9
70        1
60        1
Name: lights, dtype: int64

### Drop columns

In [10]:
data.drop('date', axis='columns', inplace=True)
data.drop('lights', axis='columns', inplace=True)

In [11]:
data['zscore'] = ( data.Windspeed - data.Windspeed.mean() ) / data.Windspeed.std()
data.head(5)

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,zscore
0,60,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,...,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433,1.207663
1,60,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,...,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195,1.071676
2,50,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,...,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668,0.93569
3,50,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,...,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389,0.799703
4,60,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,...,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097,0.663716


In [12]:
data[(data.zscore<-3) | (data.zscore>3)]

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,zscore
2661,40,19.230000,44.126667,18.500000,43.900000,19.790000,44.663333,16.600000,46.500000,16.700000,...,48.790000,8.900000,756.400000,85.0,11.500000,40.000000,6.500000,9.316703,9.316703,3.043483
2662,40,19.200000,44.090000,18.500000,43.900000,19.790000,44.663333,16.533333,46.530000,16.700000,...,48.790000,8.833333,756.300000,86.0,11.666667,40.000000,6.600000,10.221086,10.221086,3.111476
2663,50,19.200000,44.163333,18.500000,44.030000,19.790000,44.700000,16.533333,46.590000,16.700000,...,48.790000,8.766667,756.200000,87.0,11.833333,40.000000,6.700000,11.592651,11.592651,3.179470
2664,40,19.166667,44.200000,18.426667,44.030000,19.790000,44.700000,16.500000,46.590000,16.700000,...,48.900000,8.700000,756.100000,88.0,12.000000,40.000000,6.800000,20.305092,20.305092,3.247463
2665,50,19.100000,44.200000,18.390000,44.030000,19.890000,44.826667,16.500000,46.590000,16.700000,...,48.900000,8.666667,756.000000,89.0,11.833333,41.000000,6.916667,43.358476,43.358476,3.179470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11045,110,21.390000,40.590000,18.790000,45.000000,22.390000,37.900000,20.000000,40.260000,19.790000,...,42.200000,8.850000,739.166667,82.0,12.166667,38.000000,5.950000,10.962173,10.962173,3.315456
11046,100,21.390000,40.730000,18.790000,45.000000,22.390000,37.966667,20.263333,40.626667,19.790000,...,42.090000,8.900000,739.100000,83.0,12.000000,40.000000,6.200000,9.382576,9.382576,3.247463
11047,90,21.390000,40.863333,18.890000,45.000000,22.390000,38.090000,20.530000,40.760000,19.856667,...,42.090000,8.850000,739.216667,84.5,11.833333,43.833333,6.383333,25.963221,25.963221,3.179470
11048,110,21.426667,40.860000,18.963333,45.000000,22.323333,38.090000,20.860000,40.900000,19.856667,...,41.966667,8.800000,739.333333,86.0,11.666667,47.666667,6.566667,21.774027,21.774027,3.111476


## MODEL BUILDING

### Feature Correlation

In [13]:
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,zscore
Appliances,1.0,0.055447,0.086031,0.120073,-0.060465,0.08506,0.036292,0.040281,0.016965,0.01976,0.006955,0.117638,-0.083178,0.025801,-0.055642,0.039572,-0.094039,0.01001,-0.051462,0.099155,-0.034885,-0.152282,0.087122,0.00023,0.015353,-0.011145,-0.011145,0.087122
T1,0.055447,1.0,0.164006,0.836834,-0.002509,0.892402,-0.02855,0.877001,0.097861,0.885247,-0.014782,0.654769,-0.615045,0.838705,0.135182,0.825413,-0.006441,0.844777,0.071756,0.682846,-0.150574,-0.345481,-0.087654,-0.07621,0.571309,-0.006203,-0.006203,-0.087654
RH_1,0.086031,0.164006,1.0,0.269839,0.797535,0.25323,0.844677,0.10618,0.880359,0.205797,0.303258,0.316141,0.245126,0.021397,0.801122,-0.030053,0.736196,0.115263,0.764001,0.340767,-0.293957,0.274126,0.204932,-0.021057,0.639106,-0.000699,-0.000699,0.204932
T2,0.120073,0.836834,0.269839,1.0,-0.16561,0.735245,0.121497,0.762066,0.231563,0.72055,0.029595,0.801186,-0.580372,0.66366,0.229212,0.578191,0.068534,0.675535,0.157346,0.792255,-0.133028,-0.505291,0.052495,-0.069721,0.582602,-0.011087,-0.011087,0.052495
RH_2,-0.060465,-0.002509,0.797535,-0.16561,1.0,0.137319,0.678326,-0.047304,0.721435,0.110409,0.250271,-0.00967,0.389933,-0.051422,0.690584,-0.041023,0.679777,0.054544,0.676467,0.033674,-0.255646,0.584911,0.06919,-0.005368,0.499152,0.006275,0.006275,0.06919
T3,0.08506,0.892402,0.25323,0.735245,0.137319,1.0,-0.011234,0.852778,0.122737,0.888169,-0.066355,0.686882,-0.647672,0.847374,0.172624,0.795283,0.044427,0.901324,0.134602,0.699417,-0.189974,-0.281718,-0.100776,-0.10231,0.645886,-0.005194,-0.005194,-0.100776
RH_3,0.036292,-0.02855,0.844677,0.121497,0.678326,-0.011234,1.0,-0.140457,0.898978,-0.050062,0.375422,0.076833,0.514912,-0.25009,0.832685,-0.283228,0.828822,-0.19527,0.833538,0.118207,-0.233274,0.356192,0.263188,0.017041,0.414387,-0.000477,-0.000477,0.263188
T4,0.040281,0.877001,0.10618,0.762066,-0.047304,0.852778,-0.140457,1.0,-0.04865,0.871813,-0.076489,0.65235,-0.703149,0.877763,0.043527,0.796256,-0.095192,0.889439,-0.025549,0.663478,-0.075292,-0.388602,-0.185747,-0.104768,0.519471,-0.001815,-0.001815,-0.185747
RH_4,0.016965,0.097861,0.880359,0.231563,0.721435,0.122737,0.898978,-0.04865,1.0,0.091812,0.352591,0.259047,0.392178,-0.131204,0.894301,-0.167066,0.847259,-0.044518,0.856591,0.293289,-0.250748,0.336813,0.300192,0.002636,0.616509,-0.001787,-0.001787,0.300192
T5,0.01976,0.885247,0.205797,0.72055,0.110409,0.888169,-0.050062,0.871813,0.091812,1.0,0.032786,0.629161,-0.632384,0.870624,0.148905,0.824981,0.016388,0.911055,0.072308,0.651321,-0.170999,-0.273953,-0.145011,-0.084164,0.588362,-0.00549,-0.00549,-0.145011


### Feature Engineering

In [14]:
# INPUT-FEATURES_INDEPENDENT VARIABLE
#x = data.drop(columns = ['Appliances']) # When input is all features except appliances
x = data[['RH_1','T2','T3','T6','T_out','Windspeed','RH_8','RH_6']]

# OUTPUT FEATURE_DEPENDANT VARIABLE 
y = data['Appliances'] #target variable 

In [15]:
x.columns

Index(['RH_1', 'T2', 'T3', 'T6', 'T_out', 'Windspeed', 'RH_8', 'RH_6'], dtype='object')

### Feature Scaling

In [16]:
Scaler = StandardScaler() 

scaled = Scaler.fit_transform(x) 
scaled  

array([[ 1.84382063, -0.52041138, -1.23506343, ...,  1.20769354,
         1.14157236,  0.95179837],
       [ 1.61680671, -0.52041138, -1.23506343, ...,  1.07170342,
         1.13455378,  0.94559165],
       [ 1.51795932, -0.52041138, -1.23506343, ...,  0.93571331,
         1.10903167,  0.91648425],
       ...,
       [ 1.59251371,  2.41110339,  2.38397142, ..., -0.15220764,
         1.28704839, -1.72105219],
       [ 1.6913611 ,  2.31325594,  2.30421308, ..., -0.08421258,
         1.2600109 , -1.72105219],
       [ 1.5933514 ,  2.24498422,  2.27098043, ..., -0.01621752,
         1.30159599, -1.72105219]])

### Split to train and test data

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

### Linear Regressor

In [18]:
# Define the range of hyperparameters
params = {
    'fit_intercept': [True, False],
    'copy_X': [True, False],
}
# Create a linear regression object
lr = LinearRegression()

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(lr, param_grid=params, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(x_train, y_train)

# Print the best hyperparameters and the corresponding MSE
print("Best hyperparameters: ", grid_search.best_params_)

Best hyperparameters:  {'copy_X': True, 'fit_intercept': True}


In [19]:
# Train the model
lr = LinearRegression(copy_X= True, fit_intercept= True)

In [20]:
lr.fit(x_train, y_train)

In [22]:
# Prediction
predicted = lr.predict(x_test)
predicted[0]

64.11202597405389

In [23]:
print(r2_score(y_test, predicted).round(2)*100,"%") # R-Squared Evaluation Technique 

8.0 %


### Gradient Boosting Regressor

In [24]:
from sklearn.ensemble import GradientBoostingRegressor

gbt = GradientBoostingRegressor(learning_rate=0.1)

y_train_col = y_train.ravel()

gbt = gbt.fit(x_train, y_train_col)

In [25]:
# Accuracy score
gbt.score(x_test, y_test)

0.17643108825654885

### Support Vector Machine

In [26]:
from sklearn.svm import SVR

sv = SVR(kernel = 'rbf')

y_train_col = y_train.ravel()

sv.fit(x_train, y_train_col)

In [27]:
# Accuracy score
sv.score(x_test, y_test)

-0.05575408127021819

### Decision Tree Regressor

In [28]:
param_grid = {'max_depth': [2, 4, 6, 8, 10],
              'min_samples_leaf': [1, 2, 4, 6, 8]}

dt = DecisionTreeRegressor()

grid_search = GridSearchCV(dt, param_grid, cv=5)

grid_search.fit(x_train, y_train)

# Get the best set of hyperparameters
print("Best parameters: ", grid_search.best_params_)

Best parameters:  {'max_depth': 10, 'min_samples_leaf': 4}


In [29]:
dt = DecisionTreeRegressor(max_depth= 10, min_samples_leaf= 4)

dt.fit(x_train,y_train)

In [30]:
pred_DTG = dt.predict(x_test)
pred_DTG 

array([ 54.67105263, 278.33333333,  48.56060606, ...,  88.50152905,
        69.71209213,  78.29268293])

In [31]:
print(r2_score(y_test, pred_DTG).round(2)*100,"%") # R-Squared Evaluation Technique 

20.0 %


In [None]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor()

# Create a grid search object
grid_search = GridSearchCV(rf, param_grid, cv=5)

y_train_col = y_train.ravel()

# Fit the grid search to the training data
grid_search.fit(x_train, y_train_col)

# Get the best set of hyperparameters
print("Best parameters: ", grid_search.best_params_)

# Get the best random forest regression model
best_rf_reg = grid_search.best_estimator_

In [33]:
rf = RandomForestRegressor(max_depth= None, min_samples_leaf= 1, min_samples_split= 2, n_estimators= 100)
rf.fit(x_train,y_train)

In [34]:
pred_RFG = rf.predict(x_test)
pred_RFG

array([ 53.3, 119.5,  49.8, ...,  61.6,  67.1,  71. ])

In [35]:
# Mean square error (MSE) is the average of the square of the errors. The larger the number the larger the error.
mse = mean_squared_error(y_test,pred_RFG)
rmse = np.sqrt(mse)
rmse

67.12286821603725

In [36]:
# The r2 score varies between 0 and 100%
print(r2_score(y_test, pred_RFG).round(2)*100,"%") # R-Squared Evaluation Technique 

55.00000000000001 %


In [38]:
best = rf

In [39]:
import pickle 

pickle.dump(dt, open("energy.pkl","wb"))