# Building a binary classifier model with open-source energy dataset

### Energy dataset is extracted from UCI database (see PDF file)

### 1. Import Python libraries

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

### 2. Import dataset, assigning x-features and y-target, followed by data pre-processing

In [2]:
dataset = pd.read_csv('datasets/ENB2012_data.csv')
for row_number in range(len(dataset)):
    if math.isnan(float(dataset['X1'][row_number])):
        dataset.drop([row_number], inplace=True)
dataset.reset_index(inplace=True)

# There are 2 y-targets. Let's focus on the heating load to demonstrate the analysis.
standard_scaler = StandardScaler()
x_feature = dataset.iloc[:,1:9]
x_feature_scaled = standard_scaler.fit_transform(x_feature)
y_target = dataset.iloc[:,9]
print(x_feature_scaled)
print('')
print(y_target)
print('')
threshold_value = np.average(y_target)

# for heating load <= threshold value, assign instance to class 1, else assign to class 2
y_target_class = [1if value <= threshold_value else 2 for value in y_target]
x_feature_scaled = np.array(x_feature_scaled).reshape(len(x_feature_scaled),-1)
y_target = np.array(y_target_class).reshape(len(y_target_class),-1)

[[ 2.04177671 -1.78587489 -0.56195149 ... -1.34164079 -1.76044698
  -1.81457514]
 [ 2.04177671 -1.78587489 -0.56195149 ... -0.4472136  -1.76044698
  -1.81457514]
 [ 2.04177671 -1.78587489 -0.56195149 ...  0.4472136  -1.76044698
  -1.81457514]
 ...
 [-1.36381225  1.55394308  1.12390297 ... -0.4472136   1.2440492
   1.41133622]
 [-1.36381225  1.55394308  1.12390297 ...  0.4472136   1.2440492
   1.41133622]
 [-1.36381225  1.55394308  1.12390297 ...  1.34164079  1.2440492
   1.41133622]]

0      15.55
1      15.55
2      15.55
3      15.55
4      20.84
5      21.46
6      20.71
7      19.68
8      19.50
9      19.95
10     19.34
11     18.31
12     17.05
13     17.41
14     16.95
15     15.98
16     28.52
17     29.90
18     29.63
19     28.75
20     24.77
21     23.93
22     24.77
23     23.93
24      6.07
25      6.05
26      6.01
27      6.04
28      6.37
29      6.40
       ...  
738    41.09
739    40.79
740    38.82
741    39.72
742    39.31
743    39.86
744    14.41
745    14.19
746

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


### 3. Splitting dataset into training and testing

In [3]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_feature_scaled,
                                                    y_target,
                                                    test_size = 0.20,
                                                    random_state = 42)
print(x_train)
print('')
print(x_test)
print('')
print(y_train)
print('')
print(y_test)
print('')

[[ 0.52818162 -0.67260223  0.         ... -1.34164079 -1.00932293
  -1.16939287]
 [-1.17461286  1.27562492  0.56195149 ...  0.4472136   1.2440492
  -0.5242106 ]
 [ 0.90658039 -0.95092039 -0.56195149 ...  0.4472136   0.11736313
  -0.5242106 ]
 ...
 [-0.51241501  0.44067043 -1.12390297 ...  0.4472136  -1.00932293
   1.41133622]
 [ 2.04177671 -1.78587489 -0.56195149 ...  1.34164079  0.11736313
   0.76615395]
 [ 1.28497917 -1.22923856  0.         ...  0.4472136  -1.00932293
  -0.5242106 ]]

[[-1.36381225  1.55394308  1.12390297 ... -1.34164079  1.2440492
   0.12097168]
 [-0.98541347  0.99730676  0.         ... -1.34164079  0.11736313
  -1.16939287]
 [ 2.04177671 -1.78587489 -0.56195149 ... -1.34164079  1.2440492
   0.12097168]
 ...
 [ 0.90658039 -0.95092039 -0.56195149 ... -1.34164079  0.11736313
  -0.5242106 ]
 [ 0.90658039 -0.95092039 -0.56195149 ... -1.34164079  0.11736313
  -1.16939287]
 [-0.51241501  0.44067043 -1.12390297 ...  0.4472136   0.11736313
   0.76615395]]

[[2]
 [1]
 [2]
 [

### 4. Building binary classifier for class 1 or 2

In [4]:
# training SGDClassifier model
sgd_classifier = SGDClassifier(random_state=42)
sgd_classifier.fit(x_train,y_train)
pred_results_sgd_classifier = sgd_classifier.predict(x_test)

# training LogisticRegression model
logistic_model = LogisticRegression(solver='newton-cg', 
                                    max_iter=500)
# (‘lbfgs’, ‘newton-cg’, ‘liblinear’, ‘sag’, ‘saga’ ----> different types of solver)
logistic_model.fit(x_train,y_train)
pred_results_logistic_classifier = logistic_model.predict(x_test)

visualise_table = pd.DataFrame(columns=['Actual Class (Test dataset)',
                                        'Predicted Class - SGD classifier',
                                        'Predicted Class - logistic classifier'])
visualise_table['Actual Class (Test dataset)'] = [y_test[i][0] for i in range(len(y_test))]
visualise_table['Predicted Class - SGD classifier'] = pred_results_sgd_classifier
visualise_table['Predicted Class - logistic classifier'] = pred_results_logistic_classifier
visualise_table

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,Actual Class (Test dataset),Predicted Class - SGD classifier,Predicted Class - logistic classifier
0,1,1,1
1,1,1,1
2,2,2,2
3,2,2,2
4,1,1,1
5,2,2,2
6,2,2,2
7,2,2,2
8,1,1,1
9,2,2,2


### 5. Performance evaluation

#### a. Cross-Validation

In [5]:
number_of_folds = 20
score_1 = cross_val_score(sgd_classifier,
                         x_train,
                         y_train,
                         cv = number_of_folds,
                         scoring = 'accuracy') # cv = 5 means splitting the training dataset into 5 folds, then
# making predictions and evaluating them on each of the 5 folds using the model 
# which is trained on the remaining folds, i.e. 4 folds
score_2 = cross_val_score(logistic_model,
                         x_test,
                         y_test,
                         cv = number_of_folds,
                         scoring = 'accuracy')
score_1 = [round(i,3) *100 for i in (score_1)]
score_2 = [round(j,3) *100 for j in (score_2)]

CV_table = pd.DataFrame(columns=['Fold number','Accuracy (%) for SGD classifier','Accuracy (%) for logistic classifier'])
CV_table['Fold number'] = [i+1 for i in range (number_of_folds)]
CV_table['Accuracy (%) for SGD classifier'] = score_1
CV_table['Accuracy (%) for logistic classifier'] = score_2
CV_table

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,Fold number,Accuracy (%) for SGD classifier,Accuracy (%) for logistic classifier
0,1,100.0,100.0
1,2,96.9,100.0
2,3,100.0,100.0
3,4,100.0,100.0
4,5,100.0,100.0
5,6,100.0,100.0
6,7,93.5,100.0
7,8,96.8,100.0
8,9,96.8,100.0
9,10,100.0,87.5


#### b. Confusion Matrix

In [6]:
y_pred_SGD_classifier = cross_val_predict(sgd_classifier,
                         x_train,
                         y_train,
                         cv = number_of_folds)
y_pred_logistic_classifier = cross_val_predict(logistic_model,
                         x_train,
                         y_train,
                         cv = number_of_folds)
score_3 = confusion_matrix(y_train,y_pred_SGD_classifier)
score_4 = confusion_matrix(y_train,y_pred_logistic_classifier)
print(score_3)
print('')
print(score_4)
print('')
# first row represents negative class, i.e. non-8s images (different from powerpoint slides)
# second row represents positive class, i.e. 8s images (different from powerpoint slides)

TN_SGD = score_3[0][0]
FP_SGD = score_3[0][1]
FN_SGD = score_3[1][0]
TP_SGD = score_3[1][1]
print('The estimated TN, TP, FN and TP values for the predictions from the SGD classifier are: '  
      + str(TN_SGD) + ', '+ str(FP_SGD) + ', '+ str(FN_SGD) + ' and '+ str(TP_SGD) + ' respectively.')
print('')
TN_LOG = score_4[0][0]
FP_LOG = score_4[0][1]
FN_LOG = score_4[1][0]
TP_LOG = score_4[1][1]
print('The estimated TN, TP, FN and TP values for the predictions from the logistic classifier are: ' 
      + str(TN_LOG) + ', '+ str(FP_LOG) + ', '+ str(FN_LOG) + ' and '+ str(TP_LOG) + ' respectively.')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[[316   9]
 [  2 287]]

[[318   7]
 [  4 285]]

The estimated TN, TP, FN and TP values for the predictions from the SGD classifier are: 316, 9, 2 and 287 respectively.

The estimated TN, TP, FN and TP values for the predictions from the logistic classifier are: 318, 7, 4 and 285 respectively.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


#### c. Precision, Recall, F1 score

In [7]:
precision_SGD = TP_SGD/(TP_SGD+FP_SGD)
recall_SGD = TP_SGD/(TP_SGD+FN_SGD)
F1_SCORE_SGD = 2* ((precision_SGD*recall_SGD)/(precision_SGD+recall_SGD))
print('Precision value for SGD classifier is '+ str(round(precision_SGD,3))+'.')
print('')
print('Recall value for SGD classifier is '+str(round(recall_SGD,3))+'.')
print('')
print('F1 score for SGD classifier is '+str(round(F1_SCORE_SGD,3))+'.')
print('')
print('-------------------------------------------------------------')
print('')
precision_logistic = TP_LOG/(TP_LOG+FP_LOG)
recall_logistic = TP_LOG/(TP_LOG+FN_LOG)
F1_SCORE_logistic = 2* ((precision_logistic*recall_logistic)/(precision_logistic+recall_logistic))
print('Precision value for logistic classifier is '+ str(round(precision_logistic,3))+'.')
print('')
print('Recall value for logistic classifier is '+str(round(recall_logistic,3))+'.')
print('')
print('F1 score for logistic classifier is '+str(round(F1_SCORE_logistic,3))+'.')
print('')

Precision value for SGD classifier is 0.97.

Recall value for SGD classifier is 0.993.

F1 score for SGD classifier is 0.981.

-------------------------------------------------------------

Precision value for logistic classifier is 0.976.

Recall value for logistic classifier is 0.986.

F1 score for logistic classifier is 0.981.

