# Machine Learning: Classification - Stability of the Grid System

In [1]:
# Electrical grids require a balance between electricity supply and demand in order to be stable. Conventional systems achieve 
# this balance through demand-driven electricity production. For future grids with a high share of inflexible (i.e., renewable)
# energy source, the concept of demand response is a promising solution. This implies changes in electricity consumption in 
# relation to electricity price changes. In this work, we’ll build a binary classification model to predict if a grid is stable 
# or unstable using the UCI Electrical Grid Stability Simulated dataset.

#Dataset: https://archive.ics.uci.edu/ml/datasets/Electrical+Grid+Stability+Simulated+Data+

In [2]:
# import the relevant libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut

In [5]:
from imblearn.over_sampling import SMOTE

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [10]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [11]:
from sklearn import metrics

## Loading the raw data

In [12]:
df = pd.read_csv('Data_for_UCI_named.csv')

In [13]:
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


## Data Cleaning

In [14]:
# check distribution of target variable

df['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [15]:
# check for null values

df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

## Train test split

In [16]:
#split the dataset into the training and testing dataset

#drop the target variables
x = df.drop(columns=['stab', 'stabf'])

#Because of the direct relationship between 'stab' and 'stabf' ('stabf' = 'stable' if 'stab' <= 0, 'unstable' otherwise), 
#'stab' would be dropped and 'stabf' will remain as the sole dependent variable
y= df['stabf']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [17]:
# check distribution of the classes

y_train.value_counts()

unstable    5092
stable      2908
Name: stabf, dtype: int64

## Standardize the variable

In [18]:
#standardize our dataset to a common scale using the standard scaler
scaler = StandardScaler()

#normalise the train dataset
normalised_train_df = scaler.fit_transform(X_train)

#create a dataframe for the normalised_train_df
normalised_train_df = pd.DataFrame(normalised_train_df, columns = X_train.columns)

In [19]:
#normalise the test dataset
normalised_test_df = scaler.transform(X_test)

#create a dataframe for the normalised_train_df
normalised_test_df = pd.DataFrame(normalised_test_df, columns = X_test.columns)

## Building a random forest classifier model (RFC)

### Training and predicting

In [20]:
rfc = RandomForestClassifier(random_state=1)

#fit the model to the training dataset
rfc.fit(normalised_train_df, y_train)

#obtain predictions
predictions_rfc = rfc.predict(normalised_test_df)

### Feature importance (RFC)

In [21]:
# check the feature importance of the model

featimp_rfc = zip(rfc.feature_importances_, x.columns)
sorted(featimp_rfc, reverse=True)

[(0.12630691876118694, 'tau2'),
 (0.12443249059568204, 'tau1'),
 (0.12205745554259492, 'tau3'),
 (0.12140144616612955, 'tau4'),
 (0.10032539728386598, 'g3'),
 (0.09768940997668532, 'g2'),
 (0.09704814283764611, 'g4'),
 (0.0907647860703433, 'g1'),
 (0.031411259083211865, 'p2'),
 (0.029955735253050415, 'p3'),
 (0.029474690454250125, 'p4'),
 (0.029132267975353385, 'p1')]

### Evaluation (RFC)

In [22]:
print(metrics.classification_report(y_test, predictions_rfc, digits=4))

              precision    recall  f1-score   support

      stable     0.9191    0.8778    0.8980       712
    unstable     0.9341    0.9573    0.9456      1288

    accuracy                         0.9290      2000
   macro avg     0.9266    0.9176    0.9218      2000
weighted avg     0.9288    0.9290    0.9286      2000



In [23]:
print(metrics.confusion_matrix(y_test, predictions_rfc))

[[ 625   87]
 [  55 1233]]


### Accuracy (RFC)

In [24]:
accuracy_rfc = metrics.accuracy_score(y_test, predictions_rfc)

print('Accuracy: {}'.format(round(accuracy_rfc * 100, 2)))

Accuracy: 92.9


### Precision (RFC)

In [25]:
precision_rfc = metrics.precision_score(y_test, predictions_rfc, pos_label = ('stable'))

print('Precision: {}'.format(round(precision_rfc * 100, 2)))

Precision: 91.91


### Recall (RFC)

In [26]:
recall_rfc = metrics.recall_score(y_test, predictions_rfc, pos_label = ('stable'))

print('Recall: {}'.format(round(recall_rfc * 100, 2)))

Recall: 87.78


### F1-Score (RFC)

In [27]:
f1_rfc = metrics.f1_score(y_test, predictions_rfc, pos_label = ('stable'))

print('F1: {}'.format(round(f1_rfc * 100, 2)))

F1: 89.8


## Building an extra trees classifier (ETC)

### Training and predicting

In [28]:
etc = ExtraTreesClassifier(random_state=1)

#fit the model to the training dataset
etc.fit(normalised_train_df, y_train)

#obtain predictions
predictions_etc = etc.predict(normalised_test_df)

### Feature importance (ETC)

In [29]:
# check the feature importance of the model

featimp_etc = zip(etc.feature_importances_, x.columns)
sorted(featimp_etc, reverse=True)

[(0.11844468079199041, 'tau2'),
 (0.11739736493320078, 'tau1'),
 (0.11546569217199552, 'tau4'),
 (0.11316851246674982, 'tau3'),
 (0.09688268324346265, 'g3'),
 (0.09401881529815702, 'g4'),
 (0.09367635844551439, 'g2'),
 (0.08978290601483987, 'g1'),
 (0.040706278296424536, 'p3'),
 (0.040578638540044426, 'p4'),
 (0.04037131556379323, 'p2'),
 (0.039506754233827476, 'p1')]

### Evaluation (ETC)

In [30]:
print(metrics.classification_report(y_test, predictions_etc, digits=4))

              precision    recall  f1-score   support

      stable     0.9410    0.8511    0.8938       712
    unstable     0.9218    0.9705    0.9455      1288

    accuracy                         0.9280      2000
   macro avg     0.9314    0.9108    0.9197      2000
weighted avg     0.9287    0.9280    0.9271      2000



In [31]:
print(metrics.confusion_matrix(y_test, predictions_etc))

[[ 606  106]
 [  38 1250]]


### Accuracy (ETC)

In [32]:
accuracy_etc = metrics.accuracy_score(y_test, predictions_etc)

print('Accuracy: {}'.format(round(accuracy_etc * 100, 2)))

Accuracy: 92.8


### Precision (ETC)

In [33]:
precision_etc = metrics.precision_score(y_test, predictions_etc, pos_label = ('stable'))

print('Precision: {}'.format(round(precision_etc * 100, 2)))

Precision: 94.1


### Recall (ETC)

In [34]:
recall_etc = metrics.recall_score(y_test, predictions_etc, pos_label = ('stable'))

print('Recall: {}'.format(round(recall_etc * 100, 2)))

Recall: 85.11


### F1-Score (ETC)

In [35]:
f1_etc = metrics.f1_score(y_test, predictions_etc, pos_label = ('stable'))

print('F1: {}'.format(round(f1_etc * 100, 2)))

F1: 89.38


## Building an extra trees classifier with randomized cross validation search

### Training and predicting

In [36]:
#Hyperparameter tuning
param_distributions = {'n_estimators': [50, 100, 300, 500, 1000], 'min_samples_split': [2,3,5,7,9], 
                       'min_samples_leaf': [1,2,4,6,8], 'max_features':['auto', 'sqrt', 'log2', None],'random_state': [1]}

randomcv = RandomizedSearchCV(ExtraTreesClassifier(), param_distributions, n_iter=10, scoring='accuracy', n_jobs=-1, refit=True,
                              cv=5, verbose=1, random_state=1)

#fit the model to the training dataset
randomcv.fit(normalised_train_df, y_train)

#obtain predictions
predictions_randomcv = randomcv.predict(normalised_test_df)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.2min finished


### Best parameter distribution and feature importance (Extra Trees - RandomizedCV)

In [37]:
#check the best hyperparameter distribution from the randomizedsearchCV

randomcv.best_params_

{'random_state': 1,
 'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [38]:
randomcv.best_estimator_

ExtraTreesClassifier(max_features=None, min_samples_leaf=8, n_estimators=1000,
                     random_state=1)

In [39]:
# check the feature importance of the model

randomcv.best_estimator_.feature_importances_

array([0.13723975, 0.1405075 , 0.13468029, 0.13541676, 0.00368342,
       0.00533686, 0.00542927, 0.00496249, 0.10256244, 0.10757765,
       0.11306268, 0.10954089])

In [40]:
featimp_randomcv = zip(randomcv.best_estimator_.feature_importances_, x.columns)
sorted(featimp_randomcv, reverse=True)

[(0.14050750384993677, 'tau2'),
 (0.13723974766109256, 'tau1'),
 (0.1354167630909727, 'tau4'),
 (0.13468028520386593, 'tau3'),
 (0.11306267999167334, 'g3'),
 (0.10954089174337298, 'g4'),
 (0.10757764577478764, 'g2'),
 (0.10256244080927947, 'g1'),
 (0.005429268421191957, 'p3'),
 (0.005336864710946151, 'p2'),
 (0.004962486591192238, 'p4'),
 (0.003683422151688322, 'p1')]

### Evaluation (Extra Trees - RandomizedCV)

In [41]:
print(metrics.classification_report(y_test, predictions_randomcv, digits=4))

              precision    recall  f1-score   support

      stable     0.9211    0.8694    0.8945       712
    unstable     0.9300    0.9589    0.9442      1288

    accuracy                         0.9270      2000
   macro avg     0.9256    0.9141    0.9193      2000
weighted avg     0.9268    0.9270    0.9265      2000



In [42]:
print(metrics.confusion_matrix(y_test, predictions_randomcv))

[[ 619   93]
 [  53 1235]]


### Accuracy (Extra Trees - RandomizedCV)

In [43]:
accuracy_randomcv = metrics.accuracy_score(y_test, predictions_randomcv)

print('Accuracy: {}'.format(round(accuracy_randomcv * 100, 2)))

Accuracy: 92.7


### Precision (Extra Trees - RandomizedCV)

In [44]:
precision_randomcv = metrics.precision_score(y_test, predictions_randomcv, pos_label = ('stable'))

print('Precision: {}'.format(round(precision_randomcv * 100, 2)))

Precision: 92.11


### Recall (Extra Trees - RandomizedCV)

In [45]:
recall_randomcv = metrics.recall_score(y_test, predictions_randomcv, pos_label = ('stable'))

print('Recall: {}'.format(round(recall_randomcv * 100, 2)))

Recall: 86.94


### F1-Score (Extra Trees - RandomizedCV)

In [46]:
f1_randomcv = metrics.f1_score(y_test, predictions_randomcv, pos_label = ('stable'))

print('F1: {}'.format(round(f1_randomcv * 100, 2)))

F1: 89.45


## Building an extreme gradient boosting model classifier (XGBC)

### Training and predicting

In [47]:
xgbc = XGBClassifier(max_depth=3, learning_rate=0.1, random_state=1)

#fit the model to the training dataset
xgbc.fit(normalised_train_df, y_train)

#obtain predictions
predictions_xgbc = xgbc.predict(normalised_test_df)

### Feature importance (XGBC)

In [48]:
# check the feature importance of the model

featimp_xgbc = zip(xgbc.feature_importances_, x.columns)
sorted(featimp_xgbc, reverse=True)

[(0.1479534, 'tau1'),
 (0.14326456, 'tau3'),
 (0.13692562, 'tau4'),
 (0.13667291, 'tau2'),
 (0.11037519, 'g4'),
 (0.10792209, 'g3'),
 (0.10484883, 'g1'),
 (0.10143146, 'g2'),
 (0.010605911, 'p1'),
 (0.0, 'p4'),
 (0.0, 'p3'),
 (0.0, 'p2')]

### Evaluation (XGBC)

In [49]:
print(metrics.classification_report(y_test, predictions_xgbc, digits=4))

              precision    recall  f1-score   support

      stable     0.9206    0.8469    0.8822       712
    unstable     0.9190    0.9596    0.9389      1288

    accuracy                         0.9195      2000
   macro avg     0.9198    0.9033    0.9105      2000
weighted avg     0.9195    0.9195    0.9187      2000



In [50]:
print(metrics.confusion_matrix(y_test, predictions_xgbc))

[[ 603  109]
 [  52 1236]]


### Accuracy (XGBC)

In [51]:
accuracy_xgbc = metrics.accuracy_score(y_test, predictions_xgbc)

print('Accuracy: {}'.format(round(accuracy_xgbc * 100, 2)))

Accuracy: 91.95


### Precision (XGBC)

In [52]:
precision_xgbc = metrics.precision_score(y_test, predictions_xgbc, pos_label = ('stable'))

print('Precision: {}'.format(round(precision_xgbc * 100, 2)))

Precision: 92.06


### Recall (XGBC)

In [53]:
recall_xgbc = metrics.recall_score(y_test, predictions_xgbc, pos_label = ('stable'))

print('Recall: {}'.format(round(recall_xgbc * 100, 2)))

Recall: 84.69


### F1-Score (XGBC)

In [54]:
f1_xgbc = metrics.f1_score(y_test, predictions_xgbc, pos_label = ('stable'))

print('F1: {}'.format(round(f1_xgbc * 100, 2)))

F1: 88.22


## Building a light gradient boosting model (LGBM)

### Training and predicting

In [55]:
lgbm = LGBMClassifier(random_state=1)

#fit the model to the training dataset
lgbm.fit(normalised_train_df, y_train)

#obtain predictions
predictions_lgbm = lgbm.predict(normalised_test_df)

### Feature importance (LGBM)

In [56]:
# check the feature importance of the model

featimp_lgbm = zip(lgbm.feature_importances_, x.columns)
sorted(featimp_lgbm, reverse=True)

[(401, 'tau4'),
 (400, 'tau1'),
 (373, 'tau2'),
 (364, 'tau3'),
 (350, 'g2'),
 (349, 'g1'),
 (345, 'g4'),
 (339, 'g3'),
 (25, 'p2'),
 (23, 'p3'),
 (16, 'p1'),
 (15, 'p4')]

### Evaluation (LGBM)

In [57]:
print(metrics.classification_report(y_test, predictions_lgbm, digits=4))

              precision    recall  f1-score   support

      stable     0.9297    0.8919    0.9104       712
    unstable     0.9415    0.9627    0.9520      1288

    accuracy                         0.9375      2000
   macro avg     0.9356    0.9273    0.9312      2000
weighted avg     0.9373    0.9375    0.9372      2000



In [58]:
print(metrics.confusion_matrix(y_test, predictions_lgbm))

[[ 635   77]
 [  48 1240]]


### Accuracy (LGBM)

In [59]:
accuracy_lgbm = metrics.accuracy_score(y_test, predictions_lgbm)

print('Accuracy: {}'.format(round(accuracy_lgbm * 100, 2)))

Accuracy: 93.75


### Precision (LGBM)

In [60]:
precision_lgbm = metrics.precision_score(y_test, predictions_lgbm, pos_label = ('stable'))

print('Precision: {}'.format(round(precision_lgbm * 100, 2)))

Precision: 92.97


### Recall (LGBM)

In [61]:
recall_lgbm = metrics.recall_score(y_test, predictions_lgbm, pos_label = ('stable'))

print('Recall: {}'.format(round(recall_lgbm * 100, 2)))

Recall: 89.19


### F1-Score (LGBM)

In [62]:
f1_lgbm = metrics.f1_score(y_test, predictions_lgbm, pos_label = ('stable'))

print('F1: {}'.format(round(f1_lgbm * 100, 2)))

F1: 91.04


## Comparing Evaluation reports for the 'stable' class

In [63]:
#create a dictionary of the various evaluation metrics of class 'stable'
stable_dict = {'Classifier': ['RFC', 'ETC', 'ExtraTreeRandomizedCV', 'XGBC', 'LGBM'], 'Accuracy': [accuracy_rfc, accuracy_etc, 
                accuracy_randomcv, accuracy_xgbc, accuracy_lgbm],'Precision': [precision_rfc, precision_etc, precision_randomcv,
                precision_xgbc, precision_lgbm], 'Recall': [recall_rfc, recall_etc, recall_randomcv, recall_xgbc, recall_lgbm],
               'F1': [f1_rfc, f1_etc, f1_randomcv, f1_xgbc, f1_lgbm]}

#create a dataframe for the stable_dict
result = pd.DataFrame(stable_dict)
result.set_index('Classifier', inplace=True)

result

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RFC,0.929,0.919118,0.877809,0.897989
ETC,0.928,0.940994,0.851124,0.893805
ExtraTreeRandomizedCV,0.927,0.921131,0.869382,0.894509
XGBC,0.9195,0.920611,0.84691,0.882224
LGBM,0.9375,0.929722,0.891854,0.910394
