# Machine Learning Classification - Managing the Quality Metric of Global Ecological Footprint

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


%matplotlib inline

In [2]:
df = pd.read_csv('C:/Users/Cyriaque Matthews/Documents/datasets/Hamoye Datasets/ML Classification - Managing the Quality Metric of Global Ecological Footprint/electricalGridStability.csv')

#### Data Set Description
Electrical Grid Stability Simulated Data Set. The local stability of the 4-node star systems implementing Decentral Smart Grid Control concept
* Data Set Characteristics: Multivariate
* Number of Instances: 10000
* Number of Attributes: 14
* Missing Values? N/A

#### Attribute Information
Predictive features
1. `tau[x]`: reaction time of participant (real from the range [0.5,10]s). Tau1 - the value for electricity producer.
2. `p[x]`: nominal power consumed(negative)/produced(positive)(real). For consumers from the range [-0.5,-2]s^-2; p1 = abs(p2 + p3 + p4)
3. `g[x]`: coefficient (gamma) proportional to price elasticity (real from the range [0.05,1]s^-1). g1 - the value for electricity producer.

Outcome variables
1. `stab`: the maximal real part of the characteristic equation root (if positive - the system is linearly unstable)(real)
2. `stabf`: the stability label of the system (categorical: stable/unstable)

In [3]:
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [5]:
df.nunique()

tau1     10000
tau2     10000
tau3     10000
tau4     10000
p1       10000
p2       10000
p3       10000
p4       10000
g1       10000
g2       10000
g3       10000
g4       10000
stab     10000
stabf        2
dtype: int64

In [6]:
df.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [7]:
# drop off 'stab' because it is directly related to 'stabf'
df = df.drop('stab', axis =1)

In [8]:
df.describe()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.25,5.250001,5.250004,5.249997,3.75,-1.25,-1.25,-1.25,0.525,0.525,0.525,0.525
std,2.742548,2.742549,2.742549,2.742556,0.75216,0.433035,0.433035,0.433035,0.274256,0.274255,0.274255,0.274255
min,0.500793,0.500141,0.500788,0.500473,1.58259,-1.999891,-1.999945,-1.999926,0.050009,0.050053,0.050054,0.050028
25%,2.874892,2.87514,2.875522,2.87495,3.2183,-1.624901,-1.625025,-1.62496,0.287521,0.287552,0.287514,0.287494
50%,5.250004,5.249981,5.249979,5.249734,3.751025,-1.249966,-1.249974,-1.250007,0.525009,0.525003,0.525015,0.525002
75%,7.62469,7.624893,7.624948,7.624838,4.28242,-0.874977,-0.875043,-0.875065,0.762435,0.76249,0.76244,0.762433
max,9.999469,9.999837,9.99945,9.999443,5.864418,-0.500108,-0.500072,-0.500025,0.999937,0.999944,0.999982,0.99993


In [9]:
df['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

### Preprocessing and Target and Feature Variables Selection

In [10]:
X = df.drop(columns=['stabf'])
y = df.stabf

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=1)

In [11]:
print('X_train shape: {}'.format(X_train.shape))
print('y_train shape: {}'.format(y_train.shape))
print('X_test shape: {}'.format(X_test.shape))
print('y_test shape: {}'.format(y_test.shape))

X_train shape: (8000, 12)
y_train shape: (8000,)
X_test shape: (2000, 12)
y_test shape: (2000,)


In [12]:
#transform train and test set using standard scaler

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns)

### Building Models and Evaluation
#### RandomForestClassifier

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 1)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)

#model accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, rf_pred)
print('Accuracy: {}'.format(round(accuracy*100), 4))

from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix, classification_report

#precision
precision = precision_score(y_test, rf_pred, pos_label='stable')
print('Precision: {}'.format(round(precision*100), 2))  

#recall
recall = recall_score(y_test, rf_pred, pos_label='stable')
print('Recall: {}'.format(round(recall*100), 2))

#F1 score
f1 = f1_score(y_test, rf_pred, pos_label='stable')
print('F1: {}'.format(round(f1*100), 2))

#classification report
print('Classification Report:\n', classification_report(y_test,rf_pred, digits =4))

#confusion matrix
rf_cnf_mat = confusion_matrix(y_test, rf_pred, labels=['unstable', 'stable'])
print('Confusion Matrix:\n', rf_cnf_mat)

Accuracy: 93
Precision: 92
Recall: 88
F1: 90
Classification Report:
               precision    recall  f1-score   support

      stable     0.9191    0.8778    0.8980       712
    unstable     0.9341    0.9573    0.9456      1288

    accuracy                         0.9290      2000
   macro avg     0.9266    0.9176    0.9218      2000
weighted avg     0.9288    0.9290    0.9286      2000

Confusion Matrix:
 [[1233   55]
 [  87  625]]


#### LGBM Classifier

In [14]:
from lightgbm import LGBMClassifier

lgbm= LGBMClassifier(random_state = 1)
lgbm.fit(X_train_scaled, y_train)
lgbm_pred = lgbm.predict(X_test_scaled)

#model accuracy
lgbm_accuracy = accuracy_score(y_test, lgbm_pred)
print('Accuracy: {}'.format(round(lgbm_accuracy*100), 2))

#precision
lgbm_precision = precision_score(y_test, lgbm_pred, pos_label='stable')
print('Precision: {}'.format(round(lgbm_precision*100), 2))  

#recall
lgbm_recall = recall_score(y_test, lgbm_pred, pos_label='stable')
print('Recall: {}'.format(round(lgbm_recall*100), 2))

#F1 score
lgbm_f1 = f1_score(y_test, lgbm_pred, pos_label='stable')
print('F1: {}'.format(round(lgbm_f1*100), 2))

#classification report
print('Classification Report:\n', classification_report(y_test,lgbm_pred, digits =4))

#confusion matrix
lgbm_cnf_mat = confusion_matrix(y_test, lgbm_pred)
print('Confusion Matrix:\n', lgbm_cnf_mat)

Accuracy: 94
Precision: 93
Recall: 90
F1: 91
Classification Report:
               precision    recall  f1-score   support

      stable     0.9276    0.9003    0.9138       712
    unstable     0.9458    0.9612    0.9534      1288

    accuracy                         0.9395      2000
   macro avg     0.9367    0.9307    0.9336      2000
weighted avg     0.9393    0.9395    0.9393      2000

Confusion Matrix:
 [[ 641   71]
 [  50 1238]]


#### ExtraTreesClassifier

In [15]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(random_state = 1)
etc.fit(X_train_scaled, y_train)
etc_pred = etc.predict(X_test_scaled)

#model accuracy
etc_accuracy = accuracy_score(y_test, etc_pred)
print('Accuracy: {}'.format(round(etc_accuracy*100), 2))

#precision
etc_precision = precision_score(y_test, etc_pred, pos_label='stable')
print('Precision: {}'.format(round(etc_precision*100), 2))  

#recall
etc_recall = recall_score(y_test, etc_pred, pos_label='stable')
print('Recall: {}'.format(round(etc_recall*100), 2))

#F1 score
etc_f1 = f1_score(y_test, etc_pred, pos_label='stable')
print('F1: {}'.format(round(etc_f1*100), 2))

#classification report
print('Classification Report:\n', classification_report(y_test,etc_pred, digits =4))

#confusion matrix
etc_cnf_mat = confusion_matrix(y_test, etc_pred)
print('Confusion Matrix:\n', etc_cnf_mat)

Accuracy: 93
Precision: 94
Recall: 85
F1: 89
Classification Report:
               precision    recall  f1-score   support

      stable     0.9410    0.8511    0.8938       712
    unstable     0.9218    0.9705    0.9455      1288

    accuracy                         0.9280      2000
   macro avg     0.9314    0.9108    0.9197      2000
weighted avg     0.9287    0.9280    0.9271      2000

Confusion Matrix:
 [[ 606  106]
 [  38 1250]]


#### XGBoost

In [17]:
from xgboost import XGBClassifier

xbc= XGBClassifier(random_state = 1)
xbc.fit(X_train_scaled, y_train)
xbc_pred = xbc.predict(X_test_scaled)

#model accuracy
xbc_accuracy = accuracy_score(y_test, xbc_pred)
print('Accuracy: {}'.format(round(xbc_accuracy*100), 2))

#precision
xbc_precision = precision_score(y_test, xbc_pred, pos_label='stable')
print('Precision: {}'.format(round(xbc_precision*100), 2))  

#recall
xbc_recall = recall_score(y_test, xbc_pred, pos_label='stable')
print('Recall: {}'.format(round(xbc_recall*100), 2))

#F1 score
xbc_f1 = f1_score(y_test, xbc_pred, pos_label='stable')
print('F1: {}'.format(round(xbc_f1*100), 2))

#classification report
print('Classification Report:\n', classification_report(y_test,xbc_pred, digits =4))

#confusion matrix
xbc_cnf_mat = confusion_matrix(y_test, xbc_pred)
print('Confusion Matrix:\n', xbc_cnf_mat)



Accuracy: 95
Precision: 94
Recall: 91
F1: 92
Classification Report:
               precision    recall  f1-score   support

      stable     0.9351    0.9101    0.9224       712
    unstable     0.9510    0.9651    0.9580      1288

    accuracy                         0.9455      2000
   macro avg     0.9430    0.9376    0.9402      2000
weighted avg     0.9453    0.9455    0.9453      2000

Confusion Matrix:
 [[ 648   64]
 [  45 1243]]


##### Improving ExtraTreesClassifier

In [18]:
#combination of hyperparameters
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,'max_features': max_features}

from sklearn.model_selection import RandomizedSearchCV

randomcv = RandomizedSearchCV(estimator = etc,  param_distributions = hyperparameter_grid, 
                              cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1,
                              random_state = 1)
search = randomcv.fit(X_train_scaled, y_train)
# get best parameters
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [19]:
#Evaluate ExtraTreesClassifier on test set using  best params
etc2 = ExtraTreesClassifier(max_features = None, 
                            min_samples_leaf= 8,
                            min_samples_split= 2,
                            n_estimators= 1000, 
                            random_state = 1)

#fit on train set
etc2.fit(X_train_scaled, y_train)
etc2_pred = etc2.predict(X_test_scaled)

#classification report
print('Classification Report:\n', classification_report(y_test,etc2_pred, digits =4))

Classification Report:
               precision    recall  f1-score   support

      stable     0.9211    0.8694    0.8945       712
    unstable     0.9300    0.9589    0.9442      1288

    accuracy                         0.9270      2000
   macro avg     0.9256    0.9141    0.9193      2000
weighted avg     0.9268    0.9270    0.9265      2000



In [20]:
#check feature importances
importance = etc2.feature_importances_

xyz=pd.DataFrame(importance, index=X_train_scaled.columns, columns=['Importance'])
print("Max Importance is of: ", xyz[['Importance']].idxmax())
print("Min Importance is of: ", xyz[['Importance']].idxmin())

Max Importance is of:  Importance    tau2
dtype: object
Min Importance is of:  Importance    p1
dtype: object
