In [28]:
#importing important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Attribute Information:

11 predictive attributes, 1 non-predictive(p1), 2 goal fields:
1. tau[x]: reaction time of participant (real from the range [0.5,10]s). Tau1 - the value for electricity producer.
2. p[x]: nominal power consumed(negative)/produced(positive)(real). For consumers from the range [-0.5,-2]s^-2; p1 = abs(p2 + p3 + p4)
3. g[x]: coefficient (gamma) proportional to price elasticity (real from the range [0.05,1]s^-1). g1 - the value for electricity producer.
4. stab: the maximal real part of the characteristic equation root (if positive - the system is linearly unstable)(real)
5. stabf: the stability label of the system (categorical: stable/unstable)


Predictive features:

'tau1' to 'tau4': the reaction time of each network participant, a real value within the range 0.5 to 10 ('tau1' corresponds to the supplier node, 'tau2' to 'tau4' to the consumer nodes);
'p1' to 'p4': nominal power produced (positive) or consumed (negative) by each network participant, a real value within the range -2.0 to -0.5 for consumers ('p2' to 'p4'). As the total power consumed equals the total power generated, p1 (supplier node) = - (p2 + p3 + p4);
'g1' to 'g4': price elasticity coefficient for each network participant, a real value within the range 0.05 to 1.00 ('g1' corresponds to the supplier node, 'g2' to 'g4' to the consumer nodes; 'g' stands for 'gamma');
Dependent variables:

'stab': the maximum real part of the characteristic differential equation root (if positive, the system is linearly unstable; if negative, linearly stable);
'stabf': a categorical (binary) label ('stable' or 'unstable').


In [128]:
#importing dataset
df=pd.read_csv("electricity_grid.csv")

In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
tau1     10000 non-null float64
tau2     10000 non-null float64
tau3     10000 non-null float64
tau4     10000 non-null float64
p1       10000 non-null float64
p2       10000 non-null float64
p3       10000 non-null float64
p4       10000 non-null float64
g1       10000 non-null float64
g2       10000 non-null float64
g3       10000 non-null float64
g4       10000 non-null float64
stab     10000 non-null float64
stabf    10000 non-null object
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [130]:
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


Because of the direct relationship between 'stab' and 'stabf' ('stabf' = 'stable' if 'stab' <= 0, 'unstable' otherwise), 'stab' should be dropped and 'stabf' will remain as the sole dependent variable (binary classification).

In [131]:
df.shape

(10000, 14)

In [132]:
#droping the stab column
df.drop(columns="stab",axis=1,inplace=True)

In [133]:
#to see if the target label is balanced
df.stabf.value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [134]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

In [135]:
#seperating the dataset into predictor and target label
x=df.iloc[:,:12]
y=df.iloc[:,12:]

In [136]:
y

Unnamed: 0,stabf
0,unstable
1,stable
2,unstable
3,unstable
4,unstable
...,...
9995,unstable
9996,stable
9997,stable
9998,unstable


In [137]:
#train_test_split into train and test spit using 80% of the data as train
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=1)

In [138]:
#normalize the data to put all the values in the same range
scaler=StandardScaler().fit(x_train)
x_trained_scaled=scaler.transform(x_train)
x_test_scaled=scaler.transform(x_test)

In [139]:
y_test

Unnamed: 0,stabf
9953,unstable
3850,unstable
4962,stable
3886,stable
5437,unstable
...,...
3919,stable
162,stable
7903,stable
2242,unstable


Model building

In [140]:
#random forest
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(random_state=1)
rf.fit(x_trained_scaled,y_train)
rf.score(x_test_scaled,y_test)


  after removing the cwd from sys.path.


0.929

In [106]:
#y=df.stabf=="stable"

In [16]:
#x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.2)
#scaler=StandardScaler().fit(x_train)
#x_trained_dummy=scaler.transform(x_train)
#x_test_dummy=scaler.transform(x_test)

In [17]:
#to test if the imbalane of the dataset has an effect:
#from sklearn.dummy import DummyClassifier
#dummy_majority = DummyClassifier(strategy='most_frequent').fit(x_trained_dummy, y_train)
#frequent = dummy_majority.predict(x_test_dummy)
#print("Unique predicted labels: {}".format(np.unique(frequent)))
#print("Test score: {:.2f}".format(dummy_majority.score(x_test_dummy, y_test)))

even without learning anything the model gave a prediction of 64%. More reasons why accuracy is not always a good evaluation metric

In [107]:
#Confusion Matrix
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score,confusion_matrix
new_predictions = rf.predict(x_test_scaled)
cnf_mat = confusion_matrix(y_true=y_test, y_pred=new_predictions, labels=[ 'stable' , 'unstable' ])
cnf_mat

array([[ 625,   87],
       [  55, 1233]], dtype=int64)

Question 14

In [119]:
#Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=new_predictions)
print( 'Accuracy: {}' .format(round(accuracy,4) )) 

#Recall
recall = recall_score(y_true=y_test, y_pred=new_predictions, pos_label= 'stable' )
print( 'Recall: {}' .format(round(recall* 100 ), 2 )) 
#precision
precision = precision_score(y_true=y_test, y_pred=new_predictions, pos_label= 'stable' )
print( 'Precision: {}' .format(round(precision* 100 ), 2 ))

#f1_score
f1_scores=f1_score(y_true=y_test,y_pred=new_predictions,pos_label='stable')
print(f1_scores)

Accuracy: 0.929
Recall: 88.0
Precision: 92.0
0.8979885057471264


In [47]:
from sklearn.metrics import classification_report

In [48]:
#to get a summary of the whole report 
print(classification_report(y_test, new_predictions,
 target_names=["stable", "unstable"]))


              precision    recall  f1-score   support

      stable       0.92      0.88      0.90       712
    unstable       0.93      0.96      0.95      1288

    accuracy                           0.93      2000
   macro avg       0.93      0.92      0.92      2000
weighted avg       0.93      0.93      0.93      2000



Extra Trees Classifier

In [49]:
from sklearn.ensemble import ExtraTreesClassifier

In [141]:
ext=ExtraTreesClassifier(random_state=1)
ext.fit(x_trained_scaled,y_train)
ext.score(x_test_scaled,y_test)

  


0.928

Evaluatioin metric for extra tree classifier

In [142]:
ext_prediction=ext.predict(x_test_scaled)


In [143]:
#Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=ext_prediction)
print( 'Accuracy: {}' .format(round(accuracy,4) )) 

#Recall
recall = recall_score(y_true=y_test, y_pred=ext_prediction, pos_label= 'stable' )
print( 'Recall: {}' .format(round(recall* 100 ), 2 )) 

#precision
precision = precision_score(y_true=y_test, y_pred=ext_prediction, pos_label= 'stable' )
print( 'Precision: {}' .format(round(precision* 100 ), 2 ))

#f1_score
f1_scores=f1_score(y_true=y_test,y_pred=ext_prediction,pos_label='stable')
print(f1_scores)

Accuracy: 0.928
Recall: 85.0
Precision: 94.0
0.8938053097345133


In [144]:
print(classification_report(y_test, ext_prediction,
 target_names=["stable", "unstable"]))

              precision    recall  f1-score   support

      stable       0.94      0.85      0.89       712
    unstable       0.92      0.97      0.95      1288

    accuracy                           0.93      2000
   macro avg       0.93      0.91      0.92      2000
weighted avg       0.93      0.93      0.93      2000



### XGBOOST

In [145]:
from xgboost.sklearn import XGBClassifier

In [146]:
xgb=XGBClassifier(random_state=1)
xgb.fit(x_trained_scaled,y_train)
xgb.score(x_test_scaled,y_test)

  return f(**kwargs)


0.9455

In [166]:
xgb_prediction=xgb.predict(x_test_scaled)

# Question 15

In [167]:
accuracy = accuracy_score(y_true=y_test, y_pred=xgb_prediction)
print( 'Accuracy: {}' .format(round(accuracy,4) )) 

#Recall
recall = recall_score(y_true=y_test, y_pred=xgb_prediction, pos_label= 'stable' )
print( 'Recall: {}' .format(round(recall* 100 ), 2 )) 

#precision
precision = precision_score(y_true=y_test, y_pred=xgb_prediction, pos_label= 'stable' )
print( 'Precision: {}' .format(round(precision* 100 ), 2 ))

#f1_score
f1_scores=f1_score(y_true=y_test,y_pred=xgb_prediction,pos_label='stable')
print(f1_scores)

Accuracy: 0.9455
Recall: 91.0
Precision: 94.0
0.9224199288256227


In [21]:
! pip install setuptools wheel numpy scipy scikit-learn -U --user

Requirement already up-to-date: setuptools in c:\users\homet\anaconda3\lib\site-packages (49.6.0)
Requirement already up-to-date: wheel in c:\users\homet\anaconda3\lib\site-packages (0.35.1)
Requirement already up-to-date: numpy in c:\users\homet\anaconda3\lib\site-packages (1.19.1)
Requirement already up-to-date: scipy in c:\users\homet\anaconda3\lib\site-packages (1.5.2)
Requirement already up-to-date: scikit-learn in c:\users\homet\anaconda3\lib\site-packages (0.23.2)


In [24]:
!pip install lightgbm

Collecting lightgbm
  Downloading https://files.pythonhosted.org/packages/1f/cb/a8ec24334c35a7d0c87b4e4e056bd2137573c7c1bd81c760b79a2f370254/lightgbm-2.3.1-py2.py3-none-win_amd64.whl (544kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.1


In [67]:
from lightgbm import LGBMClassifier

### Using Lightgbm

In [126]:
df=pd.read_csv('electricity_grid.csv')

df.drop(columns="stab",axis=1,inplace=True)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

#seperating the dataset into predictor and target label
x=df.iloc[:,:12]
y=df.iloc[:,12:]

y_binary=pd.get_dummies(y)
#lightgmb only accepts boolean,int or float as the labels
y_binary.drop('stabf_stable',axis=1,inplace=True)

x_train,x_test,y_train,y_test=train_test_split(x,y_binary,test_size=0.2, random_state=1)

#normalize the data to put all the values in the same range
scaler=StandardScaler().fit(x_train)
x_trained_scaled=scaler.transform(x_train)
x_test_scaled=scaler.transform(x_test)

Lgb = LGBMClassifier(random_state =1)

Lgb.fit(x_trained_scaled,y_train)
Lgb.score(x_test_scaled,y_test)


lgb_pred=Lgb.predict(x_test_scaled)


  return f(**kwargs)


### Question 16

In [127]:
accuracy = accuracy_score(y_true=y_test, y_pred=lgb_pred)
print( 'Accuracy: {}' .format(round(accuracy,4) )) 

#Recall
recall = recall_score(y_true=y_test, y_pred=lgb_pred)
print( 'Recall: {}' .format(round(recall* 100 ), 2 )) 

#precision
precision = precision_score(y_true=y_test, y_pred=lgb_pred)
print( 'Precision: {}' .format(round(precision* 100 ), 2 ))

#f1_score
f1_scores=f1_score(y_true=y_test,y_pred=lgb_pred)
print(f1_scores)

Accuracy: 0.9375
Recall: 96.0
Precision: 94.0
0.9520153550863725


### Using a random search cv on Extratrees classifier

In [75]:
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]



min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,
            

                       'max_features': max_features}

In [150]:
random_ext_search=RandomizedSearchCV(ext,param_distributions=hyperparameter_grid,cv=5,n_jobs=-1,
                                    scoring = 'accuracy', verbose = 1,random_state=1,n_iter=10)

In [151]:
random_ext_search.fit(x_trained_scaled,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.4min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=5, estimator=ExtraTreesClassifier(random_state=1),
                   n_jobs=-1,
                   param_distributions={'max_features': ['auto', 'sqrt', 'log2',
                                                         None],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 3, 5, 7, 9],
                                        'n_estimators': [50, 100, 300, 500,
                                                         1000]},
                   random_state=1, scoring='accuracy', verbose=1)

In [153]:
random_ext_search.score(x_test_scaled,y_test)

0.927

### Question 17

In [154]:
print(random_ext_search.best_params_)

{'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': None}


In [84]:
random_predict=random_ext_search.predict(x_test_scaled)

In [87]:
accuracy = accuracy_score(y_true=y_test, y_pred=random_predict)
print( 'Accuracy: {}' .format(round(accuracy* 100 ), 2 )) 

#Recall
recall = recall_score(y_true=y_test, y_pred=random_predict)
print( 'Recall: {}' .format(round(recall* 100 ), 2 )) 

#precision
precision = precision_score(y_true=y_test, y_pred=random_predict)
print( 'Precision: {}' .format(round(precision* 100 ), 2 ))

#f1_score
f1_scores=f1_score(y_true=y_test,y_pred=random_predict)
print(f1_scores)

Accuracy: 93.0
Recall: 95.0
Precision: 94.0
0.9446579554189085


### Question 18
 Retraining a new extra tree classifier based on the optimal parameters

In [155]:
ext2=ExtraTreesClassifier(random_state=1,n_estimators=1000, min_samples_split=2, min_samples_leaf= 8, max_features= None)
ext2.fit(x_trained_scaled,y_train)
ext2.score(x_test_scaled,y_test)

  


0.927

In [156]:
ex2_pred=ext2.predict(x_test_scaled)

In [159]:
accuracy = accuracy_score(y_true=y_test, y_pred=ex2_pred)
print( 'Accuracy: {}' .format(round(accuracy,4 ) )) 



Accuracy: 0.927


Question 20


In [161]:
print("Feature importances:\n{}".format(ext2.feature_importances_))

Feature importances:
[0.13723975 0.1405075  0.13468029 0.13541676 0.00368342 0.00533686
 0.00542927 0.00496249 0.10256244 0.10757765 0.11306268 0.10954089]


In [165]:
df.columns

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4', 'stabf'],
      dtype='object')