<a href="https://colab.research.google.com/github/ClaraOdiri/Hamoye-Data-Science-Internship/blob/master/Stage_C_Quiz_Clara_odiri.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Stage C Quiz

#### Dataset Description

Predictive features:

1. 'tau1' to 'tau4': the reaction time of each network participant, a real value within the range 0.5 to 10 ('tau1' corresponds to the supplier node, 'tau2' to 'tau4' to the consumer nodes);
2. 'p1' to 'p4': nominal power produced (positive) or consumed (negative) by each network participant, a real value within the range -2.0 to -0.5 for consumers ('p2' to 'p4'). As the total power consumed equals the total power generated, p1 (supplier node) = - (p2 + p3 + p4);
3. 'g1' to 'g4': price elasticity coefficient for each network participant, a real value within the range 0.05 to 1.00 ('g1' corresponds to the supplier node, 'g2' to 'g4' to the consumer nodes; 'g' stands for 'gamma');

Dependent variables:

1. 'stab': the maximum real part of the characteristic differential equation root (if positive, the system is linearly unstable; if negative, linearly stable);
2. 'stabf': a categorical (binary) label ('stable' or 'unstable').

In [2]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from google.colab import files
uploaded = files.upload()

Saving Data_for_UCI_named.csv to Data_for_UCI_named.csv


In [5]:
#load data into pandas dataframe

import io
uci_data = pd.read_csv(io.BytesIO(uploaded['Data_for_UCI_named.csv']))
uci_data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [6]:
#quick description of the data
uci_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [7]:
#drop 'stab' ecause of the direct relationship with 'stabf'
uci_data = uci_data.drop('stab', axis =1)

In [8]:
#check distribution of target variable
uci_data['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [9]:
uci_data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [10]:
#get features and labels
x = uci_data.drop(columns=['stabf'])

y = uci_data['stabf']

In [11]:
#Split the dataset
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

print('X_train shape: {}'.format(x_train.shape))
print('y_train shape: {}'.format(y_train.shape))
print('X_test shape: {}'.format(x_test.shape))
print('y_test shape: {}'.format(y_test.shape))

X_train shape: (8000, 12)
y_train shape: (8000,)
X_test shape: (2000, 12)
y_test shape: (2000,)


In [12]:
#transform train and test set using standard scaler
#ignore scaling for y since sklarn preprocessing ignores the y variable and it is also an object type.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [13]:
#put the scaled sets into a daataframe

x_train_scaled = pd.DataFrame(x_train_scaled, columns = x_train.columns)
x_test_scaled = pd.DataFrame(x_test_scaled, columns = x_test.columns)

### Building Models

#### RandomForestClassifier

In [14]:
#train a RandomForestClassifier 
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 1)

#fit on train set
rf.fit(x_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [15]:
#make predictions on test set
rf_pred = rf.predict(x_test_scaled)

#### Measuring Model Performance for RandomForestClassifier

In [54]:
#model accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, rf_pred)
print('Accuracy: {}'.format(round(accuracy*100), 4))

from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix, classification_report

#precision
precision = precision_score(y_test, rf_pred, pos_label='stable')
print('Precision: {}'.format(round(precision*100), 2))  

#recall
recall = recall_score(y_test, rf_pred, pos_label='stable')
print('Recall: {}'.format(round(recall*100), 2))

#F1 score
f1 = f1_score(y_test, rf_pred, pos_label='stable')
print('F1: {}'.format(round(f1*100), 2))

#classification report
print('Classification Report:\n', classification_report(y_test,rf_pred, digits =4))

#confusion matrix
rf_cnf_mat = confusion_matrix(y_test, rf_pred, labels=['unstable', 'stable'])
print('Confusion Matrix:\n', rf_cnf_mat)

Accuracy: 93.0
Precision: 92.0
Recall: 88.0
F1: 90.0
Classification Report:
               precision    recall  f1-score   support

      stable     0.9191    0.8778    0.8980       712
    unstable     0.9341    0.9573    0.9456      1288

    accuracy                         0.9290      2000
   macro avg     0.9266    0.9176    0.9218      2000
weighted avg     0.9288    0.9290    0.9286      2000

Confusion Matrix:
 [[1233   55]
 [  87  625]]


In [17]:
print("Training set score: {:.3f}".format(rf.score(x_train_scaled, y_train)))
print("Test set score: {:.3f}".format(rf.score(x_test_scaled, y_test)))


Training set score: 1.000
Test set score: 0.929


#### ExtraTreeClassifier

In [39]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(random_state = 1)

#fit on the train set
etc.fit(x_train_scaled, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=1, verbose=0,
                     warm_start=False)

In [40]:
#predict on test set
etc_pred = etc.predict(x_test_scaled)

#### Measuring Model Performance for ExtraTreeClassifier

In [41]:
#model accuracy
etc_accuracy = accuracy_score(y_test, etc_pred)
print('Accuracy: {}'.format(round(etc_accuracy*100), 2))

#precision
etc_precision = precision_score(y_test, etc_pred, pos_label='stable')
print('Precision: {}'.format(round(etc_precision*100), 2))  

#recall
etc_recall = recall_score(y_test, etc_pred, pos_label='stable')
print('Recall: {}'.format(round(etc_recall*100), 2))

#F1 score
etc_f1 = f1_score(y_test, etc_pred, pos_label='stable')
print('F1: {}'.format(round(etc_f1*100), 2))

#classification report
print('Classification Report:\n', classification_report(y_test,etc_pred, digits =4))

#confusion matrix
etc_cnf_mat = confusion_matrix(y_test, etc_pred)
print('Confusion Matrix:\n', etc_cnf_mat)


Accuracy: 93.0
Precision: 94.0
Recall: 85.0
F1: 89.0
Classification Report:
               precision    recall  f1-score   support

      stable     0.9410    0.8511    0.8938       712
    unstable     0.9218    0.9705    0.9455      1288

    accuracy                         0.9280      2000
   macro avg     0.9314    0.9108    0.9197      2000
weighted avg     0.9287    0.9280    0.9271      2000

Confusion Matrix:
 [[ 606  106]
 [  38 1250]]


In [47]:
print("Training set score: {:.3f}".format(etc.score(x_train_scaled, y_train)))
print("Test set score: {:.3f}".format(etc.score(x_test_scaled, y_test)))


Training set score: 1.000
Test set score: 0.928


#### XGBoost

In [33]:
from xgboost import XGBClassifier

xbc= XGBClassifier(random_state = 1)

#fit on train set
xbc.fit(x_train_scaled, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [34]:
#predict on test set
xbc_pred = xbc.predict(x_test_scaled)

#### Measuring Model Performace for XGB Classifier

In [56]:
#model accuracy
xbc_accuracy = accuracy_score(y_test, xbc_pred)
print('Accuracy: {}'.format(round(xbc_accuracy*100), 2))

#precision
xbc_precision = precision_score(y_test, xbc_pred, pos_label='stable')
print('Precision: {}'.format(round(xbc_precision*100), 2))  

#recall
xbc_recall = recall_score(y_test, xbc_pred, pos_label='stable')
print('Recall: {}'.format(round(xbc_recall*100), 2))

#F1 score
xbc_f1 = f1_score(y_test, xbc_pred, pos_label='stable')
print('F1: {}'.format(round(xbc_f1*100), 2))

#classification report
print('Classification Report:\n', classification_report(y_test,xbc_pred, digits =4))

#confusion matrix
c_cnf_mat = confusion_matrix(y_test, xbc_pred)
print('Confusion Matrix:\n', xbc_cnf_mat)


Accuracy: 92.0
Precision: 92.0
Recall: 85.0
F1: 88.0
Classification Report:
               precision    recall  f1-score   support

      stable     0.9206    0.8469    0.8822       712
    unstable     0.9190    0.9596    0.9389      1288

    accuracy                         0.9195      2000
   macro avg     0.9198    0.9033    0.9105      2000
weighted avg     0.9195    0.9195    0.9187      2000

Confusion Matrix:
 [[ 603  109]
 [  52 1236]]


### LightGBM Classifier

In [36]:
from lightgbm import LGBMClassifier

lgbm= LGBMClassifier(random_state = 1)

#fit on train set
lgbm.fit(x_train_scaled, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=1, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [37]:
#predict on test set
lgbm_pred = lgbm.predict(x_test_scaled)

#### Model Performance for LGBM Classifier

In [38]:
#model accuracy
lgbm_accuracy = accuracy_score(y_test, lgbm_pred)
print('Accuracy: {}'.format(round(lgbm_accuracy*100), 2))

#precision
lgbm_precision = precision_score(y_test, lgbm_pred, pos_label='stable')
print('Precision: {}'.format(round(lgbm_precision*100), 2))  

#recall
lgbm_recall = recall_score(y_test, lgbm_pred, pos_label='stable')
print('Recall: {}'.format(round(lgbm_recall*100), 2))

#F1 score
lgbm_f1 = f1_score(y_test, lgbm_pred, pos_label='stable')
print('F1: {}'.format(round(lgbm_f1*100), 2))

#classification report
print('Classification Report:\n', classification_report(y_test,lgbm_pred, digits =4))

#confusion matrix
lgbm_cnf_mat = confusion_matrix(y_test, lgbm_pred)
print('Confusion Matrix:\n', lgbm_cnf_mat)


Accuracy: 94.0
Precision: 93.0
Recall: 89.0
F1: 91.0
Classification Report:
               precision    recall  f1-score   support

      stable     0.9297    0.8919    0.9104       712
    unstable     0.9415    0.9627    0.9520      1288

    accuracy                         0.9375      2000
   macro avg     0.9356    0.9273    0.9312      2000
weighted avg     0.9373    0.9375    0.9372      2000

Confusion Matrix:
 [[ 635   77]
 [  48 1240]]


#### Improving ExtraTreesClassifier

In [42]:
#combination of hyperparameters
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}




In [57]:
from sklearn.model_selection import RandomizedSearchCV

#etc = ExtraTreesClassifier(random_state = 1)

#set up randomsearch with 5folds

randomcv = RandomizedSearchCV(estimator = etc, 
                              param_distributions = hyperparameter_grid, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1,
                              random_state = 1)


In [58]:
#fit on the training data
search = randomcv.fit(x_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.4min finished


In [59]:
#get best parameters
search.best_params_

{'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [60]:
#check feature importances
importance = etc2.feature_importances_

In [62]:
#print feature importances
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.13724
Feature: 1, Score: 0.14051
Feature: 2, Score: 0.13468
Feature: 3, Score: 0.13542
Feature: 4, Score: 0.00368
Feature: 5, Score: 0.00534
Feature: 6, Score: 0.00543
Feature: 7, Score: 0.00496
Feature: 8, Score: 0.10256
Feature: 9, Score: 0.10758
Feature: 10, Score: 0.11306
Feature: 11, Score: 0.10954


In [70]:
x_train_scaled.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.367327,-0.986042,0.650447,1.547527,-0.29149,0.061535,1.293862,-0.845074,0.160918,0.339859,0.585568,0.492239
1,-0.064659,0.089437,1.035079,-1.641494,0.619865,-0.067235,-1.502925,0.486613,-0.293143,-1.558488,1.429649,-1.443521
2,-1.46785,1.298418,-0.502536,1.166046,-0.180521,0.490603,0.68256,-0.855302,1.39935,1.451534,-1.045743,0.492489
3,0.820081,0.52992,1.299657,-1.141975,-0.812854,-0.763632,1.521579,0.65878,-0.958319,1.361958,1.60414,0.275303
4,0.665424,-1.425627,0.3123,0.919137,-1.614296,0.760315,1.422019,0.639243,1.676895,0.69566,1.137504,-1.312575


In [64]:
#get best score
search.best_score_

0.9241249999999999

In [65]:
#Evaluate ExtraTreesClassifier on test set using  best params
etc2 = ExtraTreesClassifier(max_features = None, 
                            min_samples_leaf= 8,
                            min_samples_split= 2,
                            n_estimators= 1000, 
                            random_state = 1)

#fit on train set
etc2.fit(x_train_scaled, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features=None,
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=8, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=1000,
                     n_jobs=None, oob_score=False, random_state=1, verbose=0,
                     warm_start=False)

In [66]:
#predict on test set
etc2_pred = etc2.predict(x_test_scaled)

In [68]:
#classification report
print('Classification Report:\n', classification_report(y_test,etc2_pred, digits =4))


Classification Report:
               precision    recall  f1-score   support

      stable     0.9211    0.8694    0.8945       712
    unstable     0.9300    0.9589    0.9442      1288

    accuracy                         0.9270      2000
   macro avg     0.9256    0.9141    0.9193      2000
weighted avg     0.9268    0.9270    0.9265      2000

