In [3]:
#Import all useful libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, LeaveOneOut, RandomizedSearchCV
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
%matplotlib inline

In [5]:
#Import the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv'
gridDF = pd.read_csv(url)
gridDF.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [6]:
#check for missing data
gridDF.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [7]:
#drop the stab column
grid_DF = gridDF.drop(columns = ['stab'])
grid_DF.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [9]:
#check the data distribution between the two classes
grid_DF.stabf.value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

Data Split and Feature Scaling

In [10]:
#split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(grid_DF.drop(columns = 'stabf'), grid_DF['stabf'], test_size = 0.20, random_state = 1)

In [12]:
#scale features with standard scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaledDF = pd.DataFrame(X_train_scaled, columns = X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test_scaledDF = pd.DataFrame(X_test_scaled, columns = X_test.columns)

In [57]:
#Train a random forest classifier and predict
RF = RandomForestClassifier(random_state = 1)
RF.fit(X_train_scaledDF, y_train)
RFpred = RF.predict(X_test_scaledDF)
RF_Eval_dict = {'accuracy':accuracy_score(y_test, RFpred), 'recall':recall_score(y_test, RFpred, pos_label = 'stable'),
               'precision':precision_score(y_test, RFpred, pos_label = 'stable'),'f1':f1_score(y_test, RFpred, pos_label = 'stable'),
               'Confusion Matrix': confusion_matrix(y_test, RFpred, labels = ['stable', 'unstable'])}
RF_Eval_dict
    
    

{'accuracy': 0.929,
 'recall': 0.8778089887640449,
 'precision': 0.9191176470588235,
 'f1': 0.8979885057471264,
 'Confusion Matrix': array([[ 625,   87],
        [  55, 1233]], dtype=int64)}

Model Training

In [22]:
#train an extra trees classifier and predict
ET = ExtraTreesClassifier(random_state = 1)
ET.fit(X_train_scaledDF, y_train)
ETpred = ET.predict(X_test_scaledDF)
ET_Eval_dict = {'accuracy':accuracy_score(y_test, ETpred), 'recall':recall_score(y_test, ETpred, pos_label = 'stable'),
               'precision':precision_score(y_test, ETpred, pos_label = 'stable'),'f1':f1_score(y_test, ETpred, pos_label = 'stable'),
               'Confusion Matrix': confusion_matrix(y_test, ETpred, labels = ['stable', 'unstable'])}
ET_Eval_dict

{'accuracy': 0.928,
 'recall': 0.851123595505618,
 'precision': 0.9409937888198758,
 'f1': 0.8938053097345133,
 'Confusion Matrix': array([[ 606,  106],
        [  38, 1250]], dtype=int64)}

In [23]:
#train an XGB classifier and predict
XG = XGBClassifier(random_state = 1)
XG.fit(X_train_scaledDF, y_train)
XGpred = XG.predict(X_test_scaledDF)
XG_Eval_dict = {'accuracy':accuracy_score(y_test, XGpred), 'recall':recall_score(y_test, XGpred, pos_label = 'stable'),
               'precision':precision_score(y_test, XGpred, pos_label = 'stable'),'f1':f1_score(y_test, XGpred, pos_label = 'stable'),
               'Confusion Matrix': confusion_matrix(y_test, XGpred, labels = ['stable', 'unstable'])}
XG_Eval_dict

{'accuracy': 0.9195,
 'recall': 0.8469101123595506,
 'precision': 0.9206106870229007,
 'f1': 0.8822238478419898,
 'Confusion Matrix': array([[ 603,  109],
        [  52, 1236]], dtype=int64)}

In [24]:
#train a LGB classifier and predict
LGBM = LGBMClassifier(random_state = 1)
LGBM.fit(X_train_scaledDF, y_train)
LGBMpred = LGBM.predict(X_test_scaledDF)
LGBM_Eval_dict = {'accuracy':accuracy_score(y_test, LGBMpred), 'recall':recall_score(y_test, LGBMpred, pos_label = 'stable'),
               'precision':precision_score(y_test, LGBMpred, pos_label = 'stable'),'f1':f1_score(y_test, LGBMpred, pos_label = 'stable'),
               'Confusion Matrix': confusion_matrix(y_test, LGBMpred, labels = ['stable', 'unstable'])}
LGBM_Eval_dict

{'accuracy': 0.9375,
 'recall': 0.8918539325842697,
 'precision': 0.9297218155197657,
 'f1': 0.910394265232975,
 'Confusion Matrix': array([[ 635,   77],
        [  48, 1240]], dtype=int64)}

RANDOMIZED CROSS VALIDATION SEARCH

In [25]:
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}

In [None]:
ET_CV = RandomizedSearchCV(ET, hyperparameter_grid, random_state = 1)
search = ET_CV.fit(X_train_scaledDF, y_train)

In [28]:
print(search.best_params_)
print(search.best_score_)

{'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': None}
0.9241249999999999


In [58]:
#train an extra trees classifier with optimal hyperparameters and predict
ETn = ExtraTreesClassifier(n_estimators= 1000, min_samples_split= 2, min_samples_leaf= 8, max_features= None, random_state = 2)
ETn.fit(X_train_scaledDF, y_train)
ETnpred = ETn.predict(X_test_scaledDF)
ETn_Eval_dict = {'accuracy':accuracy_score(y_test, ETnpred), 'recall':recall_score(y_test, ETnpred, pos_label = 'stable'),
               'precision':precision_score(y_test, ETnpred, pos_label = 'stable'),'f1':f1_score(y_test, ETnpred, pos_label = 'stable'),
               'Confusion Matrix': confusion_matrix(y_test, ETnpred, labels = ['stable', 'unstable'])}
ETn_Eval_dict

{'accuracy': 0.929,
 'recall': 0.8693820224719101,
 'precision': 0.9266467065868264,
 'f1': 0.8971014492753623,
 'Confusion Matrix': array([[ 619,   93],
        [  49, 1239]], dtype=int64)}

In [37]:
feature_importance = ETn.feature_importances_
feature_importance

array([0.13773375, 0.14066142, 0.1345629 , 0.13459055, 0.0037122 ,
       0.00551087, 0.00531595, 0.00514791, 0.10226596, 0.10827646,
       0.1125278 , 0.10969422])

In [54]:
importance_DF = pd.DataFrame(feature_importance, grid_DF.drop(columns = 'stabf').columns, columns = ['importance'])

In [51]:
importance_DF

Unnamed: 0,importance
tau1,0.137734
tau2,0.140661
tau3,0.134563
tau4,0.134591
p1,0.003712
p2,0.005511
p3,0.005316
p4,0.005148
g1,0.102266
g2,0.108276


In [55]:
importance_DF.max()

importance    0.140661
dtype: float64

In [56]:
importance_DF.min()

importance    0.003712
dtype: float64