In [1]:
#Import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("Data_for_UCI_named.csv")
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
tau1     10000 non-null float64
tau2     10000 non-null float64
tau3     10000 non-null float64
tau4     10000 non-null float64
p1       10000 non-null float64
p2       10000 non-null float64
p3       10000 non-null float64
p4       10000 non-null float64
g1       10000 non-null float64
g2       10000 non-null float64
g3       10000 non-null float64
g4       10000 non-null float64
stab     10000 non-null float64
stabf    10000 non-null object
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [4]:
data.drop('stab', axis=1, inplace=True) #drop column "stab"

In [5]:
#split the data into features, X and target, y
X = data.drop('stabf', axis=1)
y = data.stabf

In [6]:
#Train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [7]:
#Normalize the features data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_normalize = scaler.fit_transform(x_train)
x_train_normalize = pd.DataFrame(x_train_normalize, columns = x_train.columns)

x_test_normalize = scaler.transform(x_test)
x_test_normalize = pd.DataFrame(x_test_normalize, columns=x_test.columns)

In [8]:
from sklearn.ensemble import RandomForestClassifier

random_fc = RandomForestClassifier(random_state=1)
random_fc.fit(x_train_normalize, y_train)
random_fc_pred = random_fc.predict(x_test_normalize)

In [9]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(y_test, random_fc_pred, digits=4))

              precision    recall  f1-score   support

      stable     0.9191    0.8778    0.8980       712
    unstable     0.9341    0.9573    0.9456      1288

    accuracy                         0.9290      2000
   macro avg     0.9266    0.9176    0.9218      2000
weighted avg     0.9288    0.9290    0.9286      2000



In [10]:
accuracy_score(y_test, random_fc_pred)

0.929

In [11]:
from sklearn.ensemble import ExtraTreesClassifier
ETC = ExtraTreesClassifier(random_state=1)
ETC.fit(x_train_normalize, y_train)
ETC_pred = ETC.predict(x_test_normalize)

In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_test, ETC_pred, zero_division=True, digits=6))

              precision    recall  f1-score   support

      stable   0.940994  0.851124  0.893805       712
    unstable   0.921829  0.970497  0.945537      1288

    accuracy                       0.928000      2000
   macro avg   0.931411  0.910810  0.919671      2000
weighted avg   0.928652  0.928000  0.927121      2000



In [13]:
from xgboost import XGBClassifier
XGB = XGBClassifier(random_state=1)
XGB.fit(x_train_normalize, y_train)
XGB_pred = XGB.predict(x_test_normalize)

In [14]:
print(classification_report(y_test, XGB_pred, digits=4))

              precision    recall  f1-score   support

      stable     0.9351    0.9101    0.9224       712
    unstable     0.9510    0.9651    0.9580      1288

    accuracy                         0.9455      2000
   macro avg     0.9430    0.9376    0.9402      2000
weighted avg     0.9453    0.9455    0.9453      2000



In [15]:
from lightgbm import LGBMClassifier
LGBM = LGBMClassifier(random_state=1)
LGBM.fit(x_train_normalize, y_train)
LGBM_pred = LGBM.predict(x_test_normalize)

In [16]:
print(classification_report(y_test, LGBM_pred, digits=4))

              precision    recall  f1-score   support

      stable     0.9297    0.8919    0.9104       712
    unstable     0.9415    0.9627    0.9520      1288

    accuracy                         0.9375      2000
   macro avg     0.9356    0.9273    0.9312      2000
weighted avg     0.9373    0.9375    0.9372      2000



In [17]:
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 
hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

In [18]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier

ETC2 = ExtraTreesClassifier(random_state=1)
clf = RandomizedSearchCV(ETC2, hyperparameter_grid, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state=1)
result = clf.fit(x_train_normalize, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   21.3s finished


In [19]:
ETC2.fit(x_train_normalize, y_train)
ETC2_pred = ETC2.predict(x_test_normalize)
accuracy_score(y_test, ETC2_pred)

0.928

In [20]:
result.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [21]:
ETC3 = ExtraTreesClassifier(n_estimators = 1000, min_samples_split = 2, min_samples_leaf =8, max_features= None)
ETC3.fit(x_train_normalize, y_train)
ETC3_pred = ETC3.predict(x_test_normalize)

In [22]:
accuracy_score(y_test, ETC3_pred)

0.9265

In [23]:
ETC3.feature_importances_   #print the importance of each feature to the optimal ETC model

[0.13775138 0.14061305 0.13412898 0.1351817  0.00364325 0.00545497
 0.00541282 0.00507191 0.10229039 0.10802605 0.11267404 0.10975146]
