In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV

In [7]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv')
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [8]:
data = data.drop(["stab"], axis=1)
X = data.drop(["stabf"], 1)
y = data["stabf"]

In [9]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [10]:
sc = StandardScaler()
 
normalised_train_df = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
 
normalised_test_df = pd.DataFrame(sc.transform(X_test), columns=X_test.columns)

In [11]:
RFC = RandomForestClassifier(random_state = 1)
RFC.fit(normalised_train_df, y_train)
RFC_pred = RFC.predict(normalised_test_df)

In [12]:
CR = classification_report(y_test, RFC_pred, digits = 4)
print(f'{CR}')

              precision    recall  f1-score   support

      stable     0.9191    0.8778    0.8980       712
    unstable     0.9341    0.9573    0.9456      1288

    accuracy                         0.9290      2000
   macro avg     0.9266    0.9176    0.9218      2000
weighted avg     0.9288    0.9290    0.9286      2000



In [13]:
AC = accuracy_score(y_true = y_test, y_pred = RFC_pred)
print(round(AC, 4))

0.929


In [14]:
xgb = XGBClassifier(random_state=1)
xgb.fit(normalised_train_df, y_train)
xgb_pred = xgb.predict(normalised_test_df)

In [15]:
# for the classificaton report
CR = classification_report(y_test, xgb_pred, digits = 4)
print(f'{CR}')

              precision    recall  f1-score   support

      stable     0.9206    0.8469    0.8822       712
    unstable     0.9190    0.9596    0.9389      1288

    accuracy                         0.9195      2000
   macro avg     0.9198    0.9033    0.9105      2000
weighted avg     0.9195    0.9195    0.9187      2000



In [16]:
AC = accuracy_score(y_true = y_test, y_pred = xgb_pred)
print(round(AC, 4))

0.9195


In [19]:
lgbm = LGBMClassifier(random_state=1)
lgbm.fit(normalised_train_df, y_train)
lgbm_pred = lgbm.predict(normalised_test_df)

In [21]:
CR = classification_report(y_test, lgbm_pred, digits = 4)
print(f'{CR}')

              precision    recall  f1-score   support

      stable     0.9297    0.8919    0.9104       712
    unstable     0.9415    0.9627    0.9520      1288

    accuracy                         0.9375      2000
   macro avg     0.9356    0.9273    0.9312      2000
weighted avg     0.9373    0.9375    0.9372      2000



In [22]:
AC = accuracy_score(y_true = y_test, y_pred = lgbm_pred)
print(round(AC, 4))

0.9375


In [24]:
tree = ExtraTreesClassifier(random_state =1)
tree.fit(normalised_train_df, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=1, verbose=0,
                     warm_start=False)

In [25]:
tree_pred = tree.predict(normalised_test_df)

In [26]:
CR = classification_report(y_test, tree_pred, digits = 4)
print(f'{CR}')

              precision    recall  f1-score   support

      stable     0.9410    0.8511    0.8938       712
    unstable     0.9218    0.9705    0.9455      1288

    accuracy                         0.9280      2000
   macro avg     0.9314    0.9108    0.9197      2000
weighted avg     0.9287    0.9280    0.9271      2000



In [27]:
AC = accuracy_score(y_true = y_test, y_pred = tree_pred)
print(round(AC, 4))

0.928


In [28]:
min_samples_leaf = [1, 2, 4, 6, 8]
min_samples_split = [2, 3, 5, 7, 9]
n_estimators = [50, 100, 300, 500, 1000]
max_features = ['auto', 'sqrt', 'log2', None] 
hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

In [29]:
clf = RandomizedSearchCV(tree, hyperparameter_grid, random_state=1)
search = clf.fit(normalised_train_df, y_train)
search.best_params_.values()

dict_values([1000, 2, 8, None])

In [30]:
hype = ExtraTreesClassifier(verbose = 1,
                                  n_estimators=1000, min_samples_split=2, 
                                 min_samples_leaf=8, max_features=None, random_state=1)
hype.fit(normalised_train_df, y_train)
hype_pred = hype.predict(normalised_test_df)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:   10.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.4s finished


In [31]:
CR = classification_report(y_test, hype_pred, digits = 4)
print(f'{CR}')

              precision    recall  f1-score   support

      stable     0.9211    0.8694    0.8945       712
    unstable     0.9300    0.9589    0.9442      1288

    accuracy                         0.9270      2000
   macro avg     0.9256    0.9141    0.9193      2000
weighted avg     0.9268    0.9270    0.9265      2000



In [32]:
AC = accuracy_score(y_true = y_test, y_pred = hype_pred)
print(round(AC, 4))

0.927


In [34]:
sorted(zip(hype.feature_importances_,data.drop(columns=['stabf'],axis=1)),reverse=True)

[(0.14050750384993677, 'tau2'),
 (0.13723974766109256, 'tau1'),
 (0.1354167630909727, 'tau4'),
 (0.13468028520386593, 'tau3'),
 (0.11306267999167334, 'g3'),
 (0.10954089174337298, 'g4'),
 (0.10757764577478764, 'g2'),
 (0.10256244080927947, 'g1'),
 (0.005429268421191957, 'p3'),
 (0.005336864710946151, 'p2'),
 (0.004962486591192238, 'p4'),
 (0.003683422151688322, 'p1')]