In [74]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns


In [75]:
df = pd.read_csv(r'/content/drive/My Drive/Documents/Data_for_UCI_named.csv')
df.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable


In [76]:
df = df.drop(columns = "stab", axis = 1)
df.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable


In [77]:
df_stab = df[df.stabf == 'stable']
df_unstab = df[df.stabf == 'unstable']
data_df = df_stab.append(df_unstab)

import sklearn.utils
data_df = sklearn.utils.shuffle(data_df)
data_df.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
3287,9.667321,6.237136,8.861572,6.471214,4.231425,-1.154554,-1.261862,-1.815009,0.802051,0.100206,0.490517,0.176005,unstable
8525,6.974846,1.849432,0.702661,0.922617,3.641983,-1.004602,-1.833478,-0.803903,0.632835,0.209192,0.997493,0.777527,stable


In [78]:
data_df = data_df.reset_index(drop = True)

In [79]:
x = df.drop(columns = "stabf")
y = df['stabf']

In [80]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)
y_train.value_counts()

unstable    5092
stable      2908
Name: stabf, dtype: int64

In [81]:
from sklearn.preprocessing import StandardScaler
x_train_balanced = pd.DataFrame(x_train, columns = x_train.columns)
scaler = StandardScaler()
sc_train_df = scaler.fit_transform(x_train)
sc_train_df = pd.DataFrame(sc_train_df, columns = x_train_balanced.columns)

In [82]:
x_test = x_test.reset_index(drop =True)
sc_test_df = scaler.transform(x_test)
sc_test_df = pd.DataFrame(sc_test_df, columns = x_test.columns)

In [83]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(sc_train_df, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [84]:
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
predictions = log_reg.predict(sc_test_df)
clf_report = classification_report(y_test, predictions)
print("Classification\n", clf_report)

Classification
               precision    recall  f1-score   support

      stable       0.74      0.69      0.72       712
    unstable       0.84      0.86      0.85      1288

    accuracy                           0.80      2000
   macro avg       0.79      0.78      0.78      2000
weighted avg       0.80      0.80      0.80      2000



In [85]:
cnf = confusion_matrix(y_test, predictions)
print(cnf)

[[ 494  218]
 [ 174 1114]]


In [86]:
precision = precision_score(y_test, predictions, pos_label = 'stable')
print(precision)

0.7395209580838323


In [87]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
rf_clf = RandomForestClassifier(random_state = 1)
y_dict = {'stable': 1}
xtree_clf = ExtraTreesClassifier(random_state = 1, class_weight = y_dict)

rf_clf.fit(sc_train_df, y_train)
xtree_clf.fit(sc_train_df, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight={'stable': 1},
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=1, verbose=0,
                     warm_start=False)

In [113]:
rf_pred = rf_clf.predict(sc_test_df)
xtree_pred = xtree_clf.predict(sc_test_df)
clf_report_rf = classification_report(y_test, rf_pred)
print("RandomForest Classification:\n", clf_report_rf)

RandomForest Classification:
               precision    recall  f1-score   support

      stable       0.92      0.88      0.90       712
    unstable       0.93      0.96      0.95      1288

    accuracy                           0.93      2000
   macro avg       0.93      0.92      0.92      2000
weighted avg       0.93      0.93      0.93      2000



In [89]:
rf_cnf = confusion_matrix(y_test, rf_pred, labels = ['stable', 'unstable'])
print("RandomForest Confusion: \n", rf_cnf)

RandomForest Confusion: 
 [[ 625   87]
 [  55 1233]]


In [90]:
rf_prec = precision_score(y_test, rf_pred, pos_label = 'stable')
print("RandomForest Precision:", rf_prec.round(2))

RandomForest Precision: 0.92


In [114]:
rf_acc = accuracy_score(y_test, rf_pred)
print("RandomForest Accuracy:  ", rf_acc.round(4))

RandomForest Accuracy:   0.929


In [92]:
clf_report_xtree = classification_report(y_test, xtree_pred)
print("XTrees Classification:\n", clf_report_xtree)

XTrees Classification:
               precision    recall  f1-score   support

      stable       0.94      0.85      0.89       712
    unstable       0.92      0.97      0.95      1288

    accuracy                           0.93      2000
   macro avg       0.93      0.91      0.92      2000
weighted avg       0.93      0.93      0.93      2000



In [93]:
xtree_cnf = confusion_matrix(y_test, xtree_pred)
print("XTrees Confusion:\n", xtree_cnf)

XTrees Confusion:
 [[ 606  106]
 [  38 1250]]


In [94]:
xtree_prec = precision_score(y_test, xtree_pred, pos_label = 'stable')
print("XTrees Precision:", xtree_prec.round(2))

XTrees Precision: 0.94


In [119]:
xtree_acc = accuracy_score(y_test, xtree_pred)
print("ExtraTrees Accuracy:  ", xtree_acc.round(4))

ExtraTrees Accuracy:   0.928


In [95]:
# pip! install lightgbm

from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(random_state = 1)
lgbm.fit(sc_train_df, y_train)

lgbm_pred = lgbm.predict(sc_test_df)

In [96]:
lgbm_clf = classification_report(y_test, lgbm_pred)
print("LGBM Classification: \n", lgbm_clf)

LGBM Classification: 
               precision    recall  f1-score   support

      stable       0.93      0.89      0.91       712
    unstable       0.94      0.96      0.95      1288

    accuracy                           0.94      2000
   macro avg       0.94      0.93      0.93      2000
weighted avg       0.94      0.94      0.94      2000



In [117]:
lgbm_acc = accuracy_score(y_test, lgbm_pred)
print("LGBM Accuracy:  ", lgbm_acc.round(4))

LGBM Accuracy:   0.9375


In [97]:
# pip install xgboost

from xgboost import XGBClassifier

xgb = XGBClassifier(random_state = 1)
xgb.fit(sc_train_df, y_train)

xgb_pred = xgb.predict(sc_test_df)

xgb_clf = classification_report(y_test, xgb_pred)
print("XGBoost Classification: \n", xgb_clf)

XGBoost Classification: 
               precision    recall  f1-score   support

      stable       0.92      0.85      0.88       712
    unstable       0.92      0.96      0.94      1288

    accuracy                           0.92      2000
   macro avg       0.92      0.90      0.91      2000
weighted avg       0.92      0.92      0.92      2000



In [98]:
xgb_cnf = confusion_matrix(y_test, xgb_pred)
print(xgb_cnf)

[[ 603  109]
 [  52 1236]]


In [116]:
xgb_acc = accuracy_score(y_test, xgb_pred)
print("XGBoost Accuracy:  ", xgb_acc.round(4))

XGBoost Accuracy:   0.9195


In [99]:
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]
hyperparameter_grid = {'n_estimators': n_estimators, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'max_features': max_features}

from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(xtree_clf, hyperparameter_grid, random_state = 1)

random_search.fit(sc_train_df, y_train)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=ExtraTreesClassifier(bootstrap=False,
                                                  ccp_alpha=0.0,
                                                  class_weight={'stable': 1},
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  max_samples=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                            

In [100]:
random_search.best_estimator_

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight={'stable': 1},
                     criterion='gini', max_depth=None, max_features=None,
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=8, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=1000,
                     n_jobs=None, oob_score=False, random_state=1, verbose=0,
                     warm_start=False)

In [101]:
random_search.best_params_

{'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [102]:
feature_importances = random_search.best_estimator_.feature_importances_
print("Feature Importances: \n", feature_importances)

Feature Importances: 
 [0.13723975 0.1405075  0.13468029 0.13541676 0.00368342 0.00533686
 0.00542927 0.00496249 0.10256244 0.10757765 0.11306268 0.10954089]


In [103]:
sorted(zip(feature_importances, x), reverse = True)

[(0.14050750384993677, 'tau2'),
 (0.13723974766109256, 'tau1'),
 (0.1354167630909727, 'tau4'),
 (0.13468028520386593, 'tau3'),
 (0.11306267999167334, 'g3'),
 (0.10954089174337298, 'g4'),
 (0.10757764577478764, 'g2'),
 (0.10256244080927947, 'g1'),
 (0.005429268421191957, 'p3'),
 (0.005336864710946151, 'p2'),
 (0.004962486591192238, 'p4'),
 (0.003683422151688322, 'p1')]

In [104]:
final_xtree = random_search.best_estimator_
final_pred = final_xtree.predict(sc_test_df)

final_clf = classification_report(y_test, final_pred)
print("Final Classification:\n", final_clf)

Final Classification:
               precision    recall  f1-score   support

      stable       0.92      0.87      0.89       712
    unstable       0.93      0.96      0.94      1288

    accuracy                           0.93      2000
   macro avg       0.93      0.91      0.92      2000
weighted avg       0.93      0.93      0.93      2000



In [105]:
final_cnf = confusion_matrix(y_test, final_pred)
print(final_cnf)

[[ 619   93]
 [  53 1235]]


In [118]:
final_acc = accuracy_score(y_test, final_pred)
print("Final Model Accuracy:  ", final_acc.round(4))

Final Model Accuracy:   0.927
