In [174]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [175]:
#load data into pandas dataframe

data = pd.read_csv('C:\python\Hamoye\Data_for_UCI_named.csv')

In [176]:
#quick description of the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [177]:
#drop 'stab' because it has direct relationship with 'stabf'
data = data.drop('stab', axis =1)

In [178]:
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [179]:
#get features and labels
features = data.drop(columns=['stabf'])

target = data['stabf']

In [180]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix, classification_report

In [181]:
#Split the dataset

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)



In [182]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [184]:

#train a RandomForestClassifier 
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state = 1)

#fit on train set
forest.fit(x_train_scaled, y_train)

In [185]:
#make predictions on test set
forest_pred = forest.predict(x_test_scaled)

In [186]:
#model accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, forest_pred)
print('Accuracy: {}'.format(round(accuracy*100), 4))



#precision
precision = precision_score(y_test, forest_pred, pos_label='stable')
print('Precision: {}'.format(round(precision*100), 4))  

#recall
recall = recall_score(y_test, forest_pred, pos_label='stable')
print('Recall: {}'.format(round(recall*100), 4))

#F1 score
f1 = f1_score(y_test, forest_pred, pos_label='stable')
print('F1: {}'.format(round(f1*100), 4))

#classification report
print('Classification Report:\n', classification_report(y_test,forest_pred, digits =4))

#confusion matrix
forest_cnf_mat = confusion_matrix(y_test, forest_pred, labels=['unstable', 'stable'])
print('Confusion Matrix:\n', forest_cnf_mat)

Accuracy: 93
Precision: 92
Recall: 88
F1: 90
Classification Report:
               precision    recall  f1-score   support

      stable     0.9191    0.8778    0.8980       712
    unstable     0.9341    0.9573    0.9456      1288

    accuracy                         0.9290      2000
   macro avg     0.9266    0.9176    0.9218      2000
weighted avg     0.9288    0.9290    0.9286      2000

Confusion Matrix:
 [[1233   55]
 [  87  625]]


In [187]:
print("Training set score: {:.3f}".format(forest.score(x_train_scaled, y_train)))
print("Test set score: {:.3f}".format(forest.score(x_test_scaled, y_test)))

     

Training set score: 1.000
Test set score: 0.929


In [188]:
from sklearn.ensemble import ExtraTreesClassifier

extra_tree = ExtraTreesClassifier(random_state = 1)

#fit on the train set
extra_tree.fit(x_train_scaled, y_train)

In [189]:
#predict on test set
extra_tree_pred = extra_tree.predict(x_test_scaled)

In [190]:

#model accuracy
extra_tree_accuracy = accuracy_score(y_test, extra_tree_pred)
print('Accuracy: {}'.format(round(extra_tree_accuracy*100), 4))

#precision
extra_tree_precision = precision_score(y_test, extra_tree_pred, pos_label='stable')
print('Precision: {}'.format(round(extra_tree_precision*100), 4))  

#recall
extra_tree_recall = recall_score(y_test, extra_tree_pred, pos_label='stable')
print('Recall: {}'.format(round(extra_tree_recall*100), 4))

#F1 score
extra_tree_f1 = f1_score(y_test, extra_tree_pred, pos_label='stable')
print('F1: {}'.format(round(extra_tree_f1*100), 4))

#classification report
print('Classification Report:\n', classification_report(y_test,extra_tree_pred, digits =4))

#confusion matrix
extra_tree_cnf_mat = confusion_matrix(y_test, extra_tree_pred)
print('Confusion Matrix:\n', extra_tree_cnf_mat)

Accuracy: 93
Precision: 94
Recall: 85
F1: 89
Classification Report:
               precision    recall  f1-score   support

      stable     0.9410    0.8511    0.8938       712
    unstable     0.9218    0.9705    0.9455      1288

    accuracy                         0.9280      2000
   macro avg     0.9314    0.9108    0.9197      2000
weighted avg     0.9287    0.9280    0.9271      2000

Confusion Matrix:
 [[ 606  106]
 [  38 1250]]


In [191]:
print("Training set score: {:.3f}".format(extra_tree.score(x_train_scaled, y_train)))
print("Test set score: {:.3f}".format(extra_tree.score(x_test_scaled, y_test)))


Training set score: 1.000
Test set score: 0.928


In [192]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
y_train_label = label.fit_transform(y_train)
y_test_label = label.fit_transform(y_test)

In [193]:
from xgboost import XGBClassifier

xgboost= XGBClassifier(random_state = 1)

#fit on train set
xgboost.fit(x_train_scaled, y_train_label)

In [194]:
#predict on test set
xgboost_pred = xgboost.predict(x_test_scaled)

In [195]:
#model accuracy
xgboost_accuracy = accuracy_score(y_test_label, xgboost_pred)
print('Accuracy: {}'.format(round(xgboost_accuracy*100), 4))

#precision
xgboost_precision = precision_score(y_test_label, xgboost_pred, pos_label= 0)
print('Precision: {}'.format(round(xgboost_precision*100), 4))  

#recall
xgboost_recall = recall_score(y_test_label, xgboost_pred, pos_label= 0)
print('Recall: {}'.format(round(xgboost_recall*100), 4))

#F1 score
xgboost_f1 = f1_score(y_test_label, xgboost_pred, pos_label= 0)
print('F1: {}'.format(round(xgboost_f1*100), 4))

#classification report
print('Classification Report:\n', classification_report(y_test_label,xgboost_pred, digits =4))

#confusion matrix
c_cnf_mat = confusion_matrix(y_test_label, xgboost_pred)
print('Confusion Matrix:\n', c_cnf_mat)

Accuracy: 95
Precision: 94
Recall: 91
F1: 92
Classification Report:
               precision    recall  f1-score   support

           0     0.9351    0.9101    0.9224       712
           1     0.9510    0.9651    0.9580      1288

    accuracy                         0.9455      2000
   macro avg     0.9430    0.9376    0.9402      2000
weighted avg     0.9453    0.9455    0.9453      2000

Confusion Matrix:
 [[ 648   64]
 [  45 1243]]


In [196]:

from lightgbm import LGBMClassifier

lgbm= LGBMClassifier(random_state = 1)

#fit on train set
lgbm.fit(x_train_scaled, y_train)

In [197]:
#predict on test set
lgbm_pred = lgbm.predict(x_test_scaled)

In [198]:
#model accuracy
lgbm_accuracy = accuracy_score(y_test, lgbm_pred)
print('Accuracy: {}'.format(round(lgbm_accuracy*100),digits = 4))

#precision
lgbm_precision = precision_score(y_test, lgbm_pred, pos_label='stable')
print('Precision: {}'.format(round(lgbm_precision*100), 4))  

#recall
lgbm_recall = recall_score(y_test, lgbm_pred, pos_label='stable')
print('Recall: {}'.format(round(lgbm_recall*100), 4))

#F1 score
lgbm_f1 = f1_score(y_test, lgbm_pred, pos_label='stable')
print('F1: {}'.format(round(lgbm_f1*100), 4))

#classification report
print('Classification Report:\n', classification_report(y_test,lgbm_pred, digits =4))

#confusion matrix
lgbm_cnf_mat = confusion_matrix(y_test, lgbm_pred)
print('Confusion Matrix:\n', lgbm_cnf_mat)

Accuracy: 94
Precision: 93
Recall: 90
F1: 91
Classification Report:
               precision    recall  f1-score   support

      stable     0.9276    0.9003    0.9138       712
    unstable     0.9458    0.9612    0.9534      1288

    accuracy                         0.9395      2000
   macro avg     0.9367    0.9307    0.9336      2000
weighted avg     0.9393    0.9395    0.9393      2000

Confusion Matrix:
 [[ 641   71]
 [  50 1238]]


In [199]:
#hyperparameters
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}


In [200]:
from sklearn.model_selection import RandomizedSearchCV

extra_tree = ExtraTreesClassifier(random_state = 1)



random = RandomizedSearchCV(estimator = extra_tree, 
                              param_distributions = hyperparameter, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1,
                              random_state = 1)

In [201]:
#fit on the training data
search = random.fit(x_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [202]:
#get best parameters
search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [203]:

extra_tree2 = ExtraTreesClassifier(max_features = None, 
                            min_samples_leaf= 8,
                            min_samples_split= 2,
                            n_estimators= 1000, 
                            random_state = 1)


extra_tree2.fit(x_train_scaled, y_train)

In [204]:
importance = extra_tree2.feature_importances_
for feature,score in enumerate(importance):
	print('Feature: %0d, Score: %.4f' % (feature,score))

Feature: 0, Score: 0.1372
Feature: 1, Score: 0.1405
Feature: 2, Score: 0.1347
Feature: 3, Score: 0.1354
Feature: 4, Score: 0.0037
Feature: 5, Score: 0.0053
Feature: 6, Score: 0.0054
Feature: 7, Score: 0.0050
Feature: 8, Score: 0.1026
Feature: 9, Score: 0.1076
Feature: 10, Score: 0.1131
Feature: 11, Score: 0.1095


In [205]:
#get best score
search.best_score_

0.9241249999999999

In [206]:
#predict on test set
extra_tree2_pred = extra_tree2.predict(x_test_scaled)