<a href="https://colab.research.google.com/github/Bennykillua/Hamoye/blob/master/HamoyeStageThreeLogisticRegressionQuiz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#importing the necessary libraries
import pandas as pd
import numpy as np

In [2]:
#loading my dataset
url ="https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv"

df = pd.read_csv(url, error_bad_lines=False)

df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
df.columns

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4', 'stab', 'stabf'],
      dtype='object')

In [4]:
df['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [5]:
#finding out which column has nan values
df.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [6]:
#drop 'stab' so 'stabf' is the sole dependent variable
df.drop(['stab'], axis=1,inplace=True)

In [7]:
#Preprocessing and defining our x and y.
x = df.drop(columns = 'stabf')
y = df['stabf']

In [8]:
#split our dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
y_train.value_counts()

unstable    5092
stable      2908
Name: stabf, dtype: int64

In [9]:
#standard scaler

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaled_train_df = scaler.fit_transform(x_train)
scaled_test_df = scaler.transform(x_test)


scaled_xtrain_df = pd.DataFrame(scaled_train_df, columns=x_train.columns)
scaled_xtest_df = pd.DataFrame(scaled_test_df, columns= x_test.columns)

In [10]:
#Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=1)
rfc.fit(scaled_xtrain_df, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [11]:
rfcpredictions = rfc.predict(scaled_xtest_df)

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix

In [13]:
print("Accuracy score {}".format(round(accuracy_score(y_test, rfcpredictions), 4)))
print("Precision score for label stable %.3f" % (precision_score(y_test, rfcpredictions, pos_label='stable')))
print("Recall score for label stable {}".format(round(recall_score(y_test, rfcpredictions, pos_label='stable'), 4)))
print("F1 score %.3f" % (f1_score(y_test, rfcpredictions, pos_label='stable')))

Accuracy score 0.929
Precision score for label stable 0.919
Recall score for label stable 0.8778
F1 score 0.898


In [15]:
#confusion Matrix

cnf_mat = confusion_matrix(y_true=y_test, y_pred=rfcpredictions, labels=['stable', 'unstable'])
cnf_mat

array([[ 625,   87],
       [  55, 1233]])

In [14]:
from sklearn.metrics import classification_report

print(classification_report(y_test, rfcpredictions, digits=4))

              precision    recall  f1-score   support

      stable     0.9191    0.8778    0.8980       712
    unstable     0.9341    0.9573    0.9456      1288

    accuracy                         0.9290      2000
   macro avg     0.9266    0.9176    0.9218      2000
weighted avg     0.9288    0.9290    0.9286      2000



In [18]:
#xgboost
from xgboost import XGBClassifier
xg = XGBClassifier(random_state =1)

xg.fit(scaled_xtrain_df, y_train)

xg_pred = xg.predict(scaled_xtest_df)

In [19]:
print("Accuracy score {}".format(round(accuracy_score(y_test, xg_pred), 4)))
print("Precision score for label stable %.3f" % (precision_score(y_test, xg_pred, pos_label='stable')))
print("Recall score for label stable {}".format(round(recall_score(y_test, xg_pred, pos_label='stable'), 4)))
print("F1 score %.3f" % (f1_score(y_test, xg_pred, pos_label='stable')))

Accuracy score 0.9195
Precision score for label stable 0.921
Recall score for label stable 0.8469
F1 score 0.882


In [20]:
#confusion Matrix

cnf_mat = confusion_matrix(y_true=y_test, y_pred=xg_pred, labels=['stable', 'unstable'])
cnf_mat

array([[ 603,  109],
       [  52, 1236]])

In [21]:
#classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, xg_pred, digits=4))

              precision    recall  f1-score   support

      stable     0.9206    0.8469    0.8822       712
    unstable     0.9190    0.9596    0.9389      1288

    accuracy                         0.9195      2000
   macro avg     0.9198    0.9033    0.9105      2000
weighted avg     0.9195    0.9195    0.9187      2000



In [22]:
#lightgbm
from lightgbm import LGBMClassifier

light = LGBMClassifier(random_state=1)

light.fit(scaled_xtrain_df, y_train)

light_pred = light.predict(scaled_xtest_df)

In [23]:
print("Accuracy score {}".format(round(accuracy_score(y_test, light_pred), 4)))
print("Precision score for label stable %.3f" % (precision_score(y_test, light_pred, pos_label='stable')))
print("Recall score for label stable {}".format(round(recall_score(y_test, light_pred, pos_label='stable'), 4)))
print("F1 score %.3f" % (f1_score(y_test, light_pred, pos_label='stable')))

Accuracy score 0.9375
Precision score for label stable 0.930
Recall score for label stable 0.8919
F1 score 0.910


In [24]:
#confusion Matrix

cnf_mat = confusion_matrix(y_true=y_test, y_pred=light_pred, labels=['stable', 'unstable'])
cnf_mat

array([[ 635,   77],
       [  48, 1240]])

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test, light_pred, digits=4))

              precision    recall  f1-score   support

      stable     0.9297    0.8919    0.9104       712
    unstable     0.9415    0.9627    0.9520      1288

    accuracy                         0.9375      2000
   macro avg     0.9356    0.9273    0.9312      2000
weighted avg     0.9373    0.9375    0.9372      2000



In [26]:
#extra tree classifier
from sklearn.ensemble import ExtraTreesClassifier

tree = ExtraTreesClassifier(random_state=1)

tree.fit(scaled_xtrain_df, y_train)

tree_pred = tree.predict(scaled_xtest_df)

In [27]:
print("Accuracy score {}".format(round(accuracy_score(y_test, tree_pred), 4)))
print("Precision score for label stable %.3f" % (precision_score(y_test, tree_pred, pos_label='stable')))
print("Recall score for label stable {}".format(round(recall_score(y_test, tree_pred, pos_label='stable'), 4)))
print("F1 score %.3f" % (f1_score(y_test, tree_pred, pos_label='stable')))

Accuracy score 0.928
Precision score for label stable 0.941
Recall score for label stable 0.8511
F1 score 0.894


In [28]:
#confusion Matrix

cnf_mat = confusion_matrix(y_true=y_test, y_pred=tree_pred, labels=['stable', 'unstable'])
cnf_mat

array([[ 606,  106],
       [  38, 1250]])

In [29]:
from sklearn.metrics import classification_report
print(classification_report(y_test, tree_pred, digits=4))

              precision    recall  f1-score   support

      stable     0.9410    0.8511    0.8938       712
    unstable     0.9218    0.9705    0.9455      1288

    accuracy                         0.9280      2000
   macro avg     0.9314    0.9108    0.9197      2000
weighted avg     0.9287    0.9280    0.9271      2000



In [31]:
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 
hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

In [32]:
from sklearn.model_selection import RandomizedSearchCV
clf = RandomizedSearchCV(tree, hyperparameter_grid, random_state=1)
search = clf.fit(scaled_xtrain_df, y_train)

In [33]:
#checking for the best parameter for the model
search.best_params_

{'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [35]:
#test the model's performance with the best parameter
tree_param = ExtraTreesClassifier(n_estimators=1000, min_samples_leaf=8, 
                                 min_samples_split=2, max_features=None, random_state=1)

tree_param.fit(scaled_xtrain_df, y_train)

tree_param_pred = tree_param.predict(scaled_xtest_df)

In [36]:
feature_importances = clf.best_estimator_.feature_importances_
print("feature importance: \n", (feature_importances))


feature importance: 
 [0.13723975 0.1405075  0.13468029 0.13541676 0.00368342 0.00533686
 0.00542927 0.00496249 0.10256244 0.10757765 0.11306268 0.10954089]


In [39]:
sorted (zip(feature_importances, x), reverse= True)

[(0.14050750384993677, 'tau2'),
 (0.13723974766109256, 'tau1'),
 (0.1354167630909727, 'tau4'),
 (0.13468028520386593, 'tau3'),
 (0.11306267999167334, 'g3'),
 (0.10954089174337298, 'g4'),
 (0.10757764577478764, 'g2'),
 (0.10256244080927947, 'g1'),
 (0.005429268421191957, 'p3'),
 (0.005336864710946151, 'p2'),
 (0.004962486591192238, 'p4'),
 (0.003683422151688322, 'p1')]