In [1]:
# libraries 
import numpy as np
import pandas as pd

In [2]:
# load dataset
grid_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv')
grid_data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
#checking the missing values
grid_data.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [4]:
grid_data = grid_data.drop(['stab'], axis = 1)
grid_data.head()
X = grid_data.drop(['stabf'], axis = 1)
y = grid_data['stabf']

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test,  y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalised_xtrain = pd.DataFrame(scaler.fit_transform(x_train), columns = x_train.columns)
normalised_xtest = pd.DataFrame(scaler.transform(x_test), columns = x_test.columns)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import recall_score, classification_report, accuracy_score, precision_score, f1_score

In [8]:
xgb = XGBClassifier(random_state = 1, learning_rate = 0.1, max_depth = 3)
xgb.fit(normalised_xtrain, y_train)
pred = xgb.predict(normalised_xtest)
round(accuracy_score(y_test, pred), 4)





0.9195

In [9]:
forest = RandomForestClassifier(random_state = 1)
forest.fit(normalised_xtrain, y_train)
pred = forest.predict(normalised_xtest)
round(accuracy_score(y_test, pred), 4)

0.929

In [10]:
lgbm = LGBMClassifier(random_state = 1)
lgbm.fit(normalised_xtrain, y_train)
pred = lgbm.predict(normalised_xtest)
round(accuracy_score(y_test, pred), 4)

0.9395

In [11]:
tree = ExtraTreesClassifier(random_state = 1)
tree.fit(normalised_xtrain, y_train)
pred = tree.predict(normalised_xtest)
round(accuracy_score(y_test, pred), 4)

0.928

In [12]:
tp = 355
fp = 1480
fn = 45
tn = 120
precision = (tp/(tp+fp))
recall = (tp/(tp+fn))
F1 = 2 * (precision * recall) / (precision + recall)
F1

0.3176733780760626

In [13]:
def get_feature_importance(model, feat, col_name):
    #this function returns the importance of every feature
    importance = pd.Series(model.feature_importances_, feat.columns).sort_values()
    importance_df = pd.DataFrame(importance).reset_index()
    importance_df.columns = ['Features', col_name]
    importance_df[col_name].round(3)
    return importance_df
get_feature_importance(tree, normalised_xtrain, 'Feature_Importance')

Unnamed: 0,Features,Feature_Importance
0,p1,0.039507
1,p2,0.040371
2,p4,0.040579
3,p3,0.040706
4,g1,0.089783
5,g2,0.093676
6,g4,0.094019
7,g3,0.096883
8,tau3,0.113169
9,tau4,0.115466


In [14]:
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}
from sklearn.model_selection import RandomizedSearchCV
rsv = RandomizedSearchCV(tree, hyperparameter_grid, cv=5, n_iter = 10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)
search = rsv.fit(normalised_xtrain, y_train)
search.best_params_
tree = ExtraTreesClassifier(**search.best_params_, random_state = 1)
tree.fit(normalised_xtrain, y_train)
pred = tree.predict(normalised_xtest)
accuracy_score(y_test, pred)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


0.927