## Import the dataset

In [56]:
import numpy as np
import pandas as pd

In [57]:
data = pd.read_csv('dataset.csv')
data.head(3)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable


## Remove the stab column

In [58]:
data.drop('stab', axis=1, inplace=True)

In [59]:
data.head(3)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable


In [60]:
from sklearn.preprocessing import StandardScaler

## Setup training and testing data

In [61]:
from sklearn.model_selection import train_test_split

In [62]:
data.isna().sum

<bound method DataFrame.sum of        tau1   tau2   tau3   tau4     p1     p2     p3     p4     g1     g2  \
0     False  False  False  False  False  False  False  False  False  False   
1     False  False  False  False  False  False  False  False  False  False   
2     False  False  False  False  False  False  False  False  False  False   
3     False  False  False  False  False  False  False  False  False  False   
4     False  False  False  False  False  False  False  False  False  False   
...     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
9995  False  False  False  False  False  False  False  False  False  False   
9996  False  False  False  False  False  False  False  False  False  False   
9997  False  False  False  False  False  False  False  False  False  False   
9998  False  False  False  False  False  False  False  False  False  False   
9999  False  False  False  False  False  False  False  False  False  False   

         g3     g4  stabf  
0   

In [63]:
data.isnull()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,False,False,False,False,False,False,False,False,False,False,False,False,False
9996,False,False,False,False,False,False,False,False,False,False,False,False,False
9997,False,False,False,False,False,False,False,False,False,False,False,False,False
9998,False,False,False,False,False,False,False,False,False,False,False,False,False


In [64]:
# Encode categorical data
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()
data['stabf'] = encode.fit_transform(data['stabf'])

In [66]:
Y = data['stabf'].values.reshape(-1,1)
X = data.drop('stabf', axis=1).values
print((X.ndim, Y.ndim))
scale = StandardScaler()
X = scale.fit_transform(X)

(2, 2)


In [67]:
Y

array([[1],
       [0],
       [1],
       ...,
       [0],
       [1],
       [1]])

In [68]:
X

array([[-0.83537431, -0.79131661,  1.14170354, ...,  1.22001311,
         1.32162751,  1.57902607],
       [ 1.47829663, -0.12670487, -0.80311147, ...,  1.23035426,
         0.13542358,  0.93625569],
       [ 1.35709296,  1.31213982, -0.80349871, ...,  0.88129868,
         1.14659574, -1.51380226],
       ...,
       [-1.05234609, -0.87804866,  1.28587062, ...,  1.68284371,
        -1.37001303, -1.38205402],
       [ 1.59768553, -0.45784646, -0.90902909, ...,  0.2281105 ,
         1.32772953,  1.06982944],
       [ 0.4669346 ,  0.55855544, -0.32829064, ..., -0.0713212 ,
        -0.53325125,  1.52285961]])

In [30]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

## Setup classification

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier
from xgboost import XGBClassifier

In [103]:
random_forest = RandomForestClassifier()
extra_tree = ExtraTreeClassifier()
xgboost = XGBClassifier()

In [71]:
extra_tree.fit(x_train, y_train)

ExtraTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, random_state=None,
                    splitter='random')

In [72]:
random_forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [74]:
from lightgbm import LGBMClassifier

In [76]:
lgbm = LGBMClassifier()
lgbm.fit(x_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [78]:
from sklearn.metrics import accuracy_score

In [79]:
pred_rf = random_forest.predict(x_test)

In [81]:
accuracy_score(y_test, pred_rf)

0.9245

In [84]:
xgboost.fit(x_train, y_train)
pred_xgb = xgboost.predict(x_test)
accuracy_score(y_test, pred_xgb)

0.9195

In [85]:
pred_lgbm = lgbm.predict(x_test)
accuracy_score(y_test, pred_lgbm)

0.9375

In [86]:
from sklearn.model_selection import RandomizedSearchCV

In [109]:
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': [50, 100, 300, 500, 1000],

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}

In [110]:
rsv = RandomizedSearchCV(extra_tree, hyperparameter_grid, cv=5, n_iter=10, scoring='accuracy', n_jobs=-1, verbose=1, random_state=1)

In [111]:
rsv.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


ValueError: Invalid parameter n_estimators for estimator ExtraTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, random_state=None,
                    splitter='random'). Check the list of available parameters with `estimator.get_params().keys()`.

In [94]:
rsv_predict = rsv.predict(x_test)

In [95]:
accuracy_score(y_test, rsv_predict)

0.8345

In [97]:
extra_tree.feature_importances_.argmin()

6

In [98]:
extra_tree.feature_importances_.argmax()

0

In [100]:
data.columns[6]

'p3'

In [101]:
data.columns[0]

'tau1'