In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
data = pd.read_csv('Data_for_UCI_named.csv')
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
y = data['stabf']
X = data.drop(['stab', 'stabf'], axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [5]:
# Standardizing the data
sc = StandardScaler()
scaled_X_train = sc.fit_transform(X_train)
scaled_X_test = sc.transform(X_test)

In [6]:
# Encode the labels as 0 or 1
encoder = LabelEncoder()
y_train = pd.Series(encoder.fit_transform(y_train))
y_test = pd.Series(encoder.transform(y_test))

In [7]:
#14
random_forest = RandomForestClassifier(random_state=1)
random_forest.fit(scaled_X_train, y_train)
predictions = random_forest.predict(scaled_X_test)
accuracy_score(y_test, predictions)

0.929

In [8]:
#15
xgboost = XGBClassifier()
xgboost.fit(scaled_X_train, y_train)
predictions = xgboost.predict(scaled_X_test)
accuracy_score(y_test, predictions)

0.9455

In [9]:
#16
lgbm = LGBMClassifier(random_state=1)
lgbm.fit(scaled_X_train, y_train)
predictions = lgbm.predict(scaled_X_test)
accuracy_score(y_test, predictions)

0.9395

In [10]:
#17
extra_trees = ExtraTreesClassifier(random_state=1)
param_distribution = {'n_estimators':[100, 300, 500, 1000], 
                      'min_samples_split':[2, 5, 7],
                        'min_samples_leaf':[4, 6, 8],
                        'max_features':['auto', None, 'log2']}
grid_search_extra_trees = RandomizedSearchCV(extra_trees, param_distributions=param_distribution, n_iter=10 , cv=5, scoring='accuracy', n_jobs=-1, verbose=1, random_state=1)
grid_search_extra_trees.fit(scaled_X_train, y_train)
grid_search_extra_trees.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   48.2s finished


{'n_estimators': 1000,
 'min_samples_split': 5,
 'min_samples_leaf': 6,
 'max_features': None}

In [11]:
#18
extra_trees = ExtraTreesClassifier(random_state=1)
extra_trees.fit(scaled_X_train, y_train)
predictions = extra_trees.predict(scaled_X_test)
print(f'Initial Extra Classifier: {accuracy_score(predictions, y_test)}')

predictions = grid_search_extra_trees.predict(scaled_X_test)
print(f'Tuned Extra Classifier: {accuracy_score(predictions, y_test)}')

Initial Extra Classifier: 0.928
Tuned Extra Classifier: 0.932


In [12]:
predictions = grid_search_extra_trees.predict(scaled_X_test)
accuracy_score(predictions, y_test)

0.932

In [13]:
#20
pd.DataFrame(extra_trees.feature_importances_,index=X_train.columns, columns=['Feature importance']).sort_values(by=['Feature importance'])

Unnamed: 0,Feature importance
p1,0.039507
p2,0.040371
p4,0.040579
p3,0.040706
g1,0.089783
g2,0.093676
g4,0.094019
g3,0.096883
tau3,0.113169
tau4,0.115466
