In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [24]:
Dataset = pd.read_csv('balanced_filled_Dataset-vf.csv')

In [25]:
Dataset.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,Y
0,2892.0,75.0,7.0,95.0,9.0,1889.0,228.0,228.0,133.0,2371.0,0.79,39.0,3,4.2,Apple
1,3208.0,0.0,9.0,124.0,-2.0,5394.0,206.0,222.0,154.0,900.0,0.97,-740.0,1,4.2,Apple
2,3245.0,97.630426,3.0,564.0,66.0,4387.0,220.0,233.0,150.0,2650.0,0.55,4.0,1,4.2,Apple
3,3157.0,307.0,27.0,120.0,35.0,2971.0,138.0,213.0,211.0,2467.0,0.19,637.0,1,4.4,Apple
4,3246.0,18.0,9.0,120.0,11.0,4333.0,213.0,221.0,144.0,972.0,0.93,833.0,1,5.9,Apple


In [26]:
X = Dataset.drop('Y', axis=1)
Y = Dataset['Y']

In [27]:
X.shape, Y.shape

((7774, 14), (7774,))

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, random_state=20211008, stratify=Y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=20211008, stratify=y_train)

In [29]:
import itertools
criterion = ["gini", "entropy"] # The function to measure the quality of a split. 
bootstrap = [True, False] # Whether bootstrap samples are used when building trees.
max_depth = [None, 100, 1000] # The maximum depth of the tree.
max_features = [None, "sqrt", "log2"] # The number of features to consider when looking for the best split   
parameters = [criterion,bootstrap,max_depth, max_features]  
parameters_combinations = list(itertools.product(*parameters))
len(parameters_combinations) 


36

In [30]:
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from imblearn.metrics import geometric_mean_score

In [31]:
from sklearn.ensemble import RandomForestClassifier
# training and hyper-parameter tuning
best_acc_params = {"criterion": None, "bootstrap": None, "max_depth": None,
                         "max_features": None, "accuracy": {
                             "average_score": 0,
                             "f1_score_macro": 0,
                             "f1_score_micro": 0,
                             "MCC": 0,
                             "Gmean": 0
                         }}
for params in parameters_combinations:
    
    random_forest_clf = RandomForestClassifier(criterion=params[0], bootstrap=params[1], 
                                          max_depth=params[2], max_features=params[3])
    random_forest_clf = random_forest_clf.fit(X_train,y_train)
    y_pred = random_forest_clf.predict(X_val)
    
    f1_score_macro = f1_score(y_val, y_pred, average='macro')
    f1_score_micro = f1_score(y_val, y_pred, average='micro')
    MCC_score = matthews_corrcoef(y_val, y_pred)
    Gmean_score = geometric_mean_score(y_val,y_pred, average='macro')
    accuracy = (f1_score_macro + f1_score_micro + MCC_score + Gmean_score) / 4.0
    
    if accuracy > 0.50:
        print(f'parameters: \n criterion: {params[0]}, bootstrap: {params[1]}, max depth: {params[2]}, max features: {params[3]}')
        print(f'accuracy: {accuracy}')
        if accuracy > best_acc_params['accuracy']['average_score']:
            best_acc_params.update({"criterion": params[0], "bootstrap": params[1], "max_depth": params[2],
                         "max_features": params[3], "accuracy": {
                             "average_score": accuracy,
                             "f1_score_macro": f1_score_macro,
                             "f1_score_micro": f1_score_micro,
                             "MCC": MCC_score,
                             "Gmean": Gmean_score
                         }})

print(f'best parameters are: {best_acc_params}')


parameters: 
 criterion: gini, bootstrap: True, max depth: None, max features: None
accuracy: 0.9327346731139107
parameters: 
 criterion: gini, bootstrap: True, max depth: None, max features: sqrt
accuracy: 0.9474595115640756
parameters: 
 criterion: gini, bootstrap: True, max depth: None, max features: log2
accuracy: 0.9460847563389269
parameters: 
 criterion: gini, bootstrap: True, max depth: 100, max features: None
accuracy: 0.9327741385671304
parameters: 
 criterion: gini, bootstrap: True, max depth: 100, max features: sqrt
accuracy: 0.9462952057807787
parameters: 
 criterion: gini, bootstrap: True, max depth: 100, max features: log2
accuracy: 0.9445412286335817
parameters: 
 criterion: gini, bootstrap: True, max depth: 1000, max features: None
accuracy: 0.9297244315429509
parameters: 
 criterion: gini, bootstrap: True, max depth: 1000, max features: sqrt
accuracy: 0.9529697435531119
parameters: 
 criterion: gini, bootstrap: True, max depth: 1000, max features: log2
accuracy: 0.943

In [32]:
best_acc_params

{'criterion': 'entropy',
 'bootstrap': False,
 'max_depth': None,
 'max_features': 'log2',
 'accuracy': {'average_score': 0.9585990110247593,
  'f1_score_macro': 0.9573361599261595,
  'f1_score_micro': 0.954983922829582,
  'MCC': 0.9473671108882388,
  'Gmean': 0.9747088504550568}}

In [33]:
des_tree_clf = RandomForestClassifier(criterion=best_acc_params['criterion'], bootstrap=best_acc_params['bootstrap'], 
                                          max_depth=best_acc_params['max_depth'], max_features=best_acc_params['max_features'])
des_tree_clf = des_tree_clf.fit(X_train,y_train)
y_pred = des_tree_clf.predict(X_test)

f1_score_macro = f1_score(y_test, y_pred, average='macro')
f1_score_micro = f1_score(y_test, y_pred, average='micro')
MCC_score = matthews_corrcoef(y_test, y_pred)
Gmean_score = geometric_mean_score(y_test,y_pred, average='macro')


print(f'f1_score (macro): {f1_score_macro}')
print(f'f1_score (micro): {f1_score_micro}')
print(f'MCC: {MCC_score}')
print(f'Gmean: {Gmean_score}')


f1_score (macro): 0.9512518493735119
f1_score (micro): 0.9485530546623794
MCC: 0.9400022092151247
Gmean: 0.9722374348266637
