In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [2]:
training_set = pd.read_csv('processed_data/training_set.csv')
testing_set = pd.read_csv('processed_data/testing_set.csv')

In [3]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6417 entries, 0 to 6416
Data columns (total 49 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0    FC11   6417 non-null   float64
 1    FC12   6417 non-null   float64
 2    FC13   6417 non-null   float64
 3    FC14   6417 non-null   float64
 4    CA21   6417 non-null   float64
 5    CA22   6417 non-null   float64
 6    CA23   6417 non-null   float64
 7    CA24   6417 non-null   float64
 8    CA25   6417 non-null   float64
 9    CA26   6417 non-null   float64
 10   CA30   6417 non-null   float64
 11   CA31   6417 non-null   float64
 12   CA32   6417 non-null   float64
 13   CA33   6417 non-null   float64
 14   CA34   6417 non-null   float64
 15   CA36   6417 non-null   float64
 16   CA37   6417 non-null   float64
 17   CA38   6417 non-null   float64
 18   CA39   6417 non-null   float64
 19   CA40   6417 non-null   float64
 20   CA41   6417 non-null   float64
 21   CA42   6417 non-null   float64
 22  

In [4]:
testing_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957 entries, 0 to 956
Data columns (total 49 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0    FC11   957 non-null    float64
 1    FC12   957 non-null    float64
 2    FC13   957 non-null    float64
 3    FC14   957 non-null    float64
 4    CA21   957 non-null    float64
 5    CA22   957 non-null    float64
 6    CA23   957 non-null    float64
 7    CA24   957 non-null    float64
 8    CA25   957 non-null    float64
 9    CA26   957 non-null    float64
 10   CA30   957 non-null    float64
 11   CA31   957 non-null    float64
 12   CA32   957 non-null    float64
 13   CA33   957 non-null    float64
 14   CA34   957 non-null    float64
 15   CA36   957 non-null    float64
 16   CA37   957 non-null    float64
 17   CA38   957 non-null    float64
 18   CA39   957 non-null    float64
 19   CA40   957 non-null    float64
 20   CA41   957 non-null    float64
 21   CA42   957 non-null    float64
 22   C

In [5]:
X_train = training_set.drop('Y', axis=1)
y_train = training_set['Y']
X_test = testing_set.drop('Y', axis=1)
y_test = testing_set['Y']

In [6]:
X_train.shape, y_train.shape

((6417, 48), (6417,))

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=20211008, 
                                                  stratify=y_train)

In [8]:
import itertools
criterion = ["gini", "entropy"] # The function to measure the quality of a split. 
bootstrap = [True, False] # Whether bootstrap samples are used when building trees.
max_depth = [None, 100, 1000] # The maximum depth of the tree.
max_features = [None, "sqrt", "log2"] # The number of features to consider when looking for the best split   
parameters = [criterion,bootstrap,max_depth, max_features]  
parameters_combinations = list(itertools.product(*parameters))
len(parameters_combinations) 


36

In [9]:
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from imblearn.metrics import geometric_mean_score

In [10]:
from sklearn.ensemble import RandomForestClassifier
# training and hyper-parameter tuning
best_acc_params = {"criterion": None, "bootstrap": None, "max_depth": None,
                         "max_features": None, "accuracy": {
                             "average_score": 0,
                             "f1_score_macro": 0,
                             "f1_score_micro": 0,
                             "MCC": 0,
                             "Gmean": 0
                         }}
for params in parameters_combinations:
    
    random_forest_clf = RandomForestClassifier(criterion=params[0], bootstrap=params[1], 
                                          max_depth=params[2], max_features=params[3])
    random_forest_clf = random_forest_clf.fit(X_train,y_train)
    y_pred = random_forest_clf.predict(X_val)
    
    f1_score_macro = f1_score(y_val, y_pred, average='macro')
    f1_score_micro = f1_score(y_val, y_pred, average='micro')
    MCC_score = matthews_corrcoef(y_val, y_pred)
    Gmean_score = geometric_mean_score(y_val,y_pred, average='macro')
    accuracy = (f1_score_macro + f1_score_micro + MCC_score + Gmean_score) / 4.0
    
    if accuracy > 0.50:
        print(f'parameters: \n criterion: {params[0]}, bootstrap: {params[1]}, max depth: {params[2]}, max features: {params[3]}')
        print(f'accuracy: {accuracy}')
        if accuracy > best_acc_params['accuracy']['average_score']:
            best_acc_params.update({"criterion": params[0], "bootstrap": params[1], "max_depth": params[2],
                         "max_features": params[3], "accuracy": {
                             "average_score": accuracy,
                             "f1_score_macro": f1_score_macro,
                             "f1_score_micro": f1_score_micro,
                             "MCC": MCC_score,
                             "Gmean": Gmean_score
                         }})

print(f'best parameters are: {best_acc_params}')


parameters: 
 criterion: gini, bootstrap: True, max depth: None, max features: None
accuracy: 0.9382274794975722
parameters: 
 criterion: gini, bootstrap: True, max depth: None, max features: sqrt
accuracy: 0.9462685075256435
parameters: 
 criterion: gini, bootstrap: True, max depth: None, max features: log2
accuracy: 0.9505355389381406
parameters: 
 criterion: gini, bootstrap: True, max depth: 100, max features: None
accuracy: 0.9420437625511453
parameters: 
 criterion: gini, bootstrap: True, max depth: 100, max features: sqrt
accuracy: 0.9408127478824115
parameters: 
 criterion: gini, bootstrap: True, max depth: 100, max features: log2
accuracy: 0.945101132617236
parameters: 
 criterion: gini, bootstrap: True, max depth: 1000, max features: None
accuracy: 0.9407235043885885
parameters: 
 criterion: gini, bootstrap: True, max depth: 1000, max features: sqrt
accuracy: 0.9557087456091368
parameters: 
 criterion: gini, bootstrap: True, max depth: 1000, max features: log2
accuracy: 0.9448

In [11]:
best_acc_params

{'criterion': 'gini',
 'bootstrap': False,
 'max_depth': None,
 'max_features': 'sqrt',
 'accuracy': {'average_score': 0.9581957243207317,
  'f1_score_macro': 0.9556738659245755,
  'f1_score_micro': 0.955607476635514,
  'MCC': 0.9478959288448578,
  'Gmean': 0.9736056258779795}}

In [12]:
des_tree_clf = RandomForestClassifier(criterion=best_acc_params['criterion'], bootstrap=best_acc_params['bootstrap'], 
                                          max_depth=best_acc_params['max_depth'], max_features=best_acc_params['max_features'])
des_tree_clf = des_tree_clf.fit(X_train,y_train)
y_pred = des_tree_clf.predict(X_test)

f1_score_macro = f1_score(y_test, y_pred, average='macro')
f1_score_micro = f1_score(y_test, y_pred, average='micro')
MCC_score = matthews_corrcoef(y_test, y_pred)
Gmean_score = geometric_mean_score(y_test,y_pred, average='macro')


print(f'f1_score (macro): {f1_score_macro}')
print(f'f1_score (micro): {f1_score_micro}')
print(f'MCC: {MCC_score}')
print(f'Gmean: {Gmean_score}')


f1_score (macro): 0.8735455052216039
f1_score (micro): 0.9090909090909091
MCC: 0.8860555416456295
Gmean: 0.9415768097375388
