In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

import pandas as pd
import numpy as np

In [2]:
training_set = pd.read_csv('processed_data/training_set.csv')
testing_set = pd.read_csv('processed_data/testing_set.csv')

In [3]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6417 entries, 0 to 6416
Data columns (total 49 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0    FC11   6417 non-null   float64
 1    FC12   6417 non-null   float64
 2    FC13   6417 non-null   float64
 3    FC14   6417 non-null   float64
 4    CA21   6417 non-null   float64
 5    CA22   6417 non-null   float64
 6    CA23   6417 non-null   float64
 7    CA24   6417 non-null   float64
 8    CA25   6417 non-null   float64
 9    CA26   6417 non-null   float64
 10   CA30   6417 non-null   float64
 11   CA31   6417 non-null   float64
 12   CA32   6417 non-null   float64
 13   CA33   6417 non-null   float64
 14   CA34   6417 non-null   float64
 15   CA36   6417 non-null   float64
 16   CA37   6417 non-null   float64
 17   CA38   6417 non-null   float64
 18   CA39   6417 non-null   float64
 19   CA40   6417 non-null   float64
 20   CA41   6417 non-null   float64
 21   CA42   6417 non-null   float64
 22  

In [4]:
testing_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957 entries, 0 to 956
Data columns (total 49 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0    FC11   957 non-null    float64
 1    FC12   957 non-null    float64
 2    FC13   957 non-null    float64
 3    FC14   957 non-null    float64
 4    CA21   957 non-null    float64
 5    CA22   957 non-null    float64
 6    CA23   957 non-null    float64
 7    CA24   957 non-null    float64
 8    CA25   957 non-null    float64
 9    CA26   957 non-null    float64
 10   CA30   957 non-null    float64
 11   CA31   957 non-null    float64
 12   CA32   957 non-null    float64
 13   CA33   957 non-null    float64
 14   CA34   957 non-null    float64
 15   CA36   957 non-null    float64
 16   CA37   957 non-null    float64
 17   CA38   957 non-null    float64
 18   CA39   957 non-null    float64
 19   CA40   957 non-null    float64
 20   CA41   957 non-null    float64
 21   CA42   957 non-null    float64
 22   C

In [5]:
X_train = training_set.drop('Y', axis=1)
y_train = training_set['Y']
X_test = testing_set.drop('Y', axis=1)
y_test = testing_set['Y']

In [6]:
X_train.shape, y_train.shape

((6417, 48), (6417, 48))

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=20211008, 
                                                  stratify=y_train)

In [8]:
import itertools
criterion = ["gini", "entropy"] # The function to measure the quality of a split. 
splitter = ["best", "random"] # The strategy used to choose the split at each node.
max_depth = [None, 100, 1000] # The maximum depth of the tree.
max_features = [None, "auto", "sqrt", "log2"] # The number of features to consider when looking for the best split   
parameters = [criterion,splitter,max_depth, max_features]  
parameters_combinations = list(itertools.product(*parameters))
len(parameters_combinations) 


48

In [9]:
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from imblearn.metrics import geometric_mean_score

In [10]:
from sklearn.ensemble import AdaBoostClassifier
# training and hyper-parameter tuning

####### ADD YOUR CODE ####
n_estimators_array = [10,100]
best_acc_params = {"base_estimator": None, "n_estimators": None,"accuracy": 0}


best_acc_params = {"accuracy": {
                             "average_score": 0,
                             "f1_score_macro": 0,
                             "f1_score_micro": 0,
                             "MCC": 0,
                             "Gmean": 0
                         }}
print('Decision Tree Classifier: ')
for params in parameters_combinations:
    
    des_tree_clf = DecisionTreeClassifier(criterion=params[0], splitter=params[1], 
                                          max_depth=params[2], max_features=params[3])
    for n_estimators in n_estimators_array:
        print(f'n_estimators: {n_estimators}')
        clf = AdaBoostClassifier(estimator=des_tree_clf,n_estimators=n_estimators)
        clf = clf.fit(X_train, y_train)
        y_predict = clf.predict(X_val)

        f1_score_macro = f1_score(y_val, y_predict, average='macro')
        f1_score_micro = f1_score(y_val, y_predict, average='micro')
        MCC_score = matthews_corrcoef(y_val, y_predict)
        Gmean_score = geometric_mean_score(y_val,y_predict, average='macro')
        accuracy = (f1_score_macro + f1_score_micro + MCC_score + Gmean_score) / 4.0
        
        if accuracy > 0.50:
            print(f'parameters: \n criterion: {params[0]}, splitter: {params[1]}, max depth: {params[2]}, max features: {params[3]}')
            print(f'accuracy: {accuracy}')
            if accuracy > best_acc_params['accuracy']['average_score']:
                best_acc_params.update({"base_estimator": {"name": "Decision Tree", "criterion": params[0], "splitter": params[1], "max_depth": params[2],
                         "max_features": params[3]}, "n_estimators": n_estimators,"accuracy": {
                             "average_score": accuracy,
                             "f1_score_macro": f1_score_macro,
                             "f1_score_micro": f1_score_micro,
                             "MCC": MCC_score,
                             "Gmean": Gmean_score
                         }})
    print('--------------------------------------------------------------')
    
lr = LogisticRegression();
bnb = BernoulliNB()
gnb = GaussianNB()
for base_est in [lr, bnb, gnb]:
    for n_estimators in n_estimators_array:
            print(f'n_estimators: {n_estimators}')
            clf = AdaBoostClassifier(estimator=base_est,n_estimators=n_estimators)
            clf = clf.fit(X_train, y_train)
            y_predicts = clf.predict(X_val)

            f1_score_macro = f1_score(y_val, y_predicts, average='macro')
            f1_score_micro = f1_score(y_val, y_predicts, average='micro')
            MCC_score = matthews_corrcoef(y_val, y_predicts)
            Gmean_score = geometric_mean_score(y_val,y_predicts, average='macro')
            accuracy = (f1_score_macro + f1_score_micro + MCC_score + Gmean_score) / 4.0

            if accuracy > 0.50:
                print(f'parameters: \n criterion: {params[0]}, splitter: {params[1]}, max depth: {params[2]}, max features: {params[3]}')
                print(f'accuracy: {accuracy}')
                if accuracy > best_acc_params['accuracy']['average_score']:
                    best_acc_params.update({"base_estimator": {"name": "Decision Tree", "criterion": params[0], "splitter": params[1], "max_depth": params[2],
                             "max_features": params[3]}, "n_estimators": n_estimators,"accuracy": {
                                 "average_score": accuracy,
                                 "f1_score_macro": f1_score_macro,
                                 "f1_score_micro": f1_score_micro,
                                 "MCC": MCC_score,
                                 "Gmean": Gmean_score
                             }})


Decision Tree Classifier: 
n_estimators: 10
parameters: 
 criterion: gini, splitter: best, max depth: None, max features: None
accuracy: 0.915367918710657
n_estimators: 100
parameters: 
 criterion: gini, splitter: best, max depth: None, max features: None
accuracy: 0.912081006928231
--------------------------------------------------------------
n_estimators: 10
parameters: 
 criterion: gini, splitter: best, max depth: None, max features: auto
accuracy: 0.8721869643489037
n_estimators: 100
parameters: 
 criterion: gini, splitter: best, max depth: None, max features: auto
accuracy: 0.8808863515426902
--------------------------------------------------------------
n_estimators: 10
parameters: 
 criterion: gini, splitter: best, max depth: None, max features: sqrt
accuracy: 0.8863786688444311
n_estimators: 100
parameters: 
 criterion: gini, splitter: best, max depth: None, max features: sqrt
accuracy: 0.8841231613446727
--------------------------------------------------------------
n_estimat

accuracy: 0.8591030702248003
--------------------------------------------------------------
n_estimators: 10
parameters: 
 criterion: entropy, splitter: best, max depth: None, max features: log2
accuracy: 0.8480678721275725
n_estimators: 100
parameters: 
 criterion: entropy, splitter: best, max depth: None, max features: log2
accuracy: 0.8561866512003633
--------------------------------------------------------------
n_estimators: 10
parameters: 
 criterion: entropy, splitter: best, max depth: 100, max features: None
accuracy: 0.9122463458153777
n_estimators: 100
parameters: 
 criterion: entropy, splitter: best, max depth: 100, max features: None
accuracy: 0.9129645555128616
--------------------------------------------------------------
n_estimators: 10
parameters: 
 criterion: entropy, splitter: best, max depth: 100, max features: auto
accuracy: 0.8869622890821749
n_estimators: 100
parameters: 
 criterion: entropy, splitter: best, max depth: 100, max features: auto
accuracy: 0.87514305

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

parameters: 
 criterion: entropy, splitter: random, max depth: 1000, max features: log2
accuracy: 0.5081640436919224
n_estimators: 100


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

parameters: 
 criterion: entropy, splitter: random, max depth: 1000, max features: log2
accuracy: 0.505929334354298
n_estimators: 10
n_estimators: 100
parameters: 
 criterion: entropy, splitter: random, max depth: 1000, max features: log2
accuracy: 0.5136565338942272
n_estimators: 10
parameters: 
 criterion: entropy, splitter: random, max depth: 1000, max features: log2
accuracy: 0.6615488846476809
n_estimators: 100
parameters: 
 criterion: entropy, splitter: random, max depth: 1000, max features: log2
accuracy: 0.6154355057098635


In [11]:
best_acc_params

{'accuracy': {'average_score': 0.923234350356124,
  'f1_score_macro': 0.9193915717691917,
  'f1_score_micro': 0.9174454828660437,
  'MCC': 0.9031066969083337,
  'Gmean': 0.9529936498809273},
 'base_estimator': {'name': 'Decision Tree',
  'criterion': 'gini',
  'splitter': 'random',
  'max_depth': 1000,
  'max_features': None},
 'n_estimators': 10}

In [12]:
base_estimator_params = best_acc_params['base_estimator']
base_estimator = DecisionTreeClassifier(criterion=base_estimator_params['criterion'], 
                                                                         splitter=base_estimator_params['splitter'], 
                                                                         max_depth=base_estimator_params['max_depth'],
                                                                         max_features=base_estimator_params['max_features'])
clf = AdaBoostClassifier(estimator=base_estimator,n_estimators=best_acc_params['n_estimators'])
clf = clf.fit(X_train, y_train)
y_predicts = clf.predict(X_test)




f1_score_macro = f1_score(y_test, y_predicts, average='macro')
f1_score_micro = f1_score(y_test, y_predicts, average='micro')
MCC_score = matthews_corrcoef(y_test, y_predicts)
Gmean_score = geometric_mean_score(y_test,y_predicts, average='macro')


print(f'f1_score (macro): {f1_score_macro}')
print(f'f1_score (micro): {f1_score_micro}')
print(f'MCC: {MCC_score}')
print(f'Gmean: {Gmean_score}')

f1_score (macro): 0.7731738994425997
f1_score (micro): 0.831765935214211
MCC: 0.7905928118290567
Gmean: 0.8926110891362069


