In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.model_selection import KFold

from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score

In [3]:
base = pd.read_csv("basePreProcessedAllAbFinal.csv")

y_data = base["status"]
x_data = base.drop(["status"], axis=1)

In [4]:
x_data.info()
y_data.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3589 entries, 0 to 3588
Columns: 170 entries, Molecular Weight to UFF Energy
dtypes: float64(82), int64(88)
memory usage: 4.7 MB


status
1.0    1914
0.0    1675
Name: count, dtype: int64

In [5]:
for i in range(7):

    random_state = 1
    folds = 10
    rangeThreshold = [0.1, 0.81, 0.1]
    cvResults = []
    countFold = 1

    thresholdList = []
    for thresh in np.arange(start=rangeThreshold[0], stop=rangeThreshold[1], step=rangeThreshold[2]):
        thresholdList.append(thresh)

    match i:
        case 0:
            classifier = KNeighborsClassifier()
            model = "KNN"
        case 1:
            classifier = KNeighborsClassifier(weights='distance')
            model = "DWNN"
        case 2:
            classifier = SVC(random_state=random_state, probability=True)
            model = "SVM"
        case 3:
            classifier = LogisticRegression(random_state=random_state)
            model = "Regressão Logistica"
        case 4:
            classifier = DecisionTreeClassifier(random_state=random_state)
            model = "Decision Tree"
        case 5:
            classifier = RandomForestClassifier(random_state=random_state)
            model = "Random Forest"
        case 6:
            classifier = XGBClassifier(random_state=random_state)
            model = "XGBoost"

    kf = KFold(n_splits=folds, shuffle=True, random_state=random_state) 

    for train_index, test_index in kf.split(base): #10-fold cross-validation
        countFold += 1

        x_train, x_test = x_data.iloc[train_index], x_data.iloc[test_index]
        y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]

        results = {'F1-Macro':[],'ACC-Balanced':[], 'AUC': [], 'MCC': [], 'THRE': []}

        classifier.fit(x_train, y_train)
        y_pred_proba = classifier.predict_proba(x_test)[:, 1]

        for thresh in np.arange(start=rangeThreshold[0], stop=rangeThreshold[1], step=rangeThreshold[2]):
            
            y_pred = (y_pred_proba >= thresh).astype(int)
                    
            balanced_acc = balanced_accuracy_score(y_test, y_pred)
            f1_macro = f1_score(y_test, y_pred, average='macro')
            auc = roc_auc_score(y_test, y_pred_proba) #CORRIGIDO: y_pred -> y_pred_proba
            mcc = matthews_corrcoef(y_test, y_pred)

            results['ACC-Balanced'].append(balanced_acc)
            results['F1-Macro'].append(f1_macro)
            results['AUC'].append(auc)
            results['MCC'].append(mcc)
            results['THRE'].append(thresh)

        cvResults.append(results) #Cada lista terá multiplos valores pois executa sobre varios thresholds testados (teria um valor só se fosse um único threshold). Assim, cvResults é um vetor de dicts

    cvFinalMetrics = {"ACC-Balanced":[], 'F1-Macro':[], 'AUC':[], 'MCC': []}
        
    #Calculate average for each metric and each threshold
    for metric in cvFinalMetrics:
        temp_metric = []

        for i in range(folds): #Pega o vetor de valores por fold da métrica em questão (são vetores pois temos um valor por threshold) 
            temp_metric.append(cvResults[i][metric])
        
        for i in range(len(thresholdList)): #Calcula a média por threshold
            aux = []
            
            for i2 in range(folds):
                aux.append(temp_metric[i2][i])
            
            cvFinalMetrics[metric].append(np.average(aux))

    results_df = {"Model": model, "Threshold": thresholdList}
    results_df.update(cvFinalMetrics)
    results_df = pd.DataFrame(results_df)
    print('-' * 15 + "FDA" + '-' * 15)
    print(results_df)

---------------FDA---------------
  Model  Threshold  ACC-Balanced  F1-Macro      AUC       MCC
0   KNN        0.1      0.533647  0.436175  0.69511  0.138467
1   KNN        0.2      0.533647  0.436175  0.69511  0.138467
2   KNN        0.3      0.602095  0.578589  0.69511  0.244520
3   KNN        0.4      0.602095  0.578589  0.69511  0.244520
4   KNN        0.5      0.645741  0.644531  0.69511  0.292750
5   KNN        0.6      0.645741  0.644531  0.69511  0.292750
6   KNN        0.7      0.634484  0.614365  0.69511  0.283929
7   KNN        0.8      0.634484  0.614365  0.69511  0.283929
---------------FDA---------------
  Model  Threshold  ACC-Balanced  F1-Macro       AUC       MCC
0  DWNN        0.1      0.534216  0.438626  0.710895  0.138030
1  DWNN        0.2      0.584253  0.542928  0.710895  0.226389
2  DWNN        0.3      0.601849  0.578823  0.710895  0.243127
3  DWNN        0.4      0.631232  0.626188  0.710895  0.275977
4  DWNN        0.5      0.651666  0.650476  0.710895  0.304

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

---------------FDA---------------
                 Model  Threshold  ACC-Balanced  F1-Macro      AUC       MCC
0  Regressão Logistica        0.1      0.499645  0.349492  0.68189 -0.007504
1  Regressão Logistica        0.2      0.511987  0.383327  0.68189  0.078379
2  Regressão Logistica        0.3      0.538880  0.464141  0.68189  0.128754
3  Regressão Logistica        0.4      0.592624  0.570040  0.68189  0.218011
4  Regressão Logistica        0.5      0.635129  0.633249  0.68189  0.272543
5  Regressão Logistica        0.6      0.634822  0.618134  0.68189  0.280672
6  Regressão Logistica        0.7      0.577738  0.512252  0.68189  0.203744
7  Regressão Logistica        0.8      0.534835  0.409319  0.68189  0.143650
---------------FDA---------------
           Model  Threshold  ACC-Balanced  F1-Macro       AUC      MCC
0  Decision Tree        0.1      0.616369  0.615366  0.616369  0.23243
1  Decision Tree        0.2      0.616369  0.615366  0.616369  0.23243
2  Decision Tree        0.