In [8]:
import sys
import time
import pickle
import warnings
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
from loguru import logger as log
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from datetime import datetime
from collections import Counter

from ialovecoffe.data import *
from ialovecoffe.models import *
from ialovecoffe.validation import computer_scores, computer_scores_outlier, accuracy_per_class
from sklearn.model_selection import train_test_split

#simplefilter("ignore", category=[RuntimeWarning, ConvergenceWarning]) 
#simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter("ignore")

In [3]:
random.seed(10)

In [4]:
def create_folds(X, Y, FOLDS: int):

    skf = StratifiedKFold(n_splits=FOLDS, random_state=None, shuffle=False)

    folds = []

    for train_index, test_index in skf.split(X, Y):

        x_train = X.iloc[train_index].to_numpy()
        x_test = X.iloc[test_index].to_numpy()

        y_train = Y[train_index]
        y_test = Y[test_index]
        
        folds.append([x_train, y_train, x_test, y_test])

    return folds

In [12]:
def run_experiment(x, y, iterations, ts) -> pd.DataFrame:

    data_results = []

    results = {'model_name': [], 'iteration':[], 'F1':[], 
                'ROC':[],'acc-class-1':[],'acc-class-2':[], 'SEN':[], 
                'SPE':[], 'MCC':[], 'TPR': [], 'FPR':[], 'THRESH': []}

    for i in tqdm(range(iterations)):

        #folds = create_folds(x, y, 2)

        #for j, fold in enumerate(folds):
            
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=ts, random_state=42)

        x_train = x_train.to_numpy()
        x_test = x_test.to_numpy()
        y_train = y_train.to_numpy()
        y_test = y_test.to_numpy()

        log.debug('-' * 30)
        log.debug(f'Iteration {i}')
        log.debug('-' * 30)
            
        model, y_pred, y_pred_prob = RSRF(x_train, y_train, x_test, y_test)
        sen, spe, f1, roc, jac, fmi, mcc, fpr, tpr, thresh = computer_scores(y_test, y_pred, y_pred_prob)
        acc1, acc2 = accuracy_per_class(y_test, y_pred)
        results['model_name'].append('RandomForest')
        results['iteration'].append(i)
        results['acc-class-1'].append(acc1)
        results['acc-class-2'].append(acc2)
        results['F1'].append(f1)
        results['ROC'].append(roc)
        results['SEN'].append(sen)
        results['SPE'].append(spe)
        results['MCC'].append(mcc)
        results['FPR'].append(fpr)
        results['TPR'].append(tpr)
        results['THRESH'].append(thresh)
        log.debug(f'RF .......: {f1}')

        
        model, y_pred, y_pred_prob = RSDT(x_train, y_train, x_test, y_test)
        sen, spe, f1, roc, jac, fmi, mcc, fpr, tpr, thresh = computer_scores(y_test, y_pred, y_pred_prob)
        acc1, acc2 = accuracy_per_class(y_test, y_pred)
        results['model_name'].append('DecisionTree')
        results['iteration'].append(i)
        results['acc-class-1'].append(acc1)
        results['acc-class-2'].append(acc2)
        results['F1'].append(f1)
        results['ROC'].append(roc)
        results['SEN'].append(sen)
        results['SPE'].append(spe)
        results['MCC'].append(mcc)
        results['FPR'].append(fpr)
        results['TPR'].append(tpr)
        results['THRESH'].append(thresh)
        log.debug(f'DT .......: {f1}')

        model, y_pred, y_pred_prob = RSNN(x_train, y_train, x_test, y_test)
        sen, spe, f1, roc, jac, fmi, mcc, fpr, tpr, thresh = computer_scores(y_test, y_pred, y_pred_prob)
        acc1, acc2 = accuracy_per_class(y_test, y_pred)
        results['model_name'].append('KNN')
        results['iteration'].append(i)
        results['F1'].append(f1)
        results['acc-class-1'].append(acc1)
        results['acc-class-2'].append(acc2)
        results['ROC'].append(roc)
        results['SEN'].append(sen)
        results['SPE'].append(spe)
        results['MCC'].append(mcc)
        results['FPR'].append(fpr)
        results['TPR'].append(tpr)
        results['THRESH'].append(thresh)
        log.debug(f'KNN ......: {f1}')
            
        model, y_pred = RSOneClassSVM(x_train, y_train, x_test, y_test)
        sen, spe, f1, roc, jac, fmi, mcc, fpr, tpr, thresh = computer_scores_outlier(y_test, y_pred)
        acc1, acc2 = accuracy_per_class(y_test, y_pred)
        results['model_name'].append('OneClassSVM')
        results['iteration'].append(i)
        results['acc-class-1'].append(acc1)
        results['acc-class-2'].append(acc2)
        results['F1'].append(f1)
        results['ROC'].append(roc)
        results['SEN'].append(sen)
        results['SPE'].append(spe)
        results['MCC'].append(mcc)
        results['FPR'].append(fpr)
        results['TPR'].append(tpr)
        results['THRESH'].append(thresh)
        log.debug(f'OneClassSVM ......: {f1}')

        model, y_pred = RSLocalOutlierFactor(x_train, y_train, x_test, y_test)
        sen, spe, f1, roc, jac, fmi, mcc, fpr, tpr, thresh = computer_scores_outlier(y_test, y_pred)
        acc1, acc2 = accuracy_per_class(y_test, y_pred)
        results['model_name'].append('LocalOutlierFactor')
        results['iteration'].append(i)
        results['acc-class-1'].append(acc1)
        results['acc-class-2'].append(acc2)
        results['F1'].append(f1)
        results['ROC'].append(roc)
        results['SEN'].append(sen)
        results['SPE'].append(spe)
        results['MCC'].append(mcc)
        results['FPR'].append(fpr)
        results['TPR'].append(tpr)
        results['THRESH'].append(thresh)
        log.debug(f'LocalOutlierFactor ......: {f1}')

        model, y_pred = RSIsolationForest(x_train, y_train, x_test, y_test)
        sen, spe, f1, roc, jac, fmi, mcc, fpr, tpr, thresh = computer_scores_outlier(y_test, y_pred)
        acc1, acc2 = accuracy_per_class(y_test, y_pred)
        results['model_name'].append('IsolationForest')
        results['iteration'].append(i)
        results['acc-class-1'].append(acc1)
        results['acc-class-2'].append(acc2)
        results['F1'].append(f1)
        results['ROC'].append(roc)
        results['SEN'].append(sen)
        results['SPE'].append(spe)
        results['MCC'].append(mcc)
        results['FPR'].append(fpr)
        results['TPR'].append(tpr)
        results['THRESH'].append(thresh)
        log.debug(f'IsolationForest ......: {f1}')

        model, y_pred, y_pred_prob = RSXgboost(x_train, y_train, x_test, y_test)
        sen, spe, f1, roc, jac, fmi, mcc, fpr, tpr, thresh = computer_scores(y_test, y_pred, y_pred_prob)
        acc1, acc2 = accuracy_per_class(y_test, y_pred)
        results['model_name'].append('Xgboost')
        results['iteration'].append(i)
        results['acc-class-1'].append(acc1)
        results['acc-class-2'].append(acc2)
        results['F1'].append(f1)
        results['ROC'].append(roc)
        results['SEN'].append(sen)
        results['SPE'].append(spe)
        results['MCC'].append(mcc)
        results['FPR'].append(fpr)
        results['TPR'].append(tpr)
        results['THRESH'].append(thresh)
        log.debug(f'Xgboost ......: {f1}')
        
        log.debug('\n')


    df_fold = pd.DataFrame(results)
    models = df_fold['model_name'].unique()
    log.info('\n')
    log.info('-' * 30)
    for model in models:

        df_model = df_fold[df_fold['model_name'] == model]
        mean_f1 = float(f'{np.mean(df_model["F1"]):.2f}')

        log.info(f'MODEL {model} with .....: {mean_f1}')

    #data_results.append(df_fold)
    #log.info('-' * 30)

    #return pd.concat(data_results)
    return df_fold

In [6]:
NUM_ITER = 50

In [7]:
def process_A():
    '''
    READ DATA
    '''
    x, y = read_A_thrombosis_non_thrombosis_v5()
    
    
    y.replace(to_replace=["Non_thrombosis", "Thrombosis"], value=[0, 1], inplace=True)

    # run
    df = run_experiment(x, y, NUM_ITER)
    df.to_csv('results/results_v4_v5/A_thrombosis_non_thrombosis_v5.csv', index=False)
    
    # only show mean metrics
    models = df['model_name'].unique()
    for model in models:

        df_model = df[df['model_name'] == model]
        mean_f1 = float(f'{np.mean(df_model["F1"]):.2f}')
        log.info(f'MODEL {model} with .....: {mean_f1}')


In [37]:
x, y = read_A_thrombosis_non_thrombosis_v5()
x.shape

(417, 7)

In [43]:
x['target'] = y
x.head()

Unnamed: 0,dg,bt,cl,pr,auth,relSESA,consurf,target
0,8,0.025472,0.260783,0.003314,0.144998,0.084348,1.713,Non_thrombosis
1,4,0.004383,0.231801,0.001794,0.048901,0.831449,3.237,Non_thrombosis
2,7,0.005611,0.238478,0.003224,0.250893,0.193362,1.526,Non_thrombosis
3,3,0.001928,0.209895,0.001452,0.031038,0.640557,1.916,Non_thrombosis
4,4,0.003534,0.200209,0.002539,0.046188,0.626411,0.67,Non_thrombosis


In [38]:
x.shape, y.shape

((417, 7), (417,))

In [39]:
Counter(y)

Counter({'Non_thrombosis': 283, 'Thrombosis': 134})

In [40]:
size_minority = min(Counter(y).values())
size_minority

134

In [83]:
p = np.ceil(size_minority * 0.1).astype('int')
p

14

In [95]:
def test_balacing(X, Y, percentage, at='target'):
    
    X[at] = Y
    
    p = np.ceil(size_minority * percentage).astype('int')
    train = []
    test = []
    for classe in X[at].unique():
        
        df_class = x[x[at] == classe]
        
        test.append(df_class.iloc[:p])
        train.append(df_class.iloc[p:])
        
    df_train = pd.concat(train)
    df_test = pd.concat(test)
    
    #print(df_train.head())
    
    y_train = df_train[at]
    y_test = df_test[at]
        
    x_train = df_train.drop([at], axis=1)
    x_test = df_test.drop([at], axis=1)   
    
    return x_train, y_train, x_test, y_test

In [98]:
x_train, y_train, x_test, y_test = test_balacing(x, y, 0.1)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((389, 7), (389,), (28, 7), (28,))

In [99]:
x_train.head()

Unnamed: 0,dg,bt,cl,pr,auth,relSESA,consurf
17,6,0.005519,0.221091,0.002968,0.01266,0.697812,-0.329
18,3,8e-05,0.191485,0.001811,0.004993,0.965468,-0.811
19,3,0.001434,0.206157,0.001736,0.003549,0.890655,0.299
21,4,0.015834,0.233831,0.001542,0.010145,0.593749,-0.249
22,4,0.006965,0.225426,0.003006,0.01021,0.610521,-0.691


In [85]:
att_target = 'target'
subset = []
for classe in x[att_target].unique():
    subset.append(x[x[att_target] == classe])
    
    df_class = x[x[att_target] == classe]
    
    test = df_class.iloc[:p]
    train = df_class.iloc[p:]
   
    log.info(f'CLASS: {classe} - TRAIN: {train.shape} TEST: {test.shape}')
    # balancer as instâncias de teste
    
#data_train_balanced = pd.concat(subset)

2022-07-07 22:26:57.017 | INFO     | __main__:<cell line: 3>:11 - CLASS: Non_thrombosis - TRAIN: (269, 8) TEST: (14, 8)
2022-07-07 22:26:57.021 | INFO     | __main__:<cell line: 3>:11 - CLASS: Thrombosis - TRAIN: (120, 8) TEST: (14, 8)


In [86]:
269 + 14

283

In [87]:
120 + 14

134

In [48]:
Counter(data_train_balanced['target'])

Counter({'Non_thrombosis': 134, 'Thrombosis': 134})

In [76]:
data_train_balanced.index.to_list()

[0,
 1,
 2,
 3,
 4,
 5,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 16,
 17,
 18,
 19,
 21,
 22,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 34,
 35,
 36,
 38,
 39,
 45,
 46,
 47,
 50,
 51,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 67,
 73,
 74,
 76,
 78,
 82,
 85,
 88,
 89,
 90,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 102,
 104,
 106,
 107,
 112,
 116,
 117,
 118,
 119,
 121,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 132,
 134,
 135,
 136,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 153,
 154,
 155,
 156,
 157,
 159,
 160,
 162,
 163,
 164,
 165,
 166,
 168,
 169,
 170,
 172,
 173,
 175,
 177,
 178,
 182,
 183,
 184,
 185,
 186,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 197,
 201,
 205,
 6,
 7,
 15,
 20,
 23,
 31,
 32,
 33,
 37,
 40,
 41,
 42,
 43,
 44,
 48,
 49,
 52,
 65,
 66,
 68,
 69,
 70,
 71,
 72,
 75,
 77,
 79,
 80,
 81,
 83,
 84,
 86,
 87,
 91,
 99,
 100,
 101,
 103,
 105,
 108,
 109,
 110,
 111,
 113,
 114,
 115,
 120,
 122,
 130,
 131,
 133,


In [72]:
#X_train, X_test, y_train, y_test = train_test_split(data_train_balanced, data_train_balanced[att_target], test_size=0.15, stratify=data_train_balanced['target'].values)
X_train, X_test, y_train, y_test = train_test_split(data_train_balanced, data_train_balanced[att_target], test_size=0.15, stratify=data_train_balanced['target'].values)

In [73]:
Counter(y_train)

Counter({'Thrombosis': 114, 'Non_thrombosis': 113})

In [74]:
Counter(y_test)

Counter({'Non_thrombosis': 21, 'Thrombosis': 20})

In [None]:

if __name__ == '__main__':

    log.info('Process A')
    process_A()
    log.info('Finished A')