In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import jaccard_score
from sklearn.metrics.cluster import fowlkes_mallows_score
from sklearn.metrics import matthews_corrcoef

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier

from models import RSRF

from loguru import logger as log

In [46]:
dataset = pd.read_csv("../data/thrombosis_non_thrombosis.csv", delim_whitespace=True, header=0)
dataset["type"].value_counts()
dataset

#index_yes, index_no = create_cv_balanced(dataset['type'], test_size=14)

#print(len(index_no))

Unnamed: 0,node,dg,bt,cl,burts,pr,auth,kcore,areaSAS,areaSES,relSESA,type
0,51,10,0.012553,0.208375,4.990810,0.002631,0.326096,5,57.444680,62.364731,0.585397,Non_thrombosis
1,62,8,0.015998,0.197170,4.723779,0.002392,0.112726,5,39.795853,53.654439,0.592598,Non_thrombosis
2,114,11,0.035159,0.214359,5.946126,0.003065,0.459967,5,4.259958,18.087845,0.199775,Non_thrombosis
3,131,9,0.021794,0.208479,5.877906,0.002434,0.403769,5,6.606401,30.632352,0.338326,Non_thrombosis
4,158,8,0.003376,0.171804,4.116013,0.002391,0.137508,5,41.253536,34.970537,0.328257,Non_thrombosis
...,...,...,...,...,...,...,...,...,...,...,...,...
414,289,6,0.000439,0.182852,2.597751,0.001885,0.046030,5,98.603614,86.802216,0.846273,Thrombosis
415,371,4,0.006493,0.147287,3.400186,0.002114,0.004124,3,57.445468,50.488315,0.826823,Thrombosis
416,379,1,0.000000,0.114458,1.000000,0.000909,0.000072,1,92.780324,51.624957,1.094793,Thrombosis
417,380,7,0.010661,0.140457,4.721689,0.003280,0.003577,4,96.000745,109.140761,0.787480,Thrombosis


In [2]:
def read_dataset():
    dataset = pd.read_csv("../data/thrombosis_non_thrombosis.csv", delim_whitespace=True, header=0)
    return dataset

In [None]:
y_train = dataset["type"]
x_train = dataset.drop(['type', 'node'], axis=1)
# KNN classifier model
knn = KNeighborsClassifier()

# K-fold (k=5)
scores = cross_val_score(knn, x_train, y_train, cv=5, scoring='accuracy')

# Results
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [3]:
def apply_smote(x, y):
    x_smt, y_smt = SMOTE().fit_resample(x, y)
    
    return  x_smt, y_smt

In [4]:
def create_cv_balanced(data_labels, test_size = 14, reference='Thrombosis', folds = 10):    
    index_thrombo = (np.where(data_labels == reference)[0]).tolist()
    index_no_thrombo = (np.where(data_labels != reference)[0]).tolist()  
    #random_thrombo  = random.sample(index_thrombo, test_size) 
    #random_no_thrombo = random.sample(index_no_thrombo, test_size)     
    
    
    cv_indexes_thrombo = []
    cv_indexes_no_thrombo = []

    for i in np.arange(folds-1):
        temp = random.sample(index_thrombo, test_size)
        temp_out = random.sample(index_no_thrombo, test_size)   
        cv_indexes_thrombo.append(temp)
        cv_indexes_no_thrombo.append(temp_out)
        index_thrombo = [x for x in index_thrombo if x not in temp]
        index_no_thrombo = [x for x in index_no_thrombo if x not in temp_out]

    cv_indexes_thrombo.append(index_thrombo)
    cv_indexes_no_thrombo.append(index_no_thrombo)

    return cv_indexes_thrombo, cv_indexes_no_thrombo

In [5]:
def data_splitting(x, y, data_frame, 
                   n_folds = 10, 
                   smote=False, 
                   test_size=14):

    data_folds = []

    for fold_id in range(n_folds):
        
        index_yes, index_no = create_cv_balanced(data_frame['type'], test_size=test_size, folds = n_folds)

        list_index = np.concatenate((index_yes[fold_id], index_no[fold_id]))

        # test
        x_test = x[list_index,].copy()
        y_test = y[list_index,].copy()
        
        # train
        x_train = np.delete(x, list_index, axis=0)
        y_train = np.delete(y, list_index)
            
        if smote:
            x_train, y_train = apply_smote(x_train, y_train)

        data_folds.append([x_train, y_train, x_test, y_test])


    return data_folds


In [6]:
def feature_selection(algorithm, k_best, data_train_X, data_train_y):

    if(algorithm == 'rfe'):
        regre = LogisticRegression(solver='lbfgs')
        model = RFE(regre, n_features_to_select=k_best, step=1)
        return model.fit_transform(data_train_X, data_train_y)

    if(algorithm == 'kbest'):
        kbest = SelectKBest(score_func=f_classif, k=k_best)
        return kbest.fit_transform(data_train_X, data_train_y)

    if(algorithm == 'extra_tree'):
        clf = ExtraTreesClassifier(n_estimators=100).fit(data_train_X, data_train_y)
        model = SelectFromModel(clf, prefit=True)
        return model.transform(data_train_X)

    if(algorithm == 'linear_svc'):
        lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(data_train_X, data_train_y)
        model = SelectFromModel(lsvc, prefit=True)  
        return model.transform(data_train_X)

In [7]:
def pre_processing(df, norm=True, 
                       stand=False, 
                       algorithm=None, 
                       k_best=21, 
                       atts=None):

    
    data_train_y = df['type']
    data_train_y = data_train_y.replace(['Non_thrombosis', 'Thrombosis'], [1, -1]).to_numpy()
    
    if atts:
        data_train_X = df[atts].to_numpy() # set attributes
        data_train_X = data_train_X.reshape(data_train_X.shape[0], len(atts))
    else: 
        data_train_X = df.drop(['type'], axis=1).to_numpy()
    
    if norm:
        data_train_X = MinMaxScaler().fit_transform(data_train_X)

    if stand:
        data_train_X = StandardScaler().fit_transform(data_train_X)

    if algorithm:
        data_train_X = feature_selection(algorithm, k_best, data_train_X, data_train_y)

    return data_train_X, data_train_y, df


In [8]:
def computer_scores(y_test, y_pred, names=False):
    sen, spe = positive_negative_rate(y_test, y_pred)
    f1 = round(f1_score(y_test, y_pred, average='macro'), 5)
    roc = round(roc_auc_score(y_test, y_pred, average='weighted'), 5)
    jac = round(jaccard_score(y_test, y_pred, average='weighted'), 5)
    fmi = round(fowlkes_mallows_score(y_test, y_pred), 5)
    mcc = round(matthews_corrcoef(y_test, y_pred), 5)

    if names:
        return {'SEN': sen, 'SPE': spe, 'F1':f1, 
                'ROC': roc, 'IOU': jac, 'FMI': fmi, 'MCC': mcc}

    return sen, spe, f1, roc, jac, fmi, mcc

In [9]:
def positive_negative_rate(y_test, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn+fp)
    sensibivity = tp / (tp+fn)

    return round(sensibivity, 5), round(specificity, 5)     


In [13]:
def run_all_pipeline():
    '''
    Run pipile cross validation with iterations
    '''
    random.seed(10)
    n_iter = 5
    NF = 10

    results = {'model_name': [], 'iteration':[], 'fold':[], 'F1':[], 
               'ROC':[],'IOU':[],'FMI':[],'MCC':[], 'SEN':[], 'SPE':[]}

    norm, stand, smote = True, False, False

    df = read_dataset()
    
    # pre processed data
    x, y, df = pre_processing(df, norm=norm, stand=stand)
    
    # iterations
    for i in np.arange(n_iter):

        datafolds = {'x_train':[], 'y_train':[], 'x_test':[], 'y_test':[]}

        folds = data_splitting(x, y, df, n_folds=NF, smote=smote)
        for fold in folds:
            x_train, y_train, x_test, y_test = fold
            datafolds['x_train'].append(x_train)
            datafolds['y_train'].append(y_train)
            datafolds['x_test'].append(x_test)
            datafolds['y_test'].append(y_test)

        
        # cross validation, 9 folds
        for j in range(len(datafolds['x_train'])): 
            # get data
            x_train, y_train, x_test, y_test = datafolds['x_train'][j], datafolds['y_train'][j], \
                 datafolds['x_test'][j], datafolds['y_test'][j]
            # save data
            sav = True #save_data_fold(i, j, x_train, y_train, x_test, y_test)
            
            log.info(f'Iter: {i} save fold {j} saved:{sav}')
            
           
            model, y_pred = RSRF(x_train, y_train, x_test, y_test)
            sen, spe, f1, roc, jac, fmi, mcc = computer_scores(y_test, y_pred)
            
            results['model_name'].append('RF')
            results['iteration'].append(i)
            results['fold'].append(j)
            results['F1'].append(f1)
            results['ROC'].append(roc)
            results['IOU'].append(jac)
            results['FMI'].append(fmi)
            results['MCC'].append(mcc)
            results['SEN'].append(sen)
            results['SPE'].append(spe)
            log.info(f'RF .....................: {f1}')
            
            #save_model(model, filename=f"DT_iter-{i}_fold-{j}_f1-{f1}.pkl", 
            #                  folder='saved_models/exp_iii/')


In [None]:
run_all_pipeline()