# <font color='Crimson'><b>STATISTICAL EVALUATION</b></font>

In [None]:
#Import packages:
.
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler 
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import make_scorer
#from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib import cm
import statistics
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy
from scipy import stats
from scipy.sparse import hstack
from scipy.stats import ranksums,wilcoxon
import seaborn as sns
from sklearn.metrics import class_likelihood_ratios
from IPython.display import clear_output

In [None]:
#Define useful functions for classification task:
#TSS score
def my_tss_score(Y_training, probability_prediction,threshold):
    
    Y_predicted = probability_prediction > threshold
    res = metrics_classification(Y_training > 0, Y_predicted, print_skills=False)    

    return res['tss']

#Classification metrics
def metrics_classification(y_real, y_pred, print_skills=True):

    cm, far, pod, acc, hss, tss, fnfp, csi, tpr, tnr = classification_skills(y_real, y_pred)

    if print_skills:
        print ('confusion matrix')
        print (cm)
        print ('false alarm ratio       \t', far)
        print ('probability of detection\t', pod)
        print ('accuracy                \t', acc)
        print ('hss                     \t', hss)
        print ('tss                     \t', tss)
        print ('balance                 \t', fnfp)
        print ('csi                 \t', csi)
        print ('tpr                 \t', tpr)
        print ('tnr                 \t', tnr)

    balance_label = float(sum(y_real)) / y_real.shape[0]

    return {
        "cm": cm,
        "far": far,
        "pod": pod,
        "acc": acc,
        "hss": hss,
        "tss": tss,
        "fnfp": fnfp,
        "balance label": balance_label,
        "csi": csi,
        "tpr": tpr,
        "tnr": tnr}

def classification_skills(y_real, y_pred):

    cm = confusion_matrix(y_real, y_pred)

    if cm.shape[0] == 1 and sum(y_real) == 0:
        a = 0.
        d = float(cm[0, 0])
        b = 0.
        c = 0.
    elif cm.shape[0] == 1 and sum(y_real) == y_real.shape[0]:
        a = float(cm[0, 0])
        d = 0.
        b = 0.
        c = 0.
    elif cm.shape[0] == 2:
        a = float(cm[1, 1])
        d = float(cm[0, 0])
        b = float(cm[0, 1])
        c = float(cm[1, 0])
    TP = a
    TN = d
    FP = b
    FN = c

    if (TP + FP + FN + TN) == 0.:
        if (TP + TN) == 0.:
            acc = 0.  # float('NaN')
        else:
            acc = -100  # float('Inf')
    else:
        acc = (TP + TN) / (TP + FP + FN + TN)
        

    if TP + FN == 0.:
        if TP == 0.:
            tss_aux1 = 0.  # float('NaN')
        else:
            tss_aux1 = -100  # float('Inf')
    else:
        tss_aux1 = (TP / (TP + FN))

    if (FP + TN) == 0.:
        if FP == 0.:
            tss_aux2 = 0.  # float('NaN')
        else:
            tss_aux2 = -100  # float('Inf')
    else:
        tss_aux2 = (FP / (FP + TN))

    tss = tss_aux1 - tss_aux2

    if ((TP + FN) * (FN + TN) + (TP + FP) * (FP + TN)) == 0.:
        if (TP * TN - FN * FP) == 0:
            hss = 0.  # float('NaN')
        else:
            hss = -100  # float('Inf')
    else:
        hss = 2 * (TP * TN - FN * FP) / ((TP + FN) *
                                         (FN + TN) + (TP + FP) * (FP + TN))

    if FP == 0.:
        if FN == 0.:
            fnfp = 0.  # float('NaN')
        else:
            fnfp = -100  # float('Inf')
    else:
        fnfp = FN / FP

    if (TP + FN) == 0.:
        if TP == 0.:
            pod = 0  # float('NaN')
        else:
            pod = -100  # float('Inf')
    else:
        pod = TP / (TP + FN)


    if (TP + FP) == 0.:
        if FP == 0.:
            far = 0.  # float('NaN')
        else:
            far = -100  # float('Inf')
    else:
        far = FP / (TP + FP)

    #acc = (a + d) / (a + b + c + d)
    tpr = tss_aux1  # a / (a + b)
    tnr = 1-tss_aux2  # d / (d + c)
    #wtpr = a / (a + b) * (a + c) / (a + b + c + d) + d / (c + d) * (b + d) / (a + b + c + d)
    #pacc = a / (a + c)
    #nacc = d / (b + d)
    #wacc = a / (a + c) * (a + c) / (a + b + c + d) + d / (b + d) * (b + d) / (a + b + c + d)

    # if the cm has a row or a column equal to 0, we have bad tss
    if TP+FN == 0 or TN+FP == 0 or TP+FP == 0 or TN+FN == 0:
        tss = 0
    if TP+FP+FN==0:
        csi = 0
    else:
        csi = TP/(TP+FP+FN)

    return cm.tolist(), far, pod, acc, hss, tss, fnfp, csi, tpr, tnr

In [None]:
#Define useful fuctions for cross-validation:
def build_grid(p):
    #Parameters initialization:
    model_name = None if p['model_name'] == None else p['model_name']
    GRID = None 

    if model_name == "RF":
        GRID = {'n_estimators': p['n_estimators'],
                'max_features': p['max_features'],
                'max_depth': p['max_depth'],
                'criterion': p['criterion']
               }   

    #other models can be added here

    return GRID


def initiate_p(p):
    #Ranges for hyperparameter search:
    if p['model_name'] == 'RF':
        p['estimator'] = RandomForestClassifier(random_state=100)
        p['n_estimators'] = [100,200,300,400,500] 
        p['max_features'] = [None, 'sqrt', 'log2']
        p['max_depth'] = [4,5,6,7,8]
        p['criterion'] = ['gini','entropy']

    #other models can be added here

    #All:
    p['cv'] = 10

    return p


def GridSearch_(X,y,p):
    #Parameters initialization:
    estimator = None if p['estimator'] == None else p['estimator']
    cv = 10 if p['cv'] == None else p['cv']
    grid = None if p['grid'] == None else p['grid']
    tau = 0.5 if p['threshold'] == None else p['threshold']
    
    if(estimator == None or grid == None):
        return None

    #1st step: select the best hyperparameters
    CV = GridSearchCV(estimator     = estimator,
                        param_grid  = grid,
                        scoring     =  make_scorer(my_tss_score,threshold=tau,needs_proba=True), #TSS score
                        #refit       = 'roc_auc',
                        cv          = cv,
                        verbose     = 0,
                        n_jobs      = 20,
                        return_train_score=True)
    CV_H = CV.fit(X,y)
    
    return CV_H

In [None]:
#Import dataset:
df_complete1 = pd.read_csv('02_1_Dataset.csv', sep=',',index_col=0)
df_complete1.head()

In [None]:
#Select BDG and PCT columns:
df_complete1_BDG_PCT = df_complete1.iloc[:,[40,42]]

In [None]:
#Import dataset:
df_dataset = pd.read_csv('02_2_Dataset.csv', sep=',',index_col=0)
df_dataset.head()

In [None]:
#Merge the last imported dataset and BDG and PCT columns:
df_dataset2 = df_dataset.drop(df_dataset.columns[[0,3,4,5,9]], axis=1)
df_complete2_BDG_PCT = df_complete1_BDG_PCT.merge(df_dataset2, how='right',right_index=True,left_index=True)
df_complete2_BDG_PCT.head()

In [None]:
#Delete episodes with missing BDG or PCT:
df_BDG_PCT = df_complete2_BDG_PCT[~(df_complete2_BDG_PCT['PROCALCITONINA'].isna() | df_complete2_BDG_PCT['B_D_GLUCANO'].isna())]
print('Number of samples in the new dataset:',df_BDG_PCT.shape[0])

In [None]:
#Imputation and Standardization: within stratified 10-fold

In [None]:
#New percentage of candidemias:
print('Number of candidemias in the new dataset:', pd.value_counts(df_BDG_PCT['CANDIDEMIA'])[1], 
      '(',round(pd.value_counts(df_BDG_PCT['CANDIDEMIA'], normalize=True)[1]*100,2),'%) of',
      df_BDG_PCT['CANDIDEMIA'].shape[0], 'samples')

In [None]:
#4 subset of interest to compare statistically: BDG and PCT only, all features, best subset chosen with and without BDG and PCT:
#Train best classifier with StratifiedKFold (for more than one shuffle) to evaluate performances (on validation sets) consistently:

#Define the number of shuffles:
R = 3
#Define the number of folds:
K = 10
#Threshold chosen:
thresholds = np.linspace(0.175,0.175,1)
#Classifier chosen:
classifiers = ['RF'] #LogisticRegL1 #LogisticRegL2

#Subsets to evaluate:
dataset = ['All','BDG_PCT','with_12','without_12']

for d in dataset:
    
    if d == 'BDG_PCT':
        subset_training = df_BDG_PCT[['CANDIDEMIA', 'B_D_GLUCANO', 'PROCALCITONINA']]
    elif d == 'All':
        subset_training = df_BDG_PCT.drop('MISTA',axis=1)
    elif d == 'with_12':
        subset_training = df_BDG_PCT[['CANDIDEMIA', 'B_D_GLUCANO', 'PROCALCITONINA', 'EOSINOFILI', 'LINFOCITI', 'MONOCITI', 'NEUTROFILI', 
                                      'EMATOCRITO', 'EMOGLOBINA', 'GLOBULI_B', 'PIASTRINE', 'TEMPO_PROTROMB', 'ACIDO_URICO', 'UREA', 'ALBUMINA']]
    elif d == 'without_12':
        subset_training = df_BDG_PCT[['CANDIDEMIA', 'EOSINOFILI', 'LINFOCITI', 'MONOCITI', 'NEUTROFILI', 
                                      'EMATOCRITO', 'EMOGLOBINA', 'GLOBULI_B', 'PIASTRINE', 'TEMPO_PROTROMB', 'ACIDO_URICO', 'UREA', 'ALBUMINA']]
        
    #Splitting features and outcome:
    X_training = subset_training.drop('CANDIDEMIA',axis=1) #.values
    y_training = subset_training[['CANDIDEMIA']] #.values
    #y_training = np.double(y_training)
    
    #Shuffle data in order to test stability of the model:
    #In order to guarantee reproducibility of results, the random states are always the numbers 0,1,2.
    for r in range(0,R):
        X,y = shuffle(X_training,y_training,random_state = r) 
        #------------------------------------------------------------------------------------------
        # CROSS-VALIDATION: K-Fold
        kf = StratifiedKFold(n_splits=K, shuffle=False)
        #------------------------------------------------------------------------------------------
        Kfold = 0

        for train_index, validation_index in kf.split(X,y):

            X_training, X_validation = X.iloc[train_index,:], X.iloc[validation_index,:]
            y_training0, y_validation0 = y.iloc[train_index], y.iloc[validation_index]
            
            train = pd.concat([X_training,y_training0],axis=1)
            validation = pd.concat([X_validation,y_validation0],axis=1)
            
            #Imputation
            #Definition of the imputer:
            imputer = IterativeImputer(KNeighborsRegressor(n_neighbors=5),
                                       sample_posterior=False, 
                                       max_iter=100,
                                       tol=0.05,
                                       n_nearest_features=None,
                                       initial_strategy='most_frequent',
                                       imputation_order='random',
                                       random_state=100)
            #Fit of the imputer on training set and imputation:
            train_imputed = imputer.fit_transform(train)
            train_imputed = pd.DataFrame(train_imputed)
            #Work on imputed training set:
            train_imputed.columns = train.columns
            if d=='All':
                train_imputed['30gg'] = train_imputed['30gg'] > 0.5
                train_imputed['30gg'] = train_imputed['30gg']*1.
            #Imputation of the validation set:
            validation_imputed = imputer.transform(validation)
            validation_imputed = pd.DataFrame(validation_imputed)
            #Work on imputed test set:
            validation_imputed.columns = validation.columns
            if d=='All':
                validation_imputed['30gg'] = validation_imputed['30gg'] > 0.5
                validation_imputed['30gg'] = validation_imputed['30gg']*1.
            
            #Standardization
            #Define the scaler:
            scaler = StandardScaler()
            #Fit of the scaler on the continuous features of the training set and scale:
            if d=='All':
                X_train_std0 = pd.DataFrame(scaler.fit_transform(train_imputed.drop(['CANDIDEMIA','SESSO','30gg'],axis=1)))
                X_train_std1 = pd.DataFrame(np.concatenate((train_imputed[['SESSO','30gg']], X_train_std0), axis=1))
            else:
                X_train_std1 = pd.DataFrame(scaler.fit_transform(train_imputed.drop('CANDIDEMIA',axis=1)))
                
            #Scale the continuous features of the test set:
            if d=='All':
                X_validation_std0 = pd.DataFrame(scaler.transform(validation_imputed.drop(['CANDIDEMIA','SESSO','30gg'],axis=1)))
                X_validation_std1 = pd.DataFrame(np.concatenate((validation_imputed[['SESSO','30gg']], X_validation_std0), axis=1))
            else:
                X_validation_std1 = pd.DataFrame(scaler.transform(validation_imputed.drop('CANDIDEMIA',axis=1)))
                
            X_train_std = X_train_std1.values
            X_validation_std = X_validation_std1.values
            y_training = y_training0.values.ravel()
            y_training = np.double(y_training)
            y_validation = y_validation0.values.ravel()
            y_validation = np.double(y_validation)

            for cl in classifiers:
                metrics_cl = {}

                #metrics_cl['train_index'] = train_index
                #metrics_cl['validation_index'] = validation_index

                p = {}
                p['model_name'] = cl
                p = initiate_p(p)
                print('Model name:', p['model_name'])

                metrics_th = {}

                for th in tqdm(thresholds):
                    p['threshold'] = th
                    print('Threshold:', p['threshold'])

                    performance = {}
                    #---------------------------------------------
                    #Hyperparameters search and model fit on training set:    
                    if p['model_name'] == 'LogisticRegL1':
                        score = make_scorer(my_tss_score,threshold=th,needs_proba=True)
                        fit = LogisticRegressionCV(cv = 10,random_state = 100,penalty='l1',solver='liblinear',
                                                   scoring=score).fit(X_train_std, y_training) 

                    elif p['model_name'] == 'LogisticRegL2':
                        score = make_scorer(my_tss_score,threshold=th,needs_proba=True)
                        fit = LogisticRegressionCV(cv = 10,random_state = 100,penalty='l2',scoring=score).fit(X_train_std, y_training) 

                    if p['model_name'] == 'RF':
                        p['grid'] = build_grid(p)
                        p['GS_RF'] = GridSearch_(X_train_std, y_training, p)
                        max_depth = p['GS_RF'].best_params_['max_depth']
                        n_estimators = p['GS_RF'].best_params_['n_estimators']
                        criterion = p['GS_RF'].best_params_['criterion']
                        max_features = p['GS_RF'].best_params_['max_features']
                        print(p['GS_RF'].best_params_)
                        #performance['best_hyper'] = p['GS_RF'].best_params_
                        fit = RandomForestClassifier(max_depth=max_depth,n_estimators=n_estimators,criterion=criterion,
                                                     max_features=max_features,random_state=100).fit(X_train_std, y_training) 

                    #other models can be added here
                    
                    #---------------------------------------------
                    #Predict on training set:
                    PRED_prob_tr = fit.predict_proba(X_train_std)
                    PRED_tr_yes = PRED_prob_tr[:,1] 

                    PRED_tr_bin = PRED_tr_yes > th
                    PRED_tr_bin = PRED_tr_bin*1.

                    performance['y_training'] = y_training
                    performance['PRED_prob_tr'] = PRED_prob_tr

                    #Compute evaluation metrics on training set:
                    print(confusion_matrix(y_training,PRED_tr_bin))
                    tn_tr, fp_tr, fn_tr, tp_tr = confusion_matrix(y_training, PRED_tr_bin).ravel()
                    spec_tr = tn_tr / (tn_tr+fp_tr)
                    f1_tr = f1_score(y_training, PRED_tr_bin,average = 'weighted')
                    acc_tr = accuracy_score(y_training, PRED_tr_bin)
                    prec_tr = precision_score(y_training, PRED_tr_bin,average = 'weighted')
                    recall_tr = recall_score(y_training, PRED_tr_bin) 
                    npv_tr = tn_tr / (tn_tr+fn_tr)

                    performance['tss_tr'] = recall_tr+spec_tr-1
                    performance['f1score_tr'] = f1_tr
                    performance['accuracy_tr'] = acc_tr
                    performance['precision_tr'] = prec_tr
                    performance['recall_tr'] = recall_tr
                    performance['specificity_tr'] = spec_tr 
                    performance['npv_tr'] = npv_tr 
                    
                    #---------------------------------------------
                    #Predict on validation set:
                    PRED_prob_ts = fit.predict_proba(X_validation_std)
                    PRED_ts_yes = PRED_prob_ts[:,1]

                    PRED_ts_bin = PRED_ts_yes>th
                    PRED_ts_bin = PRED_ts_bin*1.

                    performance['yts'] = y_validation
                    performance['PRED_prob_ts'] = PRED_prob_ts

                    #Compute evaluation metrics on validation set:
                    print(confusion_matrix(y_validation,PRED_ts_bin))
                    tn_ts, fp_ts, fn_ts, tp_ts = confusion_matrix(y_validation,PRED_ts_bin).ravel()
                    spec_ts = tn_ts / (tn_ts+fp_ts)
                    f1_ts = f1_score(y_validation, PRED_ts_bin,average = 'weighted')
                    acc_ts = accuracy_score(y_validation, PRED_ts_bin)
                    prec_ts = precision_score(y_validation, PRED_ts_bin,average = 'weighted')
                    recall_ts = recall_score(y_validation, PRED_ts_bin)  
                    npv_ts = tn_ts / (tn_ts+fn_ts)

                    performance['tss_ts'] = recall_ts+spec_ts-1
                    performance['f1score_ts'] = f1_ts
                    performance['accuracy_ts'] = acc_ts
                    performance['precision_ts'] = prec_ts
                    performance['recall_ts'] = recall_ts
                    performance['specificity_ts'] = spec_ts
                    performance['npv_ts'] = npv_ts
                    #---------------------------------------------
                    metrics_th[th] = performance
                #---------------------------------------------
                #Performances at each treshold:
                metrics_cl[cl] = metrics_th
                #---------------------------------------------
                # Save on file:                
                #nome_file = "<insert your path>/" + d + "/" + d +"_R"+ str(r) + "_K" + str(Kfold) + '_' + p['model_name'] + '.npy'
                #np.save(nome_file, metrics_cl)
                Kfold += 1
            #---------------------------------------------
    p_shuffle = {}
    p_shuffle['R'] = r
    #------------------------------------------------------------------------------------------

In [None]:
#Create one dictionary with vectors of performances for each subset: Key:Subset , Value: {Key:Measure of performance, Value:[30 values]}
dataset_list = ['BDG_PCT','All','with_12','without_12']

for d in dataset_list:
    metrics = {}
    nn = 0
    #isert your path here:
    file_name = '<insert your path>/' + d
    for R in range(0,3):
        for K in range(0,10):
            key = d + '_R' + str(R) + '_K' + str(K) + '_RF'
            inner_file = file_name + '/' + key + '.npy'
            metrics[key] = np.load(inner_file, allow_pickle = True)
            #nome_file = "<insert your path>/" + d + "/" + d + "_metrics.npy"
            #np.save(nome_file, metrics)
            nn += 1
            
metrics_tot = {}
for d in dataset_list:
    #insert your path here:
    file_name = '<insert your path>/' + d + "/" + d + "_metrics.npy"
    metrics = np.load(file_name, allow_pickle = True).item()

    tss,f1,accuracy,precision,recall,specificity,npv,lr_p,lr_m,prev = [],[],[],[],[],[],[],[],[],[]

    for key in list(metrics.keys()):
        values = metrics[key].item()
        #print(key)
        #print(values)
        cl = values['RF']
        #print(cl)
        th = cl[0.175]
        #print(th)

        tss.append(th['tss_ts'])
        f1.append(th['f1score_ts'])
        accuracy.append(th['accuracy_ts'])
        precision.append(th['precision_ts'])
        recall.append(th['recall_ts'])
        specificity.append(th['specificity_ts'])
        npv.append(th['npv_ts'])
        
        #Likelihood Ratios:
        PRED_ts_bin = th['PRED_prob_ts'][:,1] > 0.175
        PRED_ts_bin = PRED_ts_bin*1.
        pos_LR, neg_LR = class_likelihood_ratios(th['yts'],PRED_ts_bin)
        lr_p.append(pos_LR)
        lr_m.append(neg_LR)
        
        #Prevalence:
        tn, fp, fn, tp = confusion_matrix(th['yts'],PRED_ts_bin).ravel()
        prev.append((tp+fn)/(tp+tn+fp+fn))
        
    metrics_tot[d] = {'TSS_F':tss, 'TSS_M':np.mean(tss),
                      'F1_F':f1, 'F1_M':np.mean(f1),
                      'Accuracy_F':accuracy, 'Accuracy_M':np.mean(accuracy),
                      'Precision_F':precision, 'Precision_M':np.mean(precision),
                      'Recall_F':recall, 'Recall_M':np.mean(recall),
                      'Specificity_F':specificity, 'Specificity_M':np.mean(specificity),
                      'NPV_F':npv, 'NPV_M':np.mean(npv),
                      'LR+_F':lr_p, 'LR+_M':np.mean(lr_p),
                      'LR-_F':lr_m, 'LR-_M':np.mean(lr_m),
                      'PREV_F':prev, 'PREV_M':np.mean(prev),}   
#print(metrics_tot) 

In [None]:
#Create lists of metrics to compute boxplots:
list_ = ['BDG_PCT']*30 + ['All']*30 + ['with_12']*30 + ['without_12']*30
nome_metrica = 'TSS_F'
list_TSS = metrics_tot['BDG_PCT'][nome_metrica] + metrics_tot['All'][nome_metrica] + metrics_tot['with_12'][nome_metrica] +  metrics_tot['without_12'][nome_metrica]
nome_metrica = 'Accuracy_F'
list_Accuracy =  metrics_tot['BDG_PCT'][nome_metrica] + metrics_tot['All'][nome_metrica] + metrics_tot['with_12'][nome_metrica] +  metrics_tot['without_12'][nome_metrica] 
nome_metrica = 'Precision_F'
list_Precision = metrics_tot['BDG_PCT'][nome_metrica] + metrics_tot['All'][nome_metrica] + metrics_tot['with_12'][nome_metrica] +  metrics_tot['without_12'][nome_metrica] 
nome_metrica = 'F1_F'
list_F1= metrics_tot['BDG_PCT'][nome_metrica] + metrics_tot['All'][nome_metrica] + metrics_tot['with_12'][nome_metrica] +  metrics_tot['without_12'][nome_metrica]
nome_metrica = 'Recall_F'
list_Recall = metrics_tot['BDG_PCT'][nome_metrica] + metrics_tot['All'][nome_metrica] + metrics_tot['with_12'][nome_metrica] +  metrics_tot['without_12'][nome_metrica]
nome_metrica = 'Specificity_F'
list_Specificity = metrics_tot['BDG_PCT'][nome_metrica] + metrics_tot['All'][nome_metrica] + metrics_tot['with_12'][nome_metrica] +  metrics_tot['without_12'][nome_metrica] 
nome_metrica = 'NPV_F'
list_NPV = metrics_tot['BDG_PCT'][nome_metrica] + metrics_tot['All'][nome_metrica] + metrics_tot['with_12'][nome_metrica] +  metrics_tot['without_12'][nome_metrica]
nome_metrica = 'LR+_F'
list_LR_p = metrics_tot['BDG_PCT'][nome_metrica] + metrics_tot['All'][nome_metrica] + metrics_tot['with_12'][nome_metrica] +  metrics_tot['without_12'][nome_metrica] 
nome_metrica = 'LR-_F'
list_LR_m = metrics_tot['BDG_PCT'][nome_metrica] + metrics_tot['All'][nome_metrica] + metrics_tot['with_12'][nome_metrica] +  metrics_tot['without_12'][nome_metrica]

df_metrics = pd.DataFrame([list_,list_TSS,list_Accuracy,list_Precision,list_F1,list_Recall,list_Specificity,list_NPV,list_LR_p,list_LR_m]).transpose()
df_metrics.columns = ['Dataset','TSS','Accuracy','Precision','F1','Recall','Specificity','NPV','LR+','LR-']

In [None]:
#Use boxplots to visualize performances' distribution (not shown):
metr = ['TSS_F','F1_F','Accuracy_F','Precision_F','Recall_F','Specificity_F','NPV_F','LR+_F','LR-_F']

for m in metr:
    print(m + ':')
    vect = []
    for d in dataset_list:
        metrics_dataset = metrics_tot[d]
        vect.append(metrics_dataset[m])
    vect_df = np.transpose(pd.DataFrame(vect))
    vect_df.columns = dataset_list
    print(vect_df.describe())
    sns.boxplot(vect_df)
    plt.show()

In [None]:
#Boxplots (Figure3 and S6):
def set_box_color(bp, color):
    plt.setp(bp['boxes'], color=color)
    plt.setp(bp['whiskers'], color=color)
    plt.setp(bp['caps'], color=color)
    plt.setp(bp['medians'], color=color)

colors = ['#bebada','#80b1d3','#fdb462','#b3de69']

group = 'Dataset'
dataset_list = ['All','BDG_PCT','with_12','without_12']
ngroup = len(dataset_list)
metr_df_metrics = ['TSS','Accuracy','Precision','F1','Recall','Specificity','NPV','LR+','LR-']

for m in list(metr_df_metrics):
    column = m

    grouped0 = df_metrics.groupby(group)

    names0, vals0, x0 = [], [] ,[]

    for i, (name, subdf) in enumerate(grouped0):
        names0.append(name)
        vals0.append((subdf[column]).tolist())
        x0.append([4*(i)+1]*30)
        
    myorder = [1,0,2,3]
    x0 = [x0[j] for j in myorder]
    positions0 = [5,1,9,13]
    bp0 = plt.boxplot(vals0, positions = positions0, widths=[1.2]*4)
    ngroup = len(vals0)

    i=0
    for x, val in zip(x0, vals0):
        plt.scatter(x, val, color=colors[i], alpha=0.4)
        i+=1
    #------------------------------------------------------------------------------------------------------- 

    nome_figura = 'Boxplot_' + m
    if m == 'LR+':
        plt.ylim(float(df_metrics[[m]].min())-0.5,float(df_metrics[[m]].max())+0.7)
        yticks = np.linspace(1,5,5)
    elif m == 'LR-':
        plt.ylim(float(df_metrics[[m]].min())-0.1,float(df_metrics[[m]].max())+0.08)
        yticks = np.linspace(0,1,5)
    elif m == 'TSS':
        plt.ylim(float(df_metrics[[m]].min())-0.1,1.1)
        yticks = np.linspace(0,1,6)
    elif m == 'Precision':
        plt.ylim(float(df_metrics[[m]].min())-0.1,1.05)
        yticks = np.linspace(0.7,1,4)
    elif m == 'Recall':
        plt.ylim(float(df_metrics[[m]].min())-0.1,1.05)
        yticks = np.linspace(0.2,1,5)
    elif m == 'NPV':
        plt.ylim(float(df_metrics[[m]].min())-0.05,1.01)
        yticks = np.linspace(0.8,1,5)
    else:
        plt.ylim(float(df_metrics[[m]].min())-0.1,1.05)
        yticks = np.linspace(0.4,1,4)
    #plt.legend(title='Random state',fontsize='8',loc='lower center', ncol = 3)
    #labels = ['bw' if x=='bw200' else x for x in names0]
    labels = ['BDG-PCT + All fetaures','BDG-PCT','BGD-PCT + 12 best features','12 best features']
    
    plt.xticks([5,1,9,13], labels,size=6.5)
    plt.yticks(yticks,size=6.5)
    plt.xlabel('Subset', fontsize=10)
    if m=='F1':
        plt.ylabel('F1-score', fontsize=10)
    else:
        plt.ylabel(m, fontsize=10)
    plt.tight_layout()

    set_box_color(bp0, '#636363') # colors are from http://colorbrewer2.org/

    #colors = ['blue', 'green', 'purple', 'tan', 'pink', 'red']
    #for patch, color in zip(bp0['boxes'], colors):
    #    patch.set_facecolor(color)

    #plt.title('Boxplots of ' + m , fontsize=15) 
    
    #Save figure:
    #nome_figura = '<insert your path>/' + nome_figura + '.png'
    #plt.savefig(nome_figura,dpi = 300)
    plt.show()

In [None]:
# Friedman Test: Paired samples
# Non-parametric test used to determine whether there is a statistical difference between the medians of at least two groups 
metr = ['TSS_F','F1_F','Accuracy_F','Precision_F','Recall_F','Specificity_F','NPV_F','LR+_F','LR-_F']
    
# BDG_PCT vs All vs Best_with vs Best_without:
print('\n')
print('Friedman Test for PAIRED samples - BDG_PCT vs All vs Best_with vs Best_without:\n') #alpha = ' + str(round(0.05/4,3)) +' 
    
for m in metr:
    x2 = metrics_tot['All'] 
    x3 = metrics_tot['with_12'] 
    x1 = metrics_tot['BDG_PCT'] 
    x4 = metrics_tot['without_12'] 

    print(m + ': p = ', stats.friedmanchisquare(x1[m],x2[m],x3[m],x4[m])[1])
    
    #If the Friedman test is rejected, then each couple is tested with Wilcoxon test:
    if stats.friedmanchisquare(x1[m],x2[m],x3[m],x4[m])[1] < 0.05:
        print('Test fails for the couple: ')
        print('The p-value corrected with the Bonferroni correction is: ',0.05/6)
        if stats.wilcoxon(x1[m],x2[m])[1] < 0.05/6:
            print('BDG_PCT vs All: p = ',stats.wilcoxon(x1[m],x2[m])[1])
        if stats.wilcoxon(x1[m],x3[m])[1] < 0.05/6:
            print('BDG_PCT vs Best with: p = ',stats.wilcoxon(x1[m],x3[m])[1])
        if stats.wilcoxon(x1[m],x4[m])[1] < 0.05/6:
            print('BDG_PCT vs Best without: p = ',stats.wilcoxon(x1[m],x4[m])[1])
        if stats.wilcoxon(x2[m],x3[m])[1] < 0.05/6:
            print('All vs Best with: p = ',stats.wilcoxon(x2[m],x3[m])[1])
        if stats.wilcoxon(x2[m],x4[m])[1] < 0.05/6:
            print('All vs Best without: p = ',stats.wilcoxon(x2[m],x4[m])[1])
        if stats.wilcoxon(x3[m],x4[m])[1] < 0.05/6:
            print('Best with vs Best without: p = ',stats.wilcoxon(x3[m],x4[m])[1])
        print('-------------------------------------------------------------------------------------------------------------')

In [None]:
#No other data set needs to be exported for following analysis