# <font color='Crimson'><b>MACHINE LEARNING ALGORITHMS</b></font>

In [None]:
#Import packages:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from sklearn.metrics import class_likelihood_ratios
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [None]:
#Import imputed and standardized training set:
training_std = pd.read_csv('05_Training_standard.csv',sep=',',index_col=0)
training_std.head()

In [None]:
#Import imputed and standardized test set:
test_std = pd.read_csv('05_Test_standard.csv',sep=',',index_col=0)
test_std.head()

In [None]:
#Shuffle the two sets:
training_std = shuffle(training_std,random_state=100)
test_std = shuffle(test_std,random_state=100)

In [None]:
#Split features from outcome in both sets:
X_training = training_std.iloc[:,2:training_std.shape[1]].values
y_training = training_std.iloc[:,0].values
y_training = np.double(y_training)
X_test = test_std.iloc[:,2:test_std.shape[1]].values
y_test = test_std.iloc[:,0].values
y_test = np.double(y_test)

In [None]:
#Define useful functions for classification task:
#TSS score:
def my_tss_score(Y_training,probability_prediction,threshold):
    
    Y_predicted = probability_prediction > threshold
    res = metrics_classification(Y_training > 0, Y_predicted, print_skills=False)    

    return res['tss']

#Classification metrics:
def metrics_classification(y_real, y_pred, print_skills=True):

    cm, far, pod, acc, hss, tss, fnfp, csi, tpr, tnr = classification_skills(y_real, y_pred)

    if print_skills:
        print ('confusion matrix')
        print (cm)
        print ('false alarm ratio       \t', far)
        print ('probability of detection\t', pod)
        print ('accuracy                \t', acc)
        print ('hss                     \t', hss)
        print ('tss                     \t', tss)
        print ('balance                 \t', fnfp)
        print ('csi                 \t', csi)
        print ('tpr                 \t', tpr)
        print ('tnr                 \t', tnr)

    balance_label = float(sum(y_real)) / y_real.shape[0]

    return {
        "cm": cm,
        "far": far,
        "pod": pod,
        "acc": acc,
        "hss": hss,
        "tss": tss,
        "fnfp": fnfp,
        "balance label": balance_label,
        "csi": csi,
        "tpr": tpr,
        "tnr": tnr}

def classification_skills(y_real, y_pred):

    cm = confusion_matrix(y_real, y_pred)

    if cm.shape[0] == 1 and sum(y_real) == 0:
        a = 0.
        d = float(cm[0, 0])
        b = 0.
        c = 0.
    elif cm.shape[0] == 1 and sum(y_real) == y_real.shape[0]:
        a = float(cm[0, 0])
        d = 0.
        b = 0.
        c = 0.
    elif cm.shape[0] == 2:
        a = float(cm[1, 1])
        d = float(cm[0, 0])
        b = float(cm[0, 1])
        c = float(cm[1, 0])
    TP = a
    TN = d
    FP = b
    FN = c

    if (TP + FP + FN + TN) == 0.:
        if (TP + TN) == 0.:
            acc = 0.  # float('NaN')
        else:
            acc = -100  # float('Inf')
    else:
        acc = (TP + TN) / (TP + FP + FN + TN)
        

    if TP + FN == 0.:
        if TP == 0.:
            tss_aux1 = 0.  # float('NaN')
        else:
            tss_aux1 = -100  # float('Inf')
    else:
        tss_aux1 = (TP / (TP + FN))

    if (FP + TN) == 0.:
        if FP == 0.:
            tss_aux2 = 0.  # float('NaN')
        else:
            tss_aux2 = -100  # float('Inf')
    else:
        tss_aux2 = (FP / (FP + TN))

    tss = tss_aux1 - tss_aux2

    if ((TP + FN) * (FN + TN) + (TP + FP) * (FP + TN)) == 0.:
        if (TP * TN - FN * FP) == 0:
            hss = 0.  # float('NaN')
        else:
            hss = -100  # float('Inf')
    else:
        hss = 2 * (TP * TN - FN * FP) / ((TP + FN) *
                                         (FN + TN) + (TP + FP) * (FP + TN))

    if FP == 0.:
        if FN == 0.:
            fnfp = 0.  # float('NaN')
        else:
            fnfp = -100  # float('Inf')
    else:
        fnfp = FN / FP

    if (TP + FN) == 0.:
        if TP == 0.:
            pod = 0  # float('NaN')
        else:
            pod = -100  # float('Inf')
    else:
        pod = TP / (TP + FN)


    if (TP + FP) == 0.:
        if FP == 0.:
            far = 0.  # float('NaN')
        else:
            far = -100  # float('Inf')
    else:
        far = FP / (TP + FP)

    #acc = (a + d) / (a + b + c + d)
    tpr = tss_aux1  # a / (a + b)
    tnr = 1-tss_aux2  # d / (d + c)
    #wtpr = a / (a + b) * (a + c) / (a + b + c + d) + d / (c + d) * (b + d) / (a + b + c + d)
    #pacc = a / (a + c)
    #nacc = d / (b + d)
    #wacc = a / (a + c) * (a + c) / (a + b + c + d) + d / (b + d) * (b + d) / (a + b + c + d)

    # if the cm has a row or a column equal to 0, we have bad tss
    if TP+FN == 0 or TN+FP == 0 or TP+FP == 0 or TN+FN == 0:
        tss = 0
    if TP+FP+FN==0:
        csi = 0
    else:
        csi = TP/(TP+FP+FN)

    return cm.tolist(), far, pod, acc, hss, tss, fnfp, csi, tpr, tnr

In [None]:
#Define useful fuctions for cross-validation:
def build_grid(p):
    #Parameters initialization:
    model_name = None if p['model_name'] == None else p['model_name']
    GRID = None 
        
    if model_name == "RF":
        GRID = {'n_estimators': p['n_estimators'],
                'max_features': p['max_features'],
                'max_depth': p['max_depth'],
                'criterion': p['criterion']
               }  
    
    #other models can be added here

    return GRID


def initiate_p(p):
    #Ranges for hyperparameter search:
    if p['model_name'] == 'RF':
        p['estimator'] = RandomForestClassifier(random_state=100)
        p['n_estimators'] = [100,200,300,400,500] 
        p['max_features'] = [None, 'sqrt', 'log2'] 
        p['max_depth'] = [4,5,6,7,8]
        p['criterion'] = ['gini','entropy']
        
    #other models can be added here

    #All:
    p['cv'] = 10

    return p


def GridSearch_(X,y,p):
    #Parameters initialization:
    estimator = None if p['estimator'] == None else p['estimator']
    cv = 10 if p['cv'] == None else p['cv']
    grid = None if p['grid'] == None else p['grid']
    tau = 0.5 if p['threshold'] == None else p['threshold']
    
    if(estimator == None or grid == None):
        return None

    #1st step: select the best hyperparameters
    CV = GridSearchCV(estimator     = estimator,
                        param_grid  = grid,
                        scoring     =  make_scorer(my_tss_score,threshold=tau,needs_proba=True), #TSS score
                        #refit       = 'roc_auc',
                        cv          = cv,
                        verbose     = 0,
                        n_jobs      = 20,
                        return_train_score=True)
    CV_H = CV.fit(X,y)
    
    return CV_H

In [None]:
#Train the classifiers and evaluate performances both on training and test sets:

#Range of threshold to evaluate:
thresholds = np.linspace(0.0,0.5,11)

#Classifiers to train:
classifiers = ['LogisticRegL1','LogisticRegL2','RF'] 
metrics_cl = {}

for cl in classifiers:
    p = {}
    p['model_name'] = cl
    p = initiate_p(p)
    print('Model name:', p['model_name'])
    
    metrics_th = {}
    
    for th in tqdm(thresholds):
        p['threshold'] = th
        print('Threshold:', p['threshold'])
        
        performance = {}
        #---------------------------------------------
        #Hyperparameters search and model fit on training set:    
        if p['model_name'] == 'LogisticRegL1':
            score = make_scorer(my_tss_score,threshold=th,needs_proba=True)
            fit = LogisticRegressionCV(cv = 10,random_state = 100,penalty='l1',solver='liblinear',scoring=score).fit(X_training,y_training) 
        
        elif p['model_name'] == 'LogisticRegL2':
            score = make_scorer(my_tss_score,threshold=th,needs_proba=True)
            fit = LogisticRegressionCV(cv = 10,random_state = 100,penalty='l2',scoring=score).fit(X_training,y_training) 
               
        elif p['model_name'] == 'RF':
            p['grid'] = build_grid(p)
            p['GS_RF'] = GridSearch_(X_training, y_training, p)
            max_depth = p['GS_RF'].best_params_['max_depth']
            n_estimators = p['GS_RF'].best_params_['n_estimators']
            criterion = p['GS_RF'].best_params_['criterion']
            max_features = p['GS_RF'].best_params_['max_features']
            print(p['GS_RF'].best_params_)
            performance['best_hyper'] = p['GS_RF'].best_params_
            fit = RandomForestClassifier(max_depth=max_depth,n_estimators=n_estimators,criterion=criterion,
                                         max_features=max_features,random_state=100).fit(X_training, y_training) 
            
        #other models can be added here
            
        #---------------------------------------------
        #Predict on training set:
        PRED_prob_tr = fit.predict_proba(X_training)
        PRED_tr_yes = PRED_prob_tr[:,1] 

        PRED_tr_bin = PRED_tr_yes > th
        PRED_tr_bin = PRED_tr_bin*1.
        
        performance['ytr'] = y_training
        performance['PRED_prob_tr'] = PRED_prob_tr

        #Compute evaluation metrics on training set:
        print(confusion_matrix(y_training,PRED_tr_bin))
        tn_tr, fp_tr, fn_tr, tp_tr = confusion_matrix(y_training, PRED_tr_bin).ravel()
        spec_tr = tn_tr / (tn_tr+fp_tr)
        f1_tr = f1_score(y_training, PRED_tr_bin,average = 'weighted')
        acc_tr = accuracy_score(y_training, PRED_tr_bin)
        prec_tr = precision_score(y_training, PRED_tr_bin,average = 'weighted')
        recall_tr = recall_score(y_training, PRED_tr_bin) 
        npv_tr = tn_tr / (tn_tr+fn_tr)

        performance['tss_tr'] = recall_tr+spec_tr-1
        performance['f1score_tr'] = f1_tr
        performance['accuracy_tr'] = acc_tr
        performance['precision_tr'] = prec_tr
        performance['recall_tr'] = recall_tr
        performance['specificity_tr'] = spec_tr 
        performance['npv_tr'] = npv_tr 
        
        #---------------------------------------------
        #Predict on test set:
        PRED_prob_ts = fit.predict_proba(X_test)
        PRED_ts_yes = PRED_prob_ts[:,1]

        PRED_ts_bin = PRED_ts_yes>th
        PRED_ts_bin = PRED_ts_bin*1.
        
        performance['yts'] = y_test
        performance['PRED_prob_ts'] = PRED_prob_ts

        #Compute evaluation metrics on test set:
        print(confusion_matrix(y_test,PRED_ts_bin))
        tn_ts, fp_ts, fn_ts, tp_ts = confusion_matrix(y_test,PRED_ts_bin).ravel()
        spec_ts = tn_ts / (tn_ts+fp_ts)
        f1_ts = f1_score(y_test, PRED_ts_bin,average = 'weighted')
        acc_ts = accuracy_score(y_test, PRED_ts_bin)
        prec_ts = precision_score(y_test, PRED_ts_bin,average = 'weighted')
        recall_ts = recall_score(y_test, PRED_ts_bin)  
        npv_ts = tn_ts / (tn_ts+fn_ts)

        performance['tss_ts'] = recall_ts+spec_ts-1
        performance['f1score_ts'] = f1_ts
        performance['accuracy_ts'] = acc_ts
        performance['precision_ts'] = prec_ts
        performance['recall_ts'] = recall_ts
        performance['specificity_ts'] = spec_ts
        performance['npv_ts'] = npv_ts
        #---------------------------------------------
        metrics_th[th] = performance
    #---------------------------------------------
    #Performances at each treshold:
    metrics_cl[cl] = metrics_th
    #Save on file:
    #nome_file = '<insert_your_path>/' + cl + '_Performances.npy'
    #np.save(nome_file, metrics_th)
#Save on file:    
#nome_file = '<insert_your_path>/Complete_Performances.npy'
#np.save(nome_file, metrics_cl)

In [None]:
#Likelihood Ratios:
print('LR L1:')
for th in thresholds:
    print(th)

    tn_tr, fp_tr, fn_tr, tp_tr = confusion_matrix(size_LRL1[th]['ytr'],size_LRL1[th]['PRED_bin_tr']).ravel()
    pos_LR_tr, neg_LR_tr = class_likelihood_ratios(size_LRL1[th]['ytr'],size_LRL1[th]['PRED_bin_tr'])
    print(f"training set LR+: {pos_LR_tr:.3f}")
    print(f"training set LR-: {neg_LR_tr:.3f}")

    tn_ts, fp_ts, fn_ts, tp_ts = confusion_matrix(size_LRL1[th]['yts'],size_LRL1[th]['PRED_ts_bin']).ravel()
    pos_LR_ts, neg_LR_ts = class_likelihood_ratios(size_LRL1[th]['yts'],size_LRL1[th]['PRED_ts_bin'])
    print(f"Test set LR+: {pos_LR_ts:.3f}")
    print(f"Test set LR-: {neg_LR_ts:.3f}")

print(' ')
print('LR L2:')
for th in thresholds:
    print(th)

    tn_tr, fp_tr, fn_tr, tp_tr = confusion_matrix(size_LRL2[th]['ytr'],size_LRL2[th]['PRED_bin_tr']).ravel()
    pos_LR_tr, neg_LR_tr = class_likelihood_ratios(size_LRL2[th]['ytr'],size_LRL2[th]['PRED_bin_tr'])
    print(f"training set LR+: {pos_LR_tr:.3f}")
    print(f"training set LR-: {neg_LR_tr:.3f}")

    tn_ts, fp_ts, fn_ts, tp_ts = confusion_matrix(size_LRL2[th]['yts'],size_LRL2[th]['PRED_ts_bin']).ravel()
    pos_LR_ts, neg_LR_ts = class_likelihood_ratios(size_LRL2[th]['yts'],size_LRL2[th]['PRED_ts_bin'])
    print(f"Test set LR+: {pos_LR_ts:.3f}")
    print(f"Test set LR-: {neg_LR_ts:.3f}")
    
print(' ')
print('RF:')
for th in thresholds:
    print(th)

    tn_tr, fp_tr, fn_tr, tp_tr = confusion_matrix(size_RF[th]['ytr'],size_RF[th]['PRED_bin_tr']).ravel()
    pos_LR_tr, neg_LR_tr = class_likelihood_ratios(size_RF[th]['ytr'],size_RF[th]['PRED_bin_tr'])
    print(f"training set LR+: {pos_LR_tr:.3f}")
    print(f"training set LR-: {neg_LR_tr:.3f}")

    tn_ts, fp_ts, fn_ts, tp_ts = confusion_matrix(size_RF[th]['yts'],size_RF[th]['PRED_ts_bin']).ravel()
    pos_LR_ts, neg_LR_ts = class_likelihood_ratios(size_RF[th]['yts'],size_RF[th]['PRED_ts_bin'])
    print(f"Test set LR+: {pos_LR_ts:.3f}")
    print(f"Test set LR-: {neg_LR_ts:.3f}")

In [None]:
#Fagan's nomograms (Figure S2):
Pmax = .999
Pmin = .001
Omax = np.ceil(np.log(Pmax/(1-Pmax)))
Omin = np.floor(np.log(Pmin/(1-Pmin)))

Pticks = np.sort(np.concatenate([
    np.arange(.1, .999, .1),
    np.arange(.08, 0, -.02),
    np.arange(.01, 0, -.002),
    np.arange(.92, .999, +.02),
    np.arange(.99, .999, +.002),
    [.001, .999]
]))

Oticks = Pticks / (1 - Pticks)


Lticks = [10 ** int(i) for i in np.arange(-4, 4.1, 1)]


def Fagan(LR='pos',draw=[],cl='RF',th=0.1):
    fig, ax = plt.subplots(figsize=(5,10), dpi=100)
    ax.set_xlim(-1.2, 1.2)
    ax.set_ylim(Omin, Omax)
    ax.axis('off')

    ax.axvline(-1, c="k")
    ax.scatter([-1 for _ in range(len(Pticks))], np.log(Oticks), marker="_", c="k", s=100)
    for Otick, Ptick in zip(Oticks, Pticks):
        ax.text(
            -1.05, np.log(Otick),
            f"${Ptick:.0%}$" if Ptick<=.99 and Ptick>=.01 else f"${Ptick:.1%}$",
            fontsize=7, va="center", ha="right"
        )

    ax.axvline(+1, c="k")
    ax.scatter([+1 for _ in range(len(Pticks))], np.log(1/Oticks), marker="_", c="k", s=100)
    for Otick, Ptick in zip(Oticks, Pticks):
        ax.text(
            +1.05, np.log(Otick),
            f"${1-Ptick:.0%}$" if Ptick<=.99 and Ptick>=.01 else f"${1-Ptick:.1%}$",
            fontsize=7, va="center", ha="left"
        )

    Ops = []
    for Ltick in Lticks:
        Op = 1 / (np.sqrt(Ltick) + 1)
        Ops.append(Op)
        ax.scatter(0, np.log(Op/(1-Op)), marker="_", c="k", s=100)
        ax.text(+.05, np.log(Op/(1-Op)), f"${Ltick}$", fontsize=7, c="k", va="center", ha="left")
        ax.text(-.05, np.log(Op/(1-Op)), f"${Ltick}$", fontsize=7, c="k", va="center", ha="right")
        #minors = [Ltick-10**int(np.log10(Ltick)-1)*i for i in range(9)]
        #for minor in minors:
            #if minor < .001:
                #continue
            #Op = 1 / (np.sqrt(minor) + 1)
            #ax.scatter(0, np.log(Op/(1-Op)), marker="_", c="k", s=50)
    ax.plot([0,0], [np.log(Ops[0]/(1-Ops[0])), np.log(Ops[-1]/(1-Ops[-1]))], c="k", ls="-")

    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_title("Fagan's nomogram - " + cl , fontsize=12)
    ax.text(-1.3, 0, "Pre-test Probability (%)", ha="center", va="center", fontsize=12, rotation=90)
    ax.text(+1.35, 0, "Post-test Probability (%)", ha="center", va="center", fontsize=12, rotation=90)
    if LR=='neg':
        ax.text(0, np.log(Ops[0]/(1-Ops[0]))+.5, "Negative Likelihood Ratio", ha="center", va="center", fontsize=12)
    else: 
        ax.text(0, np.log(Ops[0]/(1-Ops[0]))+.5, "Positive Likelihood Ratio", ha="center", va="center", fontsize=12)
    
    for line in draw:
            Opo = line[1] * (line[0]/(1-line[0]))
            Ppo = Opo/(1+Opo)
            print(f"Pr:{line[0]:>7.2%}  LR:{line[1]:>7.2f}  Po:{Ppo:>7.2%}")
            ax.plot(
                [-1, 1], [np.log(line[0]/(1-line[0])), np.log((1-line[0])/(line[0]*line[1]))], color=line[2],linestyle=line[3]
            )
        
    #nome_figura = '<insert_your_path>/' + cl + '_Fagan_' + LR + '_th_' +str(th) +'.png'
    #plt.savefig(nome_figura,dpi=300)

    plt.show();


#Figure S2, A: PLR L1, threshold=0.1:
Fagan( LR = 'pos',
       draw=[
             [.102, 1.884,'#80b1d3','-'], #prevalence training set, LR+, color, line style
             [.102, 1.834,'#fb8072','--']  #prevalence test set, LR+,  color, line style
            ],
      cl = 'LR L1',
      th = 0.1
     )


Fagan( LR = 'neg',
       draw=[
             [.102, 0.558, '#80b1d3','-'], #prevalence training set, LR-
             [.102, 0.581,'#fb8072','--']  #prevalence test set, LR-
            ],
      cl = 'LR L1',
      th = 0.1
     )

#Figure S2, B: PLR L2, threshold=0.1:
Fagan( LR = 'pos',
       draw=[
             [.102, 1.804,'#80b1d3','-'], #prevalence training set, LR+
             [.102, 1.764,'#fb8072','--']  #prevalence test set, LR+
            ],
      cl = 'LR L2',
      th = 0.1
     )


Fagan( LR = 'neg',
       draw=[
             [.102, 0.503,'#80b1d3','-'], #prevalence training set, LR-
             [.102, 0.528,'#fb8072','--']  #prevalence test set, LR-
            ],
      cl = 'LR L2',
      th = 0.1
     )

#Figure S2, C: RF, threshold=0.1:
Fagan( LR = 'pos',
       draw=[
             [.102, 2.774,'#80b1d3','-'], #prevalence training set, LR+
             [.102, 1.702,'#fb8072','--']  #prevalence test set, LR+
            ],
      cl = 'RF',
      th = 0.1
     )


Fagan( LR = 'neg',
       draw=[
             [.102, 0.033,'#80b1d3','-'], #prevalence training set, LR-
             [.102, 0.462,'#fb8072','--']  #prevalence test set, LR-
            ],
      cl = 'RF',
      th = 0.1
     )

In [None]:
#Train again the best classifier chosen, with StratifiedKFold (for more than one shuffle) and evaluate performances on validation set:
#Define the number of shuffles:
R = 3
#Define the number of folds:
K = 10

#Range of threshold to evaluate:
thresholds = np.linspace(0.0,0.5,11)
#Best classifier:
classifiers = ['RF'] #LogisticRegL1 #LogisticRegL2

#Shuffle data in order to test stability of the model:
#In order to guarantee reproducibility of results, the random states are always the numbers 0,1,2.
for r in range(0,R):
    X,y = shuffle(X_training,y_training,random_state = r) 
    #------------------------------------------------------------------------------------------
    # CROSS-VALIDATION: K-Fold
    kf = StratifiedKFold(n_splits=K, shuffle=False)
    #------------------------------------------------------------------------------------------
    Kfold = 0
    
    for train_index, validation_index in kf.split(X,y):
        
        X_training, X_validation = X[train_index], X[validation_index]
        y_training, y_validation = y[train_index], y[validation_index]
        
        for cl in classifiers:
            metrics_cl = {}
            
            #metrics_cl['train_index'] = train_index
            #metrics_cl['validation_index'] = validation_index
            
            p = {}
            p['model_name'] = cl
            p = initiate_p(p)
            print('Model name:', p['model_name'])

            metrics_th = {}

            for th in tqdm(thresholds):
                p['threshold'] = th
                print('Threshold:', p['threshold'])

                performance = {}
                #---------------------------------------------
                #Hyperparameters search and model fit on training set:    
                if p['model_name'] == 'LogisticRegL1':
                    score = make_scorer(my_tss_score,threshold=th,needs_proba=True)
                    fit = LogisticRegressionCV(cv = 10,random_state = 100,penalty='l1',solver='liblinear',
                                               scoring=score).fit(X_training, y_training) 

                elif p['model_name'] == 'LogisticRegL2':
                    score = make_scorer(my_tss_score,threshold=th,needs_proba=True)
                    fit = LogisticRegressionCV(cv = 10,random_state = 100,penalty='l2',scoring=score).fit(X_training, y_training) 

                elif p['model_name'] == 'RF':
                    p['grid'] = build_grid(p)
                    p['GS_RF'] = GridSearch_(X_training, y_training, p)
                    max_depth = p['GS_RF'].best_params_['max_depth']
                    n_estimators = p['GS_RF'].best_params_['n_estimators']
                    criterion = p['GS_RF'].best_params_['criterion']
                    max_features = p['GS_RF'].best_params_['max_features']
                    print(p['GS_RF'].best_params_)
                    performance['best_hyper'] = p['GS_RF'].best_params_
                    fit = RandomForestClassifier(max_depth=max_depth,n_estimators=n_estimators,criterion=criterion,
                                                 max_features=max_features,random_state=100).fit(X_training, y_training) 
                    
                #other models can be added here
                    
                #---------------------------------------------
                #Predict on training set:
                PRED_prob_tr = fit.predict_proba(X_training)
                PRED_tr_yes = PRED_prob_tr[:,1] 

                PRED_tr_bin = PRED_tr_yes > th
                PRED_tr_bin = PRED_tr_bin*1.

                performance['y_training'] = y_training
                performance['PRED_prob_tr'] = PRED_prob_tr

                #Compute evaluation metrics on training set:
                print(confusion_matrix(y_training,PRED_tr_bin))
                tn_tr, fp_tr, fn_tr, tp_tr = confusion_matrix(y_training, PRED_tr_bin).ravel()
                spec_tr = tn_tr / (tn_tr+fp_tr)
                f1_tr = f1_score(y_training, PRED_tr_bin,average = 'weighted')
                acc_tr = accuracy_score(y_training, PRED_tr_bin)
                prec_tr = precision_score(y_training, PRED_tr_bin,average = 'weighted')
                recall_tr = recall_score(y_training, PRED_tr_bin) 
                npv_tr = tn_tr / (tn_tr+fn_tr)

                performance['tss_tr'] = recall_tr+spec_tr-1
                performance['f1score_tr'] = f1_tr
                performance['accuracy_tr'] = acc_tr
                performance['precision_tr'] = prec_tr
                performance['recall_tr'] = recall_tr
                performance['specificity_tr'] = spec_tr 
                performance['npv_tr'] = npv_tr 

                #---------------------------------------------
                #Predict on validation set:
                PRED_prob_ts = fit.predict_proba(X_validation)
                PRED_ts_yes = PRED_prob_ts[:,1]

                PRED_ts_bin = PRED_ts_yes>th
                PRED_ts_bin = PRED_ts_bin*1.

                performance['yts'] = y_validation
                performance['PRED_prob_ts'] = PRED_prob_ts

                #Compute evaluation metrics on validation set:
                print(confusion_matrix(y_validation,PRED_ts_bin))
                tn_ts, fp_ts, fn_ts, tp_ts = confusion_matrix(y_validation,PRED_ts_bin).ravel()
                spec_ts = tn_ts / (tn_ts+fp_ts)
                f1_ts = f1_score(y_validation, PRED_ts_bin,average = 'weighted')
                acc_ts = accuracy_score(y_validation, PRED_ts_bin)
                prec_ts = precision_score(y_validation, PRED_ts_bin,average = 'weighted')
                recall_ts = recall_score(y_validation, PRED_ts_bin)  
                npv_ts = tn_ts / (tn_ts+fn_ts)

                performance['tss_ts'] = recall_ts+spec_ts-1
                performance['f1score_ts'] = f1_ts
                performance['accuracy_ts'] = acc_ts
                performance['precision_ts'] = prec_ts
                performance['recall_ts'] = recall_ts
                performance['specificity_ts'] = spec_ts
                performance['npv_ts'] = npv_ts 
                #---------------------------------------------
                metrics_th[th] = performance
            #---------------------------------------------
            #Performances at each treshold:
            metrics_cl[cl] = metrics_th
            #---------------------------------------------
            # Save on file:                
            #nome_file = "<insert your path>/R"+ str(r) + "_K" + str(Kfold) + '_' + p['model_name'] + '.npy'
            #np.save(nome_file, metrics_cl)
            Kfold += 1
        #---------------------------------------------
    p_shuffle = {}
    p_shuffle['R'] = r
    #------------------------------------------------------------------------------------------

In [None]:
#No data set needs to be exported for following analysis