In [None]:
# Testing different models:

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.model_selection import train_test_split

import imblearn as mb
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [17]:
from sklearn.ensemble import GradientBoostingClassifier
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import sys
import math
from hyperopt import space_eval

In [4]:
from sklearn.preprocessing import StandardScaler

# Pacotes para Redes Neurais Artificiais
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import layers
from scikeras.wrappers import KerasRegressor

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from scipy import stats
from scipy.stats import ks_2samp
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [6]:
df = pd.read_csv('Loan_default.csv')
df

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255342,8C6S86ESGC,19,37979,210682,541,109,4,14.11,12,0.85,Bachelor's,Full-time,Married,No,No,Other,No,0
255343,98R4KDHNND,32,51953,189899,511,14,2,11.55,24,0.21,High School,Part-time,Divorced,No,No,Home,No,1
255344,XQK1UUUNGP,56,84820,208294,597,70,3,5.29,60,0.50,High School,Self-employed,Married,Yes,Yes,Auto,Yes,0
255345,JAO28CPL4H,42,85109,60575,809,40,1,20.90,48,0.44,High School,Part-time,Single,Yes,Yes,Other,No,0


In [7]:
df['NumCreditLines'] = df['NumCreditLines'].astype(object)
df['LoanTerm'] = df['LoanTerm'].astype(object)

In [95]:
y = df['Default']

x_feat_name = ['Age','InterestRate','Income','MonthsEmployed','LoanAmount']

x = df[x_feat_name]

In [150]:
# Division into training and testing:    
X_train, X_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size=0.30,
                                                    random_state=42) 

In [97]:
# Division into train and validation:    
x_train2, x_valid, y_train2, y_valid = train_test_split(x_train, y_train, 
                                                        test_size=0.20,
                                                        random_state=42)

In [116]:
# Applying tomek, under e oversampling to increase the minority class proportion.
# Note: The use of both under and oversampling aimed to avoid obtaining a very large base.

under_tomek = TomekLinks(sampling_strategy='majority',n_jobs=4)

x_train_tomek, y_train_tomek = under_tomek.fit_resample(x_train2, y_train2)

rnd_under = RandomUnderSampler(sampling_strategy=0.35,
                                        random_state=42)
x_train_under, y_train_under = rnd_under.fit_resample(x_train_tomek, y_train_tomek)

smote = SMOTE(sampling_strategy= 1, random_state=42)

x_train_over, y_train_over = smote.fit_resample(x_train_under, y_train_under)

In [117]:
y_train_over.mean()

0.5

In [None]:
# Note: Different proportions of minority class were tested.
#       The best proportion was 50 % of the nimority class (target).

In [None]:
# Gradient Boosting Classifier:

In [100]:
# Defining the function structure of the prediction model:

def train_oversm_boosting (estimators, lean_rate, sample, max_dp): 
    

    model_gbc = GradientBoostingClassifier(n_estimators = estimators, 
                                           learning_rate = lean_rate,
                                           min_samples_leaf = 100,
                                           subsample = sample,           
                                           max_depth = max_dp,
                                           validation_fraction=0.4,
                                           random_state = 42)
                                            
    return model_gbc

In [101]:
# Definition of hyperparametric space:

est = np.arange(50, 110, 10) 

leanrt = np.arange(0.30, 1.0, 0.10).round(decimals=2) 

samp = np.arange(0.30, 1.0, 0.20).round(decimals=2) 
 
maxdp = np.arange(3, 20, 1) 

# Defining the dictionary of hyperparameters: 
param_grid = dict(estimators=est,
                  lean_rate=leanrt,
                  sample =samp,
                  max_dp = maxdp)

In [102]:
bo_space = {'estimators': hp.choice('estimators', param_grid['estimators']),
            'lean_rate': hp.choice('lean_rate', param_grid['lean_rate']),
            'sample': hp.choice('sample', param_grid['sample']),
            'max_dp': hp.choice('max_dp', param_grid['max_dp'])}

In [None]:
# Defining the function that will be optimized:
   
# The metrics used to find the best hyperparameters were the Precision and the Recall.
# The objective was to obtain the highest possible values for both metrics and at the same time
# obtain a minimum variation between training and test performances. 
# Hypothetical points were considered in a 2D space, where x would be the value of the metric and y 
# would be the difference between trainig and testing performance values. 
# Considering this, the ideal point in this 2D space would be the point (1,0).
# Therefore, the distance between the real point and the ideal point should be as small as possible.
# The distance between 2 points in a 2D space is called The Euclidean distance.
# The sum of the Euclidean distances of the precision and recall metrics was used as the loss of the optimization.

In [152]:
def bo_gpc(param):   

    print ('tested_hyperparameters: ', param)
    model_bo= train_oversm_boosting(estimators=param['estimators'], 
                                    lean_rate=param['lean_rate'],
                                    sample =param['sample'],
                                    max_dp=param['max_dp'])   
    
    model_bo.fit(x_train_over, y_train_over)    
      
    ypred_train = model_bo.predict(x_train_over)
    
    ypred_valid = model_bo.predict(x_valid)
    
    Prec_train = precision_score(y_train_over, ypred_train)    
    Prec_valid = precision_score(y_valid, ypred_valid)    
    Prec_Diference = (Prec_train - Prec_valid)/Prec_train
        
    Recl_train = recall_score(y_train_over, ypred_train)    
    Recl_valid = recall_score(y_valid, ypred_valid)    
    Recl_Diference = (Recl_train - Recl_valid)/Recl_train
    
    D_p = math.sqrt((Prec_valid - 1)**2 + (Prec_Diference - 0)**2)
    D_r = math.sqrt((Recl_valid - 1)**2 + (Recl_Diference - 0)**2)
    D_t = D_p + D_r
    
    print('Recl_valid :', Recl_valid, 'Recl_train :', Recl_train,
         'Prec_valid :', Prec_valid, 'Prec_train :',Prec_train)
    
    return {'loss': D_t,'status': STATUS_OK}     

In [153]:
trials = Trials()                  
bo_search = fmin(fn= bo_gpc,       
                 space= bo_space,  
                 algo=tpe.suggest, 
                 max_evals=10,     
                 trials=trials)

tested_hyperparameters:                                                                                                
{'estimators': 70, 'lean_rate': 0.7, 'max_dp': 6, 'sample': 0.9}                                                       
Prec_valid :                                                                                                           
0.19725072099577495                                                                                                    
Diference_Prec :                                                                                                       
0.7325079436942346                                                                                                     
Recl_valid :                                                                                                           
0.6104492070779631                                                                                                     
Diference_Recl :                        

KeyboardInterrupt: 

In [105]:
# Obtaining the best hyperparameters:

bo_better_hp = space_eval(bo_space, bo_search)
print('Better hyperparameters: ', bo_better_hp)

Better hyperparameters:  {'estimators': 100, 'lean_rate': 0.5, 'max_dp': 3, 'sample': 0.3}


In [None]:
# Fitting the chosen models (best hyperparameters):

In [148]:
model_gbc = train_oversm_boosting (estimators=100, lean_rate=0.5, sample=0.3, max_dp=3) 

model_gbc.fit(x_train_over, y_train_over)

In [149]:
per_gbc = performances (model_gbc, x_train_over, y_train_over, X_test, y_test, 0.50) 
per_gbc

Unnamed: 0,Train,Test,Variation
Recall,0.74,0.65,-0.12
AUROC,0.78,0.72,-0.08
KS,0.42,0.32,-0.24
Accuracy,0.71,0.67,-0.06


In [None]:
# Bagging of Logistic regression:

In [18]:
def train_bagg_lr (estimators, samples, features): 
    
    model_base = LogisticRegression(max_iter=1000)   

    num_estimators = estimators          
  

    model_bagg_lr = BaggingClassifier(estimator = model_base, 
                                       n_estimators = num_estimators, 
                                       max_samples= samples,    
                                       max_features= features,   
                                       random_state = 42) 
    return model_bagg_lr

In [30]:
# Definition of the hyperparametric space:

est = np.arange(50, 210, 50)

samp = np.arange(0.30, 1, 0.20) 

feat = np.arange(0.60, 1.00, 0.39).round(decimals=2) 

param_grid = dict(estimators=est,
                 samples = samp,
                 features = feat)

In [31]:
bo_space2 = {'estimators': hp.choice('estimators', param_grid['estimators']),
            'samples': hp.choice('samples', param_grid['samples']),
            'features': hp.choice('features', param_grid['features'])}

In [None]:
# Defining the function that will be optimized:

# The metrics used to find the best hyperparameters were the Precision and the Recall.
# The objective was to obtain the highest possible values for both metrics and at the same time
# obtain a minimum variation between training and test performances. 
# Hypothetical points were considered in a 2D space, where x would be the value of the metric and y 
# would be the difference between trainig and testing performance values. 
# Considering this, the ideal point in this 2D space would be the point (1,0).
# Therefore, the distance between the real point and the ideal point should be as small as possible.
# The distance between 2 points in a 2D space is called The Euclidean distance.
# The sum of the Euclidean distances of the precision and recall metrics was used as the loss of the optimization.

In [32]:
def bo_bagg(param):   

    print ('tested_hyperparameters: ', param)
    model_bo = train_bagg_lr (estimators=param['estimators'],
                              samples=param['samples'], 
                              features=param['features'])
    
    model_bo.fit(x_train_over, y_train_over)    
      
    ypred_train = model_bo.predict(x_train_over)
    
    ypred_valid = model_bo.predict(x_valid)
    
    Prec_train = precision_score(y_train_over, ypred_train)    
    Prec_valid = precision_score(y_valid, ypred_valid)    
    Prec_Diference = (Prec_train - Prec_valid)/Prec_train
        
    Recl_train = recall_score(y_train_over, ypred_train)    
    Recl_valid = recall_score(y_valid, ypred_valid)    
    Recl_Diference = (Recl_train - Recl_valid)/Recl_train
    
    D_p = math.sqrt((Prec_valid - 1)**2 + (Prec_Diference - 0)**2)
    D_r = math.sqrt((Recl_valid - 1)**2 + (Recl_Diference - 0)**2)
    D_t = D_p + D_r
    
    print('Recl_valid :', Recl_valid, 'Recl_train :', Recl_train,
         'Prec_valid :', Prec_valid, 'Prec_train :',Prec_train)
    
    return {'loss': D_t,'status': STATUS_OK}    

In [None]:
# Defining the parameters of the "fmin" optimization function:

In [33]:
trials = Trials()                  
bo_search2 = fmin(fn= bo_bagg,       
                 space= bo_space2,  
                 algo=tpe.suggest, 
                 max_evals=15,     
                 trials=trials)

tested_hyperparameters:                                                                                                
{'estimators': 150, 'features': 0.6, 'samples': 0.5}                                                                   
Recl_valid :                                                                                                           
0.5964233929434509                                                                                                     
Recl_train :                                                                                                           
0.6015779924897684                                                                                                     
Prec_valid :                                                                                                           
0.21013197105151127                                                                                                    
Prec_train :                            

In [34]:
# Obtaining the best hyperparameters:

bo_better_hp2 = space_eval(bo_space2, bo_search2)
print('Better hyperparameters: ', bo_better_hp2)

Better hyperparameters:  {'estimators': 100, 'features': 0.99, 'samples': 0.7}


In [None]:
# Fitting the chosen models (best hyperparameters):

In [35]:
model_bagg = train_bagg_lr (estimators=100,
                              samples= 0.7, 
                              features= 1)

model_bagg.fit(x_train_over, y_train_over)

In [72]:
# Function to obtain the performance metrics with the testing and training base:

def performances (model, x_train, y_train, x_test, y_test, threshold):  

    if isinstance(model, tf.keras.Model):
        y_pred_train = (model.predict(x_train) > threshold).astype("int32")
        y_proba_train = model.predict(x_train)[:,0]
        
    else:
        try:
            y_proba_train = model.predict_proba(x_train)[:,1]
            y_pred_train = np.where(y_proba_train > threshold, 1, 0)
            
        except Exception as e:
            print(f'Model not supported: {e}')
            return None
   
    acc_train = accuracy_score(y_train, y_pred_train).round(decimals=2)
    roc_train = roc_auc_score(y_train, y_proba_train).round(decimals=2)
    recl_train = recall_score(y_train, y_pred_train).round(decimals=2)

    df_train = x_train.copy()
    df_train['Default'] = y_train
    df_train['Prob(Y=1)'] = y_proba_train
    
    train_prob_y_real_0 = df_train.loc[df_train.Default==0,"Prob(Y=1)"]
    train_prob_y_real_1 = df_train.loc[df_train.Default==1,"Prob(Y=1)"]

    KS_train = (stats.ks_2samp(train_prob_y_real_0, train_prob_y_real_1).statistic).round(decimals=2)

    if isinstance(model, tf.keras.Model):
       
        y_pred_test = (model.predict(x_test) > threshold).astype("int32")
        y_proba_test = model.predict(x_test)[:,0]
    
    else:
        try:   
            y_proba_test = model.predict_proba(x_test)[:,1]
            y_pred_test = np.where(y_proba_test > threshold, 1, 0)
            
        except Exception as e:
            print(f'Model not supported: {e}')
            return None
            
    acc_test = accuracy_score(y_test, y_pred_test).round(decimals=2)
    roc_test = roc_auc_score(y_test, y_proba_test).round(decimals=2)
    recl_test = recall_score(y_test, y_pred_test).round(decimals=2)

    df_test = x_test.copy()
    df_test['Default'] = y_test
    df_test['Prob(Y=1)'] = y_proba_test
    
    test_prob_y_real_0 = df_test.loc[df_test.Default==0,"Prob(Y=1)"]
    test_prob_y_real_1 = df_test.loc[df_test.Default==1,"Prob(Y=1)"]   
    
    KS_test = (stats.ks_2samp(test_prob_y_real_0, test_prob_y_real_1).statistic).round(decimals=2)

                  
    df_performance = pd.DataFrame({'Train':[recl_train, roc_train, KS_train, acc_train],
                              'Test':[recl_test, roc_test, KS_test, acc_test]},
                            index=['Recall','AUROC','KS','Accuracy'])
    
    df_performance['Variation'] = round(df_performance['Test'] / df_performance['Train'] - 1, 2)

    return df_performance 

In [121]:
# Obtaining the performances:

per_bagg = performances (model_bagg, x_train_over, y_train_over, X_test, y_test, 0.50) 
per_bagg

Unnamed: 0,Train,Test,Variation
Recall,0.67,0.65,-0.03
AUROC,0.72,0.71,-0.01
KS,0.34,0.31,-0.09
Accuracy,0.67,0.66,-0.01


In [None]:
# Random Forest model:

In [37]:
def train_RF (estimators, min_leaf, max_dp): 
    
    model_rf = RandomForestClassifier(n_estimators= estimators,               
                                   min_samples_leaf= min_leaf, 
                                   max_depth= max_dp,
                                   random_state = 42)
    return model_rf

In [38]:
# Definition of the hyperparametric space:

est = np.arange(50, 210, 50)

minlf = np.arange(10, 1500, 500)

maxdp = np.arange(3, 15, 1) 

param_grid = dict(estimators=est,
                  min_leaf= minlf,
                  max_dp = maxdp)

In [39]:
bo_space3 = {'estimators': hp.choice('estimators', param_grid['estimators']),
            'min_leaf': hp.choice('min_leaf', param_grid['min_leaf']),
            'max_dp': hp.choice('max_dp', param_grid['max_dp'])}

In [40]:
def bo_randomfor(param):   

    print ('tested_hyperparameters: ', param)
    model_bo = train_RF(estimators=param['estimators'], 
                                    min_leaf =param['min_leaf'],   
                                    max_dp=param['max_dp'])   
    
    model_bo.fit(x_train_over, y_train_over)    
      
    ypred_train = model_bo.predict(x_train_over)
    
    ypred_valid = model_bo.predict(x_valid)
    
    Prec_train = precision_score(y_train_over, ypred_train)    
    Prec_valid = precision_score(y_valid, ypred_valid)    
    Prec_Diference = (Prec_train - Prec_valid)/Prec_train
        
    Recl_train = recall_score(y_train_over, ypred_train)    
    Recl_valid = recall_score(y_valid, ypred_valid)    
    Recl_Diference = (Recl_train - Recl_valid)/Recl_train
    
    D_p = math.sqrt((Prec_valid - 1)**2 + (Prec_Diference - 0)**2)
    D_r = math.sqrt((Recl_valid - 1)**2 + (Recl_Diference - 0)**2)
    D_t = D_p + D_r
    
    print('Recl_valid :', Recl_valid, 'Recl_train :', Recl_train,
         'Prec_valid :', Prec_valid, 'Prec_train :',Prec_train)
    
    return {'loss': D_t,'status': STATUS_OK}    

In [41]:
trials = Trials()                  
bo_search3 = fmin(fn= bo_randomfor,       
                 space= bo_space3,  
                 algo=tpe.suggest, 
                 max_evals=20,     
                 trials=trials)

tested_hyperparameters:                                                                                                
{'estimators': 200, 'max_dp': 8, 'min_leaf': 10}                                                                       
Recl_valid :                                                                                                           
0.6694055099081682                                                                                                     
Recl_train :                                                                                                           
0.7623096071895701                                                                                                     
Prec_valid :                                                                                                           
0.20303452319870996                                                                                                    
Prec_train :                            

In [42]:
# Obtaining the best hyperparameters:

bo_better_hp3 = space_eval(bo_space3, bo_search3)
print('Better hyperparameters: ', bo_better_hp3)

Better hyperparameters:  {'estimators': 200, 'max_dp': 8, 'min_leaf': 10}


In [47]:
# Fitting the chosen models (best hyperparameters):

model_randomfor =  train_RF (200, 10, 8)

model_randomfor.fit(x_train_over, y_train_over)

In [122]:
# Obtaining the performances:

per_rand = performances (model_randomfor, x_train_over, y_train_over, X_test, y_test, 0.50) 
per_rand

Unnamed: 0,Train,Test,Variation
Recall,0.76,0.68,-0.11
AUROC,0.79,0.73,-0.08
KS,0.43,0.33,-0.23
Accuracy,0.71,0.66,-0.07


In [None]:
# Artificial Neural Networks:

In [49]:
# Standardizing the scale of variables:

x_scaler = StandardScaler()

x_scaler.fit(x_train_over)
x_train_sc = x_scaler.transform(x_train_over)   
x_valid_sc = x_scaler.transform(x_valid)
x_test_sc = x_scaler.transform(X_test)

In [50]:
x_train_sc.shape[1]

5

In [51]:
# Building Tensorflow model encapsulated as Scikit-Learn:

def model_ANN (num_layers=1, num_neurons=8, fn_activ_hl='tanh', drop = 0.1):
    
    model = keras.Sequential()
    
    # First hidden layer:
    model.add(layers.Dense(num_neurons, 
                           activation=str(fn_activ_hl),
                           kernel_regularizer=keras.regularizers.l1_l2(),
                           input_shape=(x_train_sc.shape[1],)))
    model.add(layers.Dropout(drop))
    
    # Add more hidden layers:
    if num_layers > 1:
        for i in np.arange(2, num_layers, 1):
            model.add(layers.Dense(num_neurons, 
                                   activation=str(fn_activ_hl),
                                   kernel_regularizer=keras.regularizers.l1_l2()))
            model.add(layers.Dropout(drop))
    
    # Output layer:
    model.add(layers.Dense(1, activation='sigmoid'))
     
   
    model.compile(loss='binary_crossentropy', 
                  optimizer=keras.optimizers.Adam(),
                  metrics= [tf.keras.metrics.Recall(name='recl')])
    return model

In [78]:
# Definition of hyperparametric space:

ehp_layers = np.arange(2, 6, 1)

ehp_neurons = np.power(2, np.arange(3, 6, 1))

ehp_fnactiv_hl = ['relu','tanh','linear']

drop = np.arange(0, 0.15, 0.05).round(decimals=2)

param_grid = dict(num_layers=ehp_layers,
                  num_neurons=ehp_neurons,
                  fn_activ_hl=ehp_fnactiv_hl,
                  drop = drop)

In [62]:
bo_space4 = {'num_layers': hp.choice('num_layers', param_grid['num_layers']),
            'num_neurons': hp.choice('num_neurons', param_grid['num_neurons']),
            'fn_activ_hl': hp.choice('fn_activ_hl', param_grid['fn_activ_hl']),
             'drop': hp.choice('drop', param_grid['drop'])}

In [None]:
# Defining the function that will be optimized:

In [None]:
# Early Stopping (callbacks):

rna_es = keras.callbacks.EarlyStopping(
monitor="val_recl",    
min_delta=0.005,
patience = 20,
verbose=1,
mode='max',
restore_best_weights=True)

In [59]:
def bo_naa(param):   

    print ('Hiperparametros testados: ', param)
    model = model_ANN (num_layers=param['num_layers'], 
                          num_neurons=param['num_neurons'],
                          fn_activ_hl=param['fn_activ_hl'],
                            drop = param['drop']) 

    model.fit(x_train_sc, 
            y_train_over,
            epochs=100,        
            batch_size=32,       
            validation_split= 0.2,  
            callbacks=[rna_es],        
            verbose=0) 

    ypred_train = (model.predict(x_train_sc) > 0.5).astype("int32") 
        
    ypred_valid = (model.predict(x_valid_sc) > 0.5).astype("int32")
                                                                 
    Prec_train = precision_score(y_train_over, ypred_train)    
    Prec_valid = precision_score(y_valid, ypred_valid)    
    Prec_Diference = (Prec_train - Prec_valid)/Prec_train
        
    Recl_train = recall_score(y_train_over, ypred_train)    
    Recl_valid = recall_score(y_valid, ypred_valid)    
    Recl_Diference = (Recl_train - Recl_valid)/Recl_train
    
    D_p = math.sqrt((Prec_valid - 1)**2 + (Prec_Diference - 0)**2)
    D_r = math.sqrt((Recl_valid - 1)**2 + (Recl_Diference - 0)**2)
    D_t = D_p + D_r
    
    print('Recl_valid :', Recl_valid, 'Recl_train :', Recl_train,
         'Prec_valid :', Prec_valid, 'Prec_train :',Prec_train)
    
    return {'loss': D_t,'status': STATUS_OK}    

In [60]:
trials = Trials()                  
bo_search4 = fmin(fn= bo_naa,       
                 space= bo_space4,  
                 algo=tpe.suggest, 
                 max_evals=20,    
                 trials=trials)

Hiperparametros testados:                                                                                              
{'drop': 0.15, 'fn_activ_hl': 'relu', 'num_layers': 4, 'num_neurons': 16}                                              
Restoring model weights from the end of the best epoch.                                                                
Epoch 00021: early stopping                                                                                            
Recl_valid :                                                                                                           
0.0                                                                                                                    
Recl_train :                                                                                                           
0.0                                                                                                                    
Prec_valid :                            

  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))

  Prec_Diference = (Prec_train - Prec_valid)/Prec_train

  Recl_Diference = (Recl_train - Recl_valid)/Recl_train



Restoring model weights from the end of the best epoch.                                                                
Epoch 00032: early stopping                                                                                            
Recl_valid :                                                                                                           
0.5106331561140648                                                                                                     
Recl_train :                                                                                                           
0.5189654444960128                                                                                                     
Prec_valid :                                                                                                           
0.2535397168226542                                                                                                     
Prec_train :                            

  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))

  Prec_Diference = (Prec_train - Prec_valid)/Prec_train

  Recl_Diference = (Recl_train - Recl_valid)/Recl_train



Restoring model weights from the end of the best epoch.                                                                
Epoch 00045: early stopping                                                                                            
Recl_valid :                                                                                                           
0.5328661188980184                                                                                                     
Recl_train :                                                                                                           
0.5435002742500317                                                                                                     
Prec_valid :                                                                                                           
0.24639624539054644                                                                                                    
Prec_train :                            

  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))

  Prec_Diference = (Prec_train - Prec_valid)/Prec_train

  Recl_Diference = (Recl_train - Recl_valid)/Recl_train



Restoring model weights from the end of the best epoch.                                                                
Epoch 00052: early stopping                                                                                            
Recl_valid :                                                                                                           
0.5260995650072499                                                                                                     
Recl_train :                                                                                                           
0.5615163917134298                                                                                                     
Prec_valid :                                                                                                           
0.2503162009888467                                                                                                     
Prec_train :                            

  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))

  Prec_Diference = (Prec_train - Prec_valid)/Prec_train

  Recl_Diference = (Recl_train - Recl_valid)/Recl_train



Restoring model weights from the end of the best epoch.                                                                
Epoch 00029: early stopping                                                                                            
Recl_valid :                                                                                                           
0.5176413726437893                                                                                                     
Recl_train :                                                                                                           
0.5348297540188178                                                                                                     
Prec_valid :                                                                                                           
0.25692695214105793                                                                                                    
Prec_train :                            

  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))

  Prec_Diference = (Prec_train - Prec_valid)/Prec_train

  Recl_Diference = (Recl_train - Recl_valid)/Recl_train



Restoring model weights from the end of the best epoch.                                                                
Epoch 00028: early stopping                                                                                            
Recl_valid :                                                                                                           
0.49951667472208794                                                                                                    
Recl_train :                                                                                                           
0.5093455972321843                                                                                                     
Prec_valid :                                                                                                           
0.25321572951120913                                                                                                    
Prec_train :                            

  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))

  Prec_Diference = (Prec_train - Prec_valid)/Prec_train

  Recl_Diference = (Recl_train - Recl_valid)/Recl_train



In [63]:
# Obtaining the best hyperparameters:

bo_better_hp4 = space_eval(bo_space4, bo_search4)
print('Better hyperparameters: ', bo_better_hp4)

Better hyperparameters:  {'drop': 0.0, 'fn_activ_hl': 'relu', 'num_layers': 3, 'num_neurons': 16}


In [80]:
# Fitting the chosen models (best hyperparameters):

model_ann = model_ANN(num_layers = 3,
                      num_neurons= 16,
                      fn_activ_hl= 'relu',
                         drop=0)

model_ann.fit(x_train_sc, 
            y_train_over,
            epochs=100,
            batch_size=32,
            validation_split=0.2,
            callbacks=rna_es,
            verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Restoring model weights from the end of the best epoch.
Epoch 00051: early stopping


<tensorflow.python.keras.callbacks.History at 0x1911900ce50>

In [82]:
x_train_2 = pd.DataFrame(x_train_sc, columns = x_feat_name)
x_test_2 = pd.DataFrame(x_test_sc, columns = x_feat_name)

In [129]:
per_ann = performances (model_ann, x_train_2, y_train_over, x_test_2, y_test, 0.5)
per_ann

Unnamed: 0,Train,Test,Variation
Recall,0.54,0.54,0.0
AUROC,0.75,0.74,-0.01
KS,0.37,0.02,-0.95
Accuracy,0.67,0.76,0.13


In [None]:
# Putting the results into a dataframe:

In [1]:
Recall_Test = []
AUROC_Test = []
KS_Test = []
Accur_Test = []
Recall_Var = []
AUROC_Var = []
KS_Var = []
Accur_Var = []

models = [per_gbc_1, per_bagg_1, per_rand_1, per_ann_1, per_gbc_2, per_bagg_2, per_rand_2, per_ann_2,
          perf_gbc_3, per_bagg_3, per_rand_3, per_ann_3, per_gbc_4, per_bagg_4, per_rand_4, per_ann_4]

for i in np.arange(0, 4):
    for model in models:
        
        result_1 = model['Test'][i]
    
        result_2 = model['Variation'][i]

        if i == 0:
            Recall_Test.append(result_1)  
            Recall_Var.append(result_2)             
        if i == 1:
            AUROC_Test.append(result_1)  
            AUROC_Var.append(result_2) 
        if i == 2:
            KS_Test.append(result_1)  
            KS_Var.append(result_2)             
        if i == 3:
            Accur_Test.append(result_1)  
            Accur_Var.append(result_2)          

NameError: name 'per_gbc_1' is not defined

In [None]:
models = ['Grad_1', 'Bagg_1', 'Rand_1', 'ANN_1', 'Grad_2', 'Bagg_2', 'Rand_2', 'ANN_2'
          'Grad_3', 'Bagg_3', 'Rand_3', 'ANN_3', 'Grad_4', 'Bagg_4', 'Rand_4', 'ANN_4']

resume = pd.DataFrame({'Recall_Test': Recall_Test,'Recall_Var': Recall_Var,
                       'AUROC_Test': AUROC_Test ,'AUROC_Var': AUROC_Var,
                       'KS_Test': KS_Test,'KS_Var': KS_Var,
                       'Accur_Test': Accur_Test,'Accur_Var': Accur_Var}, index = models) 

In [142]:
Recall_Test = per_gbc['Test'][0], per_bagg['Test'][0], per_rand['Test'][0], per_ann['Test'][0]
AUROC_Test = per_gbc['Test'][1], per_bagg['Test'][1], per_rand['Test'][1], per_ann['Test'][1]
KS_Test = per_gbc['Test'][2], per_bagg['Test'][2], per_rand['Test'][2], per_ann['Test'][2]
Accur_Test = per_gbc['Test'][3], per_bagg['Test'][3], per_rand['Test'][3], per_ann['Test'][3]
Recall_Var = per_gbc['Variation'][0], per_bagg['Variation'][0], per_rand['Variation'][0], per_ann['Variation'][0]
AUROC_Var = per_gbc['Variation'][1], per_bagg['Variation'][1], per_rand['Variation'][1], per_ann['Variation'][1]
KS_Var = per_gbc['Variation'][2], per_bagg['Variation'][2], per_rand['Variation'][2], per_ann['Variation'][2]
Accur_Var = per_gbc['Variation'][3], per_bagg['Variation'][3], per_rand['Variation'][3], per_ann['Variation'][3]

index = ['Gradient_Boost', 'Logist_Regres', 'Random_For','Art_Neural_Net']

In [143]:
resume = pd.DataFrame({'Recall_Test': Recall_Test,'Recall_Var': Recall_Var,
                       'AUROC_Test': AUROC_Test ,'AUROC_Var': AUROC_Var,
                       'KS_Test': KS_Test,'KS_Var': KS_Var,
                       'Accur_Test': Accur_Test,'Accur_Var': Accur_Var}, index = index)  

In [145]:
resume.sort_values(by=['Recall_Test','Recall_Var','AUROC_Test', 'AUROC_Var','KS_Test', 'KS_Var'],
                   ascending=[False, False, False, False,False, False])

Unnamed: 0,Recall_Test,Recall_Var,AUROC_Test,AUROC_Var,KS_Test,KS_Var,Accur_Test,Accur_Var
Random_For,0.68,-0.11,0.73,-0.08,0.33,-0.23,0.66,-0.07
Logist_Regres,0.65,-0.03,0.71,-0.01,0.31,-0.09,0.66,-0.01
Gradient_Boost,0.65,-0.12,0.72,-0.08,0.32,-0.24,0.67,-0.06
Art_Neural_Net,0.54,0.0,0.74,-0.01,0.02,-0.95,0.76,0.13


In [None]:
# Sobre os hiperparãmetros do GradientBoosting:

# O argumento learning_rate na função GradientBoostingClassifier é um hiperparâmetro que controla a contribuição de cada árvore no modelo durante o treinamento.
# Em algoritmos de boosting, como o Gradient Boosting, várias árvores (estimadores) são treinadas sequencialmente, e cada árvore tenta corrigir os erros das árvores anteriores. 
# O learning_rate é um valor que multiplica a contribuição de cada árvore ao modelo. 
# Um valor mais baixo de learning_rate geralmente resulta em um modelo mais robusto, pois reduz a influência de cada árvore individual, tornando o treinamento mais lento, mas potencialmente mais preciso.
# Por outro lado, um learning_rate mais alto pode levar a um treinamento mais rápido, mas também pode tornar o modelo mais propenso a overfitting, especialmente se o número de árvores (n_estimators) for alto.
# Em resumo, o learning_rate é um parâmetro de ajuste fino que equilibra a contribuição de cada árvore no ensemble, afetando a taxa de aprendizado do modelo.
# Ao ajustar esse parâmetro, você pode controlar o equilíbrio entre a velocidade de treinamento e a precisão do modelo.

# O argumento subsample controla a fração dos exemplos de treinamento que são amostrados aleatoriamente e usados para ajustar cada árvore de decisão em um estágio específico do algoritmo de Gradient Boosting.

In [None]:
# Abaixo código para fazer tunning de hiperparâmetro usando K-Fold estratificado:

In [None]:
def bo_gpc(param, x_train, y_train, x_valid, y_valid):   

    print ('tested_hyperparameters: ', param)
    model_bo= train_oversm_boosting(estimators=param['estimators'], 
                                    lean_rate=param['lean_rate'],
                                    sample =param['sample'],
                                    max_dp=param['max_dp'])   
    
   
    Prec_train = []
    Prec_valid = []    
    Recl_train = []
    Recl_valid = []
    
    partitions = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42) 
    
    for train_index, valid_index in partitions.split(x_train, y_train):
        
        xtrain_train, xtrain_valid = x_train.iloc[train_index], x_train.iloc[valid_index]
        ytrain_train, ytrain_valid = y_train.iloc[train_index], y_train.iloc[valid_index] 
        
        under_tomek = TomekLinks(sampling_strategy='majority',n_jobs=4)

        x_train_tomek, y_train_tomek = under_tomek.fit_resample(xtrain_train, ytrain_train)

        rnd_under = RandomUnderSampler(sampling_strategy=0.35,
                                        random_state=42)
        x_train_under, y_train_under = rnd_under.fit_resample(x_train_tomek, y_train_tomek)
        
        smote = SMOTE(sampling_strategy= 1, random_state=42)

        x_train_over, y_train_over = smote.fit_resample(x_train_under, y_train_under)
            
        model_bo.fit(x_train_over, y_train_over)    
        
        ypred_train = model_bo.predict(x_train_over)
        y_proba_train = model_bo.predict_proba(x_train_over)[:,1]
    
        ypred_valid = model_bo.predict(xtrain_valid)
        y_proba_valid = model_bo.predict_proba(xtrain_valid)[:,1]
        
        Recl_train.append(recall_score(y_train_over, ypred_train))
        Recl_valid.append(recall_score(ytrain_valid, ypred_valid))   
        
        Prec_train.append(precision_score(y_train_over, ypred_train))    
        Prec_valid.append(precision_score(ytrain_valid, ypred_valid))  
        
    Prec_train_mean = np.mean(Prec_train)
    Prec_valid_mean = np.mean(Prec_valid)
    Prec_Diference = (Prec_train_mean - Prec_valid_mean)/Prec_train_mean
   
    Recl_train_mean = np.mean(Recl_train)
    Recl_valid_mean = np.mean(Recl_valid)
    Recl_Diference = (Recl_train_mean - Recl_valid_mean)/Recl_train_mean
        
    D_p = math.sqrt((Prec_valid_mean - 1)**2 + (Prec_Diference - 0)**2)
    D_r = math.sqrt((Recl_valid_mean - 1)**2 + (Recl_Diference - 0)**2)
        
    D_t = D_p + D_r
        
    print('Prec_valid :', Prec_valid_mean, 'Diference_Prec :', Prec_Diference,
          'Recl_valid :', Recl_valid_mean, 'Diference_Recl :', Recl_Diference)
    
    return {'loss': D_t,'status': STATUS_OK}                 