# **Grid Search**



In [27]:
# Importamos librerías
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd 
import multiprocessing
import random
from sklearn.metrics import roc_auc_score as metric




from sklearn.preprocessing import scale
from sklearn.preprocessing import OneHotEncoder
import timeit

### **Dataset**

Leamos el dataset a utilizar.

In [28]:
X = pd.read_csv('/content/total_merged_train.csv')
X = X.drop(columns=['customerid'] + ['systemloanid'])
y = X['good_bad_flag']
X.head()

Unnamed: 0,loannumber,loanamount,termdays,interest,loan_interval,interest_interval,good_bad_flag,avg_loanamount,avg_paym_span_hrs,avg_active_span_days,avg_interest,max_loannumber,bank_name_clients,age,age_interval,state
0,0-4,10000.0,30,3000.0,10000,3000,0,10000.0,0.0,33.0,3000.0,1.0,Multinational Banks,36.0,36-40,Lagos
1,0-4,10000.0,30,3000.0,10000,3000,1,10000.0,13.0,3.0,3000.0,1.0,GT Bank,41.0,41-61,Lagos
2,+4,20000.0,30,3000.0,mas de 10000,3000,1,15000.0,15.166667,23.333333,3133.333333,6.0,Multinational Banks,34.0,26-35,Other
3,0-4,20000.0,30,4500.0,mas de 10000,mas de 3000,1,10000.0,13.666667,25.333333,3000.0,3.0,GT Bank,47.0,41-61,Other
4,+4,20000.0,30,4500.0,mas de 10000,mas de 3000,1,12727.272727,11.636364,14.0,1559.090909,11.0,Multinational Banks,27.0,26-35,Other


# **Tipología de variables**

In [None]:
X.info()

In [30]:
categorical_vars = set(X.columns[X.dtypes == object])
numerical_vars = set(X.columns) - categorical_vars
categorical_vars = list(categorical_vars)
numerical_vars = list(numerical_vars)
print(categorical_vars)
print(numerical_vars)

['loannumber', 'termdays', 'state', 'loan_interval', 'bank_name_clients', 'interest_interval', 'age_interval']
['age', 'interest', 'avg_active_span_days', 'avg_interest', 'avg_paym_span_hrs', 'loanamount', 'good_bad_flag', 'max_loannumber', 'avg_loanamount']


# **Transformación en dummies**

In [31]:
#Generamos dummies de las variables categóricas:
dummy_int = pd.get_dummies(X[['interest_interval']])
dummy_bank_name = pd.get_dummies(X[['bank_name_clients']], drop_first=True)
dummy_td = pd.get_dummies(X[['termdays']], drop_first=True)
dummy_age = pd.get_dummies(X[['age_interval']])
dummy_state = pd.get_dummies(X[['state']], drop_first=True)
dummy_loan = pd.get_dummies(X[['loan_interval']], drop_first=True)
dummy_loannum = pd.get_dummies(X[['loannumber']], drop_first=True)

#Añadimos las variables dummies y borramos las categóricas originales:
X = pd.concat([X, dummy_int, dummy_bank_name, dummy_td, dummy_age, dummy_state, dummy_loan, dummy_loannum], axis=1)
X = X.drop(['interest_interval','bank_name_clients', 'termdays', 'age_interval','state','loan_interval','loannumber','good_bad_flag'], 1)

X.head()

  if sys.path[0] == '':


Unnamed: 0,loanamount,interest,avg_loanamount,avg_paym_span_hrs,avg_active_span_days,avg_interest,max_loannumber,age,interest_interval_3000,interest_interval_mas de 3000,interest_interval_menos de 3000,bank_name_clients_Multinational Banks,termdays_Other,age_interval_26-35,age_interval_36-40,age_interval_41-61,state_Other,loan_interval_mas de 10000,loannumber_0-4
0,10000.0,3000.0,10000.0,0.0,33.0,3000.0,1.0,36.0,1,0,0,1,0,0,1,0,0,0,1
1,10000.0,3000.0,10000.0,13.0,3.0,3000.0,1.0,41.0,1,0,0,0,0,0,0,1,0,0,1
2,20000.0,3000.0,15000.0,15.166667,23.333333,3133.333333,6.0,34.0,1,0,0,1,0,1,0,0,1,1,0
3,20000.0,4500.0,10000.0,13.666667,25.333333,3000.0,3.0,47.0,0,1,0,0,0,0,0,1,1,1,1
4,20000.0,4500.0,12727.272727,11.636364,14.0,1559.090909,11.0,27.0,0,1,0,1,0,1,0,0,1,1,0


In [32]:
duplicate_columns = X.columns[X.columns.duplicated()]
print(duplicate_columns)

Index([], dtype='object')


### **Split en Train/Validación/Test**

Utilizaremos los siguientes ratios:

• Train: 70%.

• Validación: 15%.

• Test: 15%.


In [33]:
perc_values = [0.7, 0.15, 0.15];

Creamos los conjuntos de train, validacion y test con el tamaño seleccionado pero respetando el eje temporal.

In [34]:
# dimensiones de los conjuntos de train y test
n_train = int(X.shape[0] * perc_values[0])
n_val = int(X.shape[0] * perc_values[1])
n_test = int(X.shape[0] * perc_values[2])

# selección del conjunto de train
X_train = X.iloc[:n_train]
y_train = y.iloc[:n_train]

# selección del conjunto de validación
X_val = X.iloc[(n_train):(n_train+n_val)]
y_val = y.iloc[(n_train):(n_train+n_val)]

# selección del conjunto de test
X_test = X.iloc[(n_train+n_val):]
y_test = y.iloc[(n_train+n_val):]

Visualizamos el tamaño de los 3 subdatasets

In [35]:
print('Train data size = ' + str(X_train.shape))
print('Train target size = ' + str(y_train.shape))
print('Validation data size = ' + str(X_val.shape))
print('Validation target size = ' + str(y_val.shape))
print('Test data size = ' + str(X_test.shape))
print('Test target size = ' + str(y_test.shape))

Train data size = (2352, 19)
Train target size = (2352,)
Validation data size = (504, 19)
Validation target size = (504,)
Test data size = (505, 19)
Test target size = (505,)


# **Grid Search**

Vamos a proceder a realizar un grid search que compare todos los modelos de clasificación

Importamos todos los modelos que vamos a usar.

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

Importamos la métrica, en este caso AUC.

In [37]:
from sklearn.metrics import roc_auc_score as metric;

Definimos algunos parámetros generales.

In [38]:
random_state = 1;
nthread = multiprocessing.cpu_count() - 1;
scale_pos_weight = sum(y_train == 0) / sum(y_train == 1);

Vamos a calcular los puntos recomendados por Cherkassky para la SVM.

In [39]:
n = X_train.shape[0];
d = X_train.shape[1];
m = np.mean(y_train);
s = np.std(y_train);
C_cherk = np.max([np.abs(m + 3*s),np.abs(m - 3*s)]);
gamma_cherk = np.power(0.2, 1/d)

Definimos el grid a llevar a cabo

In [40]:
# Regresion Logística
regularization_values = ['l1', 'l2', 'none'];
penalty_values = [1, 10, 100];

# SVM
C_values = [C_cherk, 5e-03, 4.5e-03, 4e-03];
gamma_kernel_values = [gamma_cherk, 3.26e-09, 3.255e-09, 3.25e-09];

# Arbol de Decision
max_depth_values = [None, 6, 20];
min_samples_split_values = [2, 5, 20];
min_samples_leaf_values = [1, 5, 20];
max_features_values = [None, 1, 2];

# Random Forest
ntrees_values = [10, 100, 1000];

# Xgboost
nrounds_values = [10, 100]
eta_values = [0.3, 0.99]
gamma_values = [0, 1]
max_depth_values = [6, 20]
min_child_weight_values = [1, 20]
subsample_values = [0.1, 1]
colsample_bytree_values = [0.1, 1]
num_parallel_tree_values = [1, 20]
lambda_values = [0, 1]
alpha_values = [0, 1]

In [41]:
params_values = [{'model': 'logistic regression',
                  'regularization': regularization_values,
                 'penalty': penalty_values},
                 {'model': 'svm',
                  'C': C_values,
                 'gamma_kernel': gamma_kernel_values},
                 {'model': 'decision tree',
                 'max_depth': max_depth_values,
                 'min_samples_split': min_samples_split_values,
                 'min_samples_leaf': min_samples_leaf_values,
                 'max_features': max_features_values},
                 {'model': 'random forest',
                  'n_trees': ntrees_values,
                 'max_depth': max_depth_values,
                 'min_samples_split': min_samples_split_values,
                 'min_samples_leaf': min_samples_leaf_values,
                 'max_features': max_features_values},
                 {'model': 'xgboost',
                  'nrounds': nrounds_values,
                  'eta': eta_values,
                 'gamma': gamma_values,
                 'max_depth': max_depth_values,
                 'min_child_weight': min_child_weight_values,
                 'subsample': subsample_values,
                 'colsample_bytree': colsample_bytree_values,
                 'num_parallel_tree': num_parallel_tree_values,
                 'lambda': lambda_values,
                 'alpha': alpha_values}]

In [42]:
total_iteraciones = 0
for params in params_values:
    if params['model'] == 'logistic regression':
        n = len(params['regularization'])*len(params['penalty'])
    elif params['model'] == 'svm':
        n = len(params['C'])*len(params['gamma_kernel'])
    elif params['model'] == 'decision tree':
        n = len(params['max_depth'])*len(params['min_samples_split'])*len(params['min_samples_leaf'])*len(params['max_features'])
    elif params['model'] == 'random forest':
        n = len(params['n_trees'])*len(params['max_depth'])*len(params['min_samples_split'])*len(params['min_samples_leaf'])*len(params['max_features'])
    elif params['model'] == 'xgboost':
        n = len(params['nrounds'])*len(params['eta'])*len(params['gamma'])*len(params['max_depth'])*len(params['min_child_weight'])*len(params['subsample'])*len(params['colsample_bytree'])*len(params['num_parallel_tree'])*len(params['lambda'])*len(params['alpha'])
    total_iteraciones = total_iteraciones + n;
    print(str(n)+ ' iteraciones de ' + str(params['model']))
print(str(total_iteraciones)+ ' iteraciones en total')        

9 iteraciones de logistic regression
16 iteraciones de svm
54 iteraciones de decision tree
162 iteraciones de random forest
1024 iteraciones de xgboost
1265 iteraciones en total


In [43]:
grid_results = pd.DataFrame();
num_iter = 0
for params in params_values:
    
    
    # Logistic Regression
    if params['model'] == 'logistic regression':
        for regularization in params['regularization']:
            for penalty in params['penalty']:  
                start = timeit.default_timer()
                
                # Actualizar contador
                num_iter += 1; 
                
                # print control iteracion modelo
                print('Inicio de iteracion ' + str(num_iter) + 
                      '. Regularizacion = ' + str(regularization) + 
                      ', Lambda = '  + str(penalty) +
                      '\n')
                
                # Entrenar modelo
                if regularization == 'l1':
                    model = LogisticRegression(penalty = regularization, solver = 'liblinear', C = penalty, random_state = random_state)
                else:
                    model = LogisticRegression(penalty = regularization,solver = 'lbfgs', C = penalty, random_state = random_state)
               
                model.fit(X_train, np.array(y_train))

                # Generar predicciones
                pred_train_p = model.predict_proba(X_train)
                pred_val_p = model.predict_proba(X_val)

                # Calcular métricas de evaluación
                auc_train = metric(y_train, pred_train_p[:, 1])
                auc_val = metric(y_val, pred_val_p[:, 1])
                time = timeit.default_timer() - start

                print('Fin de iteracion ' + str(num_iter) + 
                     '. Regularizacion = ' + str(regularization) + 
                      ', Lambda = '  + str(penalty) +
                      '. AUC train = '  + str(auc_train) + 
                      ' -  AUC val = '  + str(auc_val)  + 
                      ' -  time = '  + str(time)  +
                      '\n')
                grid_results = grid_results.append(pd.DataFrame(data={'model':'Logistic Regression',
                                                                      'params': [{'regularization':[regularization],
                                                                                  'penalty':[penalty]}],
                                                                      'auc_train':[auc_train],
                                                                      'auc_val':[auc_val],
                                                                      'time':[time]},
                                                               columns=['model','params', 'auc_train', 'auc_val', 'time']), 
                                                   ignore_index=True)
                
     
    # SVM
    if params['model'] == 'svm':
        for C in params['C']:
            for gamma_kernel in params['gamma_kernel']:  
                start = timeit.default_timer()
                
                # Actualizar contador
                num_iter += 1; 
                
                # print control iteracion modelo
                print('Inicio de iteracion ' + str(num_iter) + 
                      '. C = ' + str(C) + 
                      ', gamma = '  + str(gamma_kernel) +
                      '\n')
                
                # Entrenar modelo               
                model = SVC(C = C, gamma = gamma_kernel, probability = True, random_state = random_state)
               
                model.fit(X_train, np.array(y_train))

                # Generar predicciones
                pred_train_p = model.predict_proba(X_train)
                pred_val_p = model.predict_proba(X_val)

                # Calcular métricas de evaluación
                auc_train = metric(y_train, pred_train_p[:, 1])
                auc_val = metric(y_val, pred_val_p[:, 1])
                time = timeit.default_timer() - start

                print('Fin de iteracion ' + str(num_iter) + 
                     '. C = ' + str(C) + 
                      ', gamma = '  + str(gamma_kernel) +
                      '. AUC train = '  + str(auc_train) + 
                      ' -  AUC val = '  + str(auc_val)  + 
                      ' -  time = '  + str(time)  +
                      '\n')
                grid_results = grid_results.append(pd.DataFrame(data={'model':'SVM',
                                                                     'params': [{'C':[C],
                                                                              'gamma_kernel':[gamma_kernel]}],
                                                                      'auc_train':[auc_train],
                                                                      'auc_val':[auc_val],
                                                                      'time':[time]},
                                                               columns=['model','params', 'auc_train', 'auc_val', 'time']), 
                                                   ignore_index=True)
                
    # Decision Tree
    if params['model'] == 'decision tree':
        for max_depth in params['max_depth']:
            for min_samples_split in params['min_samples_split']:  
                for min_samples_leaf in params['min_samples_leaf']:  
                    for max_features in params['max_features']:  
                        start = timeit.default_timer()
                
                        # Actualizar contador
                        num_iter += 1; 

                        # print control iteracion modelo
                        print('Inicio de iteracion ' + str(num_iter) + 
                              '. max_depth = ' + str(max_depth) + 
                              ', min_samples_split = '  + str(min_samples_split) +
                              ', min_samples_leaf = '  + str(min_samples_leaf) +
                              ', max_features = '  + str(max_features) +
                              '\n')

                        # Entrenar modelo               
                        model = DecisionTreeClassifier(max_depth = max_depth,
                                                      min_samples_split = min_samples_split,
                                                      min_samples_leaf = min_samples_leaf,
                                                      max_features = max_features, random_state = random_state)

                        model.fit(X_train, np.array(y_train))

                        # Generar predicciones
                        pred_train_p = model.predict_proba(X_train)
                        pred_val_p = model.predict_proba(X_val)

                        # Calcular métricas de evaluación
                        auc_train = metric(y_train, pred_train_p[:, 1])
                        auc_val = metric(y_val, pred_val_p[:, 1])
                        time = timeit.default_timer() - start

                        print('Fin de iteracion ' + str(num_iter) + 
                             '. max_depth = ' + str(max_depth) + 
                              ', min_samples_split = '  + str(min_samples_split) +
                              ', min_samples_leaf = '  + str(min_samples_leaf) +
                              ', max_features = '  + str(max_features) +
                              '. AUC train = '  + str(auc_train) + 
                              ' -  AUC val = '  + str(auc_val)  + 
                              ' -  time = '  + str(time)  +
                              '\n')
                        grid_results = grid_results.append(pd.DataFrame(data={'model':'decision tree',
                                                                              'params': [{'max_depth':[max_depth],
                                                                                          'min_samples_split':[min_samples_split],
                                                                                          'min_samples_leaf':[min_samples_leaf],
                                                                                          'max_features':[max_features]}],
                                                                      'auc_train':[auc_train],
                                                                      'auc_val':[auc_val],
                                                                      'time':[time]},
                                                                       columns=['model','params', 'auc_train', 'auc_val', 'time']), 
                                                           ignore_index=True)  
                        
    
    # Random Forest
    if params['model'] == 'random forest':
        for n_trees in params['n_trees']:
            for max_depth in params['max_depth']:
                for min_samples_split in params['min_samples_split']:  
                    for min_samples_leaf in params['min_samples_leaf']:  
                        for max_features in params['max_features']:  
                            start = timeit.default_timer()
                
                            # Actualizar contador
                            num_iter += 1; 

                            # print control iteracion modelo
                            print('Inicio de iteracion ' + str(num_iter) + 
                                  '. n_trees = ' + str(n_trees) + 
                                  ', max_depth = ' + str(max_depth) + 
                                  ', min_samples_split = '  + str(min_samples_split) +
                                  ', min_samples_leaf = '  + str(min_samples_leaf) +
                                  ', max_features = '  + str(max_features) +
                                  '\n')

                            # Entrenar modelo               
                            model = RandomForestClassifier(n_estimators = n_trees,
                                                          max_depth = max_depth,
                                                          min_samples_split = min_samples_split,
                                                          min_samples_leaf = min_samples_leaf,
                                                          max_features = max_features, random_state = random_state)

                            model.fit(X_train, np.array(y_train))

                            # Generar predicciones
                            pred_train_p = model.predict_proba(X_train)
                            pred_val_p = model.predict_proba(X_val)

                            # Calcular métricas de evaluación
                            auc_train = metric(y_train, pred_train_p[:, 1])
                            auc_val = metric(y_val, pred_val_p[:, 1])  
                            time = timeit.default_timer() - start

                            print('Fin de iteracion ' + str(num_iter) + 
                                 '. n_trees = ' + str(n_trees) + 
                                  ', max_depth = ' + str(max_depth) + 
                                  ', min_samples_split = '  + str(min_samples_split) +
                                  ', min_samples_leaf = '  + str(min_samples_leaf) +
                                  ', max_features = '  + str(max_features) +
                                  '. AUC train = '  + str(auc_train) + 
                                  ' -  AUC val = '  + str(auc_val)  + 
                                  ' -  time = '  + str(time)  +
                                  '\n')
                            grid_results = grid_results.append(pd.DataFrame(data={'model':'random forest',
                                                                                  'params': [{'n_trees':[n_trees],
                                                                                              'max_depth':[max_depth],
                                                                                              'min_samples_split':[min_samples_split],
                                                                                              'min_samples_leaf':[min_samples_leaf],
                                                                                              'max_features':[max_features]}],
                                                                          'auc_train':[auc_train],
                                                                          'auc_val':[auc_val],
                                                                          'time':[time]},
                                                                           columns=['model','params', 'auc_train', 'auc_val', 'time']), 
                                                               ignore_index=True)  
    
    # XGBOOST
    if params['model'] == 'xgboost':
         for nrounds in params['nrounds']:
            for eta in params['eta']:
                for gamma in params['gamma']:
                    for max_depth in params['max_depth']:
                        for min_child_weight in params['min_child_weight']:
                            for subsample in params['subsample']:
                                for colsample_bytree in params['colsample_bytree']:
                                    for num_parallel_tree in params['num_parallel_tree']:
                                        for lamda in params['lambda']:
                                            for alpha in params['alpha']:
                                                start = timeit.default_timer()
                                            
                                                # Actualizar contador
                                                num_iter += 1; 

                                                # print control iteracion modelo
                                                print('Inicio de iteracion ' + str(num_iter) + 
                                                      '. Parametro nrounds = ' + str(nrounds) + 
                                                      ', parametro eta = '  + str(eta) +
                                                      ', parametro gamma = '  + str(gamma) +
                                                      ', parametro max_depth = '  + str(max_depth) +
                                                      ', parametro min_child_weight = '  + str(min_child_weight) +
                                                      ', parametro subsample = '  + str(subsample) +
                                                      ', parametro colsample_bytree = '  + str(colsample_bytree) +
                                                      ', parametro num_parallel_tree = '  + str(num_parallel_tree) +
                                                      ', parametro lambda = '  + str(lamda) +
                                                      ', parametro alpha = '  + str(alpha) + 
                                                      '\n')
                                                # Entrenar modelo
                                                model = XGBClassifier(nthread = nthread, 
                                                                      scale_pos_weight = scale_pos_weight,
                                                                      random_state = random_state,
                                                                      n_estimators = nrounds,
                                                                      learning_rate = eta, 
                                                                      gamma = gamma,
                                                                      max_depth = max_depth,
                                                                      min_child_weight = min_child_weight ,
                                                                      subsample = subsample,
                                                                      colsample_bytree = colsample_bytree,
                                                                      num_parallel_tree = num_parallel_tree,
                                                                      reg_lambda = lamda,
                                                                      reg_alpha = alpha)
                                                model.fit(X_train, np.array(y_train))

                                                # Generar predicciones
                                                pred_train_p = model.predict_proba(X_train)
                                                pred_val_p = model.predict_proba(X_val)

                                                # Calcular métricas de evaluación
                                                auc_train = metric(y_train, pred_train_p[:, 1])
                                                auc_val = metric(y_val, pred_val_p[:, 1])    
                                                time = timeit.default_timer() - start

                                                print('Fin de iteracion ' + str(num_iter) + 
                                                      '. Parametro nrounds = ' + str(nrounds) + 
                                                      ', parametro eta = ' + str(eta) + 
                                                      ', parametro gamma = '  + str(gamma) +
                                                      ', parametro max_depth = '  + str(max_depth) +
                                                      ', parametro min_child_weight = '  + str(min_child_weight) +
                                                      ', parametro subsample = '  + str(subsample) +
                                                      ', parametro colsample_bytree = '  + str(colsample_bytree) +
                                                      ', parametro num_parallel_tree = '  + str(num_parallel_tree) +
                                                      ', parametro lambda = '  + str(lamda) +
                                                      ', parametro alpha = '  + str(alpha) + 
                                                      '. AUC train = '  + str(auc_train) + 
                                                      ' -  AUC val = '  + str(auc_val)  + 
                                                      ' -  time = '  + str(time)  +
                                                      '\n')
                                                grid_results = grid_results.append(pd.DataFrame(data={'model':'xgboost',
                                                                                              'params': [{'nrounds':[nrounds],
                                                                                              'eta':[eta],
                                                                                              'gamma':[gamma],
                                                                                              'max_depth':[max_depth],
                                                                                              'min_child_weight':[min_child_weight],
                                                                                              'subsample':[subsample],
                                                                                              'colsample_bytree':[colsample_bytree],
                                                                                              'num_parallel_tree':[num_parallel_tree],
                                                                                              'lamda':[lamda],
                                                                                              'alpha':[alpha]}],
                                                                                              'auc_train':[auc_train],
                                                                                              'auc_val':[auc_val],
                                                                                              'time':[time]},
                                                                                               columns=['model', 'params', 'auc_train', 'auc_val', 'time']), 
                                                                                   ignore_index=True)
                               


Inicio de iteracion 1. Regularizacion = l1, Lambda = 1





Fin de iteracion 1. Regularizacion = l1, Lambda = 1. AUC train = 0.7009371352979149 -  AUC val = 0.6583231974052024 -  time = 0.4133869979996234

Inicio de iteracion 2. Regularizacion = l1, Lambda = 10





Fin de iteracion 2. Regularizacion = l1, Lambda = 10. AUC train = 0.701674886872792 -  AUC val = 0.6603717218274545 -  time = 4.568972716000189

Inicio de iteracion 3. Regularizacion = l1, Lambda = 100



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fin de iteracion 3. Regularizacion = l1, Lambda = 100. AUC train = 0.7016971466185856 -  AUC val = 0.6602863666431941 -  time = 4.562960999000097

Inicio de iteracion 4. Regularizacion = l2, Lambda = 1

Fin de iteracion 4. Regularizacion = l2, Lambda = 1. AUC train = 0.6870226742010607 -  AUC val = 0.6452851930094105 -  time = 0.08771272500007399

Inicio de iteracion 5. Regularizacion = l2, Lambda = 10

Fin de iteracion 5. Regularizacion = l2, Lambda = 10. AUC train = 0.6870110143342163 -  AUC val = 0.6452851930094105 -  time = 0.08199125300006926

Inicio de iteracion 6. Regularizacion = l2, Lambda = 100



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

Fin de iteracion 6. Regularizacion = l2, Lambda = 100. AUC train = 0.6870290341284303 -  AUC val = 0.6452851930094105 -  time = 0.12267476599981819

Inicio de iteracion 7. Regularizacion = none, Lambda = 1

Fin de iteracion 7. Regularizacion = none, Lambda = 1. AUC train = 0.6870322140921151 -  AUC val = 0.6453278706015406 -  time = 0.09259564300009515

Inicio de iteracion 8. Regularizacion = none, Lambda = 10

Fin de iteracion 8. Regularizacion = none, Lambda = 10. AUC train = 0.6870322140921151 -  AUC val = 0.6453278706015406 -  time = 0.0844012960001237

Inicio de iteracion 9. Regularizacion = none, Lambda = 100



  "Setting penalty='none' will ignore the C and l1_ratio parameters"
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
Inicio de iteracion 16. C = 0.005, gamma = 3.255e-09

Fin de iteracion 16. C = 0.005, gamma = 3.255e-09. AUC train = 0.5502640959840239 -  AUC val = 0.49038687237266076 -  time = 1.0339679469998373

Inicio de iteracion 17. C = 0.005, gamma = 3.25e-09

Fin de iteracion 17. C = 0.005, gamma = 3.25e-09. AUC train = 0.49790175396196973 -  AUC val = 0.4575891428205621 -  time = 1.0366939790001197

Inicio de iteracion 18. C = 0.0045, gamma = 0.9187812103811747

Fin de iteracion 18. C = 0.0045, gamma = 0.9187812103811747. AUC train = 0.9999968200363152 -  AUC val = 0.5500394767727206 -  time = 1.9619848720003574

Inicio de iteracion 19. C = 0.0045, gamma = 3.26e-09

Fin de iteracion 19. C = 0.0045, gamma = 3.26e-09. AUC train = 0.40453802017580964 -  AUC val = 0.41860316240957685 -  time = 1.0183033719999912

Inicio de iteracion 20. C = 0.0045, gamma = 3.255e-09

Fin de iteracion 20. C = 0.0045, gamma = 3.255e-09. AUC 

In [44]:
grid_results.groupby(['model'], sort=False)['time'].mean().sort_values()

model
decision tree          0.013889
xgboost                0.763153
Logistic Regression    1.124700
SVM                    1.343909
random forest          1.621816
Name: time, dtype: float64

Veamos los resultados.

In [45]:
grid_results

Unnamed: 0,model,params,auc_train,auc_val,time
0,Logistic Regression,"{'regularization': ['l1'], 'penalty': [1]}",0.700937,0.658323,0.413387
1,Logistic Regression,"{'regularization': ['l1'], 'penalty': [10]}",0.701675,0.660372,4.568973
2,Logistic Regression,"{'regularization': ['l1'], 'penalty': [100]}",0.701697,0.660286,4.562961
3,Logistic Regression,"{'regularization': ['l2'], 'penalty': [1]}",0.687023,0.645285,0.087713
4,Logistic Regression,"{'regularization': ['l2'], 'penalty': [10]}",0.687011,0.645285,0.081991
...,...,...,...,...,...
1260,xgboost,"{'nrounds': [100], 'eta': [0.99], 'gamma': [1]...",0.777152,0.668673,0.291001
1261,xgboost,"{'nrounds': [100], 'eta': [0.99], 'gamma': [1]...",0.810733,0.670838,4.539366
1262,xgboost,"{'nrounds': [100], 'eta': [0.99], 'gamma': [1]...",0.800147,0.674861,4.548049
1263,xgboost,"{'nrounds': [100], 'eta': [0.99], 'gamma': [1]...",0.821380,0.679289,4.074053


Vamos a analizar el mejor resultado para cada familia de modelos.

In [46]:
grid_results.groupby(['model'], sort=False)['auc_val'].max().sort_values()

model
SVM                    0.560261
Logistic Regression    0.660372
decision tree          0.668182
random forest          0.668224
xgboost                0.702174
Name: auc_val, dtype: float64

Nos quedamos con la mejor combinación de familia de modelos + hiperparámetros.

In [47]:
best_params = grid_results.iloc[grid_results['auc_val'].idxmax()]

Juntamos validación y train para entrenar el modelo final.

In [48]:
print('Train data size = ' + str(X_train.shape))
print('Train target size = ' + str(y_train.shape))
print('Validation data size = ' + str(X_val.shape))
print('Validation target size = ' + str(y_val.shape))

# Combinar train y validación
X_train = pd.concat((X_train,X_val), axis = 0)
y_train = np.concatenate((y_train, y_val), axis = 0)

del X_val, y_val

print('Train data size = ' + str(X_train.shape))
print('Train target size = ' + str(y_train.shape))

Train data size = (2352, 19)
Train target size = (2352,)
Validation data size = (504, 19)
Validation target size = (504,)
Train data size = (2856, 19)
Train target size = (2856,)


Y ahora entrenamos el modelo final.

In [49]:
# Logistic Regression
if best_params['model'] == 'logistic regression':       

    # Entrenar modelo
    if best_params['params']['regularization'] == 'l1':
        model = LogisticRegression(penalty = best_params['params']['regularization'][0], solver = 'liblinear', C = best_params['params']['penalty'][0], random_state = random_state)
    else:
        model = LogisticRegression(penalty = best_params['params']['regularization'][0],solver = 'lbfgs', C = best_params['params']['penalty'][0], random_state = random_state)


# SVM
elif best_params['model'] == 'SVM':

    model = SVC(C = best_params['params']['C'][0], gamma = best_params['params']['gamma_kernel'][0], probability = True, 
                random_state = random_state)             


# Decision Tree
elif best_params['model'] == 'decision tree':
    model = DecisionTreeClassifier(max_depth = int(best_params['params']['max_depth'][0]),
                                                  min_samples_split = int(best_params['params']['min_samples_split'][0]),
                                                  min_samples_leaf = int(best_params['params']['min_samples_leaf'][0]),
                                                  max_features = int(best_params['params']['max_features'][0]), 
                                   random_state = random_state)


# Random Forest
elif best_params['model'] == 'random forest':
    model = RandomForestClassifier(n_estimators = int(best_params['params']['n_trees'][0]),
                                                      max_depth = int(best_params['params']['max_depth'][0]),
                                                      min_samples_split = int(best_params['params']['min_samples_split'][0]),
                                                      min_samples_leaf = int(best_params['params']['min_samples_leaf'][0]),
                                                      max_features = int(best_params['params']['max_features'][0]), 
                                                      random_state = random_state)

# XGBOOST
elif best_params['model'] == 'xgboost':
    model = XGBClassifier(nthread = nthread, 
                                                              scale_pos_weight = scale_pos_weight,
                                                              random_state = random_state,
                                                              n_estimators = int(best_params['params']['nrounds'][0]), 
                                                              learning_rate = best_params['params']['eta'][0], 
                                                              gamma = best_params['params']['gamma'][0],
                                                              max_depth = int(best_params['params']['max_depth'][0]),
                                                              min_child_weight = best_params['params']['min_child_weight'][0],
                                                              subsample = best_params['params']['subsample'][0],
                                                              colsample_bytree = best_params['params']['colsample_bytree'][0],
                                                              num_parallel_tree  = int(best_params['params']['num_parallel_tree'][0]),
                                                              reg_lambda = best_params['params']['lamda'][0],
                                                              reg_alpha = best_params['params']['alpha'][0])

# Entrenar modelo
model.fit(X_train, np.array(y_train))

# Generar predicciones
pred_train_p = model.predict_proba(X_train)
pred_test_p = model.predict_proba(X_test)

# Calcular métricas de evaluación
auc_train = metric(y_train, pred_train_p[:, 1])
auc_test = metric(y_test, pred_test_p[:, 1]) 

results = pd.DataFrame()
results = results.append(pd.DataFrame(data={'model':best_params['model'],'auc_train':[auc_train],'auc_test':[auc_test]}, columns=['model',  'auc_train', 'auc_test']), ignore_index=True)

In [50]:
results

Unnamed: 0,model,auc_train,auc_test
0,xgboost,0.799196,0.711595
