In [1]:
import pandas as pd
import numpy as np
import multiprocessing
from scipy import stats
import timeit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import scale
%matplotlib inline
import matplotlib.pyplot as plt
#import aux_functions
#from aux_functions import plot_with_err

# Import data

##Train

In [2]:
data = pd.read_csv('/content/PCA3_train.csv')
y = data['Y']
X = data.drop(['Y'], axis=1)
X.head()

Unnamed: 0,PC1,PC2,PC3
0,-1.74325,1.853444,0.503015
1,-1.977215,0.877564,1.604216
2,0.674778,0.925818,-1.858416
3,0.917958,0.801224,1.298191
4,1.984788,-0.838934,-1.138878


In [3]:
#dimensión del conjunto de train
n_train = int(X.shape[0])

#selección del conjunto de train
X_train = X.iloc[:n_train]
y_train = y.iloc[:n_train]

## Validation y Test

In [4]:
data_t = pd.read_csv('/content/PCA3_test.csv')
y_t = data_t['Y']
X_t = data_t.drop(['Y'], axis=1)
X_t.head()

Unnamed: 0,PC1,PC2,PC3
0,-2.033963,0.112226,1.800107
1,1.691348,2.347953,0.030756
2,1.692131,3.287265,-0.897788
3,2.295393,0.749977,1.688313
4,-2.083365,-0.037694,1.908369


Dividimos el dataset de test en validation y test, es decir, 50-50. Representarán el 10% del dataset total cada uno, ya que el train supone un 80% del total de los datos.

In [5]:
perc_values = [0.5, 0.5]

# dimensiones de los conjuntos de validation y test
n_val = int(X_t.shape[0] * perc_values[0])
n_test = int(X_t.shape[0] * perc_values[1])

# selección del conjunto de validación
X_val = X_t.iloc[:n_val]
y_val = y_t.iloc[:n_val]

# selección del conjunto de test
X_test = X_t.iloc[n_test:]
y_test = y_t.iloc[n_test:]

In [6]:
#Visualizamos el tamaño de los tres tipos de datasets

print('Train data size = ' + str(X_train.shape))
print('Train target size = ' + str(y_train.shape))
print('Validation data size = ' + str(X_val.shape))
print('Validation target size = ' + str(y_val.shape))
print('Test data size = ' + str(X_test.shape))
print('Test target size = ' + str(y_test.shape))

Train data size = (3361, 3)
Train target size = (3361,)
Validation data size = (420, 3)
Validation target size = (420,)
Test data size = (420, 3)
Test target size = (420,)


#Search Grid

In [7]:
#Hemos decidido probar los siguientes modelos:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [8]:
#Importamos la métrica que evaluará nuestros modelos
from sklearn.metrics import roc_auc_score as metric

## Parámetros generales

In [9]:
random_state = 42    # inicializa el random number generator interno
nthread = multiprocessing.cpu_count() - 1
scale_pos_weight = sum(y_train == 0) / sum(y_train == 1)

In [10]:
#puntos recomendados por Cherkassky para la SVM

n = X_train.shape[0]
d = X_train.shape[1]
m = np.mean(y_train)
s = np.std(y_train)
C_cherk = np.max([np.abs(m + 3*s),np.abs(m - 3*s)])
gamma_cherk = np.power(0.2, 1/d)


## Definición del grid

In [11]:
# Regresion Logística
regularization_values = ['l1', 'l2', 'none']   #reduce overfitting
penalty_values = [1, 10, 100]   #penaliza por tener demasiadas variables

# SVM
C_values = [C_cherk, 5e-03, 4.5e-03, 4e-03]    #parámetro de regularización 
gamma_kernel_values = [gamma_cherk, 3.26e-09, 3.255e-09, 3.25e-09]    #influencia del train

# Random Forest
ntrees_values = [10, 100, 1000]
max_depth_values = [None, 6, 20];
min_samples_split_values = [2, 5, 20];
min_samples_leaf_values = [1, 5, 20];
max_features_values = [None, 1, 2];

# Xgboost
nrounds_values = [10, 100]
eta_values = [0.3, 0.99]
gamma_values = [0, 1]
max_depth_values = [6, 20]
min_child_weight_values = [1, 20]
subsample_values = [0.1, 1]
colsample_bytree_values = [0.1, 1]
num_parallel_tree_values = [1, 20]
lambda_values = [0, 1]
alpha_values = [0, 1]

In [12]:
params_values = [{'model': 'logistic regression',
                  'regularization': regularization_values,
                 'penalty': penalty_values},

                 {'model': 'svm',
                  'C': C_values,
                 'gamma_kernel': gamma_kernel_values},

                 {'model': 'random forest',
                  'n_trees': ntrees_values,
                 'max_depth': max_depth_values,
                 'min_samples_split': min_samples_split_values,
                 'min_samples_leaf': min_samples_leaf_values,
                 'max_features': max_features_values},

                 {'model': 'xgboost',
                  'nrounds': nrounds_values,
                  'eta': eta_values,
                 'gamma': gamma_values,
                 'max_depth': max_depth_values,
                 'min_child_weight': min_child_weight_values,
                 'subsample': subsample_values,
                 'colsample_bytree': colsample_bytree_values,
                 'num_parallel_tree': num_parallel_tree_values,
                 'lambda': lambda_values,
                 'alpha': alpha_values}]

In [13]:
total_iteraciones = 0
for params in params_values:
    if params['model'] == 'logistic regression':
        n = len(params['regularization'])*len(params['penalty'])
    elif params['model'] == 'svm':
        n = len(params['C'])*len(params['gamma_kernel'])
    elif params['model'] == 'random forest':
        n = len(params['n_trees'])*len(params['max_depth'])*len(params['min_samples_split'])*len(params['min_samples_leaf'])*len(params['max_features'])
    elif params['model'] == 'xgboost':
        n = len(params['nrounds'])*len(params['eta'])*len(params['gamma'])*len(params['max_depth'])*len(params['min_child_weight'])*len(params['subsample'])*len(params['colsample_bytree'])*len(params['num_parallel_tree'])*len(params['lambda'])*len(params['alpha'])
    total_iteraciones = total_iteraciones + n;
    print(str(n)+ ' iteraciones de ' + str(params['model']))
print(str(total_iteraciones)+ ' iteraciones en total')        

9 iteraciones de logistic regression
16 iteraciones de svm
162 iteraciones de random forest
1024 iteraciones de xgboost
1211 iteraciones en total


## Aplicamos modelos

In [14]:
grid_results = pd.DataFrame();
num_iter = 0
for params in params_values:
  
    #Regresion Logística
    if params['model'] == 'logistic regression':
        for regularization in params['regularization']:
            for penalty in params['penalty']:  
                start = timeit.default_timer()
                num_iter += 1
                print('Inicio de iteracion ' + str(num_iter) + '. Regularizacion = ' + str(regularization) + ', Lambda = '  + str(penalty) + '\n')
                
                #Entrenar modelo
                if regularization == 'l1':
                    model = LogisticRegression(penalty = regularization, solver = 'liblinear', C = penalty, random_state = random_state)
                else:
                    model = LogisticRegression(penalty = regularization,solver = 'lbfgs', C = penalty, random_state = random_state)
               
                model.fit(X_train, np.array(y_train))

                #Generar predicciones
                pred_train_p = model.predict_proba(X_train)
                pred_val_p = model.predict_proba(X_val)

                #Calcular métricas de evaluación
                auc_train = metric(y_train, pred_train_p[:, 1])
                auc_val = metric(y_val, pred_val_p[:, 1])
                time = timeit.default_timer() - start

                print('Fin de iteracion ' + str(num_iter) + '. Regularizacion = ' + str(regularization) + ', Lambda = '  + str(penalty) +
                      '. AUC train = '  + str(auc_train) + ' -  AUC val = '  + str(auc_val)  + ' -  time = '  + str(time)  + '\n')
                
                grid_results = grid_results.append(pd.DataFrame(data={'model':'Logistic Regression',
                                                                      'params': [{'regularization':[regularization], 'penalty':[penalty]}],
                                                                      'auc_train':[auc_train],
                                                                      'auc_val':[auc_val],
                                                                      'time':[time]},
                                                               columns=['model','params', 'auc_train', 'auc_val', 'time']), ignore_index=True)
                
    #SVM
    if params['model'] == 'svm':
        for C in params['C']:
            for gamma_kernel in params['gamma_kernel']:  
                start = timeit.default_timer()
                num_iter += 1
                print('Inicio de iteracion ' + str(num_iter) + '. C = ' + str(C) + ', gamma = '  + str(gamma_kernel) + '\n')
                
                #Entrenar modelo               
                model = SVC(C = C, gamma = gamma_kernel, probability = True, random_state = random_state)
                model.fit(X_train, np.array(y_train))

                #Generar predicciones
                pred_train_p = model.predict_proba(X_train)
                pred_val_p = model.predict_proba(X_val)

                #Calcular métricas de evaluación
                auc_train = metric(y_train, pred_train_p[:, 1])
                auc_val = metric(y_val, pred_val_p[:, 1])
                time = timeit.default_timer() - start

                print('Fin de iteracion ' + str(num_iter) + '. C = ' + str(C) + ', gamma = '  + str(gamma_kernel) + 
                      '. AUC train = '  + str(auc_train) + ' -  AUC val = '  + str(auc_val)  + ' -  time = '  + str(time)  + '\n')
                
                grid_results = grid_results.append(pd.DataFrame(data={'model':'SVM',
                                                                     'params': [{'C':[C], 'gamma_kernel':[gamma_kernel]}],
                                                                      'auc_train':[auc_train], 'auc_val':[auc_val], 'time':[time]},
                                                               columns=['model','params', 'auc_train', 'auc_val', 'time']), ignore_index=True)
                

    #Random Forest
    if params['model'] == 'random forest':
        for n_trees in params['n_trees']:
            for max_depth in params['max_depth']:
                for min_samples_split in params['min_samples_split']:  
                    for min_samples_leaf in params['min_samples_leaf']:  
                        for max_features in params['max_features']:  
                            start = timeit.default_timer()
                            num_iter += 1 
                            print('Inicio de iteracion ' + str(num_iter) + '. n_trees = ' + str(n_trees) + ', max_depth = ' + str(max_depth) + 
                                  ', min_samples_split = '  + str(min_samples_split) + ', min_samples_leaf = '  + str(min_samples_leaf) +
                                  ', max_features = '  + str(max_features) + '\n')

                            #Entrenar modelo               
                            model = RandomForestClassifier(n_estimators = n_trees, max_depth = max_depth,
                                                          min_samples_split = min_samples_split,
                                                          min_samples_leaf = min_samples_leaf,
                                                          max_features = max_features, random_state = random_state)

                            model.fit(X_train, np.array(y_train))

                            #Generar predicciones
                            pred_train_p = model.predict_proba(X_train)
                            pred_val_p = model.predict_proba(X_val)

                            #Calcular métricas de evaluación
                            auc_train = metric(y_train, pred_train_p[:, 1])
                            auc_val = metric(y_val, pred_val_p[:, 1])  
                            time = timeit.default_timer() - start

                            print('Fin de iteracion ' + str(num_iter) + '. n_trees = ' + str(n_trees) + ', max_depth = ' + str(max_depth) + 
                                  ', min_samples_split = '  + str(min_samples_split) + ', min_samples_leaf = '  + str(min_samples_leaf) +
                                  ', max_features = '  + str(max_features) + '. AUC train = '  + str(auc_train) +  ' -  AUC val = '  + str(auc_val)  + 
                                  ' -  time = '  + str(time)  + '\n')
                            
                            grid_results = grid_results.append(pd.DataFrame(data={'model':'random forest',
                                                                                  'params': [{'n_trees':[n_trees],'max_depth':[max_depth],
                                                                                              'min_samples_split':[min_samples_split],
                                                                                              'min_samples_leaf':[min_samples_leaf],
                                                                                              'max_features':[max_features]}],
                                                                                  'auc_train':[auc_train], 
                                                                                  'auc_val':[auc_val],
                                                                                  'time':[time]},
                                                                           columns=['model','params', 'auc_train', 'auc_val', 'time']), ignore_index=True)  
    #XGBOOST
    if params['model'] == 'xgboost':
         for nrounds in params['nrounds']:
            for eta in params['eta']:
                for gamma in params['gamma']:
                    for max_depth in params['max_depth']:
                        for min_child_weight in params['min_child_weight']:
                            for subsample in params['subsample']:
                                for colsample_bytree in params['colsample_bytree']:
                                    for num_parallel_tree in params['num_parallel_tree']:
                                        for lamda in params['lambda']:
                                            for alpha in params['alpha']:
                                                start = timeit.default_timer()
                                                num_iter += 1; 
                                                print('Inicio de iteracion ' + str(num_iter) + '. Parametro nrounds = ' + str(nrounds) + 
                                                      ', parametro eta = '  + str(eta) + ', parametro gamma = '  + str(gamma) +
                                                      ', parametro max_depth = '  + str(max_depth) + ', parametro min_child_weight = '  + str(min_child_weight) +
                                                      ', parametro subsample = '  + str(subsample) + ', parametro colsample_bytree = '  + str(colsample_bytree) +
                                                      ', parametro num_parallel_tree = '  + str(num_parallel_tree) + ', parametro lambda = '  + str(lamda) +
                                                      ', parametro alpha = '  + str(alpha) + '\n')
                                               
                                                #Entrenar modelo
                                                model = XGBClassifier(nthread = nthread, scale_pos_weight = scale_pos_weight,
                                                                      random_state = random_state, n_estimators = nrounds,
                                                                      learning_rate = eta, gamma = gamma,
                                                                      max_depth = max_depth, min_child_weight = min_child_weight ,
                                                                      subsample = subsample, colsample_bytree = colsample_bytree,
                                                                      num_parallel_tree = num_parallel_tree,
                                                                      reg_lambda = lamda, reg_alpha = alpha)
                                                model.fit(X_train, np.array(y_train))

                                                #Generar predicciones
                                                pred_train_p = model.predict_proba(X_train)
                                                pred_val_p = model.predict_proba(X_val)

                                                #Calcular métricas de evaluación
                                                auc_train = metric(y_train, pred_train_p[:, 1])
                                                auc_val = metric(y_val, pred_val_p[:, 1])    
                                                time = timeit.default_timer() - start

                                                print('Fin de iteracion ' + str(num_iter) + '. Parametro nrounds = ' + str(nrounds) + 
                                                      ', parametro eta = ' + str(eta) + ', parametro gamma = '  + str(gamma) +
                                                      ', parametro max_depth = '  + str(max_depth) + ', parametro min_child_weight = '  + str(min_child_weight) +
                                                      ', parametro subsample = '  + str(subsample) + ', parametro colsample_bytree = '  + str(colsample_bytree) +
                                                      ', parametro num_parallel_tree = '  + str(num_parallel_tree) + ', parametro lambda = '  + str(lamda) +
                                                      ', parametro alpha = '  + str(alpha) + '. AUC train = '  + str(auc_train) + 
                                                      ' -  AUC val = '  + str(auc_val)  + ' -  time = '  + str(time)  + '\n')
                                                grid_results = grid_results.append(pd.DataFrame(data={'model':'xgboost',
                                                                                                      'params': [{'nrounds':[nrounds], 'eta':[eta],
                                                                                                                  'gamma':[gamma], 'max_depth':[max_depth],
                                                                                                                  'min_child_weight':[min_child_weight],
                                                                                                                  'subsample':[subsample], 
                                                                                                                  'colsample_bytree':[colsample_bytree],
                                                                                                                  'num_parallel_tree':[num_parallel_tree],
                                                                                                                  'lamda':[lamda], 'alpha':[alpha]}],
                                                                                                      'auc_train':[auc_train], 'auc_val':[auc_val], 'time':[time]},
                                                                                                columns=['model', 'params', 'auc_train', 'auc_val', 'time']), ignore_index=True)

Inicio de iteracion 1. Regularizacion = l1, Lambda = 1

Fin de iteracion 1. Regularizacion = l1, Lambda = 1. AUC train = 0.6287336073595616 -  AUC val = 0.47192293066294844 -  time = 0.014510347999987516

Inicio de iteracion 2. Regularizacion = l1, Lambda = 10

Fin de iteracion 2. Regularizacion = l1, Lambda = 10. AUC train = 0.6287408185760937 -  AUC val = 0.47217644821903915 -  time = 0.0154497120000201

Inicio de iteracion 3. Regularizacion = l1, Lambda = 100

Fin de iteracion 3. Regularizacion = l1, Lambda = 100. AUC train = 0.6287315470119809 -  AUC val = 0.47217644821903915 -  time = 0.013342587000011008

Inicio de iteracion 4. Regularizacion = l2, Lambda = 1

Fin de iteracion 4. Regularizacion = l2, Lambda = 1. AUC train = 0.6287428789236744 -  AUC val = 0.4721447585245278 -  time = 0.01956900999999789

Inicio de iteracion 5. Regularizacion = l2, Lambda = 10

Fin de iteracion 5. Regularizacion = l2, Lambda = 10. AUC train = 0.6287423638367793 -  AUC val = 0.47217644821903915 -  

  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"


Fin de iteracion 10. C = 2.0232209027962953, gamma = 0.5848035476425733. AUC train = 0.6775867663874896 -  AUC val = 0.4928064393459247 -  time = 3.2362476619999825

Inicio de iteracion 11. C = 2.0232209027962953, gamma = 3.26e-09

Fin de iteracion 11. C = 2.0232209027962953, gamma = 3.26e-09. AUC train = 0.5951195516683665 -  AUC val = 0.4084167828622132 -  time = 1.505303313000013

Inicio de iteracion 12. C = 2.0232209027962953, gamma = 3.255e-09

Fin de iteracion 12. C = 2.0232209027962953, gamma = 3.255e-09. AUC train = 0.6228914917946657 -  AUC val = 0.42245531753073906 -  time = 1.5125190349999684

Inicio de iteracion 13. C = 2.0232209027962953, gamma = 3.25e-09

Fin de iteracion 13. C = 2.0232209027962953, gamma = 3.25e-09. AUC train = 0.6189433507432704 -  AUC val = 0.4328495373304601 -  time = 1.4952931590000276

Inicio de iteracion 14. C = 0.005, gamma = 0.5848035476425733

Fin de iteracion 14. C = 0.005, gamma = 0.5848035476425733. AUC train = 0.6264208672002967 -  AUC val =

## Resultados

In [15]:
#por tiempo
grid_results.groupby(['model'], sort=False)['time'].mean().sort_values()

model
Logistic Regression    0.017475
xgboost                0.885127
SVM                    1.668917
random forest          2.316723
Name: time, dtype: float64

In [16]:
#por el AUC
grid_results.groupby(['model'], sort=False)['auc_val'].max().sort_values()

model
Logistic Regression    0.472176
SVM                    0.509190
random forest          0.612736
xgboost                0.678175
Name: auc_val, dtype: float64

In [17]:
#Elegimos la mejor combinación de modelo + hiperparámetros
best_params = grid_results.iloc[grid_results['auc_val'].idxmax()]

# Entrenamiento del modelo final

In [18]:
print('Train data size = ' + str(X_train.shape))
print('Train target size = ' + str(y_train.shape))
print('Validation data size = ' + str(X_val.shape))
print('Validation target size = ' + str(y_val.shape))

#Combinamos train y validación
X_train = pd.concat((X_train,X_val), axis = 0)
y_train = np.concatenate((y_train, y_val), axis = 0)

del X_val, y_val

print('Train data size = ' + str(X_train.shape))
print('Train target size = ' + str(y_train.shape))

Train data size = (3361, 3)
Train target size = (3361,)
Validation data size = (420, 3)
Validation target size = (420,)
Train data size = (3781, 3)
Train target size = (3781,)


In [19]:
#Regresión Logística
if best_params['model'] == 'logistic regression':       

    #Entrenar modelo
    if best_params['params']['regularization'] == 'l1':
        model = LogisticRegression(penalty = best_params['params']['regularization'][0], 
                                   solver = 'liblinear', C = best_params['params']['penalty'][0], random_state = random_state)
    else:
        model = LogisticRegression(penalty = best_params['params']['regularization'][0],
                                   solver = 'lbfgs', C = best_params['params']['penalty'][0], random_state = random_state)


#SVM
elif best_params['model'] == 'SVM':

    model = SVC(C = best_params['params']['C'][0], gamma = best_params['params']['gamma_kernel'][0], 
                probability = True, random_state = random_state)             


#Random Forest
elif best_params['model'] == 'random forest':
    model = RandomForestClassifier(n_estimators = int(best_params['params']['n_trees'][0]), max_depth = int(best_params['params']['max_depth'][0]),
                                                      min_samples_split = int(best_params['params']['min_samples_split'][0]),
                                                      min_samples_leaf = int(best_params['params']['min_samples_leaf'][0]),
                                                      max_features = int(best_params['params']['max_features'][0]), random_state = random_state)


# XGBOOST
elif best_params['model'] == 'xgboost':
    model = XGBClassifier(nthread = nthread, scale_pos_weight = scale_pos_weight, random_state = random_state,
                          n_estimators = int(best_params['params']['nrounds'][0]), learning_rate = best_params['params']['eta'][0], 
                          gamma = best_params['params']['gamma'][0], max_depth = int(best_params['params']['max_depth'][0]),
                          min_child_weight = best_params['params']['min_child_weight'][0], subsample = best_params['params']['subsample'][0],
                          colsample_bytree = best_params['params']['colsample_bytree'][0],
                          num_parallel_tree  = int(best_params['params']['num_parallel_tree'][0]),
                          reg_lambda = best_params['params']['lamda'][0], reg_alpha = best_params['params']['alpha'][0])

#Entrenar modelo
model.fit(X_train, np.array(y_train))

#Generar predicciones
pred_train_p = model.predict_proba(X_train)
pred_test_p = model.predict_proba(X_test)

#Calcular métricas de evaluación
auc_train = metric(y_train, pred_train_p[:, 1])
auc_test = metric(y_test, pred_test_p[:, 1]) 

results = pd.DataFrame()
results = results.append(pd.DataFrame(data={'model':best_params['model'],'auc_train':[auc_train],
                                            'auc_test':[auc_test]}, columns=['model',  'auc_train', 'auc_test']), ignore_index=True)

In [20]:
results

Unnamed: 0,model,auc_train,auc_test
0,xgboost,0.70035,0.604755
