In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import datetime

In [2]:
data_encoded = pd.read_csv('data_encoded.csv')
data_encoded.head()

Unnamed: 0.1,Unnamed: 0,id,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,...,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,Arrival Delay in Minutes Media,Arrival Delay in Minutes Mediana,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus
0,-2.450978,-1.105346,0.853462,-1.769073,5,4,3,4,3,4,...,1.76323,1.639646,satisfied,1.636619,1.636619,False,False,False,True,False
1,-2.450978,0.695632,-0.052817,1.419539,1,1,3,1,5,4,...,-0.773881,-0.785357,satisfied,-0.788543,-0.788543,False,False,False,False,False
2,-2.450978,-1.646493,-1.481168,-1.676214,2,0,2,4,2,2,...,-0.773881,-0.785357,neutral or dissatisfied,-0.788543,-0.788543,True,True,False,True,False
3,-2.450978,0.525766,0.440817,1.604222,0,0,0,2,3,4,...,-0.773881,0.454269,satisfied,0.451164,0.451164,True,False,False,False,False
4,-2.450978,-0.357263,0.706518,0.404756,2,3,4,3,4,1,...,-0.773881,1.154131,satisfied,1.151072,1.151072,False,False,False,True,False


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.impute import SimpleImputer

In [4]:
# Variable a Predecir Satisfaction
# Dividir el dataset
X = data_encoded.drop('satisfaction', axis=1)
y = data_encoded['satisfaction']

In [5]:
# Dividir los datos en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2025, shuffle=True)

In [6]:
# Imprimir los valores únicos de y_train antes de la transformación
print("Valores únicos de y_train antes de la transformación:")
print(y_train.unique())

Valores únicos de y_train antes de la transformación:
['satisfied' 'neutral or dissatisfied']


In [7]:
# Crear instancia de LabelEncoder
label_encoder = LabelEncoder()

# Codificar las etiquetas de entrenamiento
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Imprimir los valores únicos de y_train después de la codificación
print("Valores únicos de y_train después de la codificación:")
print(np.unique(y_train_encoded))
print(np.unique(y_test_encoded))

Valores únicos de y_train después de la codificación:
[0 1]
[0 1]


In [8]:
# Escalado y ajuste 
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
print("Dimensiones de X_train_scaled:", X_train_scaled.shape)
print("Dimensiones de y_train_encoded:", y_train_encoded.shape)

# Verificar tipos de datos
print("Tipo de datos de X_train_scaled:", X_train_scaled.dtype)
print("Tipo de datos de y_train_encoded:", y_train_encoded.dtype)

# Verificar valores únicos en y_train_encoded
print("Valores únicos en y_train_encoded:", np.unique(y_train_encoded))

Dimensiones de X_train_scaled: (18183, 27)
Dimensiones de y_train_encoded: (18183,)
Tipo de datos de X_train_scaled: float64
Tipo de datos de y_train_encoded: int64
Valores únicos en y_train_encoded: [0 1]


In [10]:
# Imputación de valores faltantes
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_scaled)
X_test_imputed = imputer.transform(X_test_scaled)

### Regresión Logistica

In [11]:
# Hiper-parametros
param_grid_logit = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

In [22]:
# Regresion Logistica
inicio_entrenamiento_logit = datetime.datetime.now()

logit = LogisticRegression()
grid_logit = GridSearchCV(estimator=logit, param_grid=param_grid_logit, cv=5, scoring='roc_auc')
grid_logit.fit(X_train_imputed, y_train_encoded)

# Predicciones y evaluación
logit_best = grid_logit.best_estimator_
logit_predicts = logit_best.predict(X_test_imputed)
roc_auc = roc_auc_score(y_test_encoded, logit_predicts)

fin_entrenamiento_logit = datetime.datetime.now()
tiempo_entrenamiento_logit = fin_entrenamiento_logit - inicio_entrenamiento_logit

print("Mejores hiperparámetros para regresión logística:", grid_logit.best_params_)
print("Roc_Auc Score: ", roc_auc)

Mejores hiperparámetros para regresión logística: {'C': 1, 'solver': 'liblinear'}
Roc_Auc Score:  0.8671793394182287


In [23]:
logit_results = pd.DataFrame(grid_logit.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
logit_results['algoritmo'] = 'Regresión Logística'
logit_results['tiempo_entrenamiento'] = tiempo_entrenamiento_logit
logit_results['fecha_hora_fin_entrenamiento'] = fin_entrenamiento_logit

logit_results = logit_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score', 'tiempo_entrenamiento', 'fecha_hora_fin_entrenamiento']]
print("Resultados de la búsqueda de hiperparámetros para Regresión Logística:")
logit_results

Resultados de la búsqueda de hiperparámetros para Regresión Logística:


Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score,tiempo_entrenamiento,fecha_hora_fin_entrenamiento
4,Regresión Logística,1,"{'C': 1, 'solver': 'liblinear'}",0.928543,0.002918,0 days 00:00:02.795814,2024-06-30 12:40:34.189073
5,Regresión Logística,2,"{'C': 1, 'solver': 'lbfgs'}",0.928514,0.002886,0 days 00:00:02.795814,2024-06-30 12:40:34.189073
9,Regresión Logística,3,"{'C': 100, 'solver': 'lbfgs'}",0.928513,0.00289,0 days 00:00:02.795814,2024-06-30 12:40:34.189073
7,Regresión Logística,4,"{'C': 10, 'solver': 'lbfgs'}",0.928511,0.002879,0 days 00:00:02.795814,2024-06-30 12:40:34.189073
6,Regresión Logística,5,"{'C': 10, 'solver': 'liblinear'}",0.928495,0.00287,0 days 00:00:02.795814,2024-06-30 12:40:34.189073
8,Regresión Logística,6,"{'C': 100, 'solver': 'liblinear'}",0.928491,0.002862,0 days 00:00:02.795814,2024-06-30 12:40:34.189073
3,Regresión Logística,7,"{'C': 0.1, 'solver': 'lbfgs'}",0.928346,0.002902,0 days 00:00:02.795814,2024-06-30 12:40:34.189073
2,Regresión Logística,8,"{'C': 0.1, 'solver': 'liblinear'}",0.928255,0.003221,0 days 00:00:02.795814,2024-06-30 12:40:34.189073
1,Regresión Logística,9,"{'C': 0.01, 'solver': 'lbfgs'}",0.924621,0.003001,0 days 00:00:02.795814,2024-06-30 12:40:34.189073
0,Regresión Logística,10,"{'C': 0.01, 'solver': 'liblinear'}",0.923047,0.004079,0 days 00:00:02.795814,2024-06-30 12:40:34.189073


### Naive Bayes

In [12]:
# Hiper-parametros
param_grid_nb = {
    'var_smoothing': np.logspace(-9, 0, 10)
}

In [24]:
inicio_entrenamiento_nb = datetime.datetime.now()

nb = GaussianNB()
grid_nb = GridSearchCV(estimator=nb, param_grid=param_grid_nb, cv=5, scoring='roc_auc')
grid_nb.fit(X_train_imputed, y_train_encoded)

fin_entrenamiento_nb = datetime.datetime.now()
tiempo_entrenamiento_nb = fin_entrenamiento_nb - inicio_entrenamiento_nb

nb_best = grid_nb.best_estimator_
nb_predicts = nb_best.predict(X_test_imputed)

roc_auc_nb = roc_auc_score(y_test_encoded, nb_predicts)
print("Mejores hiperparámetros para Gaussian Naive Bayes:", grid_nb.best_params_)
print("ROC AUC Score para Gaussian Naive Bayes: ", roc_auc_nb)


Mejores hiperparámetros para Gaussian Naive Bayes: {'var_smoothing': np.float64(1e-09)}
ROC AUC Score para Gaussian Naive Bayes:  0.8508951819707242


In [25]:
nb_results = pd.DataFrame(grid_nb.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
nb_results['algoritmo'] = 'Gaussian Naive Bayes'
nb_results['tiempo_entrenamiento'] = tiempo_entrenamiento_nb
nb_results['fecha_hora_fin_entrenamiento'] = fin_entrenamiento_nb

nb_results = nb_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score', 'tiempo_entrenamiento', 'fecha_hora_fin_entrenamiento']]
print("Resultados de la búsqueda de hiperparámetros para Gaussian Naive Bayes:")
nb_results

Resultados de la búsqueda de hiperparámetros para Gaussian Naive Bayes:


Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score,tiempo_entrenamiento,fecha_hora_fin_entrenamiento
0,Gaussian Naive Bayes,1,{'var_smoothing': 1e-09},0.923003,0.003072,0 days 00:00:00.577031,2024-06-30 12:40:44.375987
1,Gaussian Naive Bayes,1,{'var_smoothing': 1e-08},0.923003,0.003072,0 days 00:00:00.577031,2024-06-30 12:40:44.375987
2,Gaussian Naive Bayes,1,{'var_smoothing': 1e-07},0.923003,0.003072,0 days 00:00:00.577031,2024-06-30 12:40:44.375987
3,Gaussian Naive Bayes,4,{'var_smoothing': 1e-06},0.923003,0.003072,0 days 00:00:00.577031,2024-06-30 12:40:44.375987
4,Gaussian Naive Bayes,5,{'var_smoothing': 1e-05},0.923003,0.003072,0 days 00:00:00.577031,2024-06-30 12:40:44.375987
5,Gaussian Naive Bayes,6,{'var_smoothing': 0.0001},0.923002,0.003071,0 days 00:00:00.577031,2024-06-30 12:40:44.375987
6,Gaussian Naive Bayes,7,{'var_smoothing': 0.001},0.922987,0.003064,0 days 00:00:00.577031,2024-06-30 12:40:44.375987
7,Gaussian Naive Bayes,8,{'var_smoothing': 0.01},0.922815,0.003039,0 days 00:00:00.577031,2024-06-30 12:40:44.375987
8,Gaussian Naive Bayes,9,{'var_smoothing': 0.1},0.921007,0.002905,0 days 00:00:00.577031,2024-06-30 12:40:44.375987
9,Gaussian Naive Bayes,10,{'var_smoothing': 1.0},0.912034,0.003115,0 days 00:00:00.577031,2024-06-30 12:40:44.375987


### Random Forest

In [13]:
# Hiper-parametros
param_grid_rf = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 30],
    'criterion': ['gini', 'entropy']
}

In [26]:
inicio_entrenamiento_rf = datetime.datetime.now()

rf = RandomForestClassifier()
grid_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, scoring='roc_auc')
grid_rf.fit(X_train_imputed, y_train_encoded)

fin_entrenamiento_rf = datetime.datetime.now()
tiempo_entrenamiento_rf = fin_entrenamiento_rf - inicio_entrenamiento_rf

# Predicciones y evaluación
rf_best = grid_rf.best_estimator_
rf_predicts = rf_best.predict(X_test_imputed)
roc_auc_rf = roc_auc_score(y_test_encoded, rf_predicts)

print("Mejores hiperparámetros para Random Forest:", grid_rf.best_params_)
print("Roc_Auc Score (Random Forest): ", roc_auc_rf)

Mejores hiperparámetros para Random Forest: {'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 500}
Roc_Auc Score (Random Forest):  0.9475398441793553


In [27]:
rf_results = pd.DataFrame(grid_rf.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
rf_results['algoritmo'] = 'Random Forest'
rf_results['tiempo_entrenamiento'] = tiempo_entrenamiento_rf
rf_results['fecha_hora_fin_entrenamiento'] = fin_entrenamiento_rf

rf_results = rf_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score', 'tiempo_entrenamiento', 'fecha_hora_fin_entrenamiento']]
print("Resultados de la búsqueda de hiperparámetros para Random Forest:")
rf_results

Resultados de la búsqueda de hiperparámetros para Random Forest:


Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score,tiempo_entrenamiento,fecha_hora_fin_entrenamiento
14,Random Forest,1,"{'criterion': 'entropy', 'max_depth': 20, 'n_e...",0.991301,0.001159,0 days 00:07:09.644799,2024-06-30 12:48:01.607354
17,Random Forest,2,"{'criterion': 'entropy', 'max_depth': 30, 'n_e...",0.99117,0.001196,0 days 00:07:09.644799,2024-06-30 12:48:01.607354
16,Random Forest,3,"{'criterion': 'entropy', 'max_depth': 30, 'n_e...",0.991156,0.001202,0 days 00:07:09.644799,2024-06-30 12:48:01.607354
13,Random Forest,4,"{'criterion': 'entropy', 'max_depth': 20, 'n_e...",0.991037,0.001228,0 days 00:07:09.644799,2024-06-30 12:48:01.607354
8,Random Forest,5,"{'criterion': 'gini', 'max_depth': 30, 'n_esti...",0.99098,0.00118,0 days 00:07:09.644799,2024-06-30 12:48:01.607354
12,Random Forest,6,"{'criterion': 'entropy', 'max_depth': 20, 'n_e...",0.990899,0.001152,0 days 00:07:09.644799,2024-06-30 12:48:01.607354
5,Random Forest,7,"{'criterion': 'gini', 'max_depth': 20, 'n_esti...",0.990885,0.001219,0 days 00:07:09.644799,2024-06-30 12:48:01.607354
7,Random Forest,8,"{'criterion': 'gini', 'max_depth': 30, 'n_esti...",0.990786,0.001362,0 days 00:07:09.644799,2024-06-30 12:48:01.607354
15,Random Forest,9,"{'criterion': 'entropy', 'max_depth': 30, 'n_e...",0.990711,0.001004,0 days 00:07:09.644799,2024-06-30 12:48:01.607354
4,Random Forest,10,"{'criterion': 'gini', 'max_depth': 20, 'n_esti...",0.990659,0.001173,0 days 00:07:09.644799,2024-06-30 12:48:01.607354


### SVM

In [14]:
# Hiper-parametros
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly']
}

In [28]:
inicio_entrenamiento_svm = datetime.datetime.now()

svm = SVC()
grid_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=5, scoring='roc_auc')
grid_svm.fit(X_train_imputed, y_train_encoded)

fin_entrenamiento_svm = datetime.datetime.now()
tiempo_entrenamiento_svm = fin_entrenamiento_svm - inicio_entrenamiento_svm

# Predicciones y evaluación
svm_best = grid_svm.best_estimator_
svm_predicts = svm_best.predict(X_test_imputed)
roc_auc_svm = roc_auc_score(y_test_encoded, svm_predicts)

print("Mejores hiperparámetros para SVM:", grid_svm.best_params_)
print("Roc_Auc Score (SVM): ", roc_auc_svm)

Mejores hiperparámetros para SVM: {'C': 10, 'kernel': 'rbf'}
Roc_Auc Score (SVM):  0.9382397278658613


In [29]:
svm_results = pd.DataFrame(grid_svm.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
svm_results['algoritmo'] = 'SVM'
svm_results['tiempo_entrenamiento'] = tiempo_entrenamiento_svm
svm_results['fecha_hora_fin_entrenamiento'] = fin_entrenamiento_svm

svm_results = svm_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score', 'tiempo_entrenamiento', 'fecha_hora_fin_entrenamiento']]
print("Resultados de la búsqueda de hiperparámetros para SVM:")
svm_results

Resultados de la búsqueda de hiperparámetros para SVM:


Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score,tiempo_entrenamiento,fecha_hora_fin_entrenamiento
7,SVM,1,"{'C': 10, 'kernel': 'rbf'}",0.984714,0.002398,0 days 00:11:09.105028,2024-06-30 12:59:18.126277
8,SVM,2,"{'C': 10, 'kernel': 'poly'}",0.982594,0.002557,0 days 00:11:09.105028,2024-06-30 12:59:18.126277
5,SVM,3,"{'C': 1, 'kernel': 'poly'}",0.981931,0.002042,0 days 00:11:09.105028,2024-06-30 12:59:18.126277
4,SVM,4,"{'C': 1, 'kernel': 'rbf'}",0.981606,0.002415,0 days 00:11:09.105028,2024-06-30 12:59:18.126277
10,SVM,5,"{'C': 100, 'kernel': 'rbf'}",0.980276,0.002931,0 days 00:11:09.105028,2024-06-30 12:59:18.126277
11,SVM,6,"{'C': 100, 'kernel': 'poly'}",0.977115,0.001811,0 days 00:11:09.105028,2024-06-30 12:59:18.126277
2,SVM,7,"{'C': 0.1, 'kernel': 'poly'}",0.977005,0.001986,0 days 00:11:09.105028,2024-06-30 12:59:18.126277
1,SVM,8,"{'C': 0.1, 'kernel': 'rbf'}",0.967799,0.002058,0 days 00:11:09.105028,2024-06-30 12:59:18.126277
0,SVM,9,"{'C': 0.1, 'kernel': 'linear'}",0.928607,0.00294,0 days 00:11:09.105028,2024-06-30 12:59:18.126277
3,SVM,10,"{'C': 1, 'kernel': 'linear'}",0.928551,0.002948,0 days 00:11:09.105028,2024-06-30 12:59:18.126277


### LDA

In [15]:
# Hiper-parametros 
param_grid_lda = {
    'solver': ['svd', 'lsqr', 'eigen']
}

In [30]:
inicio_entrenamiento_lda = datetime.datetime.now()

lda = LDA()
grid_lda = GridSearchCV(estimator=lda, param_grid=param_grid_lda, cv=5, scoring='roc_auc')
grid_lda.fit(X_train_imputed, y_train_encoded)

fin_entrenamiento_lda = datetime.datetime.now()
tiempo_entrenamiento_lda = fin_entrenamiento_lda - inicio_entrenamiento_lda

# Prediccion y evaluacion
lda_best = grid_lda.best_estimator_
lda_predicts = lda_best.predict(X_test_imputed)
roc_auc_lda = roc_auc_score(y_test_encoded, lda_predicts)

print("Mejores hiperparámetros para LDA:", grid_lda.best_params_)
print("ROC AUC Score para LDA: ", roc_auc_lda)

Mejores hiperparámetros para LDA: {'solver': 'svd'}
ROC AUC Score para LDA:  0.8668158513655468


4 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/chrismerida/Documents/U/Statistical Learning /Proyecto Statistical/Proyecto-Statistical/proyectovenv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/chrismerida/Documents/U/Statistical Learning /Proyecto Statistical/Proyecto-Statistical/proyectovenv/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/chrismerida/Documents/U/Statistical Learning /Proyecto Statistical/Proyect

In [31]:
lda_results = pd.DataFrame(grid_lda.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
lda_results['algoritmo'] = 'LDA'
lda_results['tiempo_entrenamiento'] = tiempo_entrenamiento_lda
lda_results['fecha_hora_fin_entrenamiento'] = fin_entrenamiento_lda

lda_results = lda_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score', 'tiempo_entrenamiento', 'fecha_hora_fin_entrenamiento']]
print("Resultados de la búsqueda de hiperparámetros para LDA:")
lda_results

Resultados de la búsqueda de hiperparámetros para LDA:


Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score,tiempo_entrenamiento,fecha_hora_fin_entrenamiento
0,LDA,1,{'solver': 'svd'},0.92659,0.003063,0 days 00:00:00.347090,2024-06-30 12:59:47.701452
1,LDA,1,{'solver': 'lsqr'},0.92659,0.003063,0 days 00:00:00.347090,2024-06-30 12:59:47.701452
2,LDA,3,{'solver': 'eigen'},,,0 days 00:00:00.347090,2024-06-30 12:59:47.701452


### Arboles de decision

In [16]:
#Hiper-parametros
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [32]:
inicio_entrenamiento_dt = datetime.datetime.now()

dt = DecisionTreeClassifier(random_state=2025)
grid_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, scoring='roc_auc')
grid_dt.fit(X_train_imputed, y_train_encoded)

fin_entrenamiento_dt = datetime.datetime.now()
tiempo_entrenamiento_dt = fin_entrenamiento_dt - inicio_entrenamiento_dt

# Prediccion y evaluacion
dt_best = grid_dt.best_estimator_
dt_predicts = dt_best.predict(X_test_imputed)
roc_auc_dt = roc_auc_score(y_test_encoded, dt_predicts)

print("Mejores hiperparámetros para Árbol de Decisión:", grid_dt.best_params_)
print("ROC AUC Score para Árbol de Decisión: ", roc_auc_dt)

Mejores hiperparámetros para Árbol de Decisión: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
ROC AUC Score para Árbol de Decisión:  0.9393521911163103


In [33]:
dt_results = pd.DataFrame(grid_dt.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
dt_results['algoritmo'] = 'Decision Tree'
dt_results['tiempo_entrenamiento'] = tiempo_entrenamiento_dt
dt_results['fecha_hora_fin_entrenamiento'] = fin_entrenamiento_dt

dt_results = dt_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score', 'tiempo_entrenamiento', 'fecha_hora_fin_entrenamiento']]
print("Resultados de la búsqueda de hiperparámetros para Decision Tree:")
dt_results

Resultados de la búsqueda de hiperparámetros para Decision Tree:


Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score,tiempo_entrenamiento,fecha_hora_fin_entrenamiento
62,Decision Tree,1,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.980284,0.001510,0 days 00:00:44.663784,2024-06-30 13:00:38.369718
59,Decision Tree,2,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.979643,0.001479,0 days 00:00:44.663784,2024-06-30 13:00:38.369718
61,Decision Tree,3,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.979390,0.001272,0 days 00:00:44.663784,2024-06-30 13:00:38.369718
60,Decision Tree,3,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.979390,0.001272,0 days 00:00:44.663784,2024-06-30 13:00:38.369718
56,Decision Tree,5,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.979334,0.001936,0 days 00:00:44.663784,2024-06-30 13:00:38.369718
...,...,...,...,...,...,...,...
19,Decision Tree,86,"{'criterion': 'gini', 'max_depth': 20, 'min_sa...",0.935151,0.004403,0 days 00:00:44.663784,2024-06-30 13:00:38.369718
27,Decision Tree,87,"{'criterion': 'gini', 'max_depth': 30, 'min_sa...",0.926120,0.005095,0 days 00:00:44.663784,2024-06-30 13:00:38.369718
0,Decision Tree,87,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.926120,0.005095,0 days 00:00:44.663784,2024-06-30 13:00:38.369718
36,Decision Tree,87,"{'criterion': 'gini', 'max_depth': 50, 'min_sa...",0.926120,0.005095,0 days 00:00:44.663784,2024-06-30 13:00:38.369718


### Analisis de discriminante cuadratico

In [17]:
# Hiper-parametros
param_grid_qda = {
    'reg_param': [0.0, 0.1, 0.5, 1.0]
}

In [34]:
inicio_entrenamiento_qda = datetime.datetime.now()

qda = QuadraticDiscriminantAnalysis()
grid_qda = GridSearchCV(estimator=qda, param_grid=param_grid_qda, cv=5, scoring='roc_auc')
grid_qda.fit(X_train_imputed, y_train_encoded)

fin_entrenamiento_qda = datetime.datetime.now()
tiempo_entrenamiento_qda = fin_entrenamiento_qda - inicio_entrenamiento_qda

# Prediccion y evaluacion
qda_best = grid_qda.best_estimator_
qda_predicts = qda_best.predict(X_test_imputed)
roc_auc_qda = roc_auc_score(y_test_encoded, qda_predicts)

print("Mejores hiperparámetros para Análisis de Discriminante Cuadrático (QDA):", grid_qda.best_params_)
print("ROC AUC Score para QDA: ", roc_auc_qda)



Mejores hiperparámetros para Análisis de Discriminante Cuadrático (QDA): {'reg_param': 0.1}
ROC AUC Score para QDA:  0.8527003483502625




In [35]:
qda_results = pd.DataFrame(grid_qda.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
qda_results['algoritmo'] = 'QDA'
qda_results['tiempo_entrenamiento'] = tiempo_entrenamiento_qda
qda_results['fecha_hora_fin_entrenamiento'] = fin_entrenamiento_qda

qda_results = qda_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score', 'tiempo_entrenamiento', 'fecha_hora_fin_entrenamiento']]
print("Resultados de la búsqueda de hiperparámetros para QDA:")
qda_results

Resultados de la búsqueda de hiperparámetros para QDA:


Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score,tiempo_entrenamiento,fecha_hora_fin_entrenamiento
1,QDA,1,{'reg_param': 0.1},0.930756,0.003379,0 days 00:00:00.411806,2024-06-30 13:02:29.257823
2,QDA,2,{'reg_param': 0.5},0.914442,0.003481,0 days 00:00:00.411806,2024-06-30 13:02:29.257823
0,QDA,3,{'reg_param': 0.0},0.912297,0.009306,0 days 00:00:00.411806,2024-06-30 13:02:29.257823
3,QDA,4,{'reg_param': 1.0},0.90385,0.003707,0 days 00:00:00.411806,2024-06-30 13:02:29.257823


### AdaBoost

In [18]:
# Hiper-parametros
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1]
}

In [36]:
inicio_entrenamiento_ada = datetime.datetime.now()

ada = AdaBoostClassifier()
grid_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=5, scoring='roc_auc')
grid_ada.fit(X_train_imputed, y_train_encoded)

fin_entrenamiento_ada = datetime.datetime.now()
tiempo_entrenamiento_ada = fin_entrenamiento_ada - inicio_entrenamiento_ada

# Predicciones y evaluación
ada_best = grid_ada.best_estimator_
ada_predicts = ada_best.predict(X_test_imputed)
roc_auc = roc_auc_score(y_test_encoded, ada_predicts)

print("Mejores hiperparámetros para AdaBoost:", grid_ada.best_params_)
print("Roc_Auc Score: ", roc_auc)



Mejores hiperparámetros para AdaBoost: {'learning_rate': 1, 'n_estimators': 200}
Roc_Auc Score:  0.9278081338424781


In [37]:
ada_results = pd.DataFrame(grid_ada.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
ada_results['algoritmo'] = 'AdaBoost'
ada_results['tiempo_entrenamiento'] = tiempo_entrenamiento_ada
ada_results['fecha_hora_fin_entrenamiento'] = fin_entrenamiento_ada

ada_results = ada_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score', 'tiempo_entrenamiento', 'fecha_hora_fin_entrenamiento']]
print("Resultados de la búsqueda de hiperparámetros para AdaBoost:")
ada_results

Resultados de la búsqueda de hiperparámetros para AdaBoost:


Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score,tiempo_entrenamiento,fecha_hora_fin_entrenamiento
8,AdaBoost,1,"{'learning_rate': 1, 'n_estimators': 200}",0.979274,0.002604,0 days 00:01:23.896445,2024-06-30 13:04:00.259054
7,AdaBoost,2,"{'learning_rate': 1, 'n_estimators': 100}",0.978835,0.002737,0 days 00:01:23.896445,2024-06-30 13:04:00.259054
6,AdaBoost,3,"{'learning_rate': 1, 'n_estimators': 50}",0.977448,0.002905,0 days 00:01:23.896445,2024-06-30 13:04:00.259054
5,AdaBoost,4,"{'learning_rate': 0.1, 'n_estimators': 200}",0.974271,0.002318,0 days 00:01:23.896445,2024-06-30 13:04:00.259054
4,AdaBoost,5,"{'learning_rate': 0.1, 'n_estimators': 100}",0.969352,0.002699,0 days 00:01:23.896445,2024-06-30 13:04:00.259054
3,AdaBoost,6,"{'learning_rate': 0.1, 'n_estimators': 50}",0.958284,0.003312,0 days 00:01:23.896445,2024-06-30 13:04:00.259054
2,AdaBoost,7,"{'learning_rate': 0.01, 'n_estimators': 200}",0.929731,0.003964,0 days 00:01:23.896445,2024-06-30 13:04:00.259054
1,AdaBoost,8,"{'learning_rate': 0.01, 'n_estimators': 100}",0.909354,0.005294,0 days 00:01:23.896445,2024-06-30 13:04:00.259054
0,AdaBoost,9,"{'learning_rate': 0.01, 'n_estimators': 50}",0.862776,0.004732,0 days 00:01:23.896445,2024-06-30 13:04:00.259054


### Gradient Boosting

In [19]:
# Hiper-parametros
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}

In [38]:
inicio_entrenamiento_gb = datetime.datetime.now()

gb = GradientBoostingClassifier()
grid_gb = GridSearchCV(estimator=gb, param_grid=param_grid_gb, cv=5, scoring='roc_auc')
grid_gb.fit(X_train_imputed, y_train_encoded)

fin_entrenamiento_gb = datetime.datetime.now()
tiempo_entrenamiento_gb = fin_entrenamiento_gb - inicio_entrenamiento_gb

# Predicciones y evaluación
gb_best = grid_gb.best_estimator_
gb_predicts = gb_best.predict(X_test_imputed)
roc_auc = roc_auc_score(y_test_encoded, gb_predicts)

print("Mejores hiperparámetros para Gradient Boosting:", grid_gb.best_params_)
print("Roc_Auc Score: ", roc_auc)

Mejores hiperparámetros para Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Roc_Auc Score:  0.9541974871939124


In [39]:
gb_results = pd.DataFrame(grid_gb.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
gb_results['algoritmo'] = 'Gradient Boosting'
gb_results['tiempo_entrenamiento'] = tiempo_entrenamiento_gb
gb_results['fecha_hora_fin_entrenamiento'] = fin_entrenamiento_gb

gb_results = gb_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score', 'tiempo_entrenamiento', 'fecha_hora_fin_entrenamiento']]
print("Resultados de la búsqueda de hiperparámetros para Gradient Boosting:")
gb_results

Resultados de la búsqueda de hiperparámetros para Gradient Boosting:


Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score,tiempo_entrenamiento,fecha_hora_fin_entrenamiento
17,Gradient Boosting,1,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.993781,0.000848,0 days 00:14:29.094706,2024-06-30 13:18:43.738492
16,Gradient Boosting,2,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.993458,0.000901,0 days 00:14:29.094706,2024-06-30 13:18:43.738492
14,Gradient Boosting,3,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.993439,0.000933,0 days 00:14:29.094706,2024-06-30 13:18:43.738492
26,Gradient Boosting,4,"{'learning_rate': 0.5, 'max_depth': 7, 'n_esti...",0.993399,0.00089,0 days 00:14:29.094706,2024-06-30 13:18:43.738492
25,Gradient Boosting,5,"{'learning_rate': 0.5, 'max_depth': 7, 'n_esti...",0.99295,0.001085,0 days 00:14:29.094706,2024-06-30 13:18:43.738492
23,Gradient Boosting,6,"{'learning_rate': 0.5, 'max_depth': 5, 'n_esti...",0.992551,0.000897,0 days 00:14:29.094706,2024-06-30 13:18:43.738492
13,Gradient Boosting,7,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.992498,0.001117,0 days 00:14:29.094706,2024-06-30 13:18:43.738492
15,Gradient Boosting,8,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.992378,0.001,0 days 00:14:29.094706,2024-06-30 13:18:43.738492
24,Gradient Boosting,9,"{'learning_rate': 0.5, 'max_depth': 7, 'n_esti...",0.992329,0.001259,0 days 00:14:29.094706,2024-06-30 13:18:43.738492
22,Gradient Boosting,10,"{'learning_rate': 0.5, 'max_depth': 5, 'n_esti...",0.992186,0.001088,0 days 00:14:29.094706,2024-06-30 13:18:43.738492


### XGBoost

In [20]:
# Hiper-parametros
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7]
}

In [40]:
inicio_entrenamiento_xgb = datetime.datetime.now()

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
grid_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=5, scoring='roc_auc')
grid_xgb.fit(X_train_imputed, y_train_encoded)

fin_entrenamiento_xgb = datetime.datetime.now()
tiempo_entrenamiento_xgb = fin_entrenamiento_xgb - inicio_entrenamiento_xgb

# Predicciones y evaluación
xgb_best = grid_xgb.best_estimator_
xgb_predicts = xgb_best.predict(X_test_imputed)
roc_auc = roc_auc_score(y_test_encoded, xgb_predicts)

print("Mejores hiperparámetros para XGBoost:", grid_xgb.best_params_)
print("Roc_Auc Score: ", roc_auc)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Mejores hiperparámetros para XGBoost: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Roc_Auc Score:  0.9549895215908348


In [41]:
xgb_results = pd.DataFrame(grid_xgb.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
xgb_results['algoritmo'] = 'XGBoost'
xgb_results['tiempo_entrenamiento'] = tiempo_entrenamiento_xgb
xgb_results['fecha_hora_fin_entrenamiento'] = fin_entrenamiento_xgb

xgb_results = xgb_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score', 'tiempo_entrenamiento', 'fecha_hora_fin_entrenamiento']]
print("Resultados de la búsqueda de hiperparámetros para XGBoost:")
xgb_results

Resultados de la búsqueda de hiperparámetros para XGBoost:


Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score,tiempo_entrenamiento,fecha_hora_fin_entrenamiento
17,XGBoost,1,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.993937,0.00075,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
25,XGBoost,2,"{'learning_rate': 0.3, 'max_depth': 7, 'n_esti...",0.9936,0.000865,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
16,XGBoost,3,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.993503,0.000711,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
14,XGBoost,4,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.993452,0.000878,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
26,XGBoost,5,"{'learning_rate': 0.3, 'max_depth': 7, 'n_esti...",0.993411,0.000911,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
24,XGBoost,6,"{'learning_rate': 0.3, 'max_depth': 7, 'n_esti...",0.993398,0.000895,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
23,XGBoost,7,"{'learning_rate': 0.3, 'max_depth': 5, 'n_esti...",0.993277,0.000802,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
22,XGBoost,8,"{'learning_rate': 0.3, 'max_depth': 5, 'n_esti...",0.993222,0.000805,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
21,XGBoost,9,"{'learning_rate': 0.3, 'max_depth': 5, 'n_esti...",0.992832,0.00085,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
20,XGBoost,10,"{'learning_rate': 0.3, 'max_depth': 3, 'n_esti...",0.99246,0.00087,0 days 00:00:22.984750,2024-06-30 13:21:02.578448


### LGBM

In [21]:
# Hiper-parametros
param_grid_lgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7]
}

In [42]:
inicio_entrenamiento_lgb = datetime.datetime.now()

lgb_model = lgb.LGBMClassifier()
grid_lgb = GridSearchCV(estimator=lgb_model, param_grid=param_grid_lgb, cv=5, scoring='roc_auc')
grid_lgb.fit(X_train_imputed, y_train_encoded)

fin_entrenamiento_lgb = datetime.datetime.now()
tiempo_entrenamiento_lgb = fin_entrenamiento_lgb - inicio_entrenamiento_lgb

# Predicciones y evaluación
lgb_best = grid_lgb.best_estimator_
lgb_predicts = lgb_best.predict(X_test_imputed)
roc_auc = roc_auc_score(y_test_encoded, lgb_predicts)

print("Mejores hiperparámetros para LightGBM:", grid_lgb.best_params_)
print("Roc_Auc Score: ", roc_auc)

[LightGBM] [Info] Number of positive: 6423, number of negative: 8123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000533 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1210
[LightGBM] [Info] Number of data points in the train set: 14546, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.441565 -> initscore=-0.234814
[LightGBM] [Info] Start training from score -0.234814
[LightGBM] [Info] Number of positive: 6423, number of negative: 8123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000574 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1210
[LightGBM] [Info] Number of data points in the train set: 14546, number of used features: 27
[LightGBM] [Info] [bin

In [43]:
lgb_results = pd.DataFrame(grid_lgb.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
lgb_results['algoritmo'] = 'LightGBM'
lgb_results['tiempo_entrenamiento'] = tiempo_entrenamiento_lgb
lgb_results['fecha_hora_fin_entrenamiento'] = fin_entrenamiento_lgb

lgb_results = lgb_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score', 'tiempo_entrenamiento', 'fecha_hora_fin_entrenamiento']]
print("Resultados de la búsqueda de hiperparámetros para LightGBM:")
lgb_results

Resultados de la búsqueda de hiperparámetros para LightGBM:


Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score,tiempo_entrenamiento,fecha_hora_fin_entrenamiento
17,LightGBM,1,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.993766,0.000921,0 days 00:00:16.224095,2024-06-30 13:23:10.012921
24,LightGBM,2,"{'learning_rate': 0.3, 'max_depth': 7, 'n_esti...",0.993562,0.000997,0 days 00:00:16.224095,2024-06-30 13:23:10.012921
25,LightGBM,3,"{'learning_rate': 0.3, 'max_depth': 7, 'n_esti...",0.99354,0.001089,0 days 00:00:16.224095,2024-06-30 13:23:10.012921
16,LightGBM,4,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.993502,0.00096,0 days 00:00:16.224095,2024-06-30 13:23:10.012921
26,LightGBM,5,"{'learning_rate': 0.3, 'max_depth': 7, 'n_esti...",0.993476,0.000866,0 days 00:00:16.224095,2024-06-30 13:23:10.012921
14,LightGBM,6,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.993446,0.000815,0 days 00:00:16.224095,2024-06-30 13:23:10.012921
23,LightGBM,7,"{'learning_rate': 0.3, 'max_depth': 5, 'n_esti...",0.993228,0.001032,0 days 00:00:16.224095,2024-06-30 13:23:10.012921
22,LightGBM,8,"{'learning_rate': 0.3, 'max_depth': 5, 'n_esti...",0.993105,0.001018,0 days 00:00:16.224095,2024-06-30 13:23:10.012921
21,LightGBM,9,"{'learning_rate': 0.3, 'max_depth': 5, 'n_esti...",0.992967,0.000839,0 days 00:00:16.224095,2024-06-30 13:23:10.012921
13,LightGBM,10,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.992529,0.000843,0 days 00:00:16.224095,2024-06-30 13:23:10.012921


In [52]:
# Resultados unificados
all_results = pd.concat([
    logit_results, 
    nb_results, 
    rf_results, 
    svm_results, 
    lda_results, 
    dt_results, 
    qda_results, 
    ada_results, 
    gb_results, 
    xgb_results, 
    lgb_results
], ignore_index=True)

# Guardar como archivo csv
all_results.to_csv('model_results.csv', index=False)
print("Resultados guardados en 'model_results.csv'")


Resultados guardados en 'model_results.csv'


### Optimizacion de hiper-parametros

In [45]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

hyper_params_grid = {
    "n_estimators":[10, 50, 100, 500, 1000],
    "criterion":["gini", "entropy"],
    "max_depth":[None, 10, 25]
}

grid_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=5, scoring='roc_auc')
grid_xgb.fit(X_train_imputed, y_train_encoded)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [46]:
# Mejor calificacion
grid_xgb.best_score_

np.float64(0.9939369380670889)

In [47]:
# Mejores hyper-parametros
grid_xgb.best_params_

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}

In [48]:
xgb_results = xgb_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score', 'tiempo_entrenamiento', 'fecha_hora_fin_entrenamiento']]
print("Resultados de la búsqueda de hiperparámetros para XGBoost:")
xgb_results

Resultados de la búsqueda de hiperparámetros para XGBoost:


Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score,tiempo_entrenamiento,fecha_hora_fin_entrenamiento
17,XGBoost,1,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.993937,0.00075,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
25,XGBoost,2,"{'learning_rate': 0.3, 'max_depth': 7, 'n_esti...",0.9936,0.000865,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
16,XGBoost,3,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.993503,0.000711,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
14,XGBoost,4,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.993452,0.000878,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
26,XGBoost,5,"{'learning_rate': 0.3, 'max_depth': 7, 'n_esti...",0.993411,0.000911,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
24,XGBoost,6,"{'learning_rate': 0.3, 'max_depth': 7, 'n_esti...",0.993398,0.000895,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
23,XGBoost,7,"{'learning_rate': 0.3, 'max_depth': 5, 'n_esti...",0.993277,0.000802,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
22,XGBoost,8,"{'learning_rate': 0.3, 'max_depth': 5, 'n_esti...",0.993222,0.000805,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
21,XGBoost,9,"{'learning_rate': 0.3, 'max_depth': 5, 'n_esti...",0.992832,0.00085,0 days 00:00:22.984750,2024-06-30 13:21:02.578448
20,XGBoost,10,"{'learning_rate': 0.3, 'max_depth': 3, 'n_esti...",0.99246,0.00087,0 days 00:00:22.984750,2024-06-30 13:21:02.578448


### Registro de Modelos

In [49]:
# Registro de modelos
xgb_model_register = xgb.XGBClassifier(learning_rate=0.1, max_depth=7, n_estimators=200)
xgb_model_register.fit(X_train_imputed, y_train_encoded)
xgb_predicts = xgb_model_register.predict(X_test_imputed)

roc_auc_score(xgb_predicts, y_test_encoded)

np.float64(0.9576783659440705)

### Modelo Final

In [50]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

modelo_final = xgb.XGBClassifier(learning_rate=0.1, max_depth=7, n_estimators=200)
X_scaled = scaler.transform(X)

modelo_final.fit(X_scaled, y_encoded)

### Predicciones

In [51]:
# Realizar predicciones con el modelo final
predicciones = modelo_final.predict(X_scaled)

# Crear un DataFrame con las predicciones
df_predicciones = pd.DataFrame({
    'id': data_encoded['id'],  # Asumiendo que tienes una columna 'id' en tu DataFrame original
    'Prediccion': predicciones
})

# Si deseas convertir las predicciones a sus etiquetas originales
df_predicciones['Prediccion'] = label_encoder.inverse_transform(df_predicciones['Prediccion'])

# Guardar las predicciones en un archivo CSV
df_predicciones.to_csv('predicciones_xgboost.csv', index=False)

print("Archivo csv con predicciones creado")

Archivo csv con predicciones creado
