In [18]:
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats import randint
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE


Importamos los datos:

In [19]:
X_test = pd.read_csv("../data/processed/X_test.csv")
X_train = pd.read_csv("../data/processed/X_train.csv")
y_train = pd.read_csv("../data/processed/y_train.csv")
y_test = pd.read_csv("../data/processed/y_test.csv")

In [20]:
y_train = y_train.squeeze()
y_test = y_test.squeeze()

Primera versión (con parámetros en default):

In [21]:
# Entrenamiento del modelo ----------------------------------------------------------------------------

model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

# Evaluación -------------------------------------------------------------------------------------------

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("MATRIZ DE CONFUSIÓN:")
print(confusion_matrix(y_test, y_pred))
print("CLASIFICACIÓN:")
print(classification_report(y_test, y_pred))
print("AUC-ROC:", round(roc_auc_score(y_test, y_proba),4))


MATRIZ DE CONFUSIÓN:
[[1536   57]
 [ 209  198]]
CLASIFICACIÓN:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1593
           1       0.78      0.49      0.60       407

    accuracy                           0.87      2000
   macro avg       0.83      0.73      0.76      2000
weighted avg       0.86      0.87      0.85      2000

AUC-ROC: 0.8591


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Segunda versión (Con balanceo de clases con scale_pos_weight}):

In [22]:
# Segundo intento XGBoost intentando balancear las clases usando scale_pos_weight

from collections import Counter

# Calcular el peso según la proporción de clases
counter = Counter(y_train)
scale = counter[0] / counter[1]
print(scale)

# Entrenamiento del modelo ----------------------------------------------------------------------------

model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale,
    random_state=42,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

# Evaluación -------------------------------------------------------------------------------------------

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("MATRIZ DE CONFUSIÓN:")
print(confusion_matrix(y_test, y_pred))
print("CLASIFICACIÓN:")
print(classification_report(y_test, y_pred))
print("AUC-ROC:", round(roc_auc_score(y_test, y_proba),4))

3.9079754601226995
MATRIZ DE CONFUSIÓN:
[[1296  297]
 [ 104  303]]
CLASIFICACIÓN:
              precision    recall  f1-score   support

           0       0.93      0.81      0.87      1593
           1       0.51      0.74      0.60       407

    accuracy                           0.80      2000
   macro avg       0.72      0.78      0.73      2000
weighted avg       0.84      0.80      0.81      2000

AUC-ROC: 0.86


Tercera Versión (Balanceando usando SMOTE):

In [23]:
#Tercer intento usando SMOTE

from imblearn.over_sampling import SMOTE

# Usando SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("Antes del balanceo:", Counter(y_train))
print("Después del balanceo:", Counter(y_train_res))

# Entrenamiento del modelo ----------------------------------------------------------------------------

model.fit(X_train_res, y_train_res)

# Evaluación -------------------------------------------------------------------------------------------

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("MATRIZ DE CONFUSIÓN:")
print(confusion_matrix(y_test, y_pred))
print("CLASIFICACIÓN:")
print(classification_report(y_test, y_pred))
print("AUC-ROC:", round(roc_auc_score(y_test, y_proba),4))

Antes del balanceo: Counter({0: 6370, 1: 1630})
Después del balanceo: Counter({1: 6370, 0: 6370})
MATRIZ DE CONFUSIÓN:
[[1099  494]
 [  68  339]]
CLASIFICACIÓN:
              precision    recall  f1-score   support

           0       0.94      0.69      0.80      1593
           1       0.41      0.83      0.55       407

    accuracy                           0.72      2000
   macro avg       0.67      0.76      0.67      2000
weighted avg       0.83      0.72      0.75      2000

AUC-ROC: 0.8617


Cuarta Versión (Cross-Validation): 

In [24]:
xgb = XGBClassifier(
    random_state=42, eval_metric='logloss', scale_pos_weight=scale
)

param_dist_xgb = {
    'n_estimators': randint(200, 600),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': randint(3, 8),
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
}

# Correr Cross Validation
search_xgb = RandomizedSearchCV(
    xgb, param_distributions=param_dist_xgb, n_iter=30,
    scoring='roc_auc', cv=5, random_state=42, n_jobs=-1, verbose=1
)

search_xgb.fit(X_train_res, y_train_res)

# Imprimir los parámetros con mejores resultados

print("Mejor XGBoost:")
print(search_xgb.best_params_)
print("AUC en CV:", search_xgb.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Mejor XGBoost:
{'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 430, 'subsample': 0.8}
AUC en CV: 0.964531297363771


In [25]:
# Entrenar el mejor modelo con todos los datos balanceados
best_xgb = search_xgb.best_estimator_
best_xgb.fit(X_train_res, y_train_res)

# Evaluación -------------------------------------------------------------------------------------------
y_pred = best_xgb.predict(X_test)
y_proba = best_xgb.predict_proba(X_test)[:, 1]

print("MATRIZ DE CONFUSIÓN:")
print(confusion_matrix(y_test, y_pred))
print("CLASIFICACIÓN:")
print(classification_report(y_test, y_pred))
print("AUC-ROC:", round(roc_auc_score(y_test, y_proba),4))

MATRIZ DE CONFUSIÓN:
[[1394  199]
 [ 164  243]]
CLASIFICACIÓN:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      1593
           1       0.55      0.60      0.57       407

    accuracy                           0.82      2000
   macro avg       0.72      0.74      0.73      2000
weighted avg       0.82      0.82      0.82      2000

AUC-ROC: 0.826
