In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import shap

In [39]:
df = pd.read_parquet("../data/interim/train_final_advanced_features.parquet")

In [40]:
class_0 = df[df['TARGET'] == 0]
class_1 = df[df['TARGET'] == 1]

# Submuestrear la clase mayoritaria
clase_mayoritaria_submuestreada = resample(class_0,
                                            replace=False,
                                            n_samples=len(class_1),
                                            random_state=42)

# Combinar
df = pd.concat([clase_mayoritaria_submuestreada, class_1])

In [41]:
y = df['TARGET']
X = df.drop(columns=['TARGET', 'SK_ID_CURR']) #adde sk_id_curr

In [42]:
cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

print(f"Categóricas: {len(cat_cols)}")
print(f"Numéricas: {len(num_cols)}")

Categóricas: 12
Numéricas: 204


In [43]:
for col in X.select_dtypes("object"):
    X[col] = X[col].astype("category")

In [44]:
cat_cols = X.select_dtypes(include=["category"]).columns
num_cols = X.select_dtypes(exclude=["category"]).columns

print(f"Categóricas: {len(cat_cols)}")
print(f"Numéricas: {len(num_cols)}")

Categóricas: 12
Numéricas: 204


In [45]:
print(X.isna().sum().sum())  # must be 0
print(X.shape)

0
(49650, 216)


In [46]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42)

In [47]:
X_train.shape


(39720, 216)

In [48]:
neg_count = (y_train == 0).sum()
pos_count = (y_train == 1).sum()
optimal_scale = neg_count / pos_count

In [49]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.01,
    'n_estimators': 6000,

    'max_depth': 5,
    'min_child_weight': 15,
    'max_leaves': 31,

    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'colsample_bylevel': 0.8,
    'colsample_bynode': 0.85,

    'scale_pos_weight': optimal_scale,

    'reg_alpha': 0.5,
    'reg_lambda': 2.5,
    'gamma': 1.5,
    'max_delta_step': 2,
}


xgb_model = XGBClassifier(**params, n_jobs=-1, random_state=42,  early_stopping_rounds=300, enable_categorical=True) #added enable_categorical

In [50]:
xgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=True)

[0]	validation_0-auc:0.67807
[1]	validation_0-auc:0.69613
[2]	validation_0-auc:0.71825
[3]	validation_0-auc:0.73127
[4]	validation_0-auc:0.73473
[5]	validation_0-auc:0.73484
[6]	validation_0-auc:0.73534
[7]	validation_0-auc:0.73647
[8]	validation_0-auc:0.73671
[9]	validation_0-auc:0.73802
[10]	validation_0-auc:0.73806
[11]	validation_0-auc:0.73975
[12]	validation_0-auc:0.74023
[13]	validation_0-auc:0.74045
[14]	validation_0-auc:0.74006
[15]	validation_0-auc:0.74016
[16]	validation_0-auc:0.74087
[17]	validation_0-auc:0.74053
[18]	validation_0-auc:0.74028
[19]	validation_0-auc:0.74034
[20]	validation_0-auc:0.73997
[21]	validation_0-auc:0.74024
[22]	validation_0-auc:0.74095
[23]	validation_0-auc:0.74074
[24]	validation_0-auc:0.74057
[25]	validation_0-auc:0.74115
[26]	validation_0-auc:0.74085
[27]	validation_0-auc:0.74085
[28]	validation_0-auc:0.74118
[29]	validation_0-auc:0.74101
[30]	validation_0-auc:0.74075
[31]	validation_0-auc:0.74082
[32]	validation_0-auc:0.74116
[33]	validation_0-au

In [51]:

# ==========================================
# CONFIGURACIÓN 1: ULTRA CONSERVADORA
# Mayor generalización, menos overfit
# ==========================================
params_ultra_conservative = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'tree_method': 'hist',
    'device': 'cuda',

    # Learning muy bajo para aprendizaje gradual
    'learning_rate': 0.005,
    'n_estimators': 10000,

    # Árboles muy simples
    'max_depth': 3,
    'min_child_weight': 30,
    'max_leaves': 15,

    # Sampling muy agresivo para diversidad
    'subsample': 0.6,
    'colsample_bytree': 0.6,
    'colsample_bylevel': 0.6,
    'colsample_bynode': 0.6,

    # Balance de clases
    'scale_pos_weight': optimal_scale,

    # Regularización muy fuerte
    'reg_alpha': 2.0,      # L1
    'reg_lambda': 5.0,     # L2
    'gamma': 3.0,          # Min loss reduction
    'max_delta_step': 1,
}

# ==========================================
# CONFIGURACIÓN 2: CONSERVADORA BALANCEADA
# Buen equilibrio generalización/performance
# ==========================================
params_conservative_balanced = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.008,
    'n_estimators': 7500,

    'max_depth': 4,
    'min_child_weight': 20,
    'max_leaves': 20,

    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'colsample_bylevel': 0.75,
    'colsample_bynode': 0.8,

    'scale_pos_weight': optimal_scale,

    'reg_alpha': 1.0,
    'reg_lambda': 3.0,
    'gamma': 2.0,
    'max_delta_step': 1,
}

# ==========================================
# CONFIGURACIÓN 3: MODERADA
# Tu configuración actual mejorada
# ==========================================
params_moderate = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.01,
    'n_estimators': 6000,

    'max_depth': 5,
    'min_child_weight': 15,
    'max_leaves': 31,

    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'colsample_bylevel': 0.8,
    'colsample_bynode': 0.85,

    'scale_pos_weight': optimal_scale,

    'reg_alpha': 0.5,
    'reg_lambda': 2.5,
    'gamma': 1.5,
    'max_delta_step': 2,
}

# ==========================================
# CONFIGURACIÓN 4: AGRESIVA PARA RECALL
# Maximiza detección de defaults
# ==========================================
params_high_recall = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.015,
    'n_estimators': 5000,

    # Árboles más profundos para capturar patrones complejos
    'max_depth': 6,
    'min_child_weight': 8,
    'max_leaves': 50,

    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.85,
    'colsample_bynode': 0.9,

    # Más peso a clase positiva
    'scale_pos_weight': optimal_scale * 1.3,

    # Regularización más suave
    'reg_alpha': 0.3,
    'reg_lambda': 1.5,
    'gamma': 0.5,
    'max_delta_step': 3,
}

# ==========================================
# CONFIGURACIÓN 5: AGRESIVA PARA PRECISION
# Minimiza falsos positivos
# ==========================================
params_high_precision = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.012,
    'n_estimators': 4309,

    'max_depth': 5,
    'min_child_weight': 25,
    'max_leaves': 25,

    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'colsample_bylevel': 0.8,
    'colsample_bynode': 0.85,

    # Menos peso a positivos para ser más selectivo
    'scale_pos_weight': optimal_scale * 0.8,

    'reg_alpha': 0.8,
    'reg_lambda': 2.5,
    'gamma': 2.0,
    'max_delta_step': 1,
}

# ==========================================
# CONFIGURACIÓN 6: DART (Dropout Trees)
# Previene overfit con dropout en árboles
# ==========================================
params_dart = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'tree_method': 'hist',
    'device': 'cuda',
    'booster': 'dart',  # Usa DART en lugar de gbtree

    'learning_rate': 0.01,
    'n_estimators': 6000,

    'max_depth': 5,
    'min_child_weight': 15,

    'subsample': 0.8,
    'colsample_bytree': 0.8,

    'scale_pos_weight': optimal_scale,

    # DART específicos
    'sample_type': 'uniform',
    'normalize_type': 'tree',
    'rate_drop': 0.1,
    'skip_drop': 0.5,

    'reg_alpha': 0.5,
    'reg_lambda': 2.0,
    'gamma': 1.0,
}

# ==========================================
# CONFIGURACIÓN 7: FAST & LIGHT
# Rápido para iteraciones, menor memoria
# ==========================================
params_fast = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'tree_method': 'hist',
    'device': 'cuda',

    # Más rápido pero efectivo
    'learning_rate': 0.02,
    'n_estimators': 3000,

    'max_depth': 4,
    'min_child_weight': 20,
    'max_leaves': 15,
    'max_bin': 128,  # Reduce bins para velocidad

    'subsample': 0.7,
    'colsample_bytree': 0.7,

    'scale_pos_weight': optimal_scale,

    'reg_alpha': 1.0,
    'reg_lambda': 2.0,
    'gamma': 1.5,
}

# ==========================================
# CONFIGURACIÓN 8: DEEP TREES
# Árboles profundos con fuerte regularización
# ==========================================
params_deep = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.008,
    'n_estimators': 7000,

    # Muy profundos
    'max_depth': 8,
    'min_child_weight': 10,
    'max_leaves': 100,

    # Sampling fuerte para compensar profundidad
    'subsample': 0.6,
    'colsample_bytree': 0.6,
    'colsample_bylevel': 0.6,
    'colsample_bynode': 0.6,

    'scale_pos_weight': optimal_scale,

    # Regularización muy fuerte
    'reg_alpha': 3.0,
    'reg_lambda': 5.0,
    'gamma': 4.0,
    'max_delta_step': 1,
}

# ==========================================
# CONFIGURACIÓN 9: MONOTONE CONSTRAINTS
# Fuerza relaciones lógicas en features
# ==========================================
params_monotone = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.01,
    'n_estimators': 6000,

    'max_depth': 5,
    'min_child_weight': 15,

    'subsample': 0.8,
    'colsample_bytree': 0.8,

    'scale_pos_weight': optimal_scale,

    # Monotone constraints (ajustar según tus features)
    # +1: a mayor valor, mayor probabilidad de default
    # -1: a mayor valor, menor probabilidad de default
    # 0: sin restricción
    # Ejemplo para primeras 10 features (ajustar según tu caso)
    'monotone_constraints': '(0,0,0,0,0,1,1,-1,1,0)',  # Ejemplo

    'reg_alpha': 0.5,
    'reg_lambda': 2.0,
    'gamma': 1.0,
}

# ==========================================
# CONFIGURACIÓN 10: INTERACTION CONSTRAINTS
# Limita qué features pueden interactuar
# ==========================================
params_interaction = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.01,
    'n_estimators': 6000,

    'max_depth': 6,
    'min_child_weight': 15,

    'subsample': 0.8,
    'colsample_bytree': 0.8,

    'scale_pos_weight': optimal_scale,

    # Permite interacciones solo dentro de grupos
    # Ejemplo: [[0,1,2], [3,4,5]] significa que features 0,1,2 pueden interactuar entre sí
    # y 3,4,5 entre sí, pero no entre grupos
    # 'interaction_constraints': [[0,1,2,3], [4,5,6,7], [8,9,10]],  # Ajustar a tus features

    'reg_alpha': 0.5,
    'reg_lambda': 2.0,
    'gamma': 1.0,
}

# ==========================================
# CONFIGURACIÓN 11: TWO-STAGE ENSEMBLE
# Entrena dos modelos complementarios
# ==========================================
params_ensemble_stage1 = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'tree_method': 'hist',
    'device': 'cuda',

    # Enfocado en generalización
    'learning_rate': 0.008,
    'n_estimators': 6000,

    'max_depth': 4,
    'min_child_weight': 25,

    'subsample': 0.7,
    'colsample_bytree': 0.7,

    'scale_pos_weight': optimal_scale,

    'reg_alpha': 1.5,
    'reg_lambda': 3.0,
    'gamma': 2.0,
}

params_ensemble_stage2 = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'tree_method': 'hist',
    'device': 'cuda',

    # Enfocado en capturar residuos
    'learning_rate': 0.012,
    'n_estimators': 4000,

    'max_depth': 6,
    'min_child_weight': 10,

    'subsample': 0.8,
    'colsample_bytree': 0.8,

    'scale_pos_weight': optimal_scale * 1.2,

    'reg_alpha': 0.3,
    'reg_lambda': 1.5,
    'gamma': 0.8,
}

# =========================================
# BASADO EN EL MEJOR AUC 5
# =========================================

params_best_auc = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.010,  # Ligeramente menor que 0.012
    'n_estimators': 5000,

    'max_depth': 5,
    'min_child_weight': 22,  # Entre 20 y 25
    'max_leaves': 28,

    'subsample': 0.75,
    'colsample_bytree': 0.76,
    'colsample_bylevel': 0.80,
    'colsample_bynode': 0.85,

    'scale_pos_weight': optimal_scale * 0.85,  # Entre 0.8 y 1.0

    'reg_alpha': 0.7,
    'reg_lambda': 2.5,
    'gamma': 1.8,
    'max_delta_step': 1,
}

# ==========================================
# RESUMEN DE CONFIGURACIONES
# ==========================================
configs = {
    '1_ultra_conservative': params_ultra_conservative,
    '2_conservative_balanced': params_conservative_balanced,
    '3_moderate': params_moderate,
    '4_high_recall': params_high_recall,
    '5_high_precision': params_high_precision,
    #'6_dart': params_dart,         #Tarda como 3 horas en entrenar con gpu
    '7_fast': params_fast,
    '8_deep': params_deep,
    '9_monotone': params_monotone,
    '10_interaction': params_interaction,
    '11_ensemble_s1': params_ensemble_stage1,
    '11_ensemble_s2': params_ensemble_stage2,
    '12_best_auc': params_best_auc,
}

# ==========================================
# FUNCIÓN PARA ENTRENAR Y COMPARAR
# ==========================================
def train_and_evaluate(config_name, params, X_train, y_train, X_val, y_val):
    """
    Entrena un modelo con la configuración dada y evalúa
    """
    print(f"\n{'='*70}")
    print(f"Entrenando: {config_name}")
    print(f"{'='*70}")

    # Preparar parámetros
    train_params = params.copy()

    # Extraer n_estimators y early_stopping_rounds
    n_estimators = train_params.pop('n_estimators', 5000)
    early_stopping = min(500, n_estimators // 10)

    # Crear modelo
    model = XGBClassifier(
        **train_params,
        n_estimators=n_estimators,
        n_jobs=-1,
        random_state=42,
        enable_categorical=True, #added here too
        early_stopping_rounds=early_stopping
    )

    # Entrenar
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=200
    )

    # Evaluar
    from sklearn.metrics import roc_auc_score, precision_score, recall_score, confusion_matrix

    y_proba = model.predict_proba(X_val)[:, 1]
    y_pred = (y_proba >= 0.5).astype(int)

    auc = roc_auc_score(y_val, y_proba)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)

    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    fpr = fp / (fp + tn)

    print(f"\n{'='*70}")
    print(f"RESULTADOS: {config_name}")
    print(f"{'='*70}")
    print(f"AUC:       {auc:.5f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"FPR:       {fpr:.4f}")
    print(f"Best iteration: {model.best_iteration}")

    return {
        'config': config_name,
        'model': model,
        'auc': auc,
        'precision': precision,
        'recall': recall,
        'fpr': fpr,
        'best_iteration': model.best_iteration
    }

In [52]:
results = {}

for i, j in configs.items():
    results[i] = train_and_evaluate(i, j,  X_train, y_train, X_val, y_val)


Entrenando: 1_ultra_conservative
[0]	validation_0-auc:0.61589
[200]	validation_0-auc:0.73983
[400]	validation_0-auc:0.74802
[600]	validation_0-auc:0.75349
[800]	validation_0-auc:0.75730
[1000]	validation_0-auc:0.76038
[1200]	validation_0-auc:0.76286
[1400]	validation_0-auc:0.76510
[1600]	validation_0-auc:0.76687
[1800]	validation_0-auc:0.76839
[2000]	validation_0-auc:0.76979
[2200]	validation_0-auc:0.77090
[2400]	validation_0-auc:0.77200
[2600]	validation_0-auc:0.77294
[2800]	validation_0-auc:0.77379
[3000]	validation_0-auc:0.77452
[3200]	validation_0-auc:0.77523
[3400]	validation_0-auc:0.77590
[3600]	validation_0-auc:0.77647
[3800]	validation_0-auc:0.77694
[4000]	validation_0-auc:0.77748
[4200]	validation_0-auc:0.77794
[4400]	validation_0-auc:0.77828
[4600]	validation_0-auc:0.77870
[4800]	validation_0-auc:0.77897
[5000]	validation_0-auc:0.77936
[5200]	validation_0-auc:0.77963
[5400]	validation_0-auc:0.77989
[5600]	validation_0-auc:0.78014
[5800]	validation_0-auc:0.78042
[6000]	valida

In [53]:
for key, d in results.items():
    print(key)
    print({k: d[k] for k in ['auc', 'precision', 'recall', 'fpr', 'best_iteration']})

1_ultra_conservative
{'auc': 0.7833441542965918, 'precision': 0.7160024033647105, 'recall': 0.7200402819738168, 'fpr': 0.28559919436052367, 'best_iteration': 9999}
2_conservative_balanced
{'auc': 0.7850748593629728, 'precision': 0.7196450181524808, 'recall': 0.7186304128902317, 'fpr': 0.2799597180261833, 'best_iteration': 6939}
3_moderate
{'auc': 0.7847285682557357, 'precision': 0.717753259779338, 'recall': 0.7206445115810675, 'fpr': 0.28338368580060425, 'best_iteration': 2994}
4_high_recall
{'auc': 0.7831524802519956, 'precision': 0.6884315117104329, 'recall': 0.7814702920443102, 'fpr': 0.3536757301107754, 'best_iteration': 1658}
5_high_precision
{'auc': 0.7843344499107042, 'precision': 0.7437902483900644, 'recall': 0.6513595166163142, 'fpr': 0.22437059415911378, 'best_iteration': 2554}
7_fast
{'auc': 0.7839690116819752, 'precision': 0.7185051235684147, 'recall': 0.7202416918429003, 'fpr': 0.2821752265861027, 'best_iteration': 2230}
8_deep
{'auc': 0.7843230103169314, 'precision': 0.71

In [54]:
# ==========================================
# MICRO-OPTIMIZACIÓN DE 5_HIGH_PRECISION
# Exploración fina alrededor del mejor modelo
# ==========================================

# BASE DE REFERENCIA (5_high_precision - AUC: 0.78817)
# 'learning_rate': 0.012
# 'scale_pos_weight': optimal_scale * 0.80
# 'min_child_weight': 25
# 'max_leaves': 25
# 'reg_alpha': 0.8, 'reg_lambda': 2.5, 'gamma': 2.0

# ==========================================
# GRUPO 1: AJUSTE DE SCALE_POS_WEIGHT
# El hiperparámetro MÁS crítico según resultados
# ==========================================

# Config 13: Scale más bajo (más conservador)
params_13_scale_low = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.012,
    'n_estimators': 6000,

    'max_depth': 5,
    'min_child_weight': 25,
    'max_leaves': 25,

    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'colsample_bylevel': 0.80,
    'colsample_bynode': 0.85,

    'scale_pos_weight': optimal_scale * 0.76,  # Entre 0.76-0.80

    'reg_alpha': 0.8,
    'reg_lambda': 2.5,
    'gamma': 2.0,
    'max_delta_step': 1,
}

# Config 14: Scale ligeramente más alto
params_14_scale_med = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.012,
    'n_estimators': 6000,

    'max_depth': 5,
    'min_child_weight': 25,
    'max_leaves': 25,

    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'colsample_bylevel': 0.80,
    'colsample_bynode': 0.85,

    'scale_pos_weight': optimal_scale * 0.82,  # Entre 0.80-0.85

    'reg_alpha': 0.8,
    'reg_lambda': 2.5,
    'gamma': 2.0,
    'max_delta_step': 1,
}

# ==========================================
# GRUPO 2: AJUSTE DE MIN_CHILD_WEIGHT
# Segundo parámetro más sensible
# ==========================================

# Config 15: Más restrictivo (menos overfit)
params_15_mcw_high = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.012,
    'n_estimators': 6000,

    'max_depth': 5,
    'min_child_weight': 28,  # Más alto que 25
    'max_leaves': 25,

    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'colsample_bylevel': 0.80,
    'colsample_bynode': 0.85,

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 0.8,
    'reg_lambda': 2.5,
    'gamma': 2.0,
    'max_delta_step': 1,
}

# Config 16: Menos restrictivo (más capacidad)
params_16_mcw_low = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.012,
    'n_estimators': 6000,

    'max_depth': 5,
    'min_child_weight': 22,  # Más bajo que 25
    'max_leaves': 25,

    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'colsample_bylevel': 0.80,
    'colsample_bynode': 0.85,

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 0.8,
    'reg_lambda': 2.5,
    'gamma': 2.0,
    'max_delta_step': 1,
}

# ==========================================
# GRUPO 3: AJUSTE DE REGULARIZACIÓN (L1, L2, GAMMA)
# Balance entre los 3 tipos de regularización
# ==========================================

# Config 17: Regularización más suave
params_17_reg_soft = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.012,
    'n_estimators': 6000,

    'max_depth': 5,
    'min_child_weight': 25,
    'max_leaves': 25,

    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'colsample_bylevel': 0.80,
    'colsample_bynode': 0.85,

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 0.6,   # Menos L1
    'reg_lambda': 2.2,  # Menos L2
    'gamma': 1.7,       # Menos gamma
    'max_delta_step': 1,
}

# Config 18: Regularización más fuerte
params_18_reg_strong = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.012,
    'n_estimators': 6000,

    'max_depth': 5,
    'min_child_weight': 25,
    'max_leaves': 25,

    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'colsample_bylevel': 0.80,
    'colsample_bynode': 0.85,

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 1.0,   # Más L1
    'reg_lambda': 2.8,  # Más L2
    'gamma': 2.3,       # Más gamma
    'max_delta_step': 1,
}

# Config 19: Balance L1 vs L2 diferente
params_19_reg_balance = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.012,
    'n_estimators': 6000,

    'max_depth': 5,
    'min_child_weight': 25,
    'max_leaves': 25,

    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'colsample_bylevel': 0.80,
    'colsample_bynode': 0.85,

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 1.0,   # Más L1 (feature selection)
    'reg_lambda': 2.0,  # Menos L2 (suavidad)
    'gamma': 2.2,
    'max_delta_step': 1,
}

# ==========================================
# GRUPO 4: LEARNING RATE + MAX_LEAVES
# Combinaciones ajustadas
# ==========================================

# Config 20: Learning rate más lento, más árboles simples
params_20_slow_simple = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.010,  # Más lento
    'n_estimators': 7200,    # Más iteraciones

    'max_depth': 5,
    'min_child_weight': 25,
    'max_leaves': 22,  # Más simple

    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'colsample_bylevel': 0.80,
    'colsample_bynode': 0.85,

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 0.8,
    'reg_lambda': 2.5,
    'gamma': 2.0,
    'max_delta_step': 1,
}

# Config 21: Learning rate más rápido, árboles ligeramente más complejos
params_21_fast_complex = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.013,  # Más rápido
    'n_estimators': 5500,    # Menos iteraciones

    'max_depth': 5,
    'min_child_weight': 25,
    'max_leaves': 28,  # Más complejo

    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'colsample_bylevel': 0.80,
    'colsample_bynode': 0.85,

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 0.8,
    'reg_lambda': 2.5,
    'gamma': 2.0,
    'max_delta_step': 1,
}

# ==========================================
# GRUPO 5: SAMPLING VARIATIONS
# Ajustes en subsample y colsample_*
# ==========================================

# Config 22: Más sampling (más diversidad)
params_22_high_sample = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.012,
    'n_estimators': 6000,

    'max_depth': 5,
    'min_child_weight': 25,
    'max_leaves': 25,

    'subsample': 0.78,           # Más alto
    'colsample_bytree': 0.78,    # Más alto
    'colsample_bylevel': 0.82,   # Más alto
    'colsample_bynode': 0.87,    # Más alto

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 0.8,
    'reg_lambda': 2.5,
    'gamma': 2.0,
    'max_delta_step': 1,
}

# Config 23: Menos sampling (más conservador)
params_23_low_sample = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.012,
    'n_estimators': 6000,

    'max_depth': 5,
    'min_child_weight': 25,
    'max_leaves': 25,

    'subsample': 0.72,           # Más bajo
    'colsample_bytree': 0.72,    # Más bajo
    'colsample_bylevel': 0.78,   # Más bajo
    'colsample_bynode': 0.83,    # Más bajo

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 0.8,
    'reg_lambda': 2.5,
    'gamma': 2.0,
    'max_delta_step': 1,
}

# ==========================================
# GRUPO 6: COMBINACIONES ÓPTIMAS
# Mejores combinaciones basadas en intuición
# ==========================================

# Config 24: Combinación conservadora óptima
params_24_ultra_optimal = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.011,
    'n_estimators': 6500,

    'max_depth': 5,
    'min_child_weight': 26,  # Ligeramente más restrictivo
    'max_leaves': 24,        # Ligeramente más simple

    'subsample': 0.76,
    'colsample_bytree': 0.76,
    'colsample_bylevel': 0.81,
    'colsample_bynode': 0.86,

    'scale_pos_weight': optimal_scale * 0.78,  # Entre 0.76 y 0.80

    'reg_alpha': 0.85,  # Entre 0.8 y 0.9
    'reg_lambda': 2.6,  # Entre 2.5 y 2.7
    'gamma': 2.1,       # Entre 2.0 y 2.2
    'max_delta_step': 1,
}

# Config 25: Combinación agresiva óptima
params_25_balanced_optimal = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.0125,  # Entre 0.012 y 0.013
    'n_estimators': 5800,

    'max_depth': 5,
    'min_child_weight': 24,  # Ligeramente menos restrictivo
    'max_leaves': 26,        # Ligeramente más complejo

    'subsample': 0.77,
    'colsample_bytree': 0.77,
    'colsample_bylevel': 0.81,
    'colsample_bynode': 0.86,

    'scale_pos_weight': optimal_scale * 0.81,  # Entre 0.80 y 0.82

    'reg_alpha': 0.75,  # Menos L1
    'reg_lambda': 2.4,  # Menos L2
    'gamma': 1.9,       # Menos gamma
    'max_delta_step': 1,
}

# ==========================================
# DICCIONARIO DE TODAS LAS CONFIGS
# ==========================================
all_fine_tuned_configs = {
    '13_scale_low': params_13_scale_low,
    '14_scale_med': params_14_scale_med,
    '15_mcw_high': params_15_mcw_high,
    '16_mcw_low': params_16_mcw_low,
    '17_reg_soft': params_17_reg_soft,
    '18_reg_strong': params_18_reg_strong,
    '19_reg_balance': params_19_reg_balance,
    '20_slow_simple': params_20_slow_simple,
    '21_fast_complex': params_21_fast_complex,
    '22_high_sample': params_22_high_sample,
    '23_low_sample': params_23_low_sample,
    '24_ultra_optimal': params_24_ultra_optimal,
    '25_balanced_optimal': params_25_balanced_optimal,
}



In [55]:
results_tuned = {}

for i, j in all_fine_tuned_configs.items():
    results_tuned[i] = train_and_evaluate(i, j,  X_train, y_train, X_val, y_val)


Entrenando: 13_scale_low
[0]	validation_0-auc:0.67899
[200]	validation_0-auc:0.75936
[400]	validation_0-auc:0.76959
[600]	validation_0-auc:0.77460
[800]	validation_0-auc:0.77764
[1000]	validation_0-auc:0.77981
[1200]	validation_0-auc:0.78132
[1400]	validation_0-auc:0.78227
[1600]	validation_0-auc:0.78294
[1800]	validation_0-auc:0.78345
[2000]	validation_0-auc:0.78388
[2200]	validation_0-auc:0.78414
[2400]	validation_0-auc:0.78446
[2600]	validation_0-auc:0.78466
[2800]	validation_0-auc:0.78468
[3000]	validation_0-auc:0.78471
[3200]	validation_0-auc:0.78471
[3400]	validation_0-auc:0.78449
[3544]	validation_0-auc:0.78449

RESULTADOS: 13_scale_low
AUC:       0.78476
Precision: 0.7501
Recall:    0.6377
FPR:       0.2125
Best iteration: 3044

Entrenando: 14_scale_med
[0]	validation_0-auc:0.67795
[200]	validation_0-auc:0.75962
[400]	validation_0-auc:0.76994
[600]	validation_0-auc:0.77498
[800]	validation_0-auc:0.77785
[1000]	validation_0-auc:0.78001
[1200]	validation_0-auc:0.78145
[1400]	val

In [56]:
for i, j in results_tuned.items():
    print(i)
    print({k: j[k] for k in ['auc', 'precision', 'recall', 'fpr', 'best_iteration']})

13_scale_low
{'auc': 0.7847627044903448, 'precision': 0.750059227671168, 'recall': 0.6376636455186304, 'fpr': 0.21248741188318226, 'best_iteration': 3044}
14_scale_med
{'auc': 0.7850272349548553, 'precision': 0.7379667116509222, 'recall': 0.6608257804632427, 'fpr': 0.23464249748237664, 'best_iteration': 3110}
15_mcw_high
{'auc': 0.7852069420485188, 'precision': 0.7444038373686614, 'recall': 0.6563947633434039, 'fpr': 0.2253776435045317, 'best_iteration': 3595}
16_mcw_low
{'auc': 0.7846859740236033, 'precision': 0.7411791486455725, 'recall': 0.655790533736153, 'fpr': 0.22900302114803625, 'best_iteration': 3322}
17_reg_soft
{'auc': 0.7849658384116814, 'precision': 0.7439890084726357, 'recall': 0.654380664652568, 'fpr': 0.22517623363544814, 'best_iteration': 2826}
18_reg_strong
{'auc': 0.7849289842594029, 'precision': 0.7437013284470911, 'recall': 0.6539778449144008, 'fpr': 0.2253776435045317, 'best_iteration': 2829}
19_reg_balance
{'auc': 0.7850286750455606, 'precision': 0.74216426447037

In [57]:
xgb_model = XGBClassifier(**params_best_auc, n_jobs=-1, random_state=42, enable_categorical=True)
xgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=True)

[0]	validation_0-auc:0.67480
[1]	validation_0-auc:0.69248
[2]	validation_0-auc:0.71820
[3]	validation_0-auc:0.73120
[4]	validation_0-auc:0.73268
[5]	validation_0-auc:0.73310
[6]	validation_0-auc:0.73299
[7]	validation_0-auc:0.73395
[8]	validation_0-auc:0.73417
[9]	validation_0-auc:0.73608
[10]	validation_0-auc:0.73687
[11]	validation_0-auc:0.73837
[12]	validation_0-auc:0.73836
[13]	validation_0-auc:0.73798
[14]	validation_0-auc:0.73767
[15]	validation_0-auc:0.73808
[16]	validation_0-auc:0.73877
[17]	validation_0-auc:0.73847
[18]	validation_0-auc:0.73837
[19]	validation_0-auc:0.73915
[20]	validation_0-auc:0.73897
[21]	validation_0-auc:0.73950
[22]	validation_0-auc:0.73925
[23]	validation_0-auc:0.73904
[24]	validation_0-auc:0.73929
[25]	validation_0-auc:0.74002
[26]	validation_0-auc:0.74070
[27]	validation_0-auc:0.74093
[28]	validation_0-auc:0.74074
[29]	validation_0-auc:0.74058
[30]	validation_0-auc:0.74072
[31]	validation_0-auc:0.74055
[32]	validation_0-auc:0.74133
[33]	validation_0-au

In [58]:
#optimal f1 threshold
from sklearn.metrics import f1_score
import numpy as np

y_val_proba = xgb_model.predict_proba(X_val)[:, 1]

thresholds = np.linspace(0.05, 0.6, 100)
f1_scores = [f1_score(y_val, y_val_proba >= t) for t in thresholds]

optimal_threshold = thresholds[np.argmax(f1_scores)]

print(f"Optimal Threshold (F1): {optimal_threshold:.3f}")

Optimal Threshold (F1): 0.367


In [59]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, roc_curve

y_val_pred = xgb_model.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_val_pred)

adj_threshold = optimal_threshold

y_val_pred = (y_val_pred >= adj_threshold).astype(int)

report = classification_report(y_val, y_val_pred)

#Confusion matrix manual
TP = np.sum((y_val == 1) & (y_val_pred == 1))
FP = np.sum((y_val == 0) & (y_val_pred == 1))
TN = np.sum((y_val == 0) & (y_val_pred == 0))
FN = np.sum((y_val == 1) & (y_val_pred == 0))

FPR = FP / (FP + TN)
Precision = TP / (TP + FP)

print(f"FPR: {FPR:.4f}")
print(f"Precision (PPV): {Precision:.4f}")
print(f"AUC validación: {auc:.5f}")
print(report)

FPR: 0.4169
Precision (PPV): 0.6640
AUC validación: 0.78419
              precision    recall  f1-score   support

           0       0.77      0.58      0.66      4965
           1       0.66      0.82      0.74      4965

    accuracy                           0.70      9930
   macro avg       0.72      0.70      0.70      9930
weighted avg       0.72      0.70      0.70      9930



In [60]:
# ==========================================
# CONFIGURACIONES XGBOOST PARA 363 FEATURES
# Optimizadas para dataset con feature engineering avanzado
# ==========================================

# IMPORTANTE: Con 363 features (vs ~120 originales), necesitas:
# 1. Más regularización (para evitar overfit con tantas features)
# 2. Más colsample (sampling de features más agresivo)
# 3. Árboles potencialmente más profundos (para capturar interacciones)

# ==========================================
# CONFIG 26: HIGH PRECISION CON MUCHAS FEATURES
# Adaptación del mejor modelo (5_high_precision) para 363 features
# ==========================================
params_26_hp_many_features = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.012,
    'n_estimators': 6000,

    'max_depth': 6,  # +1 vs original (más features = más profundidad OK)
    'min_child_weight': 30,  # +5 (más conservador con muchas features)
    'max_leaves': 30,  # +5

    # ⚡ CRÍTICO: Sampling más agresivo con 363 features
    'subsample': 0.70,  # -0.05
    'colsample_bytree': 0.65,  # -0.10 (muy importante!)
    'colsample_bylevel': 0.70,  # -0.10
    'colsample_bynode': 0.75,  # -0.10

    'scale_pos_weight': optimal_scale * 0.80,

    # Regularización más fuerte
    'reg_alpha': 1.2,   # +0.4
    'reg_lambda': 3.5,  # +1.0
    'gamma': 2.5,       # +0.5
    'max_delta_step': 1,
}

# ==========================================
# CONFIG 27: ULTRA REGULARIZADO PARA MUCHAS FEATURES
# Previene overfit agresivamente
# ==========================================
params_27_ultra_reg = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.010,
    'n_estimators': 7000,

    'max_depth': 5,
    'min_child_weight': 35,
    'max_leaves': 25,

    # Feature sampling MUY agresivo
    'subsample': 0.65,
    'colsample_bytree': 0.60,
    'colsample_bylevel': 0.65,
    'colsample_bynode': 0.70,

    'scale_pos_weight': optimal_scale * 0.80,

    # Regularización extrema
    'reg_alpha': 2.0,
    'reg_lambda': 5.0,
    'gamma': 3.0,
    'max_delta_step': 1,
}

# ==========================================
# CONFIG 28: DEEP TREES CON FEATURE SELECTION
# Árboles profundos + sampling fuerte = explora interacciones
# ==========================================
params_28_deep_selection = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.008,
    'n_estimators': 8000,

    # Profundo para capturar interacciones complejas
    'max_depth': 7,
    'min_child_weight': 25,
    'max_leaves': 50,

    # Sampling agresivo compensa profundidad
    'subsample': 0.65,
    'colsample_bytree': 0.55,  # Muy bajo: cada árbol ve ~200 features
    'colsample_bylevel': 0.60,
    'colsample_bynode': 0.65,

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 2.5,  # L1 alto para feature selection
    'reg_lambda': 4.0,
    'gamma': 3.5,
    'max_delta_step': 1,
}

# ==========================================
# CONFIG 29: BALANCED DIVERSITY
# Balance entre exploración y explotación
# ==========================================
params_29_balanced_diverse = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.011,
    'n_estimators': 6500,

    'max_depth': 6,
    'min_child_weight': 28,
    'max_leaves': 35,

    'subsample': 0.68,
    'colsample_bytree': 0.62,
    'colsample_bylevel': 0.68,
    'colsample_bynode': 0.73,

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 1.5,
    'reg_lambda': 3.8,
    'gamma': 2.7,
    'max_delta_step': 1,
}

# ==========================================
# CONFIG 30: FEATURE SAMPLING EXTREMO
# Cada árbol ve solo ~30% de features (ensemble diverso)
# ==========================================
params_30_extreme_sampling = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.013,
    'n_estimators': 6000,

    'max_depth': 5,
    'min_child_weight': 25,
    'max_leaves': 28,

    # SAMPLING EXTREMO
    'subsample': 0.60,
    'colsample_bytree': 0.50,  # Solo 180 features por árbol
    'colsample_bylevel': 0.55,
    'colsample_bynode': 0.60,

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 1.8,
    'reg_lambda': 3.0,
    'gamma': 2.2,
    'max_delta_step': 1,
}

# ==========================================
# CONFIG 31: L1 DOMINANT (Feature Selection)
# Alta regularización L1 para selección automática de features
# ==========================================
params_31_l1_dominant = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.012,
    'n_estimators': 6000,

    'max_depth': 6,
    'min_child_weight': 30,
    'max_leaves': 32,

    'subsample': 0.70,
    'colsample_bytree': 0.65,
    'colsample_bylevel': 0.70,
    'colsample_bynode': 0.75,

    'scale_pos_weight': optimal_scale * 0.80,

    # L1 muy alto para feature selection
    'reg_alpha': 3.0,  # ⚡ Muy alto
    'reg_lambda': 2.0,  # L2 más bajo
    'gamma': 2.0,
    'max_delta_step': 1,
}

# ==========================================
# CONFIG 32: ADAPTIVE LEARNING
# Learning rate más alto con más regularización
# ==========================================
params_32_adaptive = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.015,  # Más rápido
    'n_estimators': 5000,

    'max_depth': 5,
    'min_child_weight': 32,
    'max_leaves': 27,

    'subsample': 0.68,
    'colsample_bytree': 0.63,
    'colsample_bylevel': 0.68,
    'colsample_bynode': 0.73,

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 1.3,
    'reg_lambda': 3.3,
    'gamma': 2.5,
    'max_delta_step': 1,
}

# ==========================================
# CONFIG 33: MODERATE DEPTH + HIGH REG
# Balance clásico pero adaptado
# ==========================================
params_33_moderate_highreg = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.011,
    'n_estimators': 6500,

    'max_depth': 6,
    'min_child_weight': 27,
    'max_leaves': 33,

    'subsample': 0.72,
    'colsample_bytree': 0.67,
    'colsample_bylevel': 0.72,
    'colsample_bynode': 0.77,

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 1.4,
    'reg_lambda': 3.6,
    'gamma': 2.6,
    'max_delta_step': 1,
}

# ==========================================
# CONFIG 34: CONSERVATIVE MANY FEATURES
# Muy conservador específicamente para 363 features
# ==========================================
params_34_conservative_363 = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.009,
    'n_estimators': 7500,

    'max_depth': 4,  # Shallow
    'min_child_weight': 40,  # Muy restrictivo
    'max_leaves': 20,

    'subsample': 0.65,
    'colsample_bytree': 0.58,
    'colsample_bylevel': 0.63,
    'colsample_bynode': 0.68,

    'scale_pos_weight': optimal_scale * 0.80,

    'reg_alpha': 2.2,
    'reg_lambda': 4.5,
    'gamma': 3.2,
    'max_delta_step': 1,
}

# ==========================================
# CONFIG 35: FOCUS ON NEW FEATURES
# Optimizado para aprovechar las nuevas features avanzadas
# ==========================================
params_35_new_features_focus = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',

    'learning_rate': 0.012,
    'n_estimators': 6000,

    'max_depth': 6,
    'min_child_weight': 26,
    'max_leaves': 35,

    # Permite más exploración de features
    'subsample': 0.72,
    'colsample_bytree': 0.68,
    'colsample_bylevel': 0.73,
    'colsample_bynode': 0.78,

    'scale_pos_weight': optimal_scale * 0.78,  # Ligeramente ajustado

    'reg_alpha': 1.3,
    'reg_lambda': 3.2,
    'gamma': 2.3,
    'max_delta_step': 1,
}

# ==========================================
# DICCIONARIO COMPLETO
# ==========================================
all_advanced_configs = {
    '26_hp_many_features': params_26_hp_many_features,
    '27_ultra_reg': params_27_ultra_reg,
    '28_deep_selection': params_28_deep_selection,
    '29_balanced_diverse': params_29_balanced_diverse,
    '30_extreme_sampling': params_30_extreme_sampling,
    '31_l1_dominant': params_31_l1_dominant,
    '32_adaptive': params_32_adaptive,
    '33_moderate_highreg': params_33_moderate_highreg,
    '34_conservative_363': params_34_conservative_363,
    '35_new_features_focus': params_35_new_features_focus,
}


In [61]:
results_advanced = {}

for i, j in all_advanced_configs.items():
    results_advanced[i] = train_and_evaluate(i, j,  X_train, y_train, X_val, y_val)


Entrenando: 26_hp_many_features
[0]	validation_0-auc:0.67999
[200]	validation_0-auc:0.76147
[400]	validation_0-auc:0.77052
[600]	validation_0-auc:0.77575
[800]	validation_0-auc:0.77909
[1000]	validation_0-auc:0.78116
[1200]	validation_0-auc:0.78227
[1400]	validation_0-auc:0.78313
[1600]	validation_0-auc:0.78390
[1800]	validation_0-auc:0.78418
[2000]	validation_0-auc:0.78448
[2200]	validation_0-auc:0.78462
[2400]	validation_0-auc:0.78467
[2600]	validation_0-auc:0.78450
[2797]	validation_0-auc:0.78444

RESULTADOS: 26_hp_many_features
AUC:       0.78471
Precision: 0.7419
Recall:    0.6536
FPR:       0.2274
Best iteration: 2297

Entrenando: 27_ultra_reg
[0]	validation_0-auc:0.66247
[200]	validation_0-auc:0.75955
[400]	validation_0-auc:0.76705
[600]	validation_0-auc:0.77188
[800]	validation_0-auc:0.77537
[1000]	validation_0-auc:0.77766
[1200]	validation_0-auc:0.77921
[1400]	validation_0-auc:0.78068
[1600]	validation_0-auc:0.78154
[1800]	validation_0-auc:0.78227
[2000]	validation_0-auc:0.78

In [62]:
for i, j in results_advanced.items():
    print(i)
    print({k: j[k] for k in ['auc', 'precision', 'recall', 'fpr', 'best_iteration']})

26_hp_many_features
{'auc': 0.7847060338786409, 'precision': 0.7418838591678097, 'recall': 0.6535750251762337, 'fpr': 0.22739174219536756, 'best_iteration': 2297}
27_ultra_reg
{'auc': 0.7849868312832324, 'precision': 0.7422609493235497, 'recall': 0.6519637462235649, 'fpr': 0.22638469284994964, 'best_iteration': 4293}
28_deep_selection
{'auc': 0.7846739056578323, 'precision': 0.7428899082568807, 'recall': 0.6523665659617321, 'fpr': 0.2257804632426989, 'best_iteration': 5839}
29_balanced_diverse
{'auc': 0.7849700978348946, 'precision': 0.7462103812586127, 'recall': 0.654380664652568, 'fpr': 0.22255790533736153, 'best_iteration': 2769}
30_extreme_sampling
{'auc': 0.784541214483256, 'precision': 0.7442765567765568, 'recall': 0.6547834843907352, 'fpr': 0.22497482376636455, 'best_iteration': 3051}
31_l1_dominant
{'auc': 0.7854069118268971, 'precision': 0.740036231884058, 'recall': 0.6582074521651561, 'fpr': 0.2312185297079557, 'best_iteration': 3165}
32_adaptive
{'auc': 0.7843517107162017, '

In [63]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np

cat_features = [
    col for col in X.columns
    if X[col].dtype == 'category'
]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
aucs = []

for fold, (tr, va) in enumerate(skf.split(X, y), 1):

    model = CatBoostClassifier(
        iterations=4000,
        learning_rate=0.03,
        depth=6,
        l2_leaf_reg=3,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        early_stopping_rounds=300,
        verbose=200,
    )

    model.fit(
        X.iloc[tr], y.iloc[tr],
        eval_set=(X.iloc[va], y.iloc[va]),
        cat_features=cat_features
    )

    preds = model.predict_proba(X.iloc[va])[:, 1]
    auc = roc_auc_score(y.iloc[va], preds)
    aucs.append(auc)

    print(f'Fold {fold} AUC: {auc:.5f}')

print(f'\nAUC medio: {np.mean(aucs):.5f} ± {np.std(aucs):.5f}')



0:	test: 0.7005340	best: 0.7005340 (0)	total: 130ms	remaining: 8m 38s
200:	test: 0.7632579	best: 0.7632579 (200)	total: 19.8s	remaining: 6m 14s
400:	test: 0.7710791	best: 0.7710791 (400)	total: 39.9s	remaining: 5m 58s
600:	test: 0.7757237	best: 0.7757503 (598)	total: 1m 5s	remaining: 6m 9s
800:	test: 0.7777791	best: 0.7777791 (800)	total: 1m 25s	remaining: 5m 42s
1000:	test: 0.7790533	best: 0.7790533 (1000)	total: 1m 47s	remaining: 5m 20s
1200:	test: 0.7796677	best: 0.7797708 (1180)	total: 2m 5s	remaining: 4m 53s
1400:	test: 0.7800769	best: 0.7801706 (1392)	total: 2m 24s	remaining: 4m 28s
1600:	test: 0.7803644	best: 0.7804940 (1569)	total: 2m 44s	remaining: 4m 7s
1800:	test: 0.7805668	best: 0.7805668 (1800)	total: 3m 8s	remaining: 3m 50s
2000:	test: 0.7806668	best: 0.7806756 (1997)	total: 3m 31s	remaining: 3m 30s
2200:	test: 0.7806079	best: 0.7808560 (2104)	total: 3m 49s	remaining: 3m 7s
2400:	test: 0.7806564	best: 0.7808560 (2104)	total: 4m 11s	remaining: 2m 47s
Stopped by overfitting

In [64]:
print(aucs)

[0.7808560020850891, 0.7800159221296304, 0.7742958007157859, 0.7913833896692761, 0.7817551054765026]


[0.7816351982003422, 0.7900056715882002, 0.7815013529893292, 0.789682262278456, 0.7824631521772024]

In [73]:
from sklearn.metrics import roc_auc_score, classification_report
import numpy as np

# Probabilidades (idéntico conceptualmente)
y_val_proba = model.predict_proba(X_val)[:, 1]

# AUC (esto es lo importante)
auc = roc_auc_score(y_val, y_val_proba)

#Threshold (solo para métricas de clasificación)
adj_threshold = 0.6

y_val_pred = (y_val_proba >= adj_threshold).astype(int)

# Classification report
report = classification_report(y_val, y_val_pred)

#Confusion matrix manual
TP = np.sum((y_val == 1) & (y_val_pred == 1))
FP = np.sum((y_val == 0) & (y_val_pred == 1))
TN = np.sum((y_val == 0) & (y_val_pred == 0))
FN = np.sum((y_val == 1) & (y_val_pred == 0))

FPR = FP / (FP + TN)
Precision = TP / (TP + FP)

print(f"FPR: {FPR:.4f}")
print(f"Precision (PPV): {Precision:.4f}")
print(f"AUC validación: {auc:.5f}")
print(report)

cm = confusion_matrix(y_val, y_val_pred)
print(f"\n📊 Matriz de Confusión:")
print(f"                 Predicho")
print(f"               No Def  Default")
print(f"Real No Def    {cm[0,0]:6d}   {cm[0,1]:6d}")
print(f"Real Default   {cm[1,0]:6d}   {cm[1,1]:6d}")

FPR: 0.1243
Precision (PPV): 0.8371
AUC validación: 0.85639
              precision    recall  f1-score   support

           0       0.71      0.88      0.78      4965
           1       0.84      0.64      0.72      4965

    accuracy                           0.76      9930
   macro avg       0.77      0.76      0.75      9930
weighted avg       0.77      0.76      0.75      9930


📊 Matriz de Confusión:
                 Predicho
               No Def  Default
Real No Def      4348      617
Real Default     1795     3170


In [66]:

# Lista de tus tres diccionarios
all_test = [results, results_advanced, results_tuned]

# Consolidamos todos los experimentos en una sola lista de items
all_items = []
for d in all_test:
    all_items.extend(d.items())

# Buscamos el item que tiene el valor máximo en la clave 'auc'
best_exp, best_info = max(all_items, key=lambda x: x[1]['auc'])

print(f"Best Experiment: {best_exp}")
print(f"Best AUC: {best_info['auc']}")

Best Experiment: 24_ultra_optimal
Best AUC: 0.7856974653389436


In [70]:
xgb_model = XGBClassifier(**params_24_ultra_optimal, n_jobs=-1, random_state=42, enable_categorical=True)
xgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=True)

[0]	validation_0-auc:0.67530
[1]	validation_0-auc:0.68956
[2]	validation_0-auc:0.71396
[3]	validation_0-auc:0.72744
[4]	validation_0-auc:0.73033
[5]	validation_0-auc:0.73102
[6]	validation_0-auc:0.73152
[7]	validation_0-auc:0.73387
[8]	validation_0-auc:0.73380
[9]	validation_0-auc:0.73451
[10]	validation_0-auc:0.73497
[11]	validation_0-auc:0.73503
[12]	validation_0-auc:0.73519
[13]	validation_0-auc:0.73650
[14]	validation_0-auc:0.73659
[15]	validation_0-auc:0.73682
[16]	validation_0-auc:0.73677
[17]	validation_0-auc:0.73754
[18]	validation_0-auc:0.73732
[19]	validation_0-auc:0.73846
[20]	validation_0-auc:0.73815
[21]	validation_0-auc:0.73843
[22]	validation_0-auc:0.73867
[23]	validation_0-auc:0.73818
[24]	validation_0-auc:0.73880
[25]	validation_0-auc:0.73863
[26]	validation_0-auc:0.73864
[27]	validation_0-auc:0.73854
[28]	validation_0-auc:0.73931
[29]	validation_0-auc:0.74013
[30]	validation_0-auc:0.74027
[31]	validation_0-auc:0.74027
[32]	validation_0-auc:0.74049
[33]	validation_0-au

In [72]:
# Probabilidades (idéntico conceptualmente)
y_val_proba = xgb_model.predict_proba(X_val)[:, 1]

# AUC (esto es lo importante)
auc = roc_auc_score(y_val, y_val_proba)

#Threshold (solo para métricas de clasificación)
adj_threshold = 0.6

y_val_pred = (y_val_proba >= adj_threshold).astype(int)

# Classification report
report = classification_report(y_val, y_val_pred)

#Confusion matrix manual
TP = np.sum((y_val == 1) & (y_val_pred == 1))
FP = np.sum((y_val == 0) & (y_val_pred == 1))
TN = np.sum((y_val == 0) & (y_val_pred == 0))
FN = np.sum((y_val == 1) & (y_val_pred == 0))

FPR = FP / (FP + TN)
Precision = TP / (TP + FP)

print(f"FPR: {FPR:.4f}")
print(f"Precision (PPV): {Precision:.4f}")
print(f"AUC validación: {auc:.5f}")
print(report)

cm = confusion_matrix(y_val, y_val_pred)
print(f"\n📊 Matriz de Confusión:")
print(f"                 Predicho")
print(f"               No Def  Default")
print(f"Real No Def    {cm[0,0]:6d}   {cm[0,1]:6d}")
print(f"Real Default   {cm[1,0]:6d}   {cm[1,1]:6d}")

FPR: 0.1426
Precision (PPV): 0.7830
AUC validación: 0.78423
              precision    recall  f1-score   support

           0       0.64      0.86      0.73      4965
           1       0.78      0.51      0.62      4965

    accuracy                           0.69      9930
   macro avg       0.71      0.69      0.68      9930
weighted avg       0.71      0.69      0.68      9930


📊 Matriz de Confusión:
                 Predicho
               No Def  Default
Real No Def      4257      708
Real Default     2410     2555


In [75]:
import optuna
from sklearn.metrics import roc_auc_score

def objective(trial):

    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'hist',
        'device': 'cuda',
        'booster': 'gbtree',

        # Ranges
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.03, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 3000, 8000),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'min_child_weight': trial.suggest_int('min_child_weight', 10, 40),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
        'max_delta_step': trial.suggest_float('max_delta_step', 0, 10),

        'scale_pos_weight': trial.suggest_float('scale_pos_weight', optimal_scale * 0.7, optimal_scale * 1.3),

        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 5.0),
        'gamma': trial.suggest_float('gamma', 0, 5.0),
    }

    # Training
    model = XGBClassifier(**params, n_jobs=-1, random_state=42, early_stopping_rounds=100, enable_categorical=True)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

    # AUC
    preds = model.predict_proba(X_val)[:, 1]
    return roc_auc_score(y_val, preds)

# Progress callback
def callback(study, trial):
    if trial.number % 10 == 0:
        print(f"Trial {trial.number}/150: Current={trial.value:.5f}, Best={study.best_value:.5f}")

# 50 tests
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=150, callbacks=[callback] ,show_progress_bar=True)

best_params = study.best_params
print("Bests params:", best_params)

[I 2026-01-07 20:16:01,089] A new study created in memory with name: no-name-9e337f2a-90f8-489e-bbb8-78277c7f6386


  0%|          | 0/150 [00:00<?, ?it/s]

[I 2026-01-07 20:16:11,007] Trial 0 finished with value: 0.7836996944370918 and parameters: {'learning_rate': 0.023046275539473863, 'n_estimators': 7564, 'max_depth': 5, 'min_child_weight': 15, 'subsample': 0.8753187460976934, 'colsample_bytree': 0.7271439374261758, 'max_delta_step': 7.518994606636095, 'scale_pos_weight': 1.07257647712358, 'reg_alpha': 0.6491702485517222, 'reg_lambda': 0.823952780785607, 'gamma': 1.5387555878379255}. Best is trial 0 with value: 0.7836996944370918.
Trial 0/150: Current=0.78370, Best=0.78370
[I 2026-01-07 20:16:45,708] Trial 1 finished with value: 0.7848769178813628 and parameters: {'learning_rate': 0.005612670098835614, 'n_estimators': 5850, 'max_depth': 8, 'min_child_weight': 28, 'subsample': 0.7697016646381237, 'colsample_bytree': 0.8209926733653403, 'max_delta_step': 4.549984293072395, 'scale_pos_weight': 0.7355123686838704, 'reg_alpha': 0.4318346000651022, 'reg_lambda': 4.312905028284966, 'gamma': 3.7757520612550803}. Best is trial 1 with value: 0.7

KeyboardInterrupt: 