In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')

In [69]:

def get_oof_predictions(model, X, y, cat_features=None, n_folds=5):
    """
    Genera predicciones out-of-fold para stacking
    """
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(X))

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train_fold = X.iloc[train_idx]
        y_train_fold = y.iloc[train_idx]
        X_val_fold = X.iloc[val_idx]

        if isinstance(model, CatBoostClassifier):
            fold_model = CatBoostClassifier(**model.get_params())
            fold_model.fit(
                X_train_fold, y_train_fold,
                eval_set=(X_val_fold, y.iloc[val_idx]),
                cat_features=cat_features,
                verbose=0
            )
            oof_preds[val_idx] = fold_model.predict_proba(X_val_fold)[:, 1]

        elif isinstance(model, (XGBClassifier)):

            params = model.get_params()

            params.pop("verbosity", None)
            params.pop("device", None)

            fold_model = XGBClassifier(**params)
            fold_model.fit(
                X_train_fold, y_train_fold,
                #eval_set=[(X_val_fold, y.iloc[val_idx])],
                verbose=False,
            )
            oof_preds[val_idx] = fold_model.predict_proba(X_val_fold)[:, 1]

        elif isinstance(model, LGBMClassifier):
            # NUEVO: Usar LGBMClassifier para LightGBM
            fold_model = LGBMClassifier(**model.get_params())
            fold_model.fit(
                X_train_fold, y_train_fold,
                eval_set=[(X_val_fold, y.iloc[val_idx])],
            )
            oof_preds[val_idx] = fold_model.predict_proba(X_val_fold)[:, 1]

    return oof_preds


In [3]:

df = pd.read_csv("../data/processed/home_credit_train_ready.csv")

In [4]:
"""
# Rebalanceo del dataset
class_0 = df[df['TARGET'] == 0]
class_1 = df[df['TARGET'] == 1]
sub_class = resample(class_0, replace=False, n_samples=len(class_1), random_state=42)
df = pd.concat([sub_class, class_1])
"""

"\n# Rebalanceo del dataset\nclass_0 = df[df['TARGET'] == 0]\nclass_1 = df[df['TARGET'] == 1]\nsub_class = resample(class_0, replace=False, n_samples=len(class_1), random_state=42)\ndf = pd.concat([sub_class, class_1])\n"

In [5]:

# Separar X e y
y = df['TARGET']
X = df.drop(columns=['TARGET', 'SK_ID_CURR'])

In [6]:

# Manejo de valores infinitos
X.replace([np.inf, -np.inf], np.nan, inplace=True)

In [7]:

# Imputación de valores
cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

num_imputer = SimpleImputer(fill_value="median")
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")

X[num_cols] = num_imputer.fit_transform(X[num_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

In [8]:

# Convertir a categorías
for col in X.select_dtypes("object"):
    X[col] = X[col].astype("category")

In [9]:

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [10]:

# Identificar features categóricas
cat_features = X_train.select_dtypes(include=['category', 'object']).columns.tolist()

# Calcular scale_pos_weight
neg_samples = (y_train == 0).sum()
pos_samples = (y_train == 1).sum()
scale_pos_weight = neg_samples / pos_samples

In [79]:

def objective_catboost(trial):
    """
    Función objetivo para optimizar CatBoost
    """
    # Decidir si usar GPU o CPU basado en si necesitamos rsm
    #use_gpu = trial.suggest_categorical('use_gpu', [True, False])
    use_gpu = True

    params = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'auto_class_weights': 'Balanced',  # IMPORTANTE para desbalanceo

        # Boosting - Más iteraciones
        'iterations': trial.suggest_int('iterations', 3000, 8000, step=500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 6, 10),

        # Regularización
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 15.0),
        'random_strength': trial.suggest_float('random_strength', 0.5, 5.0),

        # Bootstrap
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli']),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 2.0) if trial.params.get('bootstrap_type') == 'Bayesian' else None,
        'subsample': trial.suggest_float('subsample', 0.5, 0.9) if trial.params.get('bootstrap_type') == 'Bernoulli' else None,

        # RSM solo si usamos CPU
        'rsm': trial.suggest_float('rsm', 0.5, 1.0) if not use_gpu else None,

        # Growing
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),

        # Categóricas
        'cat_features': cat_features,
        'one_hot_max_size': trial.suggest_int('one_hot_max_size', 2, 20),

        # GPU/CPU
        'task_type': 'GPU' if use_gpu else 'CPU',
        'devices': '0' if use_gpu else None,
        'thread_count': -1 if not use_gpu else None,

        'random_seed': 42,
        'verbose': 0,
    }

    # Limpiar None values
    params = {k: v for k, v in params.items() if v is not None}

    # Obtener predicciones OOF
    model = CatBoostClassifier(**params)
    oof_preds = get_oof_predictions(model, X_train, y_train, cat_features=cat_features, n_folds=5)

    # Calcular AUC
    auc = roc_auc_score(y_train, oof_preds)

    return auc

# OPTIMIZACIÓN CON OPTUNA - XGBOOST

def objective_xgboost(trial):
    """
    Función objetivo para optimizar XGBoost
    """
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'scale_pos_weight': scale_pos_weight,

        # Boosting - Más agresivo
        'n_estimators': trial.suggest_int('n_estimators', 3000, 8000, step=500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 6, 12),

        # Regularización
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),

        # L1/L2
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),

        # Otros
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 5),
        'tree_method': 'hist',
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),

        # Control
        'random_state': 42,
        'n_jobs': -1,
        'device': 'cuda',
        'verbosity': 0,
        'enable_categorical': True,
    }

    # Obtener predicciones OOF
    model = XGBClassifier(**params)
    oof_preds = get_oof_predictions(model, X_train, y_train, n_folds=5)

    # Calcular AUC
    auc = roc_auc_score(y_train, oof_preds)

    return auc


# OPTIMIZACIÓN CON OPTUNA - LIGHTGBM

def objective_lightgbm(trial):
    """
    Función objetivo para optimizar LightGBM
    """
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        'scale_pos_weight': scale_pos_weight,

        # Boosting
        'n_estimators': trial.suggest_int('n_estimators', 3000, 8000, step=500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 31, 255),
        'max_depth': trial.suggest_int('max_depth', 6, 12),

        # Regularización
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),

        # L1/L2
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),

        # DART specific
        'drop_rate': trial.suggest_float('drop_rate', 0.0, 0.5) if trial.params.get('boosting_type') == 'dart' else None,

        # Control
        'random_state': 42,
        'n_jobs': -1,
        'device': 'gpu',
        'verbosity': -1,
    }

    # Limpiar None values
    params = {k: v for k, v in params.items() if v is not None}

    # Obtener predicciones OOF
    model = LGBMClassifier(**params)
    oof_preds = get_oof_predictions(model, X_train, y_train, n_folds=5)

    # Calcular AUC
    auc = roc_auc_score(y_train, oof_preds)

    return auc

# ============================================================================
# OPTIMIZACIÓN CON OPTUNA - META-MODELO
# ============================================================================

def objective_meta_model(trial, cat_oof, xgb_oof, lgb_oof):
    """
    Función objetivo para optimizar el meta-modelo con 3 modelos
    """
    w_cat = trial.suggest_float("cat_weight", 0.0, 1.0)
    w_xgb = trial.suggest_float("xgb_weight", 0.0, 1.0)
    w_lgb = trial.suggest_float("lgb_weight", 0.0, 1.0)

    if w_lgb < 0:
        return 0.0

    preds = (
        w_cat * cat_oof +
        w_xgb * xgb_oof +
        w_lgb * lgb_oof
    )

    penalty = abs(w_cat - 0.33) + abs(w_xgb - 0.33) + abs(w_lgb - 0.33)
    auc = roc_auc_score(y_train, preds) - 0.001 * penalty


    return auc


In [12]:
# Optimizar CatBoost
study_catboost = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    study_name='catboost_optimization'
)
study_catboost.optimize(objective_catboost, n_trials=30, show_progress_bar=True)

print(f"\nMejor AUC CatBoost: {study_catboost.best_value:.5f}")
print(f"Mejores parámetros CatBoost:")
for key, value in study_catboost.best_params.items():
    print(f"  - {key}: {value}")

[I 2026-01-20 00:13:56,848] A new study created in memory with name: catboost_optimization


  0%|          | 0/30 [00:00<?, ?it/s]

Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 00:25:22,128] Trial 0 finished with value: 0.7657149618128862 and parameters: {'iterations': 5000, 'learning_rate': 0.08927180304353628, 'depth': 9, 'l2_leaf_reg': 9.381218778758512, 'random_strength': 1.2020838819909643, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.7323522915498704, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 98, 'one_hot_max_size': 17}. Best is trial 0 with value: 0.7657149618128862.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 00:31:15,718] Trial 1 finished with value: 0.7839039409939824 and parameters: {'iterations': 4000, 'learning_rate': 0.015199348301309814, 'depth': 6, 'l2_leaf_reg': 5.259391401433528, 'random_strength': 2.86140394234507, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.223705789444759, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 51, 'one_hot_max_size': 16}. Best is trial 1 with value: 0.7839039409939824.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 00:42:08,876] Trial 2 finished with value: 0.7781575053613004 and parameters: {'iterations': 4000, 'learning_rate': 0.032676417657817626, 'depth': 8, 'l2_leaf_reg': 1.6503057780799681, 'random_strength': 3.2339518335564725, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.8977710745066665, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 18, 'one_hot_max_size': 15}. Best is trial 1 with value: 0.7839039409939824.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 00:51:11,407] Trial 3 finished with value: 0.7668937283408778 and parameters: {'iterations': 5000, 'learning_rate': 0.01324458134009936, 'depth': 8, 'l2_leaf_reg': 1.4814392956130575, 'random_strength': 4.59194180935452, 'bootstrap_type': 'Bernoulli', 'subsample': 0.6246844304357644, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 98, 'one_hot_max_size': 16}. Best is trial 1 with value: 0.7839039409939824.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 01:03:24,195] Trial 4 finished with value: 0.7805005581996203 and parameters: {'iterations': 8000, 'learning_rate': 0.07849235338159358, 'depth': 8, 'l2_leaf_reg': 13.906239290323636, 'random_strength': 0.8982162592336378, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.6506606615265287, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 42, 'one_hot_max_size': 7}. Best is trial 1 with value: 0.7839039409939824.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 01:47:01,507] Trial 5 finished with value: 0.7738929849360359 and parameters: {'iterations': 5500, 'learning_rate': 0.013833249975219963, 'depth': 10, 'l2_leaf_reg': 2.0437090115167917, 'random_strength': 4.940991214702327, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.011044234247204798, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 80, 'one_hot_max_size': 3}. Best is trial 1 with value: 0.7839039409939824.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 01:54:17,005] Trial 6 finished with value: 0.7821782381693896 and parameters: {'iterations': 4500, 'learning_rate': 0.013057771348997228, 'depth': 10, 'l2_leaf_reg': 9.72617377558581, 'random_strength': 1.9890411118369213, 'bootstrap_type': 'Bernoulli', 'subsample': 0.6300733288106988, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 52, 'one_hot_max_size': 4}. Best is trial 1 with value: 0.7839039409939824.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 02:04:12,669] Trial 7 finished with value: 0.7810569957213985 and parameters: {'iterations': 6500, 'learning_rate': 0.057648106701146694, 'depth': 8, 'l2_leaf_reg': 11.793540519363853, 'random_strength': 2.7220801836397586, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.05083825348819038, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 38, 'one_hot_max_size': 11}. Best is trial 1 with value: 0.7839039409939824.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 02:24:21,999] Trial 8 finished with value: 0.7820969116218157 and parameters: {'iterations': 7500, 'learning_rate': 0.017753837036522245, 'depth': 8, 'l2_leaf_reg': 11.577715939602681, 'random_strength': 1.529591744712301, 'bootstrap_type': 'Bernoulli', 'subsample': 0.5644885149016018, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 89, 'one_hot_max_size': 17}. Best is trial 1 with value: 0.7839039409939824.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 02:30:22,193] Trial 9 finished with value: 0.7797007401051097 and parameters: {'iterations': 4000, 'learning_rate': 0.07808345085542412, 'depth': 8, 'l2_leaf_reg': 12.304162172296875, 'random_strength': 4.53241084965572, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.45587032508388337, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 10, 'one_hot_max_size': 11}. Best is trial 1 with value: 0.7839039409939824.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 02:34:36,884] Trial 10 finished with value: 0.7806589068774623 and parameters: {'iterations': 3000, 'learning_rate': 0.024854382110833333, 'depth': 6, 'l2_leaf_reg': 5.147316060606622, 'random_strength': 3.433399478501803, 'bootstrap_type': 'Bernoulli', 'subsample': 0.8839082983781679, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 71, 'one_hot_max_size': 20}. Best is trial 1 with value: 0.7839039409939824.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 02:40:23,526] Trial 11 finished with value: 0.7813366396039202 and parameters: {'iterations': 4000, 'learning_rate': 0.010224204739011923, 'depth': 6, 'l2_leaf_reg': 6.5912200769488605, 'random_strength': 2.2490066145845318, 'bootstrap_type': 'Bernoulli', 'subsample': 0.7667963710144704, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 57, 'one_hot_max_size': 2}. Best is trial 1 with value: 0.7839039409939824.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 02:45:54,289] Trial 12 finished with value: 0.7815268076605395 and parameters: {'iterations': 3500, 'learning_rate': 0.02338595250107362, 'depth': 10, 'l2_leaf_reg': 8.769787884323392, 'random_strength': 2.0643504372346215, 'bootstrap_type': 'Bernoulli', 'subsample': 0.5042397419513477, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 56, 'one_hot_max_size': 7}. Best is trial 1 with value: 0.7839039409939824.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 02:52:27,568] Trial 13 finished with value: 0.7809503171408787 and parameters: {'iterations': 4500, 'learning_rate': 0.038433824191375664, 'depth': 7, 'l2_leaf_reg': 4.381769061724758, 'random_strength': 3.696887919029021, 'bootstrap_type': 'Bernoulli', 'subsample': 0.7184200828646883, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 38, 'one_hot_max_size': 6}. Best is trial 1 with value: 0.7839039409939824.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 03:02:03,710] Trial 14 finished with value: 0.784051439917729 and parameters: {'iterations': 6500, 'learning_rate': 0.018081659456430418, 'depth': 7, 'l2_leaf_reg': 7.509157356683852, 'random_strength': 1.943825803789177, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.289231140858435, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 67, 'one_hot_max_size': 13}. Best is trial 14 with value: 0.784051439917729.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 03:12:02,621] Trial 15 finished with value: 0.7791976772957796 and parameters: {'iterations': 6500, 'learning_rate': 0.019754564808404824, 'depth': 7, 'l2_leaf_reg': 6.886058924470436, 'random_strength': 0.5213305384389031, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.363425441834631, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 70, 'one_hot_max_size': 13}. Best is trial 14 with value: 0.784051439917729.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 03:21:29,937] Trial 16 finished with value: 0.7821908838905701 and parameters: {'iterations': 6500, 'learning_rate': 0.04258179059731556, 'depth': 7, 'l2_leaf_reg': 3.9859666988357123, 'random_strength': 2.746870447686365, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.1727807202093594, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 68, 'one_hot_max_size': 13}. Best is trial 14 with value: 0.784051439917729.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 03:30:10,350] Trial 17 finished with value: 0.7841019115643509 and parameters: {'iterations': 6000, 'learning_rate': 0.017257341966192278, 'depth': 6, 'l2_leaf_reg': 7.071611279317964, 'random_strength': 3.8295096956577956, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.4504528425710896, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 27, 'one_hot_max_size': 20}. Best is trial 17 with value: 0.7841019115643509.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 03:41:06,594] Trial 18 finished with value: 0.7824515386837384 and parameters: {'iterations': 6000, 'learning_rate': 0.025355477090709816, 'depth': 7, 'l2_leaf_reg': 6.613527327131494, 'random_strength': 3.9232482403266205, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.493169176478417, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 25, 'one_hot_max_size': 20}. Best is trial 17 with value: 0.7841019115643509.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 03:49:54,297] Trial 19 finished with value: 0.7805378369454707 and parameters: {'iterations': 7000, 'learning_rate': 0.010086576565234193, 'depth': 6, 'l2_leaf_reg': 7.736381669103048, 'random_strength': 3.9797421846483223, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.8635862748877983, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 30, 'one_hot_max_size': 9}. Best is trial 17 with value: 0.7841019115643509.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 03:58:53,441] Trial 20 finished with value: 0.784615804241636 and parameters: {'iterations': 6000, 'learning_rate': 0.020264468308303578, 'depth': 7, 'l2_leaf_reg': 10.032658552941697, 'random_strength': 1.6991555670756433, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.551704656420381, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 64, 'one_hot_max_size': 19}. Best is trial 20 with value: 0.784615804241636.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 04:07:55,470] Trial 21 finished with value: 0.7850065143951975 and parameters: {'iterations': 6000, 'learning_rate': 0.01912693199266489, 'depth': 7, 'l2_leaf_reg': 10.331505281793332, 'random_strength': 1.5831849249961483, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.566326286968378, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 61, 'one_hot_max_size': 19}. Best is trial 21 with value: 0.7850065143951975.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 04:15:52,514] Trial 22 finished with value: 0.784374637555665 and parameters: {'iterations': 5500, 'learning_rate': 0.02123121205685313, 'depth': 6, 'l2_leaf_reg': 11.152261695791793, 'random_strength': 1.205785296836125, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.6006342044207873, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 61, 'one_hot_max_size': 19}. Best is trial 21 with value: 0.7850065143951975.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 04:24:09,636] Trial 23 finished with value: 0.785044390035991 and parameters: {'iterations': 5500, 'learning_rate': 0.029283777377060543, 'depth': 7, 'l2_leaf_reg': 10.437238653240708, 'random_strength': 1.5238135355014362, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.696188943505986, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 80, 'one_hot_max_size': 18}. Best is trial 23 with value: 0.785044390035991.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 04:33:15,896] Trial 24 finished with value: 0.7851042613020398 and parameters: {'iterations': 6000, 'learning_rate': 0.030065067238315705, 'depth': 7, 'l2_leaf_reg': 10.232217881271547, 'random_strength': 1.5630042938677005, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.934037357498665, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 80, 'one_hot_max_size': 18}. Best is trial 24 with value: 0.7851042613020398.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 04:41:11,756] Trial 25 finished with value: 0.7850673112989532 and parameters: {'iterations': 5000, 'learning_rate': 0.031127365059342837, 'depth': 9, 'l2_leaf_reg': 13.393357504340827, 'random_strength': 2.4200992555488696, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.8451024584048932, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 80, 'one_hot_max_size': 18}. Best is trial 24 with value: 0.7851042613020398.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 04:49:10,719] Trial 26 finished with value: 0.7851736421827313 and parameters: {'iterations': 5000, 'learning_rate': 0.02951564502586622, 'depth': 9, 'l2_leaf_reg': 14.36532868162059, 'random_strength': 2.3762458022633144, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.946782208042912, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 80, 'one_hot_max_size': 18}. Best is trial 26 with value: 0.7851736421827313.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 04:57:09,153] Trial 27 finished with value: 0.7846412917489158 and parameters: {'iterations': 5000, 'learning_rate': 0.04540566939900886, 'depth': 9, 'l2_leaf_reg': 14.365469490610426, 'random_strength': 2.469384561014156, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.9749513272743762, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 78, 'one_hot_max_size': 15}. Best is trial 26 with value: 0.7851736421827313.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 05:08:29,469] Trial 28 finished with value: 0.7744811324628176 and parameters: {'iterations': 4500, 'learning_rate': 0.03206591430543421, 'depth': 9, 'l2_leaf_reg': 13.204062104065414, 'random_strength': 2.4078604308564526, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.7956562676996013, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 88, 'one_hot_max_size': 14}. Best is trial 26 with value: 0.7851736421827313.


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2026-01-20 05:28:39,644] Trial 29 finished with value: 0.7772202103365309 and parameters: {'iterations': 5000, 'learning_rate': 0.05413632365222081, 'depth': 9, 'l2_leaf_reg': 14.761802095361311, 'random_strength': 3.0259156012574224, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.9835815187841797, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 100, 'one_hot_max_size': 17}. Best is trial 26 with value: 0.7851736421827313.

Mejor AUC CatBoost: 0.78517
Mejores parámetros CatBoost:
  - iterations: 5000
  - learning_rate: 0.02951564502586622
  - depth: 9
  - l2_leaf_reg: 14.36532868162059
  - random_strength: 2.3762458022633144
  - bootstrap_type: Bayesian
  - bagging_temperature: 1.946782208042912
  - grow_policy: Lossguide
  - min_data_in_leaf: 80
  - one_hot_max_size: 18


In [13]:

# Optimizar XGBoost
study_xgboost = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    study_name='xgboost_optimization'
)
study_xgboost.optimize(objective_xgboost, n_trials=30, show_progress_bar=True)

print(f"\nMejor AUC XGBoost: {study_xgboost.best_value:.5f}")
print(f"Mejores parámetros XGBoost:")
for key, value in study_xgboost.best_params.items():
    print(f"  - {key}: {value}")


[I 2026-01-20 05:28:39,730] A new study created in memory with name: xgboost_optimization


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-20 05:47:52,909] Trial 0 finished with value: 0.7544174852412241 and parameters: {'n_estimators': 5000, 'learning_rate': 0.08927180304353628, 'max_depth': 11, 'min_child_weight': 12, 'gamma': 0.15601864044243652, 'subsample': 0.5779972601681014, 'colsample_bytree': 0.5290418060840998, 'colsample_bylevel': 0.9330880728874675, 'colsample_bynode': 0.8005575058716043, 'reg_alpha': 3.540362888980227, 'reg_lambda': 0.20584494295802447, 'max_delta_step': 5, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.7544174852412241.
[I 2026-01-20 06:06:56,254] Trial 1 finished with value: 0.772312861711888 and parameters: {'n_estimators': 4000, 'learning_rate': 0.015254729458052608, 'max_depth': 8, 'min_child_weight': 11, 'gamma': 0.43194501864211576, 'subsample': 0.645614570099021, 'colsample_bytree': 0.8059264473611898, 'colsample_bylevel': 0.569746930326021, 'colsample_bynode': 0.6460723242676091, 'reg_alpha': 1.8318092164684585, 'reg_lambda': 4.56069984217036, 'max_delta_step':

In [25]:
# Optimizar LightGBM
study_lightgbm = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    study_name='lightgbm_optimization'
)
study_lightgbm.optimize(objective_lightgbm, n_trials=30, show_progress_bar=True)

print(f"\nMejor AUC LightGBM: {study_lightgbm.best_value:.5f}")

[I 2026-01-20 13:50:13,263] A new study created in memory with name: lightgbm_optimization


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2026-01-20 14:52:28,542] Trial 0 finished with value: 0.7611518255053991 and parameters: {'boosting_type': 'dart', 'n_estimators': 7000, 'learning_rate': 0.03968793330444373, 'num_leaves': 66, 'max_depth': 7, 'min_child_samples': 15, 'min_child_weight': 2.9154431891537547, 'subsample': 0.8005575058716043, 'colsample_bytree': 0.8540362888980227, 'reg_alpha': 0.10292247147901223, 'reg_lambda': 9.699098521619943, 'drop_rate': 0.41622132040021087}. Best is trial 0 with value: 0.7611518255053991.
[I 2026-01-20 15:11:11,817] Trial 1 finished with value: 0.7619415989237929 and parameters: {'boosting_type': 'gbdt', 'n_estimators': 4000, 'learning_rate': 0.02014847788415866, 'num_leaves': 149, 'max_depth': 9, 'min_child_samples': 36, 'min_child_weight': 0.2801635158716261, 'subsample': 0.569746930326021, 'colsample_bytree': 0.6460723242676091, 'reg_alpha': 1.8318092164684585, 'reg_lambda': 4.56069984217036}. Best is trial 1 with value: 0.7619415989237929.
[I 2026-01-20 15:21:34,794] Trial 2 

In [26]:

best_catboost_params = study_catboost.best_params.copy()

best_catboost_params.update({
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'auto_class_weights': 'Balanced',
    'cat_features': cat_features,
    'task_type': 'GPU',
    'random_seed': 42,
    'verbose': 0,
})


In [83]:

# Preparar parámetros XGBoost
best_xgboost_params = study_xgboost.best_params.copy()

best_xgboost_params.update({
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'scale_pos_weight': scale_pos_weight,
    'tree_method': 'hist',
    'random_state': 42,
    'n_jobs': -1,
    'verbose': None,
    'enable_categorical': True,
})



In [28]:
# Preparar parámetros LightGBM
best_lightgbm_params = study_lightgbm.best_params.copy()
best_lightgbm_params.update({
    'objective': 'binary',
    'metric': 'auc',
    'scale_pos_weight': scale_pos_weight,
    'random_state': 42,
    'n_jobs': -1,
    'device': 'gpu',
    'verbosity': -1,
})

In [46]:
# Crear modelos
catboost_best = CatBoostClassifier(**best_catboost_params)
xgboost_best = XGBClassifier(**best_xgboost_params)
lightgbm_best = LGBMClassifier(**best_lightgbm_params)

In [34]:
# Obtener predicciones OOF para el meta-modelo
cat_oof = get_oof_predictions(catboost_best, X_train, y_train, cat_features=cat_features, n_folds=5)

Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


In [71]:
import xgboost as xgb

_original = xgb.config.set_config

def patched_set_config(**kwargs):
    if "verbosity" in kwargs and kwargs["verbosity"] == -1:
        kwargs["verbosity"] = 0
    return _original(**kwargs)

xgb.config.set_config = patched_set_config


In [72]:
xgb_oof = get_oof_predictions(xgboost_best, X_train, y_train, n_folds=5)

In [43]:
lgb_oof = get_oof_predictions(lightgbm_best, X_train, y_train, n_folds=5)

In [73]:
print(f"\nAUC individual en OOF:")
print(f"  CatBoost:  {roc_auc_score(y_train, cat_oof):.5f}")
print(f"  XGBoost:   {roc_auc_score(y_train, xgb_oof):.5f}")
print(f"  LightGBM:  {roc_auc_score(y_train, lgb_oof):.5f}")


AUC individual en OOF:
  CatBoost:  0.78512
  XGBoost:   0.78025
  LightGBM:  0.78110


In [80]:

# Optimizar pesos del meta-modelo
study_meta = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    study_name='meta_model_optimization'
)
study_meta.optimize(
    lambda trial: objective_meta_model(trial, cat_oof, xgb_oof, lgb_oof),
    n_trials=10000,
    show_progress_bar=True
)

# Obtener pesos óptimos y normalizar
best_cat_weight = study_meta.best_params['cat_weight']
best_xgb_weight = study_meta.best_params['xgb_weight']
best_lgb_weight = study_meta.best_params['lgb_weight']
total_weight = best_cat_weight + best_xgb_weight + best_lgb_weight
best_cat_weight /= total_weight
best_xgb_weight /= total_weight
best_lgb_weight /= total_weight

print(f"\n✓ Mejor AUC Meta-modelo: {study_meta.best_value:.5f}")
print(f"Pesos óptimos:")
print(f"  - CatBoost: {best_cat_weight:.4f}")
print(f"  - XGBoost:  {best_xgb_weight:.4f}")
print(f"  - LightGBM: {best_lgb_weight:.4f}")

[I 2026-01-21 15:41:02,027] A new study created in memory with name: meta_model_optimization


  0%|          | 0/10000 [00:00<?, ?it/s]

[I 2026-01-21 15:41:02,092] Trial 0 finished with value: 0.7824601824207192 and parameters: {'cat_weight': 0.3745401188473625, 'xgb_weight': 0.9507143064099162, 'lgb_weight': 0.7319939418114051}. Best is trial 0 with value: 0.7824601824207192.
[I 2026-01-21 15:41:02,143] Trial 1 finished with value: 0.7849349742898282 and parameters: {'cat_weight': 0.5986584841970366, 'xgb_weight': 0.15601864044243652, 'lgb_weight': 0.15599452033620265}. Best is trial 1 with value: 0.7849349742898282.
[I 2026-01-21 15:41:02,198] Trial 2 finished with value: 0.781141551232299 and parameters: {'cat_weight': 0.05808361216819946, 'xgb_weight': 0.8661761457749352, 'lgb_weight': 0.6011150117432088}. Best is trial 1 with value: 0.7849349742898282.
[I 2026-01-21 15:41:02,248] Trial 3 finished with value: 0.7835191958444124 and parameters: {'cat_weight': 0.7080725777960455, 'xgb_weight': 0.020584494295802447, 'lgb_weight': 0.9699098521619943}. Best is trial 1 with value: 0.7849349742898282.
[I 2026-01-21 15:41:

In [96]:
#params with better auc
#xgb
xgboost_params = {
    # Objective y métrica
    'objective': 'binary:logistic',
    'eval_metric': 'aucpr',  # AUCPR es mejor para desbalanceo


    # Parámetros de boosting - MÁS AGRESIVOS
    'n_estimators': 3000,  # Reducir de 5000 (early stopping lo maneja)
    'learning_rate': 0.01,  # Más bajo = más refinado
    'max_depth': 6,  # Aumentar de 5 (más capacidad)

    # Regularización - BALANCEADA
    'min_child_weight': 5,  # Reducir de 7 (menos restrictivo)
    'gamma': 0.2,  # Reducir de 0.3
    'subsample': 0.8,  # Aumentar de 0.7
    'colsample_bytree': 0.8,  # Aumentar de 0.7
    'colsample_bylevel': 0.8,  # Aumentar de 0.7
    'colsample_bynode': 0.8,  # NUEVO: muestreo por nodo

    # Regularización L1/L2 - MÁS SUAVE
    'reg_alpha': 0.5,  # Reducir de 1.0
    'reg_lambda': 2.0,  # Reducir de 3.0

    # Parámetros adicionales IMPORTANTES
    'max_delta_step': 1,  # Ayuda con desbalanceo extremo
    'tree_method': 'hist',
    'grow_policy': 'depthwise',  # vs lossguide

    # Sampling method para desbalanceo
    'sampling_method': 'gradient_based',  # Alternativa: 'uniform'

    # Control
    'random_state': 42,
    'n_jobs': -1,
    'device': 'cuda',
      # Reducir de 300 (más rápido)
    'verbosity': 1,
    'enable_categorical': True,
}

catboost_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'PRAUC',

    'scale_pos_weight': scale_pos_weight * 0.9,

    # Boosting
    'iterations': 4000,
    'learning_rate': 0.025,
    'depth': 6,

    # Regularización (compensa falta de rsm)
    'l2_leaf_reg': 8.0,
    'random_strength': 2,

    # Bootstrap
    'bootstrap_type': 'Bayesian',
    'bagging_temperature': 0.5,

    # Growing
    'grow_policy': 'SymmetricTree',
    'min_data_in_leaf': 30,

    # Categóricas
    'cat_features': cat_features,
    'one_hot_max_size': 10,

    # GPU
    'task_type': 'GPU',
    'devices': '0',

    # Control
    'random_seed': 42,

    'verbose': 100,}

In [95]:

# Entrenar modelos finales
catboost_final = CatBoostClassifier(**catboost_params)
catboost_final.fit(X_train, y_train, cat_features=cat_features, verbose=0)

Default metric period is 5 because PRAUC is/are not implemented for GPU
Metric PRAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


<catboost.core.CatBoostClassifier at 0x203d82b85f0>

In [97]:

xgboost_final = XGBClassifier(**xgboost_params)
xgboost_final.fit(X_train, y_train, verbose=0)

In [85]:

lightgbm_final = LGBMClassifier(**best_lightgbm_params)
lightgbm_final.fit(X_train, y_train)

In [98]:

# Predicciones en test
test_cat_preds = catboost_final.predict_proba(X_test)[:, 1]
test_xgb_preds = xgboost_final.predict_proba(X_test)[:, 1]
test_lgb_preds = lightgbm_final.predict_proba(X_test)[:, 1]

# Meta-predicción con pesos optimizados
final_predictions = (best_cat_weight * test_cat_preds +
                     best_xgb_weight * test_xgb_preds +
                     best_lgb_weight * test_lgb_preds)

In [101]:

# Métricas
print("\n" + "="*80)
print("RESULTADOS EN TEST SET")
print("="*80)
print(f"\nAUC CatBoost:  {roc_auc_score(y_test, test_cat_preds):.5f}")
print(f"AUC XGBoost:   {roc_auc_score(y_test, test_xgb_preds):.5f}")
print(f"AUC LGBM:      {roc_auc_score(y_test, test_lgb_preds):.5f}")
print(f"AUC STACKING:  {roc_auc_score(y_test, final_predictions):.5f}")


# ============================================================================
# OPTIMIZACIÓN DE THRESHOLD
# ============================================================================

print("\n" + "="*80)
print("OPTIMIZACIÓN DE THRESHOLD")
print("="*80)

cost_FN = 10000  # Costo de no detectar un default
cost_FP = 1000   # Costo de rechazar un buen cliente

thresholds = np.linspace(0, 1, 101)
losses = []

for t in thresholds:
    y_pred = (final_predictions >= t).astype(int)
    FP = np.sum((y_test == 0) & (y_pred == 1))
    FN = np.sum((y_test == 1) & (y_pred == 0))
    loss = FP * cost_FP + FN * cost_FN
    losses.append(loss)

#optimal_threshold = thresholds[np.argmin(losses)]
optimal_threshold = thresholds[np.argmin(losses)]
min_loss = min(losses)

print(f"\n💰 Análisis de Costos:")
print(f"Threshold óptimo: {optimal_threshold:.4f}")
print(f"Pérdida mínima: ${min_loss:,.0f}")

# Predicciones con threshold optimizado
y_test_pred = (final_predictions >= optimal_threshold).astype(int)

# Confusion matrix
TP = np.sum((y_test == 1) & (y_test_pred == 1))
FP = np.sum((y_test == 0) & (y_test_pred == 1))
TN = np.sum((y_test == 0) & (y_test_pred == 0))
FN = np.sum((y_test == 1) & (y_test_pred == 0))

FPR = FP / (FP + TN) if (FP + TN) > 0 else 0
TPR = TP / (TP + FN) if (TP + FN) > 0 else 0
Precision = TP / (TP + FP) if (TP + FP) > 0 else 0

print(f"\n📈 Métricas de Clasificación (threshold={optimal_threshold:.4f}):")
print(f"FPR (False Positive Rate): {FPR:.4f}")
print(f"TPR (Recall): {TPR:.4f}")
print(f"Precision: {Precision:.4f}")

print(f"\n📋 Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=['No Default', 'Default']))

cm = confusion_matrix(y_test, y_test_pred)
print(f"\n📊 Matriz de Confusión:")
print(f"                 Predicho")
print(f"               No Def  Default")
print(f"Real No Def    {cm[0, 0]:6d}   {cm[0, 1]:6d}")
print(f"Real Default   {cm[1, 0]:6d}   {cm[1, 1]:6d}")

# Análisis financiero
total_cost_optimal = FP * cost_FP + FN * cost_FN
total_cost_default = np.sum(y_test == 1) * cost_FN
savings = total_cost_default - total_cost_optimal

print(f"\n💵 Análisis Financiero:")
print(f"Costo total (threshold óptimo): ${total_cost_optimal:,.0f}")
print(f"Costo si rechazamos a todos: ${total_cost_default:,.0f}")
print(f"Ahorro con el modelo: ${savings:,.0f} ({(savings/total_cost_default)*100:.1f}%)")
print(f"\nFalsos Positivos: {FP} (costo: ${FP * cost_FP:,.0f})")
print(f"Falsos Negativos: {FN} (costo: ${FN * cost_FN:,.0f})")

print("\n" + "="*80)
print("OPTIMIZACIÓN COMPLETADA")
print("="*80)


RESULTADOS EN TEST SET

AUC CatBoost:  0.78369
AUC XGBoost:   0.78523
AUC LGBM:      0.78533
AUC STACKING:  0.78644

OPTIMIZACIÓN DE THRESHOLD

💰 Análisis de Costos:
Threshold óptimo: 0.0600
Pérdida mínima: $25,725,000

📈 Métricas de Clasificación (threshold=0.0600):
FPR (False Positive Rate): 0.3481
TPR (Recall): 0.7781
Precision: 0.1749

📋 Classification Report:
              precision    recall  f1-score   support

  No Default       0.97      0.65      0.78     46061
     Default       0.17      0.78      0.29      4367

    accuracy                           0.66     50428
   macro avg       0.57      0.71      0.53     50428
weighted avg       0.90      0.66      0.74     50428


📊 Matriz de Confusión:
                 Predicho
               No Def  Default
Real No Def     30026    16035
Real Default      969     3398

💵 Análisis Financiero:
Costo total (threshold óptimo): $25,725,000
Costo si rechazamos a todos: $43,670,000
Ahorro con el modelo: $17,945,000 (41.1%)

Falsos Pos

In [100]:
#try a logistic regression
# Crea un DataFrame con las predicciones OOF
X_train_meta = pd.DataFrame({
    'cat': cat_oof,
    'xgb': xgb_oof,
    'lgb': lgb_oof
})

# Predicciones para el conjunto de TEST (usando los modelos ya entrenados)
test_cat = catboost_final.predict_proba(X_test)[:, 1]
test_xgb = xgboost_final.predict_proba(X_test)[:, 1]
test_lgb = lightgbm_final.predict_proba(X_test)[:, 1]

X_test_meta = pd.DataFrame({
    'cat': test_cat,
    'xgb': test_xgb,
    'lgb': test_lgb
})

# Entrena el Meta-Modelo (Logistic Regression es ideal aquí)
meta_model = LogisticRegression()
meta_model.fit(X_train_meta, y_train)

# Predicción final Stacked
final_predictions = meta_model.predict_proba(X_test_meta)[:, 1]

print(f"AUC FINAL: {roc_auc_score(y_test, final_predictions):.5f}")


AUC FINAL: 0.78644
