In [86]:
import pandas as pd
from sklearn.utils import resample
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import numpy as np
from sklearn.impute import SimpleImputer
import joblib
from xgboost import XGBClassifier
import optuna
from optuna.samplers import TPESampler


In [87]:
#load dataset
df = pd.read_csv("../data/processed/home_credit_train_ready.csv")

In [88]:
#rebalance dataset
class_0 = df[df['TARGET'] == 0]
class_1 = df[df['TARGET'] == 1]

# balance mayority class with minority
sub_class = resample(class_0, replace=False, n_samples=len(class_1), random_state=42)

# combine
df = pd.concat([sub_class, class_1])

In [89]:
y = df['TARGET']
X = df.drop(columns=['TARGET', 'SK_ID_CURR'])  #adde sk_id_curr

In [90]:
#manages divided by 0 values
X.replace([np.inf, -np.inf], np.nan, inplace=True)

In [91]:
#divide categorical and numerical to impute the values
cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

num_imputer = SimpleImputer(strategy="median")  #change median for mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")

X[num_cols] = num_imputer.fit_transform(X[num_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

In [92]:
#change object to category to use in the threelike model
for col in X.select_dtypes("object"):
    X[col] = X[col].astype("category")

In [93]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y ,random_state=42)

In [94]:
#create params for optuna study
xgb_params = {
    # Objective y métrica
    'objective': 'binary:logistic',
    'eval_metric': ['aucpr', 'auc'],  # AUCPR es mejor para desbalanceo


    # Parámetros de boosting - MÁS AGRESIVOS
    'n_estimators': 3000,  # Reducir de 5000 (early stopping lo maneja)
    'learning_rate': 0.01,  # Más bajo = más refinado
    'max_depth': 6,  # Aumentar de 5 (más capacidad)

    # Regularización - BALANCEADA
    'min_child_weight': 5,  # Reducir de 7 (menos restrictivo)
    'gamma': 0.2,  # Reducir de 0.3
    'subsample': 0.8,  # Aumentar de 0.7
    'colsample_bytree': 0.8,  # Aumentar de 0.7
    'colsample_bylevel': 0.8,  # Aumentar de 0.7
    'colsample_bynode': 0.8,  # NUEVO: muestreo por nodo

    # Regularización L1/L2 - MÁS SUAVE
    'reg_alpha': 0.5,  # Reducir de 1.0
    'reg_lambda': 2.0,  # Reducir de 3.0

    # Parámetros adicionales IMPORTANTES
    'max_delta_step': 1,  # Ayuda con desbalanceo extremo
    'tree_method': 'hist',
    'grow_policy': 'depthwise',  # vs lossguide

    # Sampling method para desbalanceo
    'sampling_method': 'gradient_based',  # Alternativa: 'uniform'

    # Control
    'random_state': 42,
    'n_jobs': -1,
    'device': 'cuda',
    'early_stopping_rounds': 200,  # Reducir de 300 (más rápido)
    'verbosity': 1,
    'enable_categorical': True,
}

In [95]:
model = XGBClassifier(**xgb_params)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

[0]	validation_0-aucpr:0.70516	validation_0-auc:0.72221
[1]	validation_0-aucpr:0.72143	validation_0-auc:0.73423
[2]	validation_0-aucpr:0.72325	validation_0-auc:0.73564
[3]	validation_0-aucpr:0.72622	validation_0-auc:0.73961
[4]	validation_0-aucpr:0.72443	validation_0-auc:0.74004
[5]	validation_0-aucpr:0.72380	validation_0-auc:0.73975
[6]	validation_0-aucpr:0.72342	validation_0-auc:0.74024
[7]	validation_0-aucpr:0.72398	validation_0-auc:0.74106
[8]	validation_0-aucpr:0.72438	validation_0-auc:0.74136
[9]	validation_0-aucpr:0.72400	validation_0-auc:0.74091
[10]	validation_0-aucpr:0.72470	validation_0-auc:0.74135
[11]	validation_0-aucpr:0.72474	validation_0-auc:0.74148
[12]	validation_0-aucpr:0.72461	validation_0-auc:0.74154
[13]	validation_0-aucpr:0.72508	validation_0-auc:0.74212
[14]	validation_0-aucpr:0.72520	validation_0-auc:0.74224
[15]	validation_0-aucpr:0.72556	validation_0-auc:0.74265
[16]	validation_0-aucpr:0.72601	validation_0-auc:0.74315
[17]	validation_0-aucpr:0.72717	validatio

In [96]:

# Probabilidades (idéntico conceptualmente)
y_val_proba = model.predict_proba(X_val)[:, 1]

# AUC (esto es lo importante)
auc = roc_auc_score(y_val, y_val_proba)

#optimal threshold
thresholds = np.linspace(0.05, 0.6, 100)
f1_scores = [f1_score(y_val, y_val_proba >= t) for t in thresholds]


optimal_threshold = thresholds[np.argmax(f1_scores)]
print(f'optimal threshold: {optimal_threshold}')

#Threshold (solo para métricas de clasificación)
adj_threshold = optimal_threshold

y_val_pred = (y_val_proba >= adj_threshold).astype(int)

# Classification report
report = classification_report(y_val, y_val_pred)

#Confusion matrix manual
TP = np.sum((y_val == 1) & (y_val_pred == 1))
FP = np.sum((y_val == 0) & (y_val_pred == 1))
TN = np.sum((y_val == 0) & (y_val_pred == 0))
FN = np.sum((y_val == 1) & (y_val_pred == 0))

FPR = FP / (FP + TN)
Precision = TP / (TP + FP)

print(f"FPR: {FPR:.4f}")
print(f"Precision (PPV): {Precision:.4f}")
print(f"AUC validación: {auc:.5f}")
print(report)

cm = confusion_matrix(y_val, y_val_pred)
print(f"\n📊 Matriz de Confusión:")
print(f"                 Predicho")
print(f"               No Def  Default")
print(f"Real No Def    {cm[0, 0]:6d}   {cm[0, 1]:6d}")
print(f"Real Default   {cm[1, 0]:6d}   {cm[1, 1]:6d}")

optimal threshold: 0.38888888888888884
FPR: 0.4382
Precision (PPV): 0.6574
AUC validación: 0.78164
              precision    recall  f1-score   support

           0       0.78      0.56      0.65      2184
           1       0.66      0.84      0.74      2183

    accuracy                           0.70      4367
   macro avg       0.72      0.70      0.70      4367
weighted avg       0.72      0.70      0.70      4367


📊 Matriz de Confusión:
                 Predicho
               No Def  Default
Real No Def      1227      957
Real Default      347     1836


In [97]:
df = pd.read_csv("../data/processed/home_credit_train_ready.csv")

y = df['TARGET']
X = df.drop(columns = ['TARGET', 'SK_ID_CURR'])

X.replace([np.inf, -np.inf], np.nan, inplace=True)

cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

num_imputer = SimpleImputer(strategy="median")  #change median for mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")

X[num_cols] = num_imputer.fit_transform(X[num_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

for col in X.select_dtypes("object"):
    X[col] = X[col].astype("category")

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.6, stratify=y ,random_state=42)

# Probabilidades (idéntico conceptualmente)
y_val_proba = model.predict_proba(X_val)[:, 1]

# AUC (esto es lo importante)
auc = roc_auc_score(y_val, y_val_proba)

#optimal threshold based on expected loss
cost_FN = 10000   # default
cost_FP = 1000    # lost good client

losses = []

for t in thresholds:
    y_pred = (y_val_proba >= t).astype(int)

    FP = np.sum((y_val == 0) & (y_pred == 1))
    FN = np.sum((y_val == 1) & (y_pred == 0))

    loss = FP * cost_FP + FN * cost_FN
    losses.append(loss)

optimal_threshold = thresholds[np.argmin(losses)]


#Threshold (solo para métricas de clasificación)
adj_threshold = optimal_threshold

y_val_pred = (y_val_proba >= adj_threshold).astype(int)

# Classification report
report = classification_report(y_val, y_val_pred)

#Confusion matrix manual
TP = np.sum((y_val == 1) & (y_val_pred == 1))
FP = np.sum((y_val == 0) & (y_val_pred == 1))
TN = np.sum((y_val == 0) & (y_val_pred == 0))
FN = np.sum((y_val == 1) & (y_val_pred == 0))

FPR = FP / (FP + TN)
Precision = TP / (TP + FP)

print(f'used threshold: {adj_threshold}')
print(f"FPR: {FPR:.4f}")
print(f"Precision (PPV): {Precision:.4f}")
print(f"AUC validación: {auc:.5f}")
print(report)

cm = confusion_matrix(y_val, y_val_pred)
print(f"\n📊 Matriz de Confusión:")
print(f"                 Predicho")
print(f"               No Def  Default")
print(f"Real No Def    {cm[0, 0]:6d}   {cm[0, 1]:6d}")
print(f"Real Default   {cm[1, 0]:6d}   {cm[1, 1]:6d}")


used threshold: 0.5166666666666666
FPR: 0.2518
Precision (PPV): 0.2259
AUC validación: 0.84252
              precision    recall  f1-score   support

           0       0.97      0.75      0.85    138182
           1       0.23      0.78      0.35     13101

    accuracy                           0.75    151283
   macro avg       0.60      0.76      0.60    151283
weighted avg       0.91      0.75      0.80    151283


📊 Matriz de Confusión:
                 Predicho
               No Def  Default
Real No Def    103388    34794
Real Default     2947    10154


In [100]:
joblib.dump(model, '../models/xgb_new_dataset.pkl')

['../models/xgb_new_dataset.pkl']