In [1]:
import pandas as pd
from sklearn.utils import resample
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import numpy as np
from sklearn.impute import SimpleImputer
import joblib
from xgboost import XGBClassifier
import optuna
from optuna.samplers import TPESampler


In [2]:
#load dataset
df = pd.read_csv("../data/processed/home_credit_train_ready.csv")

In [3]:
#rebalance dataset
class_0 = df[df['TARGET'] == 0]
class_1 = df[df['TARGET'] == 1]

# balance mayority class with minority
sub_class = resample(class_0, replace=False, n_samples=len(class_1), random_state=42)

# combine
df = pd.concat([sub_class, class_1])

In [4]:
y = df['TARGET']
X = df.drop(columns=['TARGET', 'SK_ID_CURR'])  #adde sk_id_curr

In [5]:
#manages divided by 0 values
X.replace([np.inf, -np.inf], np.nan, inplace=True)

In [6]:
#divide categorical and numerical to impute the values
cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

num_imputer = SimpleImputer(strategy="constant", fill_value= 0)  #fill nan's as 0
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")

X[num_cols] = num_imputer.fit_transform(X[num_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

In [7]:
#change object to category to use in the threelike model
for col in X.select_dtypes("object"):
    X[col] = X[col].astype("category")

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y ,random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train ,random_state=42)


In [9]:
#create params for optuna study
xgb_params = {
    # Objective y métrica
    'objective': 'binary:logistic',
    'eval_metric': ['aucpr', 'auc'],  # AUCPR es mejor para desbalanceo


    # Parámetros de boosting - MÁS AGRESIVOS
    'n_estimators': 3000,  # Reducir de 5000 (early stopping lo maneja)
    'learning_rate': 0.01,  # Más bajo = más refinado
    'max_depth': 6,  # Aumentar de 5 (más capacidad)

    # Regularización - BALANCEADA
    'min_child_weight': 5,  # Reducir de 7 (menos restrictivo)
    'gamma': 0.2,  # Reducir de 0.3
    'subsample': 0.8,  # Aumentar de 0.7
    'colsample_bytree': 0.8,  # Aumentar de 0.7
    'colsample_bylevel': 0.8,  # Aumentar de 0.7
    'colsample_bynode': 0.8,  # NUEVO: muestreo por nodo

    # Regularización L1/L2 - MÁS SUAVE
    'reg_alpha': 0.5,  # Reducir de 1.0
    'reg_lambda': 2.0,  # Reducir de 3.0

    # Parámetros adicionales IMPORTANTES
    'max_delta_step': 1,  # Ayuda con desbalanceo extremo
    'tree_method': 'hist',
    'grow_policy': 'depthwise',  # vs lossguide

    # Sampling method para desbalanceo
    'sampling_method': 'gradient_based',  # Alternativa: 'uniform'

    # Control
    'random_state': 42,
    'n_jobs': -1,
    'device': 'cuda',
    'early_stopping_rounds': 200,  # Reducir de 300 (más rápido)
    'verbosity': 1,
    'enable_categorical': True,
}

In [10]:
model = XGBClassifier(**xgb_params)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

[0]	validation_0-aucpr:0.70933	validation_0-auc:0.72336
[1]	validation_0-aucpr:0.71924	validation_0-auc:0.73238
[2]	validation_0-aucpr:0.72451	validation_0-auc:0.73664
[3]	validation_0-aucpr:0.72609	validation_0-auc:0.73744
[4]	validation_0-aucpr:0.72634	validation_0-auc:0.73772
[5]	validation_0-aucpr:0.72764	validation_0-auc:0.73876
[6]	validation_0-aucpr:0.72970	validation_0-auc:0.74092
[7]	validation_0-aucpr:0.72965	validation_0-auc:0.74139
[8]	validation_0-aucpr:0.72952	validation_0-auc:0.74157
[9]	validation_0-aucpr:0.73008	validation_0-auc:0.74162
[10]	validation_0-aucpr:0.73051	validation_0-auc:0.74232
[11]	validation_0-aucpr:0.73090	validation_0-auc:0.74256
[12]	validation_0-aucpr:0.73235	validation_0-auc:0.74360
[13]	validation_0-aucpr:0.73202	validation_0-auc:0.74343
[14]	validation_0-aucpr:0.73216	validation_0-auc:0.74342
[15]	validation_0-aucpr:0.73274	validation_0-auc:0.74406
[16]	validation_0-aucpr:0.73295	validation_0-auc:0.74414
[17]	validation_0-aucpr:0.73325	validatio

In [11]:

# Probabilidades (idéntico conceptualmente)
y_test_proba = model.predict_proba(X_test)[:, 1]

# AUC (esto es lo importante)
auc = roc_auc_score(y_test, y_test_proba)

#optimal threshold based on expected loss
cost_FN = 10000   # default
cost_FP = 1000    # lost good client

thresholds = np.linspace(0, 1, 101)
losses = []

for t in thresholds:
    y_pred = (y_test_proba >= t).astype(int)

    FP = np.sum((y_test == 0) & (y_pred == 1))
    FN = np.sum((y_test == 1) & (y_pred == 0))

    loss = FP * cost_FP + FN * cost_FN
    losses.append(loss)

optimal_threshold = thresholds[np.argmin(losses)]

#Threshold (solo para métricas de clasificación)
adj_threshold = optimal_threshold

y_test_pred = (y_test_proba >= adj_threshold).astype(int)

# Classification report
report = classification_report(y_test, y_test_pred)

#Confusion matrix manual
TP = np.sum((y_test == 1) & (y_test_pred == 1))
FP = np.sum((y_test == 0) & (y_test_pred == 1))
TN = np.sum((y_test == 0) & (y_test_pred == 0))
FN = np.sum((y_test == 1) & (y_test_pred == 0))

FPR = FP / (FP + TN)
Precision = TP / (TP + FP)

print(f"Threshold: {optimal_threshold}")
print(f"FPR: {FPR:.4f}")
print(f"Precision (PPV): {Precision:.4f}")
print(f"AUC validación: {auc:.5f}")
print(report)

cm = confusion_matrix(y_test, y_test_pred)
print(f"\n📊 Matriz de Confusión:")
print(f"                 Predicho")
print(f"               No Def  Default")
print(f"Real No Def    {cm[0, 0]:6d}   {cm[0, 1]:6d}")
print(f"Real Default   {cm[1, 0]:6d}   {cm[1, 1]:6d}")

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


Threshold: 0.1
FPR: 0.9276
Precision (PPV): 0.5175
AUC validación: 0.78085
              precision    recall  f1-score   support

           0       0.93      0.07      0.13      4367
           1       0.52      0.99      0.68      4367

    accuracy                           0.53      8734
   macro avg       0.73      0.53      0.41      8734
weighted avg       0.73      0.53      0.41      8734


📊 Matriz de Confusión:
                 Predicho
               No Def  Default
Real No Def       316     4051
Real Default       22     4345


In [14]:
df = pd.read_csv("../data/processed/home_credit_train_ready.csv")

y = df['TARGET']
X = df.drop(columns = ['TARGET', 'SK_ID_CURR'])

X.replace([np.inf, -np.inf], np.nan, inplace=True)

cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

num_imputer = SimpleImputer(strategy="constant", fill_value= 0)  #change median for mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")

X[num_cols] = num_imputer.fit_transform(X[num_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

for col in X.select_dtypes("object"):
    X[col] = X[col].astype("category")

# Probabilidades (idéntico conceptualmente)
y_test_proba = model.predict_proba(X)[:, 1]

# AUC (esto es lo importante)
auc = roc_auc_score(y, y_test_proba)

#optimal threshold based on expected loss
cost_FN = 10000   # default
cost_FP = 1000    # lost good client

thresholds = np.linspace(0, 1, 101)
losses = []

for t in thresholds:
    y_pred = (y_test_proba >= t).astype(int)

    FP = np.sum((y == 0) & (y_pred == 1))
    FN = np.sum((y == 1) & (y_pred == 0))

    loss = FP * cost_FP + FN * cost_FN
    losses.append(loss)

optimal_threshold = thresholds[np.argmin(losses)]


#Threshold (solo para métricas de clasificación)
adj_threshold = optimal_threshold

y_test_pred = (y_test_proba >= adj_threshold).astype(int)

# Classification report
report = classification_report(y, y_test_pred)

#Confusion matrix manual
TP = np.sum((y == 1) & (y_test_pred == 1))
FP = np.sum((y == 0) & (y_test_pred == 1))
TN = np.sum((y == 0) & (y_test_pred == 0))
FN = np.sum((y == 1) & (y_test_pred == 0))

FPR = FP / (FP + TN)
Precision = TP / (TP + FP)

print(f'used threshold: {adj_threshold}')
print(f"FPR: {FPR:.4f}")
print(f"Precision (PPV): {Precision:.4f}")
print(f"AUC validación: {auc:.5f}")
print(report)

cm = confusion_matrix(y, y_test_pred)
print(f"\n📊 Matriz de Confusión:")
print(f"                 Predicho")
print(f"               No Def  Default")
print(f"Real No Def    {cm[0, 0]:6d}   {cm[0, 1]:6d}")
print(f"Real Default   {cm[1, 0]:6d}   {cm[1, 1]:6d}")


used threshold: 0.53
FPR: 0.2541
Precision (PPV): 0.2218
AUC validación: 0.83239
              precision    recall  f1-score   support

           0       0.97      0.75      0.84    230302
           1       0.22      0.76      0.34     21835

    accuracy                           0.75    252137
   macro avg       0.60      0.75      0.59    252137
weighted avg       0.91      0.75      0.80    252137


📊 Matriz de Confusión:
                 Predicho
               No Def  Default
Real No Def    171778    58524
Real Default     5155    16680


In [13]:
joblib.dump(model, '../models/xgb_new_dataset.pkl')

['../models/xgb_new_dataset.pkl']

In [15]:
58524/171778

0.3406955489061463