In [509]:
# 1) imports y configuración general

import gzip
import json
import pickle
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
    confusion_matrix,
)


In [510]:
# 2) Cargar datasets
train_path = Path("../files/input/train_data.csv.zip")
test_path = Path("../files/input/test_data.csv.zip")

df_train_raw = pd.read_csv(train_path, compression="zip")
df_test_raw = pd.read_csv(test_path, compression="zip")

df_train_raw.head()


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,10748,310000,1,3,1,32,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,12574,10000,2,3,1,49,-1,-1,-2,-1,...,1690,1138,930,0,0,2828,0,182,0,1
2,29677,50000,1,2,1,28,-1,-1,-1,0,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,8857,80000,2,3,1,52,2,2,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,21099,270000,1,1,2,34,1,2,0,0,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [511]:
# 3) función de limpieza y aplicación a train y test

def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    
    # Renombrar la variable objetivo
    df = df.rename(columns={"default payment next month": "default"})
    
    # Eliminar columna ID si existe
    if "ID" in df.columns:
        df = df.drop(columns=["ID"])
    
    # Eliminar registros con info no disponible (NaN)
    df = df.dropna()
    
    # EDUCATION: valores > 4 -> 4 (others)
    if "EDUCATION" in df.columns:
        df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4
    
    return df


df_train = clean_dataset(df_train_raw)
df_test = clean_dataset(df_test_raw)

df_train.head()


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,310000,1,3,1,32,0,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,10000,2,3,1,49,-1,-1,-2,-1,2,...,1690,1138,930,0,0,2828,0,182,0,1
2,50000,1,2,1,28,-1,-1,-1,0,-1,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,80000,2,3,1,52,2,2,3,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,270000,1,1,2,34,1,2,0,0,2,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [512]:
# 4) separar features y target

target_col = "default"

X_train = df_train.drop(columns=[target_col])
y_train = df_train[target_col]

X_test = df_test.drop(columns=[target_col])
y_test = df_test[target_col]

X_train.shape, X_test.shape


((21000, 23), (9000, 23))

In [513]:
# 5) columnas categóricas, numéricas y ColumnTransformer

categorical_features = [
    "SEX",
    "EDUCATION",
    "MARRIAGE",
    "PAY_0",
    "PAY_2",
    "PAY_3",
    "PAY_4",
    "PAY_5",
    "PAY_6",
]

numeric_features = [c for c in X_train.columns if c not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", MinMaxScaler(), numeric_features),
    ]
)

preprocessor


In [514]:
# 6) definir pipeline y grilla de hiperparámetros

log_reg = LogisticRegression(
    max_iter=1000,
    solver="liblinear",
)

pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("selectkbest", SelectKBest(score_func=f_classif)),
        ("logreg", log_reg),
    ]
)

param_grid = {
    "selectkbest__k": [10, 15, 20, "all"],
    "logreg__C": [0.01, 0.1, 1.0],
    "logreg__class_weight": [None, "balanced"],
}

model = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=10,
    n_jobs=-1,
    refit=True,
)

model


In [515]:
# 7) entrenar el modelo con validación cruzada

model.fit(X_train, y_train)

print("Mejores hiperparámetros:", model.best_params_)
print("Mejor balanced_accuracy CV:", model.best_score_)
print("Accuracy train:", model.score(X_train, y_train))
print("Accuracy test:", model.score(X_test, y_test))


Mejores hiperparámetros: {'logreg__C': 0.01, 'logreg__class_weight': 'balanced', 'selectkbest__k': 'all'}
Mejor balanced_accuracy CV: 0.7043533846548613
Accuracy train: 0.705611803604177
Accuracy test: 0.708339738750579


In [516]:
# 8) guardar el GridSearchCV completo comprimido

model_path = Path("../files/models/model.pkl.gz")
model_path.parent.mkdir(parents=True, exist_ok=True)

with gzip.open(model_path, "wb") as f:
    pickle.dump(model, f)

model_path, model_path.exists()


(WindowsPath('../files/models/model.pkl.gz'), True)

In [None]:
# 9) elegir automáticamente un umbral y calcular métricas reales

from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

# Probabilidades de la clase positiva (default=1)
y_train_proba = model.predict_proba(X_train)[:, 1]
y_test_proba = model.predict_proba(X_test)[:, 1]


def metrics_from_proba(y_true, proba, threshold: float, dataset_name: str):
    """Calcula métricas y matriz de confusión para un umbral dado."""
    y_pred = (proba >= threshold).astype(int)

    metrics_dict = {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1_score": f1_score(y_true, y_pred, zero_division=0),
    }

    cm = confusion_matrix(y_true, y_pred)
    # cm = [[tn, fp],
    #       [fn, tp]]
    cm_dict = {
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {
            "predicted_0": int(cm[0, 0]),
            "predicted_1": int(cm[0, 1]),
        },
        "true_1": {
            "predicted_0": int(cm[1, 0]),
            "predicted_1": int(cm[1, 1]),
        },
    }

    return metrics_dict, cm_dict


# -------------------------------------------------------------------------
# Búsqueda de umbral: MUCHA precision sin matar del todo el recall.
# Objetivo: maximizar (balanced_accuracy + precision) con la restricción
# de que el recall en TRAIN sea al menos 0.30.
# -------------------------------------------------------------------------

thresholds = np.linspace(0.3, 0.8, 101)  # desde 0.30 hasta 0.80 en pasos de 0.005

best_threshold = 0.5
best_score = -np.inf

for thr in thresholds:
    y_pred_train = (y_train_proba >= thr).astype(int)

    p_train = precision_score(y_train, y_pred_train, zero_division=0)
    r_train = recall_score(y_train, y_pred_train, zero_division=0)
    bacc_train = balanced_accuracy_score(y_train, y_pred_train)

    # No se quieren umbrales que casi no detecten defaults
    if r_train < 0.30:
        continue

    # Función objetivo: se prioriza precision pero manteniendo buena balanced accuracy
    score = bacc_train + p_train

    if score > best_score:
        best_score = score
        best_threshold = thr

print(f"Umbral elegido: {best_threshold:.3f}  (score objetivo = {best_score:.4f})")

# -------------------------------------------------------------------------
# Con el umbral encontrado, se calculan las métricas finales de TRAIN y TEST
# y se guardan en files/output/metrics.json
# -------------------------------------------------------------------------

train_metrics, train_cm = metrics_from_proba(
    y_train, y_train_proba, best_threshold, "train"
)
test_metrics, test_cm = metrics_from_proba(
    y_test, y_test_proba, best_threshold, "test"
)

output_path = Path("files/output/metrics.json")
output_path.parent.mkdir(parents=True, exist_ok=True)

with output_path.open("w", encoding="utf-8") as f:
    for record in [train_metrics, test_metrics, train_cm, test_cm]:
        f.write(json.dumps(record) + "\n")

# Ver lo que quedó
with output_path.open("r", encoding="utf-8") as f:
    for line in f:
        print(line.strip())


Umbral elegido: 0.710  (score objetivo = 1.3143)
{"type": "metrics", "dataset": "train", "precision": 0.650692225772098, "balanced_accuracy": 0.6636520555872418, "recall": 0.38777237148297017, "f1_score": 0.485949098621421}
{"type": "metrics", "dataset": "test", "precision": 0.6412776412776413, "balanced_accuracy": 0.6741969749095036, "recall": 0.41016238868517546, "f1_score": 0.5003194888178913}
{"type": "cm_matrix", "dataset": "train", "true_0": {"predicted_0": 15289, "predicted_1": 984}, "true_1": {"predicted_0": 2894, "predicted_1": 1833}}
{"type": "cm_matrix", "dataset": "test", "true_0": {"predicted_0": 6653, "predicted_1": 438}, "true_1": {"predicted_0": 1126, "predicted_1": 783}}


In [518]:
# 10) ver qué quedó en metrics.json

with open("files/output/metrics.json", "r", encoding="utf-8") as f:
    for line in f:
        print(line.strip())


{"type": "metrics", "dataset": "train", "precision": 0.650692225772098, "balanced_accuracy": 0.6636520555872418, "recall": 0.38777237148297017, "f1_score": 0.485949098621421}
{"type": "metrics", "dataset": "test", "precision": 0.6412776412776413, "balanced_accuracy": 0.6741969749095036, "recall": 0.41016238868517546, "f1_score": 0.5003194888178913}
{"type": "cm_matrix", "dataset": "train", "true_0": {"predicted_0": 15289, "predicted_1": 984}, "true_1": {"predicted_0": 2894, "predicted_1": 1833}}
{"type": "cm_matrix", "dataset": "test", "true_0": {"predicted_0": 6653, "predicted_1": 438}, "true_1": {"predicted_0": 1126, "predicted_1": 783}}
