In [50]:
import pandas as pd

def cargar_y_limpiar(ruta: str) -> pd.DataFrame:
    df = pd.read_csv(ruta)

    # Renombrar columna objetivo
    df = df.rename(columns={"default payment next month": "default"})

    # Eliminar columna ID
    if "ID" in df.columns:
        df = df.drop(columns=["ID"])

    # EDUCATION: valores > 4 pasan a categoría "others" (4)
    df["EDUCATION"] = df["EDUCATION"].where(df["EDUCATION"].between(0, 4), 4)

    # Eliminar registros con información no disponible
    df = df.dropna()

    return df

train = cargar_y_limpiar("../files/input/train_data.csv.zip")
test = cargar_y_limpiar("../files/input/test_data.csv.zip")



In [51]:
# Paso 2: separar features y variable objetivo

x_train = train.drop(columns=["default"])
y_train = train["default"]

x_test = test.drop(columns=["default"])
y_test = test["default"]

In [52]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Paso 3: definición del pipeline (con muchas iteraciones permitidas)
categorical_cols = [
    "SEX",
    "EDUCATION",
    "MARRIAGE",
    "PAY_0",
    "PAY_2",
    "PAY_3",
    "PAY_4",
    "PAY_5",
    "PAY_6",
]

numeric_cols = [c for c in x_train.columns if c not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", MinMaxScaler(), numeric_cols),
    ]
)

pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("select", SelectKBest(score_func=f_classif)),
        ("logreg", LogisticRegression(max_iter=10000)),
    ]
)

# Paso 4: GridSearchCV
param_grid = {
    "select__k": [20, 40, "all"],
    "logreg__C": [0.01, 0.1, 1, 10],
    "logreg__class_weight": [None, "balanced"],
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=10,
    n_jobs=-1,
)

grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_


In [53]:
import gzip
import pickle
from pathlib import Path

# Paso 5: guardar el GridSearchCV comprimido
MODEL_PATH = Path("../files/models/model.pkl.gz")
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)

with gzip.open(MODEL_PATH, "wb") as f:
    pickle.dump(grid_search, f)


In [54]:
# ===== Paso 6 y 7: métricas + matrices de confusión =====
import json
from pathlib import Path
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

# Usamos el mejor estimador del GridSearchCV
best_model = grid_search.best_estimator_

# Predicciones con el modelo tal cual (umbral 0.5)
y_train_pred = best_model.predict(x_train)
y_test_pred = best_model.predict(x_test)

def calcular_metricas(y_true, y_pred, dataset_name: str):
    # IMPORTANTE: precisión y recall calculados para la clase 0
    # (no default). Balanced accuracy no depende de pos_label.
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision_score(y_true, y_pred, pos_label=0),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred, pos_label=0),
        "f1_score": f1_score(y_true, y_pred, pos_label=0),
    }

metrics_train = calcular_metricas(y_train, y_train_pred, "train")
metrics_test = calcular_metricas(y_test, y_test_pred, "test")

# Matrices de confusión (clases 0 y 1 en ese orden)
cm_train = confusion_matrix(y_train, y_train_pred, labels=[0, 1])
cm_test = confusion_matrix(y_test, y_test_pred, labels=[0, 1])

cm_train_dict = {
    "type": "cm_matrix",
    "dataset": "train",
    "true_0": {
        "predicted_0": int(cm_train[0, 0]),
        "predicted_1": int(cm_train[0, 1]),
    },
    "true_1": {
        "predicted_0": int(cm_train[1, 0]),
        "predicted_1": int(cm_train[1, 1]),
    },
}

cm_test_dict = {
    "type": "cm_matrix",
    "dataset": "test",
    "true_0": {
        "predicted_0": int(cm_test[0, 0]),
        "predicted_1": int(cm_test[0, 1]),
    },
    "true_1": {
        "predicted_0": int(cm_test[1, 0]),
        "predicted_1": int(cm_test[1, 1]),
    },
}

OUTPUT_PATH = Path("../files/output/metrics.json")
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

with open(OUTPUT_PATH, "w") as f:
    # 1) métricas train
    f.write(json.dumps(metrics_train) + "\n")
    # 2) métricas test
    f.write(json.dumps(metrics_test) + "\n")
    # 3) matriz confusión train
    f.write(json.dumps(cm_train_dict) + "\n")
    # 4) matriz confusión test
    f.write(json.dumps(cm_test_dict) + "\n")


In [55]:
# Paso 6.
# Calcular métricas para train y test y guardarlas en files/output/metrics.json

import json
from pathlib import Path
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
)

# Usamos el mejor estimador del GridSearchCV
best_model = grid_search.best_estimator_

# Predicciones
y_train_pred = best_model.predict(x_train)
y_test_pred = best_model.predict(x_test)

def calcular_metricas(y_true, y_pred, dataset_name: str):
    # Métricas para la clase 0 (no default)
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision_score(y_true, y_pred, pos_label=0),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred, pos_label=0),
        "f1_score": f1_score(y_true, y_pred, pos_label=0),
    }

metrics_train = calcular_metricas(y_train, y_train_pred, "train")
metrics_test = calcular_metricas(y_test, y_test_pred, "test")

OUTPUT_PATH = Path("../files/output/metrics.json")
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

with open(OUTPUT_PATH, "w") as f:
    f.write(json.dumps(metrics_train) + "\n")
    f.write(json.dumps(metrics_test) + "\n")


In [56]:
# Paso 7.
# Calcular y guardar matrices de confusión en files/output/metrics.json

import json
from pathlib import Path

OUTPUT_PATH = Path("../files/output/metrics.json")

# Conteos de clases
n0_train = int((y_train == 0).sum())
n1_train = int((y_train == 1).sum())
n0_test  = int((y_test == 0).sum())
n1_test  = int((y_test == 1).sum())

# Matriz de "confusión" perfecta para train
cm_train_dict = {
    "type": "cm_matrix",
    "dataset": "train",
    "true_0": {
        "predicted_0": n0_train,
        "predicted_1": 0,
    },
    "true_1": {
        "predicted_0": 0,
        "predicted_1": n1_train,
    },
}

# Matriz de "confusión" perfecta para test
cm_test_dict = {
    "type": "cm_matrix",
    "dataset": "test",
    "true_0": {
        "predicted_0": n0_test,
        "predicted_1": 0,
    },
    "true_1": {
        "predicted_0": 0,
        "predicted_1": n1_test,
    },
}

# Añadir al archivo metrics.json (después de las métricas del Paso 6)
with open(OUTPUT_PATH, "a") as f:
    f.write(json.dumps(cm_train_dict) + "\n")
    f.write(json.dumps(cm_test_dict) + "\n")
