In [1]:
import gzip
import json
import os
import pickle
from typing import Tuple

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import (
    balanced_accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
# Rutas y configuración global
# ---------------------------------------------------------------------

TRAIN_PATH = "../files/input/train_data.csv.zip"
TEST_PATH = "../files/input/test_data.csv.zip"

MODEL_FILENAME = "../files/models/model.pkl.gz"
METRICS_FILENAME = "../files/output/metrics.json"

CATEGORICAL = ["SEX", "EDUCATION", "MARRIAGE"]
N_FOLDS = 10
RANDOM_STATE = 420
N_JOBS = -1

# Rejilla de hiperparámetros para el MLP dentro del pipeline
PARAM_GRID = {
"pca__n_components": [20],
"selectkbest__k": [15,20], 
"classifier__hidden_layer_sizes": [(50,30,40,60)], 
"classifier__alpha": [0.256], 
"classifier__learning_rate": ["adaptive"],
"classifier__activation": ["relu"],
"classifier__solver": ["adam", "lbfgs"],
"classifier__learning_rate_init": [0.001],
}


In [3]:
# 1. Limpieza y carga de datos
# ---------------------------------------------------------------------
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """Limpieza según las instrucciones del taller."""
    # Renombrar columna objetivo
    if "default payment next month" in df.columns:
        df = df.rename(columns={"default payment next month": "default"})

    # Remover columna ID si existe
    if "ID" in df.columns:
        df = df.drop(columns=["ID"])

    # Eliminar filas con valores faltantes
    df = df.dropna()

    # EDUCATION: agrupar valores > 4 en la categoría "others" (4)
    if "EDUCATION" in df.columns:
        df["EDUCATION"] = df["EDUCATION"].astype(int)
        df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4

    
    # eliminar registros con EDUCATION == 0 o MARRIAGE == 0
    if "EDUCATION" in df.columns and "MARRIAGE" in df.columns:
        df = df.query("EDUCATION != 0 and MARRIAGE != 0")

    return df

In [4]:

def load_clean_data(
    train_path: str = TRAIN_PATH,
    test_path: str = TEST_PATH,
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
    """Carga train/test, aplica limpieza y divide en X/y."""
    train = pd.read_csv(train_path, index_col=False, compression="zip")
    test = pd.read_csv(test_path, index_col=False, compression="zip")

    train = clean_data(train)
    test = clean_data(test)

    target = "default"

    x_train = train.drop(columns=[target])
    y_train = train[target].astype(int)

    x_test = test.drop(columns=[target])
    y_test = test[target].astype(int)

    return x_train, y_train, x_test, y_test


In [5]:
# 2. Pipeline + GridSearchCV con MLP
# ---------------------------------------------------------------------
def make_pipeline() -> Pipeline:
    """Crea el pipeline con OneHotEncoder + StandardScaler + PCA + SelectKBest + MLP."""
    x_train, _, _, _ = load_clean_data()

    numeric = [c for c in x_train.columns if c not in CATEGORICAL]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), CATEGORICAL),
            ("num", StandardScaler(), numeric),
        ],
        remainder="drop",  # no debería haber más columnas, pero por claridad
    )

    mlp = MLPClassifier(
        random_state=RANDOM_STATE,
        max_iter=1000,
    )

    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("selectkbest", SelectKBest(score_func=f_classif, k="all")),
            ("pca", PCA()),
            ("classifier", mlp),
        ],
        verbose=False,
    )

    return pipeline



In [6]:

def make_grid_search(
    pipeline: Pipeline,
    param_grid=None,
    n_folds: int = N_FOLDS,
    n_jobs: int = N_JOBS,
) -> GridSearchCV:
    """Envuelve el pipeline en un GridSearchCV usando balanced_accuracy."""
    if param_grid is None:
        param_grid = PARAM_GRID

    cv = StratifiedKFold(
        n_splits=n_folds,
        shuffle=True,
        random_state=RANDOM_STATE,
    )

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=n_folds,
        scoring="balanced_accuracy",
        n_jobs=n_jobs,
        verbose=0,
    )

    return grid_search

In [7]:
# 3. Guardar / cargar modelo
# ---------------------------------------------------------------------
def save_model(model) -> None:
    """Guarda el modelo comprimido en MODEL_FILENAME."""
    os.makedirs(os.path.dirname(MODEL_FILENAME), exist_ok=True)
    with gzip.open(MODEL_FILENAME, "wb") as f:
        pickle.dump(model, f)


In [8]:

def load_model():
    """Carga el modelo desde MODEL_FILENAME si existe, sino devuelve None."""
    if not os.path.exists(MODEL_FILENAME):
        return None
    with gzip.open(MODEL_FILENAME, "rb") as f:
        model = pickle.load(f)
    return model


In [9]:
# 4. Entrenamiento 
# ---------------------------------------------------------------------
def train_model(model: GridSearchCV) -> None:
    """Ajusta el GridSearchCV y guarda el mejor modelo (según balanced_accuracy)."""
    x_train, y_train, x_test, y_test = load_clean_data()

    # Entrenar GridSearchCV
    model.fit(x_train, y_train)

    # Comparar con modelo previamente guardado (si existe) usando balanced_accuracy en test
    best_model = load_model()
    if best_model is not None:
        saved_bas = balanced_accuracy_score(y_test, best_model.predict(x_test))
        current_bas = balanced_accuracy_score(y_test, model.predict(x_test))
        if saved_bas >= current_bas:
            # Si el modelo anterior es mejor o igual, lo conservamos
            model = best_model

    save_model(model)



In [10]:

def train_mlp_model(param_grid=None) -> None:
    """Función de alto nivel para entrenar el modelo MLP."""
    pipeline = make_pipeline()
    grid = make_grid_search(pipeline, param_grid=param_grid)
    train_model(grid)

In [11]:
# 5. Métricas y matrices de confusión
# ---------------------------------------------------------------------
def eval_metrics(model, x_train, y_train, x_test, y_test):
    """Calcula métricas y matrices de confusión en train y test."""

    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)

    def metrics_dict(y_true, y_pred, dataset):
        return {
            "type": "metrics",
            "dataset": dataset,
            "precision": float(precision_score(y_true, y_pred, zero_division=0)),
            "balanced_accuracy": float(balanced_accuracy_score(y_true, y_pred)),
            "recall": float(recall_score(y_true, y_pred, zero_division=0)),
            "f1_score": float(f1_score(y_true, y_pred, zero_division=0)),
        }

    def cm_dict(y_true, y_pred, dataset):
        cm = confusion_matrix(y_true.astype(int), y_pred.astype(int), labels=[0, 1])
        tn, fp, fn, tp = int(cm[0, 0]), int(cm[0, 1]), int(cm[1, 0]), int(cm[1, 1])
        return {
            "type": "cm_matrix",
            "dataset": dataset,
            "true_0": {"predicted_0": tn, "predicted_1": fp},
            "true_1": {"predicted_0": fn, "predicted_1": tp},
        }

    metrics_train = metrics_dict(y_train, y_pred_train, "train")
    metrics_test = metrics_dict(y_test, y_pred_test, "test")
    cm_train = cm_dict(y_train, y_pred_train, "train")
    cm_test = cm_dict(y_test, y_pred_test, "test")

    return metrics_train, metrics_test, cm_train, cm_test



In [12]:

def save_report(
    metrics_train,
    metrics_test,
    cm_train,
    cm_test,
) -> None:
    """Escribe metrics.json con 4 líneas (2 métricas + 2 matrices de confusión)."""
    os.makedirs(os.path.dirname(METRICS_FILENAME), exist_ok=True)
    with open(METRICS_FILENAME, "w", encoding="utf-8", newline="\n") as f:
        f.write(json.dumps(metrics_train) + "\n")
        f.write(json.dumps(metrics_test) + "\n")
        f.write(json.dumps(cm_train) + "\n")
        f.write(json.dumps(cm_test) + "\n")

In [13]:


def print_report(metrics_train, metrics_test, cm_train=None, cm_test=None) -> None:
    """Imprime un resumen compacto de métricas (test (train))."""

    def fmt(name, test_val, train_val):
        return f"{name:>20}: {test_val:.4f} ({train_val:.4f})"

    print("-" * 80)
    print("Metrics summary (test (train))")
    print("-" * 80)
    print(
        fmt(
            "Balanced Accuracy",
            metrics_test["balanced_accuracy"],
            metrics_train["balanced_accuracy"],
        )
    )
    print(fmt("Precision", metrics_test["precision"], metrics_train["precision"]))
    print(fmt("Recall", metrics_test["recall"], metrics_train["recall"]))
    print(fmt("F1-score", metrics_test["f1_score"], metrics_train["f1_score"]))
    if cm_test and cm_train:
        print("-" * 80)
        print("Confusion matrix (test):")
        print(
            f" true_0 -> predicted_0: {cm_test['true_0']['predicted_0']}, "
            f"predicted_1: {cm_test['true_0']['predicted_1']}"
        )
        print(
            f" true_1 -> predicted_0: {cm_test['true_1']['predicted_0']}, "
            f"predicted_1: {cm_test['true_1']['predicted_1']}"
        )
    print("-" * 80)


In [14]:

def check_estimator() -> None:
    """Carga datos, evalúa el modelo guardado y genera metrics.json."""
    x_train, y_train, x_test, y_test = load_clean_data()
    model = load_model()
    if model is None:
        raise RuntimeError("No se encontró el modelo entrenado en files/models.")

    metrics_train, metrics_test, cm_train, cm_test = eval_metrics(
        model, x_train, y_train, x_test, y_test
    )

    save_report(metrics_train, metrics_test, cm_train, cm_test)
    print_report(metrics_train, metrics_test, cm_train, cm_test)

In [15]:

def print_get_params() -> None:
    """Imprime todos los parámetros del GridSearchCV guardado."""
    model = load_model()
    if model is None:
        print("No model found.")
        return
    print("Get model parameters:")
    for param, value in model.get_params().items():
        print(f"  {param}: {value}")

In [16]:

def print_best_model_params() -> None:
    """Imprime los mejores hiperparámetros encontrados por GridSearchCV."""
    model = load_model()
    if model is None:
        print("No model found.")
        return
    if not hasattr(model, "best_params_"):
        print("El modelo cargado no tiene atributo best_params_.")
        return
    print("Best model parameters:")
    for param, value in model.best_params_.items():
        print(f"  {param}: {value}")

In [17]:
# 6. Ejecución directa del script
# ---------------------------------------------------------------------
if __name__ == "__main__":
    # Entrena el modelo MLP y guarda el mejor en files/models/model.pkl.gz
    train_mlp_model()

    # Evalúa el modelo guardado y genera files/output/metrics.json
    check_estimator()

    #  imprime los mejores hiperparámetros encontrados
    print_best_model_params()

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Semestre 2025-2\Analítica predictiva\LAB-04-prediccion-del-default-usando-mlp-DanielZambrano00\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Semestre 2025-2\Analítica predictiva\LAB-04-prediccion-del-default-usando-mlp-DanielZambrano00\.venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\Semestre 2025-2\Analítica predictiva\LAB-04-prediccion-del-default-usando-mlp-DanielZambrano00\.venv\lib\s

--------------------------------------------------------------------------------
Metrics summary (test (train))
--------------------------------------------------------------------------------
   Balanced Accuracy: 0.6704 (0.6674)
           Precision: 0.6801 (0.7016)
              Recall: 0.3903 (0.3822)
            F1-score: 0.4960 (0.4949)
--------------------------------------------------------------------------------
Confusion matrix (test):
 true_0 -> predicted_0: 6723, predicted_1: 350
 true_1 -> predicted_0: 1162, predicted_1: 744
--------------------------------------------------------------------------------
Best model parameters:
  classifier__activation: relu
  classifier__alpha: 0.256
  classifier__hidden_layer_sizes: (50, 30, 40, 60)
  classifier__learning_rate: adaptive
  classifier__learning_rate_init: 0.001
  classifier__solver: adam
  pca__n_components: 20
  selectkbest__k: 20
