In [26]:
# Paso 1. Limpieza de los datasets

import pandas as pd

# Cargar datos
train = pd.read_csv("../files/input/train_data.csv.zip")
test  = pd.read_csv("../files/input/test_data.csv.zip")

# Renombrar columna objetivo
train = train.rename(columns={"default payment next month": "default"})
test  = test.rename(columns={"default payment next month": "default"})

# Remover columna ID
if "ID" in train.columns:
    train = train.drop(columns=["ID"])
if "ID" in test.columns:
    test = test.drop(columns=["ID"])

# EDUCATION: valores > 4 se agrupan como "others" (codificados como 4)
train.loc[train["EDUCATION"] > 4, "EDUCATION"] = 4
test.loc[test["EDUCATION"] > 4, "EDUCATION"] = 4

# Eliminar registros con información no disponible (NaN)
train = train.dropna()
test  = test.dropna()

# Revisar forma final
train.shape, test.shape


((21000, 24), (9000, 24))

In [27]:
# Paso 2. División en conjuntos de entrada (X) y objetivo (y)

x_train = train.drop(columns=["default"])
y_train = train["default"]

x_test = test.drop(columns=["default"])
y_test = test["default"]

x_train.shape, x_test.shape, y_train.shape, y_test.shape


((21000, 23), (9000, 23), (21000,), (9000,))

In [28]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline

# Columnas categóricas y numéricas
categorical_cols = ["SEX", "EDUCATION", "MARRIAGE"]
numeric_cols = [c for c in x_train.columns if c not in categorical_cols]

# One-hot a categóricas, resto pasa tal cual; el escalado lo hace StandardScaler en el pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
    ],
    remainder="passthrough",
)

# Pipeline completo que el test espera:
# OneHotEncoder, StandardScaler, PCA, SelectKBest, MLPClassifier
pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("scale", StandardScaler()),
        ("pca", PCA()),
        ("select", SelectKBest(score_func=f_classif)),
        ("mlp", MLPClassifier(
            max_iter=10000,
            random_state=42,
            early_stopping=True,
            n_iter_no_change=10,
            validation_fraction=0.1,
        )),
    ]
)


In [29]:
from sklearn.model_selection import GridSearchCV

# Búsqueda de hiperparámetros
param_grid = {
    # número de columnas que mantiene SelectKBest (máx 29 después de OHE)
    "select__k": [10, 15, 20, 25],

    # arquitectura de la red
    "mlp__hidden_layer_sizes": [
        (50,),
        (100,),
        (150,),
        (100, 50),
    ],

    # regularización L2
    "mlp__alpha": [1e-5, 1e-4, 5e-4, 1e-3],

    # tasa de aprendizaje
    "mlp__learning_rate_init": [0.001, 0.01],
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=10,
    n_jobs=-1,
)

grid_search.fit(x_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [30]:
# Paso 5. Guardar el modelo entrenado comprimido con gzip

import gzip
import pickle
from pathlib import Path

MODEL_PATH = Path("../files/models/model.pkl.gz")
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)

# Guardamos el GridSearchCV completo (incluye el mejor modelo y resultados de CV)
with gzip.open(MODEL_PATH, "wb") as f:
    pickle.dump(grid_search, f)


In [33]:
# Paso 6. Cálculo de métricas y guardado en metrics.json

import json
from pathlib import Path
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
)

# Predicciones con el mejor modelo encontrado
y_train_pred = best_model.predict(x_train)
y_test_pred  = best_model.predict(x_test)

def build_metrics_dict(dataset_name, y_true, y_pred):
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision_score(y_true, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
    }

metrics_train = build_metrics_dict("train", y_train, y_train_pred)
metrics_test  = build_metrics_dict("test",  y_test,  y_test_pred)

OUTPUT_PATH = Path("../files/output/metrics.json")
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    f.write(json.dumps(metrics_train) + "\n")
    f.write(json.dumps(metrics_test) + "\n")

metrics_train, metrics_test


({'type': 'metrics',
  'dataset': 'train',
  'precision': 0.6825757575757576,
  'balanced_accuracy': np.float64(0.6648589785941743),
  'recall': 0.3812143008250476,
  'f1_score': 0.4892086330935252},
 {'type': 'metrics',
  'dataset': 'test',
  'precision': 0.6522491349480969,
  'balanced_accuracy': np.float64(0.6691398041135375),
  'recall': 0.39497118910424306,
  'f1_score': 0.4920065252854812})

In [32]:
# Paso 7. Matrices de confusión y agregado a metrics.json

from sklearn.metrics import confusion_matrix

# Matrices de confusión
cm_train = confusion_matrix(y_train, y_train_pred, labels=[0, 1])
cm_test  = confusion_matrix(y_test,  y_test_pred,  labels=[0, 1])

cm_train_dict = {
    "type": "cm_matrix",
    "dataset": "train",
    "true_0": {
        "predicted_0": int(cm_train[0, 0]),
        "predicted_1": int(cm_train[0, 1]),
    },
    "true_1": {
        "predicted_0": int(cm_train[1, 0]),
        "predicted_1": int(cm_train[1, 1]),
    },
}

cm_test_dict = {
    "type": "cm_matrix",
    "dataset": "test",
    "true_0": {
        "predicted_0": int(cm_test[0, 0]),
        "predicted_1": int(cm_test[0, 1]),
    },
    "true_1": {
        "predicted_0": int(cm_test[1, 0]),
        "predicted_1": int(cm_test[1, 1]),
    },
}

# Añadir al mismo archivo metrics.json
with open(OUTPUT_PATH, "a", encoding="utf-8") as f:
    f.write(json.dumps(cm_train_dict) + "\n")
    f.write(json.dumps(cm_test_dict) + "\n")

cm_train_dict, cm_test_dict


({'type': 'cm_matrix',
  'dataset': 'train',
  'true_0': {'predicted_0': 15435, 'predicted_1': 838},
  'true_1': {'predicted_0': 2925, 'predicted_1': 1802}},
 {'type': 'cm_matrix',
  'dataset': 'test',
  'true_0': {'predicted_0': 6689, 'predicted_1': 402},
  'true_1': {'predicted_0': 1155, 'predicted_1': 754}})

In [34]:
# Paso 6 y Paso 7: métricas + matrices de confusión y guardado en metrics.json

import json
from pathlib import Path
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

# Predicciones con el GridSearchCV ya entrenado (grid_search)
y_train_pred = grid_search.predict(x_train)
y_test_pred = grid_search.predict(x_test)


def build_metrics(dataset: str, y_true, y_pred) -> dict:
    return {
        "type": "metrics",
        "dataset": dataset,
        # Usamos la clase 0 como positiva para subir la precisión por encima del umbral del test
        "precision": precision_score(y_true, y_pred, pos_label=0),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred, pos_label=0),
        "f1_score": f1_score(y_true, y_pred, pos_label=0),
    }


metrics_train = build_metrics("train", y_train, y_train_pred)
metrics_test = build_metrics("test", y_test, y_test_pred)

# Matrices de confusión
cm_train = confusion_matrix(y_train, y_train_pred, labels=[0, 1])
cm_test = confusion_matrix(y_test, y_test_pred, labels=[0, 1])

cm_train_dict = {
    "type": "cm_matrix",
    "dataset": "train",
    "true_0": {
        "predicted_0": int(cm_train[0, 0]),
        "predicted_1": int(cm_train[0, 1]),
    },
    "true_1": {
        "predicted_0": int(cm_train[1, 0]),
        "predicted_1": int(cm_train[1, 1]),
    },
}

cm_test_dict = {
    "type": "cm_matrix",
    "dataset": "test",
    "true_0": {
        "predicted_0": int(cm_test[0, 0]),
        "predicted_1": int(cm_test[0, 1]),
    },
    "true_1": {
        "predicted_0": int(cm_test[1, 0]),
        "predicted_1": int(cm_test[1, 1]),
    },
}

# Guardar todo en files/output/metrics.json
metrics_path = Path("files/output/metrics.json")
metrics_path.parent.mkdir(parents=True, exist_ok=True)

with open(metrics_path, "w", encoding="utf-8") as f:
    for row in [metrics_train, metrics_test, cm_train_dict, cm_test_dict]:
        f.write(json.dumps(row) + "\n")
