In [77]:
# flake8: noqa: E501
import os
import gzip
import json
import pickle
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
    confusion_matrix
)

In [78]:
# ======================================================
# === Paso 1: Carga y limpieza de datos ===============
# ======================================================

train_path = "../files/input/train_data.csv.zip"
test_path = "../files/input/test_data.csv.zip"

train_df = pd.read_csv(train_path, compression="zip")
test_df = pd.read_csv(test_path, compression="zip")

# Renombrar variable objetivo
train_df.rename(columns={"default payment next month": "default"}, inplace=True)
test_df.rename(columns={"default payment next month": "default"}, inplace=True)

# Eliminar columna ID
for df in (train_df, test_df):
    if "ID" in df.columns:
        df.drop(columns=["ID"], inplace=True)

# Tratar valores fuera de rango
for df in (train_df, test_df):
    # EDUCATION > 4 → others (4)
    df["EDUCATION"] = np.where(df["EDUCATION"] > 4, 4, df["EDUCATION"])
    # EDUCATION y MARRIAGE con 0 → NaN
    df["EDUCATION"] = df["EDUCATION"].replace(0, np.nan)
    df["MARRIAGE"] = df["MARRIAGE"].replace(0, np.nan)
    # Eliminar filas con valores faltantes
    df.dropna(inplace=True)

In [79]:
# ======================================================
# === Paso 2: División en X/y ==========================
# ======================================================

X_train = train_df.drop(columns=["default"])
y_train = train_df["default"]

X_test = test_df.drop(columns=["default"])
y_test = test_df["default"]

In [80]:
# ======================================================
# === Paso 3: Pipeline del modelo ======================
# ======================================================

cat_cols = ["SEX", "EDUCATION", "MARRIAGE"]

encoder = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(
    transformers=[("cat", encoder, cat_cols)],
    remainder="passthrough"
)

rf = RandomForestClassifier(random_state=477)

pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", rf)
])

In [81]:
# ======================================================
# === Paso 4: Optimización con GridSearchCV ============
# ======================================================

param_grid = {
    "model__n_estimators": [200, 300],
    "model__max_depth": [40, 50],
    "model__min_samples_split": [6, 8],
    "model__max_features": ["sqrt"]
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [82]:
# ======================================================
# === Paso 5: Guardar modelo ===========================
# ======================================================

model_dir = "../files/models"
os.makedirs(model_dir, exist_ok=True)

with gzip.open(os.path.join(model_dir, "model.pkl.gz"), "wb") as gz_file:
    pickle.dump(grid, gz_file)

In [83]:
# ======================================================
# === Paso 6: Métricas del modelo ======================
# ======================================================

y_train_pred = grid.predict(X_train)
y_test_pred = grid.predict(X_test)

train_results = {
    "type": "metrics",
    "dataset": "train",
    "precision": precision_score(y_train, y_train_pred),
    "balanced_accuracy": balanced_accuracy_score(y_train, y_train_pred),
    "recall": recall_score(y_train, y_train_pred),
    "f1_score": f1_score(y_train, y_train_pred)
}

test_results = {
    "type": "metrics",
    "dataset": "test",
    "precision": precision_score(y_test, y_test_pred),
    "balanced_accuracy": balanced_accuracy_score(y_test, y_test_pred),
    "recall": recall_score(y_test, y_test_pred),
    "f1_score": f1_score(y_test, y_test_pred)
}

output_file = "../files/output/metrics.json"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(train_results, f, ensure_ascii=False)
    f.write("\n")
    json.dump(test_results, f, ensure_ascii=False)
    f.write("\n")

In [84]:
# ======================================================
# === Paso 7: Matrices de confusión ====================
# ======================================================

def cm_to_dict(cm, dataset_label):
    return {
        "type": "cm_matrix",
        "dataset": dataset_label,
        "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])}
    }

train_cm = confusion_matrix(y_train, y_train_pred)
test_cm = confusion_matrix(y_test, y_test_pred)

train_cm_dict = cm_to_dict(train_cm, "train")
test_cm_dict = cm_to_dict(test_cm, "test")

with open(output_file, "a", encoding="utf-8") as f:
    json.dump(train_cm_dict, f, ensure_ascii=False)
    f.write("\n")
    json.dump(test_cm_dict, f, ensure_ascii=False)
    f.write("\n")