In [1]:
# ===============================================
# Importación de librerías necesarias
# ===============================================
import os
import gzip
import json
import pickle
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
    confusion_matrix
)


ModuleNotFoundError: No module named 'pandas'

In [None]:
# ===============================================
# Funciones de carga y limpieza de datos
# ===============================================

def cargar_datos(ruta: str) -> pd.DataFrame:

    return pd.read_csv(ruta, compression="zip", index_col=False)


def limpiar_datos(df: pd.DataFrame) -> pd.DataFrame:
   
    df = df.rename(columns={"default payment next month": "default"})
    df = df.drop(columns=["ID"])
    df = df[df["MARRIAGE"] != 0]
    df = df[df["EDUCATION"] != 0]
    df["EDUCATION"] = df["EDUCATION"].apply(lambda x: x if x < 4 else 4)
    return df


In [None]:
# ===============================================
# Construcción del modelo y búsqueda de parámetros
# ===============================================

def crear_pipeline() -> Pipeline:

    cat_vars = ["SEX", "EDUCATION", "MARRIAGE"]

    preprocesador = ColumnTransformer(
        transformers=[
            ("categoricas", OneHotEncoder(handle_unknown="ignore"), cat_vars)
        ],
        remainder="passthrough"
    )

    modelo = RandomForestClassifier(random_state=42)

    return Pipeline([
        ("preprocesamiento", preprocesador),
        ("clasificador", modelo)
    ])


def configurar_gridsearch(pipeline: Pipeline) -> GridSearchCV:

    grid = {
        "clasificador__n_estimators": [50, 100, 200],
        "clasificador__max_depth": [None, 5, 10, 20],
        "clasificador__min_samples_split": [2, 5, 10],
        "clasificador__min_samples_leaf": [1, 2, 4],
    }

    return GridSearchCV(
        estimator=pipeline,
        param_grid=grid,
        scoring="balanced_accuracy",
        cv=10,
        n_jobs=-1,
        verbose=2,
        refit=True
    )


In [None]:
# ===============================================
# Función para guardar el modelo entrenado
# ===============================================

def guardar_modelo(ruta: str, modelo):

    os.makedirs(os.path.dirname(ruta), exist_ok=True)
    with gzip.open(ruta, "wb") as f:
        pickle.dump(modelo, f)


In [None]:
# ===============================================
# Cálculo de métricas y matriz de confusión
# ===============================================

def calcular_metricas(dataset: str, y_true, y_pred) -> dict:

    return {
        "type": "metrics",
        "dataset": dataset,
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1_score": f1_score(y_true, y_pred, zero_division=0),
    }


def matriz_confusion_json(dataset: str, y_true, y_pred) -> dict:
 
    cm = confusion_matrix(y_true, y_pred)
    return {
        "type": "cm_matrix",
        "dataset": dataset,
        "true_0": {"predicted_0": int(cm[0][0]), "predicted_1": int(cm[0][1])},
        "true_1": {"predicted_0": int(cm[1][0]), "predicted_1": int(cm[1][1])}
    }


In [None]:
# ===============================================
# Carga, limpieza y separación de los datasets
# ===============================================

train_df = cargar_datos(os.path.join("../files/input/", "train_data.csv.zip"))
test_df = cargar_datos(os.path.join("../files/input/", "test_data.csv.zip"))

train_df = limpiar_datos(train_df)
test_df = limpiar_datos(test_df)

X_train, y_train = train_df.drop(columns=["default"]), train_df["default"]
X_test, y_test = test_df.drop(columns=["default"]), test_df["default"]


In [None]:
# ===============================================
# Entrenamiento y búsqueda de hiperparámetros
# ===============================================

pipeline = crear_pipeline()
grid_search = configurar_gridsearch(pipeline)

grid_search.fit(X_train, y_train)

guardar_modelo(os.path.join("../files/models/", "model.pkl.gz"), grid_search)


In [None]:
# ===============================================
# Evaluación del modelo y exportación de métricas
# ===============================================

y_pred_train = grid_search.predict(X_train)
y_pred_test = grid_search.predict(X_test)

metricas_train = calcular_metricas("train", y_train, y_pred_train)
metricas_test = calcular_metricas("test", y_test, y_pred_test)

cm_train = matriz_confusion_json("train", y_train, y_pred_train)
cm_test = matriz_confusion_json("test", y_test, y_pred_test)

os.makedirs("../files/output/", exist_ok=True)

with open(os.path.join("../files/output/", "metrics.json"), "w") as f:
    for resultado in [metricas_train, metricas_test, cm_train, cm_test]:
        f.write(json.dumps(resultado) + "\n")
