In [None]:
import os
import gzip
import json
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

In [None]:
# definimos algunas funciones a usar

def verificar_carpetas():
    """Asegura que los directorios de salida existan."""
    os.makedirs("../files/models", exist_ok=True)
    os.makedirs("../files/output", exist_ok=True)

def cargar_datos_fuente():
    """Carga los DataFrames de entrenamiento y prueba."""
    train_path = "../files/input/train_data.csv.zip"
    test_path = "../files/input/test_data.csv.zip"
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def recuperar_modelo(path="../files/models/model.pkl.gz"):
    """Carga el estimador guardado si existe, sino devuelve None."""
    if not os.path.exists(path):
        return None
    with gzip.open(path, "rb") as f:
        return pickle.load(f)

def guardar_modelo_comprimido(estimator, path="../files/models/model.pkl.gz"):
    """Guarda el estimador comprimido con gzip."""
    verificar_carpetas()
    with gzip.open(path, "wb") as f:
        pickle.dump(estimator, f)


def preparar_df(df):
    """Limpia y transforma un DataFrame (manejo de columnas y valores atípicos)."""
    df = df.copy()

    # renombrar
    if "default payment next month" in df.columns:
        df = df.rename(columns={"default payment next month": "default"})

    if "ID" in df.columns:
        df = df.drop(columns=["ID"])

    if "EDUCATION" in df.columns:
        df["EDUCATION"] = pd.to_numeric(df["EDUCATION"], errors="coerce")
        df.loc[df["EDUCATION"].isna(), "EDUCATION"] = 4
        df["EDUCATION"] = df["EDUCATION"].astype(int)
        df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4
        df.loc[df["EDUCATION"] == 0, "EDUCATION"] = 4

    # Eliminar registros con valor 0 en MARRIAGE
    df.drop(df[df["MARRIAGE"] == 0].index, inplace=True)
    df = df.dropna()

    return df

def obtener_splits(train_df, test_df):
    """
    Aplica preparar_df a los datos de entrenamiento y prueba
    y retorna x_train, y_train, x_test, y_test
    """
    train_clean = preparar_df(train_df)
    test_clean = preparar_df(test_df)

    x_train = train_clean.drop(columns=["default"])
    y_train = train_clean["default"]

    x_test = test_clean.drop(columns=["default"])
    y_test = test_clean["default"]

    return x_train, y_train, x_test, y_test


In [None]:
#  Funciones de Componentes del modelo

def construir_pipeline(feature_columns):

    categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
    # Las demás columnas se consideran numéricas
    numeric_features = [c for c in feature_columns if c not in categorical_features]

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ])

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_pipeline, categorical_features),
            ("num", numeric_pipeline, numeric_features)
        ]
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=None)),
        ('selector', SelectKBest(score_func=f_classif)),
        ('classifier', SVC(random_state=123))
        ], verbose=False
    )

    return pipeline

def crear_busqueda_grid(estimator, param_grid, cv=10):
    """Crea y configura el objeto GridSearchCV."""
    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring="balanced_accuracy",
        n_jobs=-1,
        verbose=1,
        return_train_score=True
    )
    return grid_search


In [None]:

# Funciones de Ejecución y Validación
def entrenar_y_comparar(grid_search):
    """
    Entrena el modelo, compara el score con el modelo guardado,
    y guarda si el nuevo modelo es mejor.
    """
    train_df, test_df = cargar_datos_fuente()
    x_train, y_train, x_test, y_test = obtener_splits(train_df, test_df)

    # Entrenar
    grid_search.fit(x_train, y_train)

    # Cargar modelo guardado
    saved = recuperar_modelo()
    current_score = balanced_accuracy_score(y_test, grid_search.predict(x_test))

    # Determinar el score del modelo guardado
    if saved is not None:
        try:
            saved_score = balanced_accuracy_score(y_test, saved.predict(x_test))
        except Exception:
            # Si el objeto guardado no tiene predict
            saved_score = -1.0
    else:
        saved_score = -1.0

    # Guardar si el score actual es igual o mejor
    if current_score >= saved_score:
        guardar_modelo_comprimido(grid_search)
    else:
        pass

def ejecutar_entrenamiento_principal():
    train_df, test_df = cargar_datos_fuente()
    # Usamos solo x_train para obtener los nombres de las columnas
    x_train, _, _, _ = obtener_splits(train_df, test_df)

    pipeline = construir_pipeline(feature_columns=x_train.columns.tolist())

    param_grid ={
    'selector__k': [15, 17, 20, 'all'],
    'classifier__gamma': [0.01, 0.1, 1],
    }

    gs = crear_busqueda_grid(estimator=pipeline, param_grid=param_grid, cv=10)
    entrenar_y_comparar(gs)


def validar_modelo_y_metricas():
    verificar_carpetas()
    train_df, test_df = cargar_datos_fuente()
    x_train, y_train, x_test, y_test = obtener_splits(train_df, test_df)

    # Cargar modelo (gzip)
    estimator = recuperar_modelo()
    if estimator is None:
        raise FileNotFoundError("No se encontró modelo en files/models/model.pkl.gz")

    # Predicciones
    y_train_pred = estimator.predict(x_train)
    y_test_pred = estimator.predict(x_test)

    metrics = []

    # Métricas de entrenamiento
    train_metrics = {
        "type": "metrics",
        "dataset": "train",
        "precision": precision_score(y_train, y_train_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_train, y_train_pred),
        "recall": recall_score(y_train, y_train_pred, zero_division=0),
        "f1_score": f1_score(y_train, y_train_pred, zero_division=0),
    }
    metrics.append(train_metrics)

    # Métricas de prueba
    test_metrics = {
        "type": "metrics",
        "dataset": "test",
        "precision": precision_score(y_test, y_test_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_test_pred),
        "recall": recall_score(y_test, y_test_pred, zero_division=0),
        "f1_score": f1_score(y_test, y_test_pred, zero_division=0),
    }
    metrics.append(test_metrics)

    # Matriz de confusión train
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_train_dict = {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {"predicted_0": int(cm_train[0, 0]), "predicted_1": int(cm_train[0, 1])},
        "true_1": {"predicted_0": int(cm_train[1, 0]), "predicted_1": int(cm_train[1, 1])},
    }
    metrics.append(cm_train_dict)

    # Matriz de confusión test
    cm_test = confusion_matrix(y_test, y_test_pred)
    cm_test_dict = {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {"predicted_0": int(cm_test[0, 0]), "predicted_1": int(cm_test[0, 1])},
        "true_1": {"predicted_0": int(cm_test[1, 0]), "predicted_1": int(cm_test[1, 1])},
    }
    metrics.append(cm_test_dict)

    # Guardar JSONL
    out_path = "../files/output/metrics.json"
    with open(out_path, "w") as f:
        for m in metrics:
            f.write(json.dumps(m) + "\n")

    print(f"Métricas guardadas en {out_path}")


In [None]:

if __name__ == "__main__":
    # Si se ejecuta el script, entrena y luego comprueba
    verificar_carpetas()
    ejecutar_entrenamiento_principal()
    validar_modelo_y_metricas()