# Notebook: Entrenamiento paso a paso del modelo (homework)


- Lectura de datos
- Limpieza y transformaciones
- Separación en entrenamiento/prueba
- Construcción del pipeline (OneHotEncoder + RandomForest)
- Búsqueda de hiperparámetros (GridSearchCV)
- Persistencia del modelo y exportación de métricas


In [19]:
# Imports y configuración básica
import gzip
import json
import os
import pickle
from typing import List, Tuple

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    balanced_accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


In [20]:
# Rutas (nombres ligeramente cambiados para variar la redacción)
MODEL_FILEPATH = "../files/models/model.pkl.gz"
METRICS_FILEPATH = "../files/output/metrics.json"
TRAIN_FILE = os.path.join("..","files", "input", "train_data.csv.zip")
TEST_FILE = os.path.join("..","files", "input", "test_data.csv.zip")

In [21]:
def read_raw_files() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Carga los CSV comprimidos (train/test) y devuelve DataFrames."""
    if not os.path.exists(TRAIN_FILE):
        raise FileNotFoundError(f"No encuentro {TRAIN_FILE}")
    if not os.path.exists(TEST_FILE):
        raise FileNotFoundError(f"No encuentro {TEST_FILE}")

    # pandas detecta la compresión por la extensión .zip
    train_df = pd.read_csv(TRAIN_FILE)
    test_df = pd.read_csv(TEST_FILE)
    return train_df, test_df

In [22]:
def sanitize_df(data: pd.DataFrame) -> pd.DataFrame:
    """Aplica transformaciones iniciales al DataFrame (limpieza ligera)."""
    # Renombrar la columna objetivo si viene con otro nombre
    if "default payment next month" in data.columns:
        data = data.rename(columns={"default payment next month": "default"})

    # Eliminar identificador si está presente
    if "ID" in data.columns:
        data = data.drop(columns=["ID"])

    # Normalizar valores de EDUCATION mayores a 4 hacia la categoría 'otros' (4)
    if "EDUCATION" in data.columns:
        data.loc[data["EDUCATION"] > 4, "EDUCATION"] = 4

    # Eliminar filas con NA y reindexar
    cleaned = data.dropna(axis=0, how="any").reset_index(drop=True)

    return cleaned

In [23]:
def split_features_target(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    """Devuelve (features, target). Valida que exista la columna objetivo."""
    if "default" not in df.columns:
        raise RuntimeError('La columna objetivo "default" no existe en el dataset.')

    target = df["default"].astype(int)
    features = df.drop(columns=["default"])
    return features, target

In [24]:
RANDOM_STATE = 42  # si ya lo tienes definido, usa el mismo

def make_model_search(X_train: pd.DataFrame) -> GridSearchCV:
    """
    Pipeline pedido en el lab 3:
    - OneHot para categóricas
    - PCA (todas las componentes)
    - Estandarizar
    - SelectKBest
    - SVM
    """

    # Categóricas: las de antes + estados de pago (son categorías discretas)
    cat_cols = [
        col
        for col in [
            "SEX",
            "EDUCATION",
            "MARRIAGE",
            "PAY_0",
            "PAY_2",
            "PAY_3",
            "PAY_4",
            "PAY_5",
            "PAY_6",
        ]
        if col in X_train.columns
    ]

    # Numéricas: todo lo demás
    num_cols = [c for c in X_train.columns if c not in cat_cols]

    # Paso 1: One-Hot para categóricas, pasar numéricas crudas
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
            ("num", "passthrough", num_cols),
        ]
    )

    # Clasificador SVM
    svm_clf = SVC(random_state=RANDOM_STATE)

    # Pipeline en el orden que pide el enunciado:
    # OHE -> PCA -> StandardScaler -> SelectKBest -> SVM
    pipe = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("pca", PCA()),  # usa todas las componentes
            ("scaler", StandardScaler()),
            ("feature_selection", SelectKBest(score_func=f_classif, k=20)),
            ("classifier", svm_clf),
        ]
    )

    # Grid de hiperparámetros para CV
    svm_clf = SVC(kernel="rbf")  # fijamos el kernel

    param_grid = {
    "feature_selection__k": [10, 20, 30],   # dos opciones de K
    "classifier__C": [1.0, 5.0, 10.0],       # dos opciones de C
    # sin gamma en el grid: usamos el default "scale"
    }
    

    search = GridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        cv=10,  
        scoring="balanced_accuracy",
        n_jobs=-1,
        refit=True,
    )

    return search


In [25]:
# 3. MÉTRICAS
# ---------------------------------------------------------------------
def build_classification_metrics(y_true, y_pred, dataset_name: str) -> dict:
    """Genera el dict de métricas (misma salida que antes, solo renombrado)."""
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
        "balanced_accuracy": float(balanced_accuracy_score(y_true, y_pred)),
        "recall": float(recall_score(y_true, y_pred, zero_division=0)),
        "f1_score": float(f1_score(y_true, y_pred, zero_division=0)),
    }


def build_confusion_matrix_record(y_true, y_pred, dataset_name: str) -> dict:
    """Devuelve la matriz de confusión como diccionario (mismo formato que antes)."""
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()

    return {
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {"predicted_0": int(tn), "predicted_1": int(fp)},
        "true_1": {"predicted_0": int(fn), "predicted_1": int(tp)},
    }

In [26]:
# 4. GUARDAR MODELO Y MÉTRICAS
# ---------------------------------------------------------------------
def persist_model(model) -> None:
    os.makedirs(os.path.dirname(MODEL_FILEPATH), exist_ok=True)
    with gzip.open(MODEL_FILEPATH, "wb") as f:
        pickle.dump(model, f)


def persist_metrics(records) -> None:
    """
    records: lista de diccionarios.
    Orden esperado por el test:
      0 -> métricas train
      1 -> métricas test
      2 -> matriz confusión train
      3 -> matriz confusión test
    """
    os.makedirs(os.path.dirname(METRICS_FILEPATH), exist_ok=True)
    with open(METRICS_FILEPATH, "w", encoding="utf-8") as f:
        for rec in records:
            f.write(json.dumps(rec) + "\n")

In [27]:
# 5. MAIN
# ---------------------------------------------------------------------
def run_workflow() -> None:
    # Paso 1: cargar y limpiar
    raw_train, raw_test = read_raw_files()
    train_df = sanitize_df(raw_train)
    test_df = sanitize_df(raw_test)

    # Paso 2: separar en features/target
    x_train, y_train = split_features_target(train_df)
    x_test, y_test = split_features_target(test_df)

    # Pasos 3 y 4: pipeline y búsqueda de hiperparámetros
    model_search = make_model_search(x_train)
    model_search.fit(x_train, y_train)

    # Paso 5: persistir el modelo encontrado
    persist_model(model_search)

    # Paso 6: predicciones y cálculo de métricas
    y_pred_train = model_search.predict(x_train)
    y_pred_test = model_search.predict(x_test)

    metrics_train = build_classification_metrics(y_train, y_pred_train, "train")
    metrics_test = build_classification_metrics(y_test, y_pred_test, "test")

    # Paso 7: matrices de confusión
    cm_train = build_confusion_matrix_record(y_train, y_pred_train, "train")
    cm_test = build_confusion_matrix_record(y_test, y_pred_test, "test")

    # Guardar todas las métricas en el archivo de salida
    persist_metrics([metrics_train, metrics_test, cm_train, cm_test])


if __name__ == "__main__":
    run_workflow()