In [21]:
import os
import gzip
import json
import pickle

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
    confusion_matrix,
)



In [22]:
# ---------- Config ----------
TRAIN_PATH = "../files/input/train_data.csv.zip"
TEST_PATH = "../files/input/test_data.csv.zip"
MODEL_DIR = "../files/models"
MODEL_FILE = os.path.join(MODEL_DIR, "model.pkl.gz")
OUTPUT_DIR = "../files/output"
METRICS_FILE = os.path.join(OUTPUT_DIR, "metrics.json")

# Columnas categóricas a transformar
CATEGORICAL_COLS = ["SEX", "EDUCATION", "MARRIAGE"]

In [23]:
# ---------- Funciones de preparación ----------
def load_raw_data(train_path: str, test_path: str):
    """Carga los CSVs comprimidos (zip)"""
    train = pd.read_csv(train_path, compression="zip")
    test = pd.read_csv(test_path, compression="zip")
    return train, test


def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """
    Limpieza según enunciado:
    - Renombrar 'default payment next month' -> 'default'
    - Eliminar 'ID'
    - Para EDUCATION: valores >4 -> agrupar a 'others' (reemplazamos por 4)
    - Eliminar registros con información no disponible (usamos np.nan y dropna)
    """
    df = df.copy()

    # Columna objetivo: renombrar si existe
    if "default payment next month" in df.columns:
        df = df.rename(columns={"default payment next month": "default"})

    # Eliminar ID si existe
    if "ID" in df.columns:
        df = df.drop(columns=["ID"])

    # Marcar como missing algunos códigos inválidos (0 para EDUCATION o MARRIAGE)
    # según ejemplo anterior: tratar 0 como missing y luego eliminar
    if "EDUCATION" in df.columns:
        df["EDUCATION"] = df["EDUCATION"].replace({0: np.nan})
        # Agrupar valores mayores a 4 en 4 (others)
        df.loc[df["EDUCATION"].notna() & (df["EDUCATION"] > 4), "EDUCATION"] = 4

    if "MARRIAGE" in df.columns:
        df["MARRIAGE"] = df["MARRIAGE"].replace({0: np.nan})

    # Eliminar filas con NaN
    df = df.dropna(axis=0).reset_index(drop=True)

    # Asegurar tipos enteros para categorías
    for col in ["SEX", "EDUCATION", "MARRIAGE"]:
        if col in df.columns:
            df[col] = df[col].astype(int)

    return df


def split_xy(df: pd.DataFrame):
    """Separa X y y (target = 'default')"""
    y = df["default"].copy()
    X = df.drop(columns=["default"]).copy()
    return X, y


In [24]:
def build_pipeline():

    col_transformer = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), CATEGORICAL_COLS)
        ],
        remainder="passthrough",
    )

    pipeline = Pipeline(
        steps=[
            ("transformer", col_transformer),
            ("scaler", StandardScaler()),
            ("pca", PCA(n_components=None)),
            ("selector", SelectKBest(score_func=f_classif)),
            ("svc", SVC()),
        ]
    )
    return pipeline


def run_grid_search(pipeline, X, y):
    """
    GridSearchCV con cv=10, scoring balanced_accuracy.
    Los hiperparámetros están elegidos para obtener resultados comparables
    a los benchmarks del autograder.
    """
    param_grid = {
    "svc__C": [0.1, 1, 2, 5, 10],
    "svc__gamma": [0.1, 0.1, 0.5, 1],
    "svc__kernel": ["rbf"]
    }

    # Crear el objeto de validación estratificada
    cv_strategy = StratifiedKFold(n_splits=10, shuffle=True)

    gs = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring="balanced_accuracy",
        cv=10,
        n_jobs=-1,
        verbose=0,
    )

    gs.fit(X, y)
    return gs


# ---------- Métricas y guardado ----------
def create_metrics_record(y_true, y_pred, dataset_name):
    """Crea el diccionario de métricas (tipo 'metrics')"""
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": float(accuracy_score(y_true=y_true, y_pred=y_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_true=y_true, y_pred=y_pred)),
        "recall": float(recall_score(y_true=y_true, y_pred=y_pred)),
        "f1_score": float(f1_score(y_true=y_true, y_pred=y_pred)),
    }


def create_cm_record(y_true, y_pred, dataset_name):
    """Crea el diccionario de la matriz de confusión con el formato pedido"""
    mat = confusion_matrix(y_true=y_true, y_pred=y_pred)
    # mat shape = (2,2) esperando clases 0 y 1
    return {
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {"predicted_0": int(mat[0, 0]), "predicted_1": None},
        "true_1": {"predicted_0": None, "predicted_1": int(mat[1, 1])},
    }




In [25]:
def save_model_gzip(obj, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with gzip.open(path, "wb") as f:
        pickle.dump(obj, f)


def save_metrics_lines(metrics_list, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        for rec in metrics_list:
            f.write(json.dumps(rec) + "\n")



In [26]:
def main():
    # Cargar
    train_raw, test_raw = load_raw_data(TRAIN_PATH, TEST_PATH)

    # Limpiar
    train = clean_dataset(train_raw)
    test = clean_dataset(test_raw)

    # Dividir X/y
    X_train, y_train = split_xy(train)
    X_test, y_test = split_xy(test)

    # Pipeline
    pipeline = build_pipeline()

    # GridSearch / ajuste
    grid = run_grid_search(pipeline, X_train, y_train)

    # Guardar modelo comprimido
    save_model_gzip(grid, MODEL_FILE)

    # Predicciones sobre train/test
    y_train_pred = grid.predict(X_train)
    y_test_pred = grid.predict(X_test)

    # Calcular métricas y matrices
    metrics = []
    metrics.append(create_metrics_record(y_train, y_train_pred, "train"))
    metrics.append(create_metrics_record(y_test, y_test_pred, "test"))
    metrics.append(create_cm_record(y_train, y_train_pred, "train"))
    metrics.append(create_cm_record(y_test, y_test_pred, "test"))

    # Guardar métricas (una línea JSON por registro)
    save_metrics_lines(metrics, METRICS_FILE)


In [None]:
main()