In [1]:
# flake8: noqa: E501

import os
import gzip
import json
import pickle

import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.metrics import (
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)
from sklearn.compose import ColumnTransformer

# Paths renombrados
TRAIN_INPUT_PATH = "../files/input/train_data.csv.zip"
TEST_INPUT_PATH = "../files/input/test_data.csv.zip"

MODEL_OUTPUT_PATH = "../files/models/model.pkl.gz"
METRICS_OUTPUT_PATH = "../files/output/metrics.json"

GRADING_TRAIN_X = "../files/grading/x_train.pkl"
GRADING_TRAIN_Y = "../files/grading/y_train.pkl"
GRADING_TEST_X = "../files/grading/x_test.pkl"
GRADING_TEST_Y = "../files/grading/y_test.pkl"

# Creación de carpetas
os.makedirs("../files/models", exist_ok=True)
os.makedirs("../files/output", exist_ok=True)
os.makedirs("../files/grading", exist_ok=True)


In [2]:
def read_raw_datasets():
    """Carga los datasets de entrenamiento y prueba."""
    frame_train = pd.read_csv(TRAIN_INPUT_PATH, index_col=0)
    frame_test = pd.read_csv(TEST_INPUT_PATH, index_col=0)
    return frame_train, frame_test


In [3]:
def refine_dataset(dataframe):
    """Limpia el dataset según los requerimientos."""

    # Renombrar variable objetivo
    if "default payment next month" in dataframe.columns:
        dataframe = dataframe.rename(columns={"default payment next month": "default"})

    # Remover ID
    if "ID" in dataframe.columns:
        dataframe = dataframe.drop(columns=["ID"])

    # Variables categóricas con posibles valores no válidos
    flagged_columns = ["EDUCATION", "MARRIAGE"]

    for col in flagged_columns:
        if 0 in dataframe[col].unique():
            dataframe = dataframe[dataframe[col] != 0].copy()

    # Agrupar niveles superiores en EDUCATION
    dataframe["EDUCATION"] = dataframe["EDUCATION"].replace([5, 6], 4)

    return dataframe


def run_cleaning_step():
    """Aplica la limpieza a train y test."""
    raw_train, raw_test = read_raw_datasets()

    cleaned_train = refine_dataset(raw_train)
    cleaned_test = refine_dataset(raw_test)

    return cleaned_train, cleaned_test


In [4]:
def split_features_and_target(clean_train, clean_test):
    target = "default"

    train_X = clean_train.drop(columns=[target])
    train_y = clean_train[target]

    test_X = clean_test.drop(columns=[target])
    test_y = clean_test[target]

    # Guardado para el autograder
    with open(GRADING_TRAIN_X, "wb") as f:
        pickle.dump(train_X, f)
    with open(GRADING_TRAIN_Y, "wb") as f:
        pickle.dump(train_y, f)
    with open(GRADING_TEST_X, "wb") as f:
        pickle.dump(test_X, f)
    with open(GRADING_TEST_Y, "wb") as f:
        pickle.dump(test_y, f)

    return train_X, train_y, test_X, test_y


In [5]:
def assemble_training_pipeline(train_X):
    cat_vars = ["SEX", "EDUCATION", "MARRIAGE"]
    numeric_vars = train_X.columns.drop(cat_vars).tolist()

    transformer = ColumnTransformer(
        transformers=[
            ("cat_ohe", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), cat_vars),
            ("num_scale", StandardScaler(), numeric_vars),
        ],
        remainder="passthrough",
        verbose_feature_names_out=False
    ).set_output(transform="pandas")

    pipeline_model = Pipeline(steps=[
        ("data_prep", transformer),
        ("dim_red", PCA(random_state=42)),
        ("feature_filter", SelectKBest(score_func=f_classif)),
        ("svc_clf", SVC(random_state=42))
    ])

    return pipeline_model


In [6]:
def optimize_hyperparams(pipeline, train_X, train_y):
    search_space = {
        "dim_red__n_components": [20, 21],
        "feature_filter__k": [12],
        "svc_clf__C": [1.0],
        "svc_clf__gamma": [0.099],
    }

    folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    tuner = GridSearchCV(
        estimator=pipeline,
        param_grid=search_space,
        scoring="balanced_accuracy",
        cv=folds,
        n_jobs=-1,
        verbose=1
    )

    tuner.fit(train_X, train_y)
    return tuner


In [7]:
def store_trained_model(model_object):
    with gzip.open(MODEL_OUTPUT_PATH, "wb") as fh:
        pickle.dump(model_object, fh)


In [8]:
def build_metrics_report(best_model, train_X, train_y, test_X, test_y):
    results_list = []

    datasets = {
        "train": (train_X, train_y),
        "test": (test_X, test_y)
    }

    for label, (Xset, ytrue) in datasets.items():
        yhat = best_model.predict(Xset)

        # Métricas
        bal_acc = balanced_accuracy_score(ytrue, yhat)
        prec = precision_score(ytrue, yhat, zero_division=0)
        rec = recall_score(ytrue, yhat, zero_division=0)
        f1m = f1_score(ytrue, yhat, zero_division=0)

        metric_entry = {
            "type": "metrics",
            "dataset": label,
            "precision": round(prec, 3),
            "balanced_accuracy": round(bal_acc, 3),
            "recall": round(rec, 3),
            "f1_score": round(f1m, 3)
        }

        cm = confusion_matrix(ytrue, yhat)
        tn, fp, fn, tp = cm.ravel()

        cm_entry = {
            "type": "cm_matrix",
            "dataset": label,
            "true_0": {"predicted_0": int(tn), "predicted_1": int(fp)},
            "true_1": {"predicted_0": int(fn), "predicted_1": int(tp)}
        }

        results_list.append((metric_entry, cm_entry))

    # Escritura del archivo
    with open(METRICS_OUTPUT_PATH, "w", encoding="utf-8") as file:
        json.dump(results_list[0][0], file); file.write("\n")
        json.dump(results_list[1][0], file); file.write("\n")
        json.dump(results_list[0][1], file); file.write("\n")
        json.dump(results_list[1][1], file); file.write("\n")


In [9]:
def execute_full_pipeline():


    # Paso 1: Limpieza
    clean_train, clean_test = run_cleaning_step()
   

    # Paso 2: División
    X_train, y_train, X_test, y_test = split_features_and_target(clean_train, clean_test)
   

    # Paso 3: Pipeline
    base_pipeline = assemble_training_pipeline(X_train)
    

    # Paso 4: Optimización
    tuned_model = optimize_hyperparams(base_pipeline, X_train, y_train)
    print("Hiperparámetros optimizados.")

    # Paso 5: Guardado del modelo
    store_trained_model(tuned_model)
    print("Modelo guardado.")

    # Paso 6 y 7: Métricas
    build_metrics_report(tuned_model, X_train, y_train, X_test, y_test)
    print("Reporte de métricas generado.")

    print("Proceso completado.")


In [10]:
execute_full_pipeline()


Fitting 10 folds for each of 2 candidates, totalling 20 fits
Hiperparámetros optimizados.
Modelo guardado.
Reporte de métricas generado.
Proceso completado.
