In [1]:
import pandas as pd
import numpy as np
import os
import json
import pickle
import gzip
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression


# ---------------------------- 1. Funciones de Utilidad (IO/Directorio) ----------------------------

def crear_directorios_base():
    os.makedirs("../files/models", exist_ok=True)
    os.makedirs("../files/output", exist_ok=True)

def cargar_datos_fuente():
    train_df = pd.read_csv("../files/input/train_data.csv.zip", compression="zip")
    test_df = pd.read_csv("../files/input/test_data.csv.zip", compression="zip")
    return train_df, test_df

def guardar_modelo_comprimido(estimator, path="../files/models/model.pkl.gz"):
    crear_directorios_base()
    with gzip.open(path, "wb") as f:
        pickle.dump(estimator, f)

def recuperar_modelo(path="../files/models/model.pkl.gz"):
    if not os.path.exists(path):
        return None
    with gzip.open(path, "rb") as f:
        return pickle.load(f)


# ---------------------------- 2. Funciones de Procesamiento de Datos ----------------------------

def limpiar_dataframe(df):

    df = df.copy()

    # Creación de característica 'Age' 
    # Asumiendo 2021 como el año base
    df["Age"] = 2021 - df["Year"]

    # Eliminación de columnas no deseadas
    df = df.drop(["Year", "Car_Name"], axis=1, errors='ignore')

    # Eliminar NaNs 
    df = df.dropna()

    return df

def obtener_splits_entrenamiento_prueba(train_df, test_df):
    train_clean = limpiar_dataframe(train_df)
    test_clean = limpiar_dataframe(test_df)

    # Separación de variables 
    x_train = train_clean.drop("Present_Price", axis=1)
    y_train = train_clean["Present_Price"]

    x_test = test_clean.drop("Present_Price", axis=1)
    y_test = test_clean["Present_Price"]

    return x_train, y_train, x_test, y_test


# ---------------------------- 3. Funciones de Componentes de ML (Pipeline / Grid Search) ----------------------------

def construir_pipeline_completo(feature_columns):

    # Definición de columnas 
    categorical_features = ["Fuel_Type", "Selling_type", "Transmission"]
    numeric_features = [col for col in feature_columns if col not in categorical_features]

    # Preprocesador
    preprocessor = ColumnTransformer(
        transformers=[
            # Las numéricas se escalan con MinMaxScaler
            ("num", MinMaxScaler(), numeric_features),
            # Las categóricas se codifican con OHE
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
        ],
    
        remainder=MinMaxScaler(), 
    )

    # Armado del pipeline 
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        # Función de puntuación para Regresión
        ("feature_selection", SelectKBest(score_func=f_regression)), 
        ("model", LinearRegression())
    ])

    return pipeline

def configurar_busqueda_grid(estimator, param_grid, cv=10):
    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring="neg_mean_absolute_error", 
        n_jobs=-1,
        verbose=1,
        refit=True
    )
    return grid_search


# ---------------------------- 4. Funciones de Entrenamiento y Validación ----------------------------

def entrenar_y_comparar_modelos(grid_search):

    train_df, test_df = cargar_datos_fuente()
    x_train, y_train, x_test, y_test = obtener_splits_entrenamiento_prueba(train_df, test_df)

    # Entrenar el modelo (GridSearchCV)
    grid_search.fit(x_train, y_train)

    # Guardar el mejor estimador
    guardar_modelo_comprimido(grid_search)


def ejecutar_entrenamiento_mlp():

    train_df, test_df = cargar_datos_fuente()
    x_train, y_train, x_test, y_test = obtener_splits_entrenamiento_prueba(train_df, test_df)

    pipeline = construir_pipeline_completo(feature_columns=x_train.columns.tolist())

    # Parámetros del Código Fuente
    param_grid = {"feature_selection__k": list(range(1, 20))}
    
    # cv=10 como en el Código Fuente.
    gs = configurar_busqueda_grid(estimator=pipeline, param_grid=param_grid, cv=10) 
    entrenar_y_comparar_modelos(gs)


def validar_y_generar_metricas():

    crear_directorios_base()
    train_df, test_df = cargar_datos_fuente()
    x_train, y_train, x_test, y_test = obtener_splits_entrenamiento_prueba(train_df, test_df)

    # cargar modelo (gzip)
    estimator = recuperar_modelo()
    if estimator is None:
        raise FileNotFoundError("No se encontró modelo en files/models/model.pkl.gz")

    # predicciones
    y_train_pred = estimator.predict(x_train)
    y_test_pred = estimator.predict(x_test)

    metrics = []

    # Métricas de entrenamiento (Regresión)
    train_metrics = {
        "type": "metrics",
        "dataset": "train",
        "r2": float(r2_score(y_train, y_train_pred)),
        "mse": float(mean_squared_error(y_train, y_train_pred)),
        "mad": float(median_absolute_error(y_train, y_train_pred)),
    }
    metrics.append(train_metrics)

    # Métricas de prueba (Regresión)
    test_metrics = {
        "type": "metrics",
        "dataset": "test",
        "r2": float(r2_score(y_test, y_test_pred)),
        "mse": float(mean_squared_error(y_test, y_test_pred)),
        "mad": float(median_absolute_error(y_test, y_test_pred)),
    }
    metrics.append(test_metrics)

    # guardar JSONL
    out_path = "../files/output/metrics.json"
    with open(out_path, "w") as f:
        for m in metrics:
            f.write(json.dumps(m) + "\n")

    print(f"Métricas guardadas en {out_path}")


if __name__ == "__main__":
    # si se ejecuta el script, entrena y luego comprueba
    crear_directorios_base()
    ejecutar_entrenamiento_mlp() 
    validar_y_generar_metricas()

Fitting 10 folds for each of 19 candidates, totalling 190 fits
Métricas guardadas en ../files/output/metrics.json
