In [1]:

import gzip
import json
import os
import pickle
from typing import Tuple

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    median_absolute_error,
    mean_absolute_error,
)
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [2]:
# Rutas y configuración global
# ------------------------------------------------------------------------------

TRAIN_PATH = "../files/input/train_data.csv.zip"
TEST_PATH = "../files/input/test_data.csv.zip"

MODEL_FILENAME = "../files/models/model.pkl.gz"
METRICS_FILENAME = "../files/output/metrics.json"

# Según el enunciado
CATEGORICAL = ["Fuel_Type", "Selling_type", "Transmission"]

N_FOLDS = 10
N_JOBS = -1

# Rejilla de hiperparámetros (puedes ampliarla si quieres afinar más)
PARAM_GRID = {
    "selectkbest__k": [3, 4, 5, 6, "all"],
    # Se podría explorar también:
    # "regressor__fit_intercept": [True, False],
}

In [3]:
# 1. Limpieza y carga de datos
# ------------------------------------------------------------------------------

def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Paso 1:
    - Crear Age = 2021 - Year.
    - Eliminar Year y Car_Name.
    - Eliminar registros con NA (por seguridad).
    """
    df = df.copy()

    if "Year" in df.columns and "Age" not in df.columns:
        df["Age"] = 2021 - df["Year"]

    # Eliminar columnas no requeridas
    for col in ["Year", "Car_Name"]:
        if col in df.columns:
            df = df.drop(columns=[col])

    # Por seguridad, eliminar filas con NA
    df = df.dropna()

    return df


In [4]:
def load_clean_data(
    train_path: str = TRAIN_PATH,
    test_path: str = TEST_PATH,
):
    import pandas as pd

    train = pd.read_csv(train_path, index_col=False, compression="zip")
    test = pd.read_csv(test_path, index_col=False, compression="zip")

    train = clean_data(train)
    test = clean_data(test)

    # ⬇⬇⬇ CAMBIO IMPORTANTE ⬇⬇⬇
    # El test usa Present_Price como variable objetivo
    target = "Present_Price"

    x_train = train.drop(columns=[target])
    y_train = train[target].astype(float)

    x_test = test.drop(columns=[target])
    y_test = test[target].astype(float)

    return x_train, y_train, x_test, y_test


In [5]:
# 3. Pipeline + GridSearchCV con regresión lineal
# ------------------------------------------------------------------------------

def make_pipeline() -> Pipeline:
    """
    Paso 3:
    Pipeline con:
    - OneHotEncoder para categóricas.
    - MinMaxScaler para numéricas.
    - SelectKBest (f_regression).
    - LinearRegression.
    """
    x_train, _, _, _ = load_clean_data()

    numeric = [c for c in x_train.columns if c not in CATEGORICAL]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), CATEGORICAL),
            ("num", MinMaxScaler(), numeric),
        ],
        remainder="drop",
    )

    regressor = LinearRegression()

    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("selectkbest", SelectKBest(score_func=f_regression, k="all")),
            ("regressor", regressor),
        ],
        verbose=False,
    )

    return pipeline

In [6]:
def make_grid_search(
    pipeline: Pipeline,
    param_grid=None,
    n_folds: int = N_FOLDS,
    n_jobs: int = N_JOBS,
) -> GridSearchCV:
    """
    Paso 4:
    Envuelve el pipeline en un GridSearchCV usando
    'neg_mean_absolute_error' como métrica (MAE).
    """
    if param_grid is None:
        param_grid = PARAM_GRID

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring="neg_mean_absolute_error",  # error medio absoluto (con signo negativo)
        cv=n_folds,
        n_jobs=n_jobs,
        verbose=0,
    )

    return grid_search


In [7]:
# 5. Guardar / cargar modelo
# ------------------------------------------------------------------------------

def save_model(model) -> None:
    """Guarda el modelo comprimido en MODEL_FILENAME."""
    os.makedirs(os.path.dirname(MODEL_FILENAME), exist_ok=True)
    with gzip.open(MODEL_FILENAME, "wb") as f:
        pickle.dump(model, f)

In [8]:

def load_model():
    """Carga el modelo desde MODEL_FILENAME si existe, sino devuelve None."""
    if not os.path.exists(MODEL_FILENAME):
        return None
    with gzip.open(MODEL_FILENAME, "rb") as f:
        model = pickle.load(f)
    return model

In [9]:
# Entrenamiento
# ------------------------------------------------------------------------------

def train_model(model: GridSearchCV) -> None:
    """
    Ajusta el GridSearchCV sobre el conjunto de entrenamiento y guarda
    el mejor modelo.
    """
    x_train, y_train, _, _ = load_clean_data()

    model.fit(x_train, y_train)

   
    save_model(model)


In [10]:


def train_lr_model(param_grid=None) -> None:
    """
    Función de alto nivel para entrenar el modelo de regresión lineal.
    """
    pipeline = make_pipeline()
    grid = make_grid_search(pipeline, param_grid=param_grid)
    train_model(grid)

In [11]:
# Métricas y reporte
# ------------------------------------------------------------------------------

def eval_metrics(model, x_train, y_train, x_test, y_test):
    """
    Calcula métricas y retorna dos diccionarios:
    - metrics_train
    - metrics_test

    'mad' se interpreta como median absolute error.
    """
    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)

    def metrics_dict(y_true, y_pred, dataset):
        return {
            "type": "metrics",
            "dataset": dataset,
            "r2": float(r2_score(y_true, y_pred)),
            "mse": float(mean_squared_error(y_true, y_pred)),
            # mad = median absolute error (no mean)
            "mad": float(median_absolute_error(y_true, y_pred)),
        }

    metrics_train = metrics_dict(y_train, y_pred_train, "train")
    metrics_test = metrics_dict(y_test, y_pred_test, "test")

    return metrics_train, metrics_test


In [12]:

def save_report(
    metrics_train,
    metrics_test,
) -> None:
    """
    Paso 6:
    Escribe metrics.json con 2 líneas (train y test).
    """
    os.makedirs(os.path.dirname(METRICS_FILENAME), exist_ok=True)
    with open(METRICS_FILENAME, "w", encoding="utf-8", newline="\n") as f:
        f.write(json.dumps(metrics_train) + "\n")
        f.write(json.dumps(metrics_test) + "\n")

In [13]:

def print_report(metrics_train, metrics_test) -> None:
    """Imprime un resumen compacto de métricas (test (train))."""

    def fmt(name, test_val, train_val):
        return f"{name:>10}: {test_val:.4f} ({train_val:.4f})"

    print("-" * 60)
    print("Metrics summary (test (train))")
    print("-" * 60)
    print(fmt("R2", metrics_test["r2"], metrics_train["r2"]))
    print(fmt("MSE", metrics_test["mse"], metrics_train["mse"]))
    print(fmt("MAD", metrics_test["mad"], metrics_train["mad"]))
    print("-" * 60)

In [14]:

def check_estimator() -> None:
    """
    Carga datos, evalúa el modelo guardado y genera metrics.json.
    Útil para revisar que todo quedó bien.
    """
    x_train, y_train, x_test, y_test = load_clean_data()
    model = load_model()
    if model is None:
        raise RuntimeError("No se encontró el modelo entrenado en files/models.")

    metrics_train, metrics_test = eval_metrics(
        model, x_train, y_train, x_test, y_test
    )

    save_report(metrics_train, metrics_test)
    print_report(metrics_train, metrics_test)

In [15]:

def print_get_params() -> None:
    """Imprime todos los parámetros del GridSearchCV guardado."""
    model = load_model()
    if model is None:
        print("No model found.")
        return
    print("Get model parameters:")
    for param, value in model.get_params().items():
        print(f"  {param}: {value}")

In [16]:


def print_best_model_params() -> None:
    """Imprime los mejores hiperparámetros encontrados por GridSearchCV."""
    model = load_model()
    if model is None:
        print("No model found.")
        return
    if not hasattr(model, "best_params_"):
        print("El modelo cargado no tiene atributo best_params_.")
        return
    print("Best model parameters:")
    for param, value in model.best_params_.items():
        print(f"  {param}: {value}")

In [17]:
# Ejecución directa del script
# ------------------------------------------------------------------------------

if __name__ == "__main__":
    # Entrena el modelo y lo guarda en files/models/model.pkl.gz
    train_lr_model()

    # Evalúa el modelo guardado y genera files/output/metrics.json
    check_estimator()

    # imprime los mejores hiperparámetros encontrados
    print_best_model_params()

------------------------------------------------------------
Metrics summary (test (train))
------------------------------------------------------------
        R2: 0.7326 (0.8917)
       MSE: 32.5667 (5.8746)
       MAD: 1.5034 (1.0929)
------------------------------------------------------------
Best model parameters:
  selectkbest__k: all
