In [1]:
import os
import json
import gzip
import pickle
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    median_absolute_error,
)

# Carpeta base de los datos
DATA_DIR = Path("../files/input")

# Carga de los conjuntos de entrenamiento y prueba
df_train = pd.read_csv(DATA_DIR / "train_data.csv.zip", compression="zip")
df_test = pd.read_csv(DATA_DIR / "test_data.csv.zip", compression="zip")

In [2]:

# Paso 1.
# Preprocesamiento inicial:
# - Crear la columna 'Age' a partir de 'Year' (año de referencia: 2021)
# - Eliminar las columnas 'Year' y 'Car_Name'

YEAR_REF = 2021
cols_a_borrar = ["Year", "Car_Name"]

# Trabajo sobre copias explícitas (evita problemas de vistas)
df_train = df_train.copy()
df_test = df_test.copy()

df_train["Age"] = YEAR_REF - df_train["Year"]
df_test["Age"] = YEAR_REF - df_test["Year"]

df_train.drop(columns=cols_a_borrar, inplace=True)
df_test.drop(columns=cols_a_borrar, inplace=True)

In [3]:

# Paso 2.
# División en X_train, y_train, X_test, y_test

col_objetivo = "Present_Price"

X_train = df_train.drop(columns=[col_objetivo])
y_train = df_train[col_objetivo]

X_test = df_test.drop(columns=[col_objetivo])
y_test = df_test[col_objetivo]

In [4]:
# Paso 3.
# Definir el pipeline de regresión:
# - One-hot encoding para variables categóricas.
# - Escalado Min-Max a [0, 1].
# - Selección de las K mejores características.
# - Regresión lineal como modelo final.

# Columnas categóricas definidas explícitamente
vars_cat = ["Fuel_Type", "Selling_type", "Transmission"]

# Todo lo que no es categórico se trata como numérico
vars_num = [c for c in X_train.columns if c not in vars_cat]

preprocesador = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), vars_num),
        ("cat", OneHotEncoder(handle_unknown="ignore"), vars_cat),
    ],
    remainder="drop",  # no debería quedar nada fuera, pero se deja explícito
)

pipeline = Pipeline(
    steps=[
        ("preprocesamiento", preprocesador),
        ("selector", SelectKBest(score_func=f_regression)),
        ("regresor", LinearRegression()),
    ]
)

In [5]:

# Paso 4.
# Optimización de hiperparámetros del pipeline mediante validación cruzada:
# - 10 particiones (cv=10)
# - Métrica: error medio absoluto (MAE)

param_grid = {
    "selector__k": range(1, 20),
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    cv=10,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...egression())])
,param_grid,"{'selector__k': range(1, 20)}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,score_func,<function f_r...001D16C93D760>
,k,11

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [6]:
# Paso 5.
# Guardar el modelo entrenado (GridSearchCV completo) comprimido con gzip
# en: ../files/models/model.pkl.gz

ruta_modelos = "../files/models"
os.makedirs(ruta_modelos, exist_ok=True)

ruta_modelo = os.path.join(ruta_modelos, "model.pkl.gz")

with gzip.open(ruta_modelo, "wb") as f:
    pickle.dump(grid_search, f)

In [7]:

# Paso 6.
# Calcular las métricas r2, mse y mad para train y test
# y guardarlas en ../files/output/metrics.json en formato JSON Lines.

# Predicciones con el mejor modelo encontrado
y_pred_train = grid_search.predict(X_train)
y_pred_test = grid_search.predict(X_test)

metrics_train = {
    "type": "metrics",
    "dataset": "train",
    "r2": float(r2_score(y_train, y_pred_train)),
    "mse": float(mean_squared_error(y_train, y_pred_train)),
    "mad": float(median_absolute_error(y_train, y_pred_train)),
}

metrics_test = {
    "type": "metrics",
    "dataset": "test",
    "r2": float(r2_score(y_test, y_pred_test)),
    "mse": float(mean_squared_error(y_test, y_pred_test)),
    "mad": float(median_absolute_error(y_test, y_pred_test)),
}

output_dir = "../files/output"
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "metrics.json")
with open(output_path, "w", encoding="utf-8") as f:
    f.write(json.dumps(metrics_train) + "\n")
    f.write(json.dumps(metrics_test) + "\n")