In [1]:
# Importamos las librerías
import pandas as pd
import numpy as np
import json
import gzip
import pickle

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error

In [2]:
# ------------------------------------------------------------------------------
# Paso 1: Lectura de datos y preprocesamiento
# ------------------------------------------------------------------------------

train_data = pd.read_csv("../files/input/train_data.csv.zip", index_col=False, compression="zip")
test_data = pd.read_csv("../files/input/test_data.csv.zip", index_col=False, compression="zip")

# Creamos la columna Age a partir de Year (año actual: 2021)
train_data["Age"] = 2021 - train_data["Year"]
test_data["Age"] = 2021 - test_data["Year"]

# Eliminamos las columnas 'Year' y 'Car_Name'
train_data.drop(["Year", "Car_Name"], axis=1, inplace=True)
test_data.drop(["Year", "Car_Name"], axis=1, inplace=True)

train_data.head()

Unnamed: 0,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Age
0,7.4,8.5,15059,Petrol,Dealer,Automatic,0,5
1,4.0,4.6,30000,Petrol,Dealer,Manual,0,8
2,0.5,0.826,6000,Petrol,Individual,Manual,0,10
3,3.15,4.43,15000,Petrol,Dealer,Manual,0,5
4,1.25,1.5,15000,Petrol,Individual,Manual,0,8


In [3]:
# ------------------------------------------------------------------------------
# Paso 2: Separar x_train, y_train, x_test, y_test
# ------------------------------------------------------------------------------
y_train = train_data["Present_Price"]
y_test = test_data["Present_Price"]

x_train = train_data.drop(["Present_Price"], axis=1)
x_test = test_data.drop(["Present_Price"], axis=1)

In [4]:
# ------------------------------------------------------------------------------
# Paso 3: Construir el Pipeline
# ------------------------------------------------------------------------------
# Identificamos columnas numéricas y categóricas
categorical_features = ['Fuel_Type','Selling_type','Transmission']
numeric_features = [col for col in x_train.columns if col not in categorical_features]


# Transformer para las columnas categóricas y numéricas
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('scaler',MinMaxScaler(), numeric_features),
    ],
)

# Definimos el pipeline con: OneHotEncoder, MinMaxScaler, SelectKBest, LinearRegression
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("selectk", SelectKBest(score_func=f_regression)), 
    ("regressor", LinearRegression())
])

In [5]:
# ------------------------------------------------------------------------------
# Paso 4: Optimización de hiperparámetros con validación cruzada
# ------------------------------------------------------------------------------
# Por ejemplo, probamos distintos valores de 'k' en SelectKBest.
# Puedes ajustar el rango según el número de variables creadas tras OneHotEncoder.
param_grid = {
    'selectk__k': range(1, 15),
    'regressor__fit_intercept': [True, False],
    'regressor__positive': [True, False]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    )

grid_search.fit(x_train, y_train)

In [6]:
# ------------------------------------------------------------------------------
# Paso 5: Guardar el modelo (comprimido con gzip)
# ------------------------------------------------------------------------------
import os

os.makedirs("../files/models", exist_ok=True)
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid_search, f)

In [7]:
# ------------------------------------------------------------------------------
# Paso 6: Cálculo de métricas y guardado en JSON
# ------------------------------------------------------------------------------
# Función auxiliar para calcular métricas y devolver un dict
def compute_metrics(model, X, y, dataset_name="train"):
    preds = model.predict(X)
    r2 = float(r2_score(y, preds))
    mse = float(mean_squared_error(y, preds))
    mad = float(median_absolute_error(y, preds))
    
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "r2": r2,
        "mse": mse,
        "mad": mad
    }

metrics_train = compute_metrics(grid_search, x_train, y_train, dataset_name="train")
metrics_test = compute_metrics(grid_search, x_test, y_test, dataset_name="test")

In [8]:
# Guardamos las métricas en un archivo JSON
# Cada línea contiene un diccionario con la información de cada conjunto
os.makedirs("../files/output", exist_ok=True)
with open("../files/output/metrics.json", "w") as outfile:
    json.dump(metrics_train, outfile)
    outfile.write("\n")
    json.dump(metrics_test, outfile)
    outfile.write("\n")