In [1]:
PATH_INPUT_TRAIN = "../files/input/train_data.csv.zip"
PATH_INPUT_TEST = "../files/input/test_data.csv.zip"


In [2]:
import pandas as pd

df_train = pd.read_csv(
    PATH_INPUT_TRAIN,
    index_col=False,
    compression='zip'
)

In [None]:
def limpiar_dataset(df):
    df['Age'] = 2021 - df['Year']
    df = df.drop(columns=["Year"], errors="ignore")
    df = df.drop(columns=["Car_Name"], errors="ignore")
    return df

df_train = pd.read_csv(
    PATH_INPUT_TRAIN,
    index_col=False,
    compression='zip'
)

df_test = pd.read_csv(
    PATH_INPUT_TEST,
    index_col=False,
    compression='zip'
)

# Aplicar limpieza
df_train_clean = limpiar_dataset(df_train)
df_test_clean = limpiar_dataset(df_test)

In [4]:
X_train = df_train_clean.drop(columns="Present_Price", axis=1)
y_train = df_train_clean["Present_Price"]

X_test = df_test_clean.drop(columns="Present_Price",axis=1)
y_test = df_test_clean["Present_Price"]

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression

# Columnas
categorical_features = ['Fuel_Type','Selling_type','Transmission']
numerical_features = [col for col in X_train.columns if col not in categorical_features]



preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', MinMaxScaler(), numerical_features)
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selectk', SelectKBest(f_regression)),
    ('LinearRegression', LinearRegression())   
])

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'selectk__k': [5,10,15,20],
    'LinearRegression__fit_intercept':[True,False],
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

grid.fit(X_train, y_train)

In [7]:
import gzip
import os
import pickle

os.makedirs("../files/models", exist_ok=True)
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid, f)

In [8]:
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    median_absolute_error
)

# Predecir en train y test
y_pred_train = grid.predict(X_train)
y_pred_test = grid.predict(X_test)

# Calcular m√©tricas
metrics = [
    {
        "type": "metrics",
        "dataset": "train",
        'r2': r2_score(y_train,y_pred_train),
        'mse':mean_squared_error(y_train,y_pred_train),
        'mad':median_absolute_error(y_train,y_pred_train),
    },
    {
        "type": "metrics",
        "dataset": "test",
        'r2':r2_score(y_test,y_pred_test),
        'mse':mean_squared_error(y_test,y_pred_test),
        'mad':median_absolute_error(y_test,y_pred_test),
    }
]

In [9]:
import json

# Crear directorio si no existe
os.makedirs("../files/output", exist_ok=True)

# Guardar en JSON
with open("../files/output/metrics.json", "w") as f:
    for entry in metrics:
        f.write(json.dumps(entry) + "\n")