In [1]:
import pandas as pd
import numpy as np
import os
import gzip
import pickle
import json

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, median_absolute_error

In [2]:
# Paso 0: Cargar la data
def load_data():

    df_test = pd.read_csv(
        "../files/input/test_data.csv.zip",
        index_col=False,
        compression="zip",
    )

    df_train = pd.read_csv(
        "../files/input/train_data.csv.zip",
        index_col=False,
        compression="zip",
    )

    return df_train, df_test

X, y = load_data()

In [3]:
# Paso 1: Procesar la data
def process_dataset(df):
    CURRENT_YEAR = 2021

    df = df.copy()

    # Crear la columna 'Age' a partir de la columna 'Year' y el CURRENT_YEAR
    if 'Year' in df.columns:
        df['Age'] = CURRENT_YEAR - df['Year']
        df.drop(columns=['Year'], inplace=True)
    
    # Eliminar las columnas irrelevantes 'Year' y 'Car_Name'
    if 'Car_Name' in df.columns:
        df.drop(columns=['Car_Name'], inplace=True)
    
    return df

X = process_dataset(X)
y = process_dataset(y)

In [4]:
# Paso 2: Divida los datasets en x_train, y_train, x_test y y_test
def make_train_test_split(X, y):

    x_train, y_train = X.drop(columns=["Present_Price"]), X["Present_Price"] 
    x_test, y_test = y.drop(columns=["Present_Price"]), y["Present_Price"]

    return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = make_train_test_split(X, y)
x_train.head()

Unnamed: 0,Selling_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Age
0,7.4,15059,Petrol,Dealer,Automatic,0,5
1,4.0,30000,Petrol,Dealer,Manual,0,8
2,0.5,6000,Petrol,Individual,Manual,0,10
3,3.15,15000,Petrol,Dealer,Manual,0,5
4,1.25,15000,Petrol,Individual,Manual,0,8


In [5]:
# Paso 3: Pipeline que transforma las variables categóricas por ohe hace el ajuste usando rf
def make_pipeline(X, y):

    # Identifique las variables categóricas y numéricas
    categorical_features = ["Fuel_Type", "Selling_type", "Transmission"]
    numerical_features = ["Selling_Price", "Driven_kms", "Owner", "Age"]

    # Transformadores para las variables categóricas y numéricas
    numerical_transformer = MinMaxScaler()
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    # Preprocesador que aplica las transformaciones adecuadas a cada tipo de variable
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_transformer, categorical_features),
            ("num", numerical_transformer, numerical_features),
        ],
        remainder = "drop",
    )

    # Buscar las k mejores características para entrenar el modelo
    selectkbest = SelectKBest(score_func=f_regression)

    # Instanciar un modelo de regresión lineal
    linear_regression = LinearRegression()

    # Crear el pipeline
    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("selectkbest", selectkbest),
            ("regressor", linear_regression),
        ]
    )

    return pipeline

pipeline = make_pipeline(x_train, y_train)
pipeline.fit(x_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('selectkbest', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,score_func,<function f_r...x7fef801d8180>
,k,10

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [6]:
# Paso 4: Optimizar hiperparámetros del modelo usando GridSearchCV
def optimize_hyperparams(pipeline):

    # Define el espacio de parámetros para la búsqueda en cuadrícula
    param_grid = {
        'selectkbest__k': range(1, 12),
        'regressor__fit_intercept':[True,False],
        'regressor__positive':[True,False]
    }

    # Crea un objeto de validación cruzada con 10 splits
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=10,
        scoring="neg_mean_absolute_error",
        n_jobs=-1,
        refit=True,
        verbose=1
    )

    # Ajusta el modelo usando la búsqueda en cuadrícula
    grid_search.fit(x_train, y_train)   
    best_params = grid_search.best_params_
    print("Mejores hiperparámetros encontrados:", best_params)

    return grid_search

grid_search = optimize_hyperparams(pipeline)

Fitting 10 folds for each of 44 candidates, totalling 440 fits


Mejores hiperparámetros encontrados: {'regressor__fit_intercept': True, 'regressor__positive': True, 'selectkbest__k': 11}


In [7]:
# Paso 5: Guardar el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
def save_model(model, filename):
    # Crear carpeta si no existe
    dirname = os.path.dirname(filename)
    if not os.path.exists(dirname):
        os.makedirs(dirname)

    with gzip.open(filename, 'wb') as f:
        pickle.dump(model, f)
    
    print(f"Modelo guardado exitosamente en: {filename}")

save_model(grid_search, "../files/models/model.pkl.gz")

Modelo guardado exitosamente en: ../files/models/model.pkl.gz


In [8]:
# Paso 6: Calcular métricas y guardar
def calculate_metrics(model, x_train, y_train, x_test, y_test, output_filename):
    
    # Función auxiliar para calcular métricas de un conjunto
    def get_metrics(x, y, dataset_name):
        y_pred = model.predict(x)
        return {
            'type': 'metrics',
            'dataset': dataset_name,
            'r2': float(r2_score(y, y_pred)),
            'mse': float(mean_squared_error(y, y_pred)),
            'mad': float(median_absolute_error(y, y_pred))
        }

    # Calcular para train y test
    metrics_train = get_metrics(x_train, y_train, 'train')
    metrics_test = get_metrics(x_test, y_test, 'test')
    
    # Guardar en archivo
    dirname = os.path.dirname(output_filename)
    if not os.path.exists(dirname):
        os.makedirs(dirname)

    with open(output_filename, 'w') as f:
        f.write(json.dumps(metrics_train) + "\n")
        f.write(json.dumps(metrics_test) + "\n")
    
    print(f"Métricas guardadas en {output_filename}")

# Ejecutar paso 6
calculate_metrics(
    grid_search, 
    x_train, y_train, 
    x_test, y_test, 
    "../files/output/metrics.json"
)

Métricas guardadas en ../files/output/metrics.json
