In [61]:
import zipfile
import pandas as pd

def clean_dataset(path):
    """Carga y limpia los datasets."""
    with zipfile.ZipFile(path, "r") as z:
        csv_file = z.namelist()[0]
        with z.open(csv_file) as f:
            df = pd.read_csv(f)

    df.dropna(inplace=True)
    df["Age"] = 2021 - df["Year"]
    df.drop_duplicates(inplace=True)
    df.drop(columns=["Year", "Car_Name"], inplace=True)

    return df


In [62]:
#
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.
#

# Cargar datasets
df_test = clean_dataset("../files/input/test_data.csv.zip")
df_train = clean_dataset("../files/input/train_data.csv.zip")

# Separar variables
x_train = df_train.drop(columns=["Present_Price"])
y_train = df_train["Present_Price"]

x_test = df_test.drop(columns=["Present_Price"])
y_test = df_test["Present_Price"]

In [63]:
df_test

Unnamed: 0,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Age
0,4.75,9.54,43000,Diesel,Dealer,Manual,0,8
1,7.25,9.85,6900,Petrol,Dealer,Manual,0,4
2,2.85,4.15,5200,Petrol,Dealer,Manual,0,10
3,6.75,8.12,18796,Petrol,Dealer,Manual,0,6
4,6.50,8.61,33429,Diesel,Dealer,Manual,0,6
...,...,...,...,...,...,...,...,...
85,9.70,13.60,21780,Petrol,Dealer,Manual,0,6
86,6.25,13.60,40126,Petrol,Dealer,Manual,0,7
87,2.10,7.60,50456,Petrol,Dealer,Manual,0,15
88,6.40,8.40,12000,Petrol,Dealer,Manual,0,5


In [64]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Escala las variables numéricas al intervalo [0, 1].
# - Selecciona las K mejores entradas.
# - Ajusta un modelo de regresion lineal.
#

from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


def build_pipeline():
    """Construye un pipeline con preprocesamiento, PCA, selección de características y MLP."""
    # Definir columnas categóricas y numéricas
    categorical_features = ["Fuel_Type", "Selling_type", "Transmission"]
    numeric_features = [
        col for col in x_train.columns if col not in categorical_features
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(), categorical_features),
            ("num", StandardScaler(), numeric_features),
        ]
    )

    # Ajustamos el preprocesador para conocer el número de características resultantes
    preprocessor.fit(x_train)
    x_train_transformed = preprocessor.transform(x_train)
    num_features_after_preprocessing = x_train_transformed.shape[1]
    print(
        f"Número de características después del preprocesamiento: {num_features_after_preprocessing}"
    )

    # Configuración inicial de SelectKBest (el valor k se sobreescribirá en GridSearchCV)
    k_best = SelectKBest(f_classif, k=min(10, num_features_after_preprocessing))

    # Regresión lineal
    model = LinearRegression()

    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("k_best", k_best),
            ("classifier", model),
        ]
    )

    return pipeline

In [65]:
from sklearn.model_selection import GridSearchCV


def optimize_pipeline(pipeline, x_train, y_train):
    """Optimiza el pipeline usando GridSearchCV con 10-fold cross-validation."""
    param_grid = {
        "k_best__k": [5, 6, 7, 8],
        "classifier__fit_intercept": [True,False],
        "classifier__positive": [True,False]
    }
        
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=10,
        scoring="neg_mean_absolute_error",
        n_jobs=-1,
        refit=True,
        verbose=2,
    )

    print("Optimizando hiperparámetros con GridSearchCV...")
    grid_search.fit(x_train, y_train)
    print("Optimización finalizada.")
    print("Mejores parámetros:", grid_search.best_params_)
    print("Mejor balanced_accuracy:", grid_search.best_score_)

    return grid_search

In [66]:
import os
import pickle
import gzip

def save_model(model, file_path="../files/models/model.pkl.gz"):
    """Guarda el modelo entrenado en un archivo comprimido."""
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with gzip.open(file_path, "wb") as f:
        pickle.dump(model, f)
    print(f"Modelo guardado en {file_path}")


In [67]:
import os
import json
from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error

def evaluate_model(
    model, x_train, y_train, x_test, y_test, file_path="../files/output/metrics.json"
):
    """Evalúa el modelo en los conjuntos de entrenamiento y prueba y guarda las métricas en un archivo JSON línea por línea."""
    
    # Crear carpeta si no existe
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    # Abrir el archivo en modo escritura
    with open(file_path, "w") as f:
        for dataset, (x, y) in zip(["train", "test"], [(x_train, y_train), (x_test, y_test)]):
            y_pred = model.predict(x)
            metrics = {
                "type": "metrics",
                "dataset": dataset,
                "r2": float(r2_score(y, y_pred)),
                "mse": float(mean_squared_error(y, y_pred)),
                "mad": float(median_absolute_error(y, y_pred)),
            }
            f.write(json.dumps(metrics) + "\n")  # Escribir cada métrica en una línea separada

    print(f"Métricas guardadas en {file_path}")


In [68]:
# Flujo principal
print("Construcción del pipeline...")
pipeline = build_pipeline()

print("Optimización del modelo...")
best_pipeline = optimize_pipeline(pipeline, x_train, y_train)

print("Guardando el modelo...")
save_model(best_pipeline)

print("Evaluando el modelo y guardando métricas...")
evaluate_model(best_pipeline, x_train, y_train, x_test, y_test)

print("¡Proceso completado con éxito!")


Construcción del pipeline...
Número de características después del preprocesamiento: 11
Optimización del modelo...
Optimizando hiperparámetros con GridSearchCV...
Fitting 10 folds for each of 16 candidates, totalling 160 fits
[CV] END classifier__fit_intercept=True, classifier__positive=True, k_best__k=5; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=True, k_best__k=5; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=True, k_best__k=5; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=True, k_best__k=5; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=True, k_best__k=5; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=True, k_best__k=5; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=True, k_best__k=5; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=True, k_best

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = ms

[CV] END classifier__fit_intercept=True, classifier__positive=False, k_best__k=7; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=False, k_best__k=7; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=False, k_best__k=7; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=False, k_best__k=8; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=False, k_best__k=6; total time=   0.0s
[CV] END classifier__fit_intercept=False, classifier__positive=True, k_best__k=5; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=False, k_best__k=8; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=False, k_best__k=6; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=False, k_best__k=7; total time=   0.0s
[CV] END classifier__fit_intercept=True, classifier__positive=False, k_best__k=8; total tim

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
