In [1]:
import os
import gzip
import pickle
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, median_absolute_error

def data(filepath, current_year=2021):
    data = pd.read_csv(filepath, index_col=False, compression="zip")
    data['Age'] = current_year - data['Year']
    data.drop(columns=['Year', 'Car_Name'], inplace=True)
    return data

train_data = data("../files/input/train_data.csv.zip")
test_data = data("../files/input/test_data.csv.zip")
x_train = train_data.drop(columns=['Present_Price'])
y_train = train_data['Present_Price']

x_test = test_data.drop(columns=['Present_Price'])
y_test = test_data['Present_Price']

colc = ["Fuel_Type", "Selling_type", "Transmission"]
numc = ['Selling_Price', 'Driven_kms', 'Owner', 'Age']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numc),
        ('cat', OneHotEncoder(handle_unknown="ignore"), colc)
    ],
    remainder=MinMaxScaler(),
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('Skbest', SelectKBest(score_func=f_regression)),
    ('model', LinearRegression())
])

# Precisión:

pipeline.fit(x_train, y_train)
print("Precisión", pipeline.score(x_test, y_test))

param_grid = {
    'Skbest__k': range(1, 20),
}

grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=10,
    n_jobs=-1
)

grid_search.fit(x_train, y_train)


model_path = "../files/models/model.pkl.gz"
os.makedirs(os.path.dirname(model_path), exist_ok=True)
with gzip.open(model_path, 'wb') as f:
    pickle.dump(grid_search, f)


def Metrica(y_true, y_pred, dataset_name):
    """Calcula las métricas clave para el conjunto de datos."""
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "r2": float(r2_score(y_true, y_pred)),
        "mse": float(mean_squared_error(y_true, y_pred)),
        "mad": float(median_absolute_error(y_true, y_pred))
    }

def save(metrics, filepath):
    """Guarda las métricas en un archivo JSON línea por línea."""
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, 'w') as f:
        for metric in metrics:
            f.write(json.dumps(metric) + "\n")

# Generar predicciones
y_train_pred = grid_search.best_estimator_.predict(x_train)
y_test_pred = grid_search.best_estimator_.predict(x_test)

# Calcular métricas
metrics_train = Metrica(y_train, y_train_pred, "train")
metrics_test = Metrica(y_test, y_test_pred, "test")

# Guardar las métricas
output_path = "../files/output/metrics.json"
save([metrics_train, metrics_test], output_path)




Precisión 0.6663619343263639
