In [149]:
import pandas as pd
test_data = pd.read_csv("../files/input/test_data.csv.zip",index_col=False,compression="zip")
train_data = pd.read_csv("../files/input/train_data.csv.zip",index_col=False,compression="zip")

In [150]:
#paso 1
current_year = 2021

train_data['Age'] = current_year - train_data['Year']
test_data['Age'] = current_year - test_data['Year']

columns_to_drop = ['Year', 'Car_Name']
train_data = train_data.drop(columns=columns_to_drop)
test_data = test_data.drop(columns=columns_to_drop)


In [151]:
#paso 2
x_train=train_data.drop(columns="Present_Price")
y_train=train_data["Present_Price"]


x_test=test_data.drop(columns="Present_Price")
y_test=test_data["Present_Price"]

In [152]:
#paso 3
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression

# Identificar columnas categóricas y numéricas
categorical_features = ['Fuel_Type', 'Selling_type', 'Transmission']
numeric_features= [col for col in x_train.columns if col not in categorical_features]

# Crear transformaciones para las columnas
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', MinMaxScaler(), numeric_features)
    ]
)

# Crear el pipeline
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),  # Transformar las columnas
        ('feature_selector', SelectKBest(score_func=f_regression, k='all')),  # Seleccionar las K mejores entradas
        ('regressor', LinearRegression())  # Modelo de regresión lineal
    ]
)

In [153]:
#paso 4
from sklearn.model_selection import GridSearchCV

# Definir los hiperparámetros a optimizar
param_grid = {
    'feature_selector__k':range(1,15),
    'regressor__fit_intercept':[True,False],
    
}

# Crear el objeto GridSearchCV
model=GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,  
    scoring="neg_mean_absolute_error",
    n_jobs=-1,   
)


model.fit(x_train, y_train)

best_model=model.best_estimator_

In [154]:
#paso 5
import pickle
import os
import gzip

models_dir = '../files/models'
os.makedirs(models_dir, exist_ok=True)

with gzip.open("../files/models/model.pkl.gz", "wb") as file:
    pickle.dump(model, file)

In [155]:
import json
import os
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,median_absolute_error

# Calcular métricas para el conjunto de entrenamiento
y_train_pred = best_model.predict(x_train)
train_metrics = {
    'type': 'metrics',
    'dataset': 'train',
    'r2': r2_score(y_train, y_train_pred),
    'mse': mean_squared_error(y_train, y_train_pred),
    'mad': median_absolute_error(y_train, y_train_pred),
}

# Calcular métricas para el conjunto de prueba
y_test_pred = best_model.predict(x_test)
test_metrics = {
    'type': 'metrics',
    'dataset': 'test',
    'r2': r2_score(y_test, y_test_pred),
    'mse': mean_squared_error(y_test, y_test_pred),
    'mad': median_absolute_error(y_test, y_test_pred),
}

output_path = "../files/output/metrics.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)  

with open(output_path, 'w') as f:
    f.write(json.dumps(train_metrics) + '\n')  
    f.write(json.dumps(test_metrics) + '\n')  

print(f"Métricas guardadas en: {output_path}")

Métricas guardadas en: ../files/output/metrics.json
