In [3]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, r2_score, mean_squared_error, mean_absolute_error, median_absolute_error
import pickle
import os
import gzip
import json

In [None]:
test_data =pd.read_csv("../files/input/test_data.csv.zip") #Leemos el dataframe de test_data.csv.zip
train_data =pd.read_csv("../files/input/train_data.csv.zip") #Leemos el dataframe de test_data.csv.zip

# Paso 1
test_data['Age'] = 2021 - test_data['Year']
test_data= test_data.drop(columns=["Year", "Car_Name"]) 


train_data['Age'] = 2021 - train_data['Year']
train_data= train_data.drop(columns=["Year", "Car_Name"])

In [None]:
# Paso 2
X_train = train_data.drop(columns=["Present_Price"])  
y_train = train_data["Present_Price"]  

X_test = test_data.drop(columns=["Present_Price"])  
y_test = test_data["Present_Price"]  

In [None]:
# Paso 3
categorical_features=['Fuel_Type','Selling_type','Transmission']
numerical_features= [col for col in X_train.columns if col not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', MinMaxScaler(), numerical_features), 
        ('cat', OneHotEncoder(), categorical_features)
    ],

)

# Crear el pipeline con el modelo de Regresión Lineal
pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(f_regression)),
        ('classifier', LinearRegression())
])


In [None]:
# Paso 4
param_grid = {
    'feature_selection__k':range(1,15),
    'classifier__fit_intercept':[True,False],
    'classifier__positive':[True,False]

}
model=GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="neg_mean_absolute_error",
    n_jobs=-1   
    )

model.fit(X_train, y_train)

In [None]:
# Paso 5
models_dir = '../files/models'
os.makedirs(models_dir, exist_ok=True)

with gzip.open("../files/models/model.pkl.gz", "wb") as file:
    pickle.dump(model, file)

In [2]:
# Paso 6
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

metrics = [
    {
        "type": "metrics",
        "dataset": "train",
        "r2": float(r2_score(y_train, y_train_pred)),
        "mse": float(mean_squared_error(y_train, y_train_pred)),
        "mad": float(median_absolute_error(y_train, y_train_pred)),
        
    },
    {
        "type": "metrics",
        "dataset": "test",
        "r2": float(r2_score(y_test, y_test_pred)),
        "mse": float(mean_squared_error(y_test, y_test_pred)),
        "mad": float(median_absolute_error(y_test, y_test_pred)),
 
    },
]

output_file = "../files/output/metrics.json"
os.makedirs("../files/output", exist_ok=True)

with open(output_file, "w") as f:
    for item in metrics:
        f.write(str(item).replace("'", '"') + "\n")
        