In [1]:
#========================================
# 1. Librerias utilizadas
#=========================================
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle
import gzip
import os
import json
from sklearn.metrics import r2_score,mean_squared_error,median_absolute_error
from sklearn.svm import SVC
import zipfile

In [2]:
#========================================
# 2. Carga y preprocesamiento de datos
#=========================================
train_csv = pd.read_csv("../files/input/train_data.csv.zip")
test_csv = pd.read_csv("../files/input/test_data.csv.zip")

def pre_data(df, current_year=2021):
    # Crear columna Age
    df["Age"] = current_year - df["Year"]
    
    # Eliminar columnas no necesarias
    df = df.drop(columns=["Year", "Car_Name"])
    
    return df


df_train = pre_data(train_csv)
df_test = pre_data(test_csv)

x_train=df_train.drop(columns="Present_Price")
y_train=df_train["Present_Price"]

x_test=df_test.drop(columns="Present_Price")
y_test=df_test["Present_Price"]


In [3]:
#==========================================
# 3. Pipeline de preprocesamiento y modelo
#===========================================

categorical=['Fuel_Type','Selling_type','Transmission']
numeric= [col for col in x_train.columns if col not in categorical]

# Preprocesamiento para las variables categóricas
categorical_trans= OneHotEncoder(handle_unknown="ignore")

# Preprocesamiento para las variables numéricas
numerical_trans = MinMaxScaler()

preprocessor = ColumnTransformer(
    transformers = [
        ["cat", categorical_trans, categorical],
        ["scaler", numerical_trans, numeric]
    ]
) 

pipeline=Pipeline(
    [
        ("preprocessor",preprocessor),
        ('feature_selection',SelectKBest(f_regression)),
        ('classifier', LinearRegression())
    ]
)

param= {
    'feature_selection__k':range(1,15),
    'classifier__fit_intercept':[True,False],
    'classifier__positive':[True,False]
}

grid_search= GridSearchCV(
    estimator=pipeline,
    param_grid=param,
    cv=10,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(x_train,y_train)

os.makedirs("../files/models/", exist_ok=True)

with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid_search, f)

Fitting 10 folds for each of 56 candidates, totalling 560 fits


In [4]:
#==========================================
# 4. Evaluación del modelo
#===========================================

def calculate_metrics(y_true, y_pred, name):
    metric = {
        "type": "metrics",
        "dataset": name,
        'r2': float(r2_score(y_true, y_pred)),
        'mse': float(mean_squared_error(y_true, y_pred)),
        'mad': float(median_absolute_error(y_true, y_pred))
    }
    return metric


y_train_pred = grid_search.predict(x_train)
y_test_pred = grid_search.predict(x_test)


metrics = []
train_metrics = calculate_metrics(y_train, y_train_pred, "train")
test_metrics = calculate_metrics(y_test, y_test_pred, "test")

metrics.append(train_metrics)
metrics.append(test_metrics)

os.makedirs("../files/output", exist_ok=True)

# Guardar metricas en archivo JSON
with open("../files/output/metrics.json", "w") as file:
        for metric in metrics:
            file.write(json.dumps(metric, ensure_ascii=False))
            file.write('\n')