In [13]:
import pandas as pd
import numpy as np
import pickle

import os
import sys

notebook_path = os.path.abspath(".")
sys.path.append(os.path.abspath(os.path.join(notebook_path, '..', 'src')))

import encoding_func

In [14]:
df_modelo = pd.read_pickle('../bin/dataframe_ml.pickle')
df_modelo

Unnamed: 0,kilometraje,cambio_automatico,potencia,marca_sola,anio_matricula,precio
0,11.652696,1.0,5.252273,VOLVO,2019,10.196194
1,9.615872,1.0,5.802118,MERCEDES-BENZ,2023,11.156251
2,10.512655,1.0,5.198497,CITROEN,2022,10.038936
3,10.238387,0.0,4.615121,JEEP,2023,10.081676
4,12.388398,1.0,5.484797,JAGUAR,2010,8.916104
...,...,...,...,...,...,...
14518,11.643962,1.0,4.418841,SMART,2018,9.158099
14519,11.341128,1.0,4.812184,TOYOTA,2023,10.105653
14520,10.072217,0.0,4.262680,FIAT,2024,9.391912
14521,10.308653,1.0,5.283204,TOYOTA,2024,10.542469


In [15]:
with open(f"../bin/marca_sola_precio_encoder.pickle", "rb") as file:
    marca_sola_precio_encoder = pickle.load(file)

with open(f"../bin/min_max_scaler.pickle", "rb") as file:
    min_max_scaler = pickle.load(file)   

In [16]:
TARGET = "precio"

X_train, X_test, y_train, y_test = encoding_func.dividir_dataframe(df_modelo, TARGET, test_size=0.2, random_state=42)

X_train["marca_sola"] = marca_sola_precio_encoder.transform(X_train["marca_sola"])
X_test["marca_sola"] = marca_sola_precio_encoder.transform(X_test["marca_sola"])

X_train = min_max_scaler.transform(X_train)
X_test = min_max_scaler.transform(X_test)


In [17]:
import time
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

In [18]:
modelos = {
    "Regresión Lineal": LinearRegression(),
    "Árbol de Decisión": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "K-Vecinos": KNeighborsRegressor(n_neighbors=5)
}

In [19]:
resultados = []

for nombre, modelo in modelos.items():
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    
    resultados.append({
        "Modelo": nombre,
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "R2": r2_score(y_test, y_pred)
    })


df_resultados = pd.DataFrame(resultados).sort_values(by="R2", ascending=False)
df_resultados

Unnamed: 0,Modelo,MAE,MSE,RMSE,R2
2,Random Forest,0.122849,0.033581,0.183251,0.892045
3,Gradient Boosting,0.134596,0.037238,0.19297,0.88029
0,Regresión Lineal,0.153245,0.050768,0.225317,0.836794
1,Árbol de Decisión,0.158104,0.056629,0.237968,0.817952
4,K-Vecinos,0.209618,0.084589,0.290841,0.728068


In [20]:
from sklearn.model_selection import GridSearchCV

parametros_rf = {
    "n_estimators": [100, 150, 200, 250],
    "max_depth": [None, 5],
    "min_samples_split": [8, 10, 12],
    "min_samples_leaf": [4, 6, 8]
}

from sklearn.ensemble import RandomForestRegressor
modelo_rf = RandomForestRegressor(random_state=42)

grid_search_rf = GridSearchCV(
    modelo_rf,
    parametros_rf,
    cv=5,  
    scoring="r2",  
    n_jobs=-1,  
    verbose=2
)

grid_search_rf.fit(X_train, y_train)

print("Mejores hiperparámetros:", grid_search_rf.best_params_)
print("Mejor R2 obtenido:", grid_search_rf.best_score_)

modelo_final_rf = grid_search_rf.best_estimator_

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Mejores hiperparámetros: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 250}
Mejor R2 obtenido: 0.8922091150167357


In [21]:
importancias = modelo_final_rf.feature_importances_

df_importancias = pd.DataFrame({
                  'Feature': X_train.columns,  
                  'Importance': importancias
                  })

df_importancias = df_importancias.sort_values(by='Importance', ascending=False).reset_index(drop=True)
df_importancias['Feature'] =  df_importancias['Feature'].map( {'potencia': 'Potencia',
                                                                'anio_matricula': 'Año de Matrícula',
                                                                'kilometraje':'Kilometraje',
                                                                'marca_sola': 'Marca',
                                                                'cambio_automatico': 'Cambio Automático'})

df_importancias

Unnamed: 0,Feature,Importance
0,Potencia,0.572469
1,Año de Matrícula,0.23274
2,Kilometraje,0.110989
3,Marca,0.078598
4,Cambio Automático,0.005203


In [22]:
import pickle

with open("../bin/mejor_modelo.pickle", "wb") as archivo:
    pickle.dump(grid_search_rf.best_estimator_, archivo)