# Modelos de regresion (lineal y RandomForest)

In [None]:
import json
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

base_path = Path("../../01_preprocessing_results/preprocessing")
train = pd.read_csv(base_path / "T_train_final_objetivo.csv")
test = pd.read_csv(base_path / "T_test_final_objetivo.csv")

objetivo = "Total_Amount"
X_train = train.drop(columns=[objetivo])
y_train = train[objetivo]
X_test = test.drop(columns=[objetivo])
y_test = test[objetivo]


In [None]:
# Entrenar modelos
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

rf_reg = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1,
)
rf_reg.fit(X_train, y_train)


In [None]:
# Evaluacion

def evaluar(nombre, modelo):
    preds = modelo.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    return {"modelo": nombre, "mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "preds": preds}

res_lin = evaluar("LinearRegression", lin_reg)
res_rf = evaluar("RandomForestRegressor", rf_reg)

resultados = pd.DataFrame([
    {k: v for k, v in res_lin.items() if k != "preds"},
    {k: v for k, v in res_rf.items() if k != "preds"},
])
print(resultados)


In [None]:
# Graficos para el mejor modelo
mejor = max([res_lin, res_rf], key=lambda r: r["r2"])
mejor_preds = mejor["preds"]

plt.figure(figsize=(6, 6))
plt.scatter(y_test, mejor_preds, alpha=0.6)
plt.xlabel("Valor real")
plt.ylabel("Prediccion")
plt.title(f"Pred vs real - {mejor['modelo']}")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
plt.grid(True)
plt.tight_layout()
plt.savefig("scatter_pred_vs_real.png", dpi=150)
plt.close()

residuos = y_test - mejor_preds
plt.figure(figsize=(6, 4))
plt.hist(residuos, bins=20, edgecolor="black")
plt.xlabel("Error")
plt.ylabel("Frecuencia")
plt.title(f"Distribucion de errores - {mejor['modelo']}")
plt.tight_layout()
plt.savefig("residuos_mejor_modelo.png", dpi=150)
plt.close()


In [None]:
# Guardar artefactos
joblib.dump(lin_reg, "modelo_reg_lineal.pkl")
joblib.dump(rf_reg, "modelo_random_forest.pkl")
resultados.to_csv("comparacion_modelos.csv", index=False)

with open("expected_columns.json", "w", encoding="utf-8") as f:
    json.dump({"columns": X_train.columns.tolist(), "target": objetivo}, f, ensure_ascii=False, indent=2)

preds_df = pd.DataFrame({
    "y_real": y_test,
    "y_pred_lineal": res_lin["preds"],
    "y_pred_rf": res_rf["preds"],
})
preds_df.to_csv("predicciones_test.csv", index=False)
print("Artefactos guardados")
