In [None]:
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ðŸ“Œ Cargar datos
data_path = "../data/final/sri_autos_features.csv"
df = pd.read_csv(data_path, delimiter=';', encoding="latin-1", low_memory=False)

# ðŸ“Œ Definir variables predictoras (X) y la variable objetivo (y)
X = df.drop(columns=["avaluo"])  # Eliminamos "avaluo" de las variables predictoras
y = df["avaluo"]  # La variable que queremos predecir

# ðŸ“Œ Dividir los datos en entrenamiento (80%) y prueba (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ðŸ“Œ FunciÃ³n para evaluar modelos
def evaluar_modelo(model, X_test, y_test, nombre):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)  # MSE sin 'squared=False'
    rmse = mse ** 0.5  # Calcular la raÃ­z cuadrada manualmente
    r2 = r2_score(y_test, y_pred)

    print(f"\nðŸ“Š EvaluaciÃ³n del Modelo {nombre}:")
    print(f"âœ… MAE: {mae:.2f}")
    print(f"âœ… RMSE: {rmse:.2f}")
    print(f"âœ… RÂ² Score: {r2:.4f}")

# ðŸ“Œ Entrenar modelo Random Forest inicial
rf_model = RandomForestRegressor(
    n_estimators=500,  # Aumentamos el nÃºmero de Ã¡rboles
    max_depth=25,  # Limitamos la profundidad para evitar sobreajuste
    min_samples_split=2,  # Evita que crezca demasiado rÃ¡pido
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

# ðŸ“Œ Entrenar modelo XGBoost inicial
xgb_model = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=9, random_state=42)
xgb_model.fit(X_train, y_train)

# ðŸ“Œ Evaluar modelos iniciales
evaluar_modelo(rf_model, X_test, y_test, "Random Forest Inicial")
evaluar_modelo(xgb_model, X_test, y_test, "XGBoost Inicial")

# ðŸ“Œ OptimizaciÃ³n de hiperparÃ¡metros

# Definir el espacio de bÃºsqueda de hiperparÃ¡metros para Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Definir el espacio de bÃºsqueda de hiperparÃ¡metros para XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 9],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2]
}

# Grid Search para Random Forest
grid_search_rf = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid_rf,
    scoring='neg_mean_absolute_error',  # Puedes usar 'neg_mean_squared_error' o 'r2'
    cv=5,  # ValidaciÃ³n cruzada de 5 folds
    n_jobs=-1
)
grid_search_rf.fit(X_train, y_train)

# Randomized Search para XGBoost
random_search_xgb = RandomizedSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_distributions=param_grid_xgb,
    n_iter=50,  # NÃºmero de combinaciones aleatorias a probar
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1
)
random_search_xgb.fit(X_train, y_train)

# Mostrar los mejores parÃ¡metros encontrados
print("\nMejores parÃ¡metros para Random Forest:", grid_search_rf.best_params_)
print("Mejores parÃ¡metros para XGBoost:", random_search_xgb.best_params_)

# ðŸ“Œ Reentrenar modelos con los mejores parÃ¡metros
best_rf_model = RandomForestRegressor(**grid_search_rf.best_params_, random_state=42)
best_rf_model.fit(X_train, y_train)

best_xgb_model = XGBRegressor(**random_search_xgb.best_params_, random_state=42)
best_xgb_model.fit(X_train, y_train)

# ðŸ“Œ Evaluar modelos optimizados
evaluar_modelo(best_rf_model, X_test, y_test, "Random Forest Optimizado")
evaluar_modelo(best_xgb_model, X_test, y_test, "XGBoost Optimizado")

# ðŸ“Œ VisualizaciÃ³n de resultados

# GrÃ¡fico de validaciÃ³n cruzada para Random Forest (max_depth vs MAE)
max_depths = [10, 20, 30, None]
mae_scores = []

for depth in max_depths:
    model = RandomForestRegressor(max_depth=depth, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae_scores.append(mean_absolute_error(y_test, y_pred))

plt.plot(max_depths, mae_scores, marker='o')
plt.xlabel('max_depth')
plt.ylabel('MAE')
plt.title('Random Forest: max_depth vs MAE')
plt.show()

# GrÃ¡fico de importancia de caracterÃ­sticas para Random Forest
importances = best_rf_model.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title("Importancia de caracterÃ­sticas (Random Forest)")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), feature_names[indices], rotation=90)
plt.xlabel("CaracterÃ­sticas")
plt.ylabel("Importancia")
plt.show()

# ðŸ“Œ Guardar modelos optimizados
joblib.dump(best_rf_model, "../models/random_forest_optimizado.pkl")
joblib.dump(best_xgb_model, "../models/xgboost_optimizado.pkl")