In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import xgboost as xgb

In [5]:
# =========================
# 1. CARGA DE DATOS
# =========================
df = pd.read_csv("../data/diamantesLimpios.csv")

# One-Hot Encoding para variables categóricas
# drop_first=True evita multicolinealidad
df_encoded = pd.get_dummies(df, drop_first=True)

# Separar variables independientes y target
X = df_encoded.drop("price", axis=1)
y = df_encoded["price"]

# =========================
# 2. TRAIN / TEST SPLIT
# =========================
# Usamos 80/20, más habitual
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ⚠️ IMPORTANTE:
# No usamos StandardScaler porque estos modelos son de árboles
# (no dependen de distancias)

# =========================
# 3. MODELOS BASE
# =========================
models = {
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ),
    "Gradient Boosting": GradientBoostingRegressor(
        n_estimators=200,
        learning_rate=0.1,
        random_state=42
    ),
    "XGBoost": xgb.XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        objective="reg:squarederror",
        random_state=42
    )
}

# =========================
# 4. ENTRENAMIENTO + EVALUACIÓN
# =========================
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)

    print(f"\n-- {name} --")
    print(f"R²:   {r2:.3f}")
    print(f"MAE:  {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")

# =========================
# 5. VALIDACIÓN CRUZADA (ejemplo con Random Forest)
# =========================
rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

# 5-fold Cross Validation con R²
cv_scores = cross_val_score(rf, X, y, cv=5, scoring="r2")

print("\n-- Random Forest CV --")
print(f"R² medio: {cv_scores.mean():.3f}")
print(f"Desviación: {cv_scores.std():.3f}")

# =========================
# 6. GRID SEARCH (tuning sencillo)
# =========================
param_grid = {
    "n_estimators": [200, 400],
    "max_depth": [None, 10, 20],
    "min_samples_leaf": [1, 5, 10]
}

grid = GridSearchCV(
    RandomForestRegressor(random_state=42, n_jobs=-1),
    param_grid,
    cv=3,
    scoring="r2",
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("\n-- Mejor Random Forest --")
print("Mejores parámetros:", grid.best_params_)
print("R² CV:", grid.best_score_)

# Evaluación final del mejor modelo
best_rf = grid.best_estimator_
preds = best_rf.predict(X_test)

print("\n-- Random Forest Optimizado (Test) --")
print(f"R²:   {r2_score(y_test, preds):.3f}")
print(f"MAE:  {mean_absolute_error(y_test, preds):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, preds)):.2f}")


-- Decision Tree --
R²:   0.598
MAE:  407.40
RMSE: 6307.55

-- Random Forest --
R²:   0.896
MAE:  299.31
RMSE: 3211.41

-- Gradient Boosting --
R²:   0.894
MAE:  520.04
RMSE: 3239.75

-- XGBoost --
R²:   0.667
MAE:  449.79
RMSE: 5740.30

-- Random Forest CV --
R² medio: -3.599
Desviación: 3.282

-- Mejor Random Forest --
Mejores parámetros: {'max_depth': 20, 'min_samples_leaf': 1, 'n_estimators': 200}
R² CV: 0.7966530048606301

-- Random Forest Optimizado (Test) --
R²:   0.897
MAE:  306.61
RMSE: 3194.48
