In [20]:
import time
import re
from typing import Dict, Tuple, Optional
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

In [2]:
# Configuración
RANDOM_STATE = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.2 # Porcion del set train

N_TRIALS_XGB = 60
EARLY_STOP_ROUNDS = 200

### Hiperparametros para primer modelo XGBoost --> Control: Sin VCR ni coordenadas

In [3]:
# Configuración básica del dataset
df_vcr_c = pd.read_csv('dataset_vcr_compact.csv')
df_vcr_c = df_vcr_c[df_vcr_c['monto'] < 56000].copy()
df_vcr_c['log_monto']=np.log(df_vcr_c['monto'])
df_vcr_c['log_monto'].describe()

count    25211.000000
mean         8.395828
std          0.830310
min          5.950643
25%          7.740664
50%          8.242756
75%          8.984694
max         10.915088
Name: log_monto, dtype: float64

In [4]:
#Configuración específica del dataset para el modelo
df_base =df_vcr_c.copy()
obj_cols = df_base.select_dtypes(include=["object"]).columns
cols_to_drop = list(obj_cols) + ["id", "latitud", "longitud"]
df_base = df_base.drop(columns=cols_to_drop)
df_base.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 25211 entries, 0 to 25214
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   monto                 25211 non-null  int64  
 1   superficie_t          25211 non-null  float64
 2   dormitorios           25211 non-null  int64  
 3   dormitorios_faltante  25211 non-null  int64  
 4   banos                 25211 non-null  int64  
 5   banos_faltante        25211 non-null  int64  
 6   antiguedad            25211 non-null  int64  
 7   antiguedad_faltante   25211 non-null  int64  
 8   Or_N                  25211 non-null  int64  
 9   Or_S                  25211 non-null  int64  
 10  Or_E                  25211 non-null  int64  
 11  Or_O                  25211 non-null  int64  
 12  Or_Faltante           25211 non-null  int64  
 13  terraza               25211 non-null  float64
 14  estacionamiento       25211 non-null  int64  
 15  bodegas               25

In [5]:
# Split
DF = df_base
X = DF.drop(columns=["monto", "log_monto"]).copy()
y = DF["log_monto"].values

X_train_full, X_test, y_train_full, y_test = train_test_split(
X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
X_train, X_val, y_train, y_val = train_test_split(
X_train_full, y_train_full, test_size=VAL_SIZE, random_state=RANDOM_STATE
)
print(f"Shapes -> train: {X_train.shape}, val: {X_val.shape}, test: {X_test.shape}")

Shapes -> train: (16134, 21), val: (4034, 21), test: (5043, 21)


In [6]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val,   label=y_val)

In [None]:
def objective_xgb(trial):
    # Buscamos en NOMBRES estilo sklearn para que luego pasarlos directo al XGBRegressor
    params_skl = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 12),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 10.0, log=True),
    }
    num_boost_round = trial.suggest_int("num_boost_round", 800, 3000, step=100)

    # Mapear a nombres de xgb.train
    params_train = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",   # en LOG
        "device": "cuda",        # GPU obligatoria
        "seed": RANDOM_STATE,
        "eta": params_skl["learning_rate"],
        "max_depth": params_skl["max_depth"],
        "min_child_weight": params_skl["min_child_weight"],
        "subsample": params_skl["subsample"],
        "colsample_bytree": params_skl["colsample_bytree"],
        "gamma": params_skl["gamma"],
        "alpha": params_skl["reg_alpha"],
        "lambda": params_skl["reg_lambda"],
    }

    booster = xgb.train(
        params=params_train,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        evals=[(dval, "val")],
        early_stopping_rounds=EARLY_STOP_ROUNDS,
        verbose_eval=False,
    )

    y_val_pred_log = booster.predict(dval, iteration_range=(0, booster.best_iteration + 1))
    rmse_log_val = float(np.sqrt(np.mean((y_val - y_val_pred_log) ** 2)))
    # Guardar también n_estimators óptimo en los user_attrs del trial
    trial.set_user_attr("best_n_estimators", booster.best_iteration + 1)
    # Devolver params estilo sklearn (los leeremos después)
    for k, v in params_skl.items():
        trial.set_user_attr(k, v)
    return rmse_log_val


In [9]:
study = optuna.create_study(
    direction="minimize",
    sampler=TPESampler(seed=RANDOM_STATE),
    pruner=MedianPruner(n_startup_trials=5),
)
print("Optuna XGB: iniciando búsqueda...")
study.optimize(objective_xgb, n_trials=N_TRIALS_XGB, show_progress_bar=True)

t = study.best_trial
best_n_estimators = t.user_attrs["best_n_estimators"]
best_params_sklearn = {
    "n_estimators": best_n_estimators,
    "learning_rate": t.user_attrs["learning_rate"],
    "max_depth": t.user_attrs["max_depth"],
    "min_child_weight": t.user_attrs["min_child_weight"],
    "subsample": t.user_attrs["subsample"],
    "colsample_bytree": t.user_attrs["colsample_bytree"],
    "gamma": t.user_attrs["gamma"],
    "reg_alpha": t.user_attrs["reg_alpha"],
    "reg_lambda": t.user_attrs["reg_lambda"],
    "objective": "reg:squarederror",
    "random_state": RANDOM_STATE,
}
print("\nMejores HP XGB (val RMSE_log ↓):")
print({**best_params_sklearn})


[I 2025-10-08 17:03:50,932] A new study created in memory with name: no-name-9e470bad-8c01-43ef-a25a-a33a6aa1c489


Optuna XGB: iniciando búsqueda...


  0%|          | 0/60 [00:00<?, ?it/s]

[I 2025-10-08 17:04:08,631] Trial 0 finished with value: 0.25438285482034523 and parameters: {'learning_rate': 0.008468008575248327, 'max_depth': 10, 'min_child_weight': 9, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.6624074561769746, 'gamma': 0.7799726016810132, 'reg_alpha': 2.9152036385288193e-08, 'reg_lambda': 3.9676050770529883, 'num_boost_round': 2100}. Best is trial 0 with value: 0.25438285482034523.
[I 2025-10-08 17:04:25,028] Trial 1 finished with value: 0.25940937156558697 and parameters: {'learning_rate': 0.05675206026988748, 'max_depth': 3, 'min_child_weight': 12, 'subsample': 0.9329770563201687, 'colsample_bytree': 0.6849356442713105, 'gamma': 0.9091248360355031, 'reg_alpha': 2.9324868872723725e-07, 'reg_lambda': 0.08179499475211674, 'num_boost_round': 2000}. Best is trial 0 with value: 0.25438285482034523.
[I 2025-10-08 17:04:34,971] Trial 2 finished with value: 0.25944024927980674 and parameters: {'learning_rate': 0.01174843954800703, 'max_depth': 5, 'min_child

In [10]:
# === Exportar hiperparámetros (ajusta la versión) ===
VERSION_TAG = "v1"  
os.makedirs("models_XGB", exist_ok=True)

save_path = os.path.join("models_XGB", f"best_params_XGB_{VERSION_TAG}.json")
with open(save_path, "w") as f:
    json.dump(best_params_sklearn, f, indent=2)

print(f"Hiperparámetros guardados en: {save_path}")

Hiperparámetros guardados en: models_XGB\best_params_XGB_v1.json


### Hiperparametros para segundo modelo XGBoost --> Con coordenadas y sin VCR

In [12]:
#Configuración específica del modelo
df_coord =df_vcr_c.copy()
obj_cols = df_coord.select_dtypes(include=["object"]).columns
cols_to_drop = list(obj_cols)
cols_to_drop.append("id")
df_coord = df_coord.drop(columns=cols_to_drop)
df_coord.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 25211 entries, 0 to 25214
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   monto                 25211 non-null  int64  
 1   superficie_t          25211 non-null  float64
 2   dormitorios           25211 non-null  int64  
 3   dormitorios_faltante  25211 non-null  int64  
 4   banos                 25211 non-null  int64  
 5   banos_faltante        25211 non-null  int64  
 6   antiguedad            25211 non-null  int64  
 7   antiguedad_faltante   25211 non-null  int64  
 8   Or_N                  25211 non-null  int64  
 9   Or_S                  25211 non-null  int64  
 10  Or_E                  25211 non-null  int64  
 11  Or_O                  25211 non-null  int64  
 12  Or_Faltante           25211 non-null  int64  
 13  terraza               25211 non-null  float64
 14  estacionamiento       25211 non-null  int64  
 15  bodegas               25

In [13]:
# Split
DF = df_coord
X = DF.drop(columns=["monto", "log_monto"]).copy()
y = DF["log_monto"].values

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=VAL_SIZE, random_state=RANDOM_STATE
    )
print(f"Shapes -> train: {X_train.shape}, val: {X_val.shape}, test: {X_test.shape}")

Shapes -> train: (16134, 23), val: (4034, 23), test: (5043, 23)


In [14]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val,   label=y_val)

In [15]:
def objective_xgb(trial):
    # Buscamos en NOMBRES estilo sklearn para que luego pasarlos directo al XGBRegressor
    params_skl = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 12),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 10.0, log=True),
    }
    num_boost_round = trial.suggest_int("num_boost_round", 800, 3000, step=100)

    # Mapear a nombres de xgb.train
    params_train = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",   # en LOG
        "device": "cuda",        # GPU obligatoria
        "seed": RANDOM_STATE,
        "eta": params_skl["learning_rate"],
        "max_depth": params_skl["max_depth"],
        "min_child_weight": params_skl["min_child_weight"],
        "subsample": params_skl["subsample"],
        "colsample_bytree": params_skl["colsample_bytree"],
        "gamma": params_skl["gamma"],
        "alpha": params_skl["reg_alpha"],
        "lambda": params_skl["reg_lambda"],
    }

    booster = xgb.train(
        params=params_train,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        evals=[(dval, "val")],
        early_stopping_rounds=EARLY_STOP_ROUNDS,
        verbose_eval=False,
    )

    y_val_pred_log = booster.predict(dval, iteration_range=(0, booster.best_iteration + 1))
    rmse_log_val = float(np.sqrt(np.mean((y_val - y_val_pred_log) ** 2)))
    # Guardar también n_estimators óptimo en los user_attrs del trial
    trial.set_user_attr("best_n_estimators", booster.best_iteration + 1)
    # Devolver params estilo sklearn (los leeremos después)
    for k, v in params_skl.items():
        trial.set_user_attr(k, v)
    return rmse_log_val


In [16]:
study = optuna.create_study(
    direction="minimize",
    sampler=TPESampler(seed=RANDOM_STATE),
    pruner=MedianPruner(n_startup_trials=5),
)
print("Optuna XGB: iniciando búsqueda...")
study.optimize(objective_xgb, n_trials=N_TRIALS_XGB, show_progress_bar=True)

t = study.best_trial
best_n_estimators = t.user_attrs["best_n_estimators"]
best_params_sklearn = {
    "n_estimators": best_n_estimators,
    "learning_rate": t.user_attrs["learning_rate"],
    "max_depth": t.user_attrs["max_depth"],
    "min_child_weight": t.user_attrs["min_child_weight"],
    "subsample": t.user_attrs["subsample"],
    "colsample_bytree": t.user_attrs["colsample_bytree"],
    "gamma": t.user_attrs["gamma"],
    "reg_alpha": t.user_attrs["reg_alpha"],
    "reg_lambda": t.user_attrs["reg_lambda"],
    "objective": "reg:squarederror",
    "random_state": RANDOM_STATE,
}
print("\nMejores HP XGB con coordenadas(val RMSE_log ↓):")
print({**best_params_sklearn})


[I 2025-10-08 17:36:55,243] A new study created in memory with name: no-name-752f5777-f9b2-4917-8cb7-807c2ea1734f


Optuna XGB: iniciando búsqueda...


  0%|          | 0/60 [00:00<?, ?it/s]

[I 2025-10-08 17:37:10,941] Trial 0 finished with value: 0.18239654774617994 and parameters: {'learning_rate': 0.008468008575248327, 'max_depth': 10, 'min_child_weight': 9, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.6624074561769746, 'gamma': 0.7799726016810132, 'reg_alpha': 2.9152036385288193e-08, 'reg_lambda': 3.9676050770529883, 'num_boost_round': 2100}. Best is trial 0 with value: 0.18239654774617994.
[I 2025-10-08 17:37:15,635] Trial 1 finished with value: 0.19185196276566827 and parameters: {'learning_rate': 0.05675206026988748, 'max_depth': 3, 'min_child_weight': 12, 'subsample': 0.9329770563201687, 'colsample_bytree': 0.6849356442713105, 'gamma': 0.9091248360355031, 'reg_alpha': 2.9324868872723725e-07, 'reg_lambda': 0.08179499475211674, 'num_boost_round': 2000}. Best is trial 0 with value: 0.18239654774617994.
[I 2025-10-08 17:37:22,384] Trial 2 finished with value: 0.1905545362853053 and parameters: {'learning_rate': 0.01174843954800703, 'max_depth': 5, 'min_child_

In [17]:
# === Exportar hiperparámetros (ajusta la versión) ===
VERSION_TAG = "v2"  
os.makedirs("models_XGB", exist_ok=True)

save_path = os.path.join("models_XGB", f"best_params_XGB_{VERSION_TAG}.json")
with open(save_path, "w") as f:
    json.dump(best_params_sklearn, f, indent=2)

print(f"Hiperparámetros guardados en: {save_path}")

Hiperparámetros guardados en: models_XGB\best_params_XGB_v2.json


### Hiperparametros para tercer modelo XGBoost --> Con coordenadas y VCR

In [18]:
df_vcr_e = pd.read_csv('dataset_vcr_expanded.csv')
df_vcr_e = df_vcr_e[df_vcr_e['monto'] < 56000].copy()
df_vcr_e['log_monto']=np.log(df_vcr_e['monto'])
df_vcr_e['log_monto'].describe()

count    25211.000000
mean         8.395828
std          0.830310
min          5.950643
25%          7.740664
50%          8.242756
75%          8.984694
max         10.915088
Name: log_monto, dtype: float64

In [19]:
df_vcr =df_vcr_e.copy()
obj_cols = df_vcr.select_dtypes(include=["object"]).columns
cols_to_drop = list(obj_cols)
cols_to_drop.append("id")
df_vcr = df_vcr.drop(columns=cols_to_drop)
df_vcr.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 25211 entries, 0 to 25214
Data columns (total 181 columns):
 #    Column                        Dtype  
---   ------                        -----  
 0    monto                         int64  
 1    superficie_t                  float64
 2    dormitorios                   int64  
 3    dormitorios_faltante          int64  
 4    banos                         int64  
 5    banos_faltante                int64  
 6    antiguedad                    int64  
 7    antiguedad_faltante           int64  
 8    Or_N                          int64  
 9    Or_S                          int64  
 10   Or_E                          int64  
 11   Or_O                          int64  
 12   Or_Faltante                   int64  
 13   terraza                       float64
 14   estacionamiento               int64  
 15   bodegas                       int64  
 16   flag_Departamento             int64  
 17   flag_Multinivel               int64  
 18   flag_Semi

In [21]:
# %% Configuración 
# Dimensiones (1..12) 
DIMS_MAP = {
    1: "count_pois",
    2: "mean_distance",
    3: "min_distance",
    4: "max_distance",
    5: "median_distance",
    6: "std_distance",
    7: "mean_inverse_distance",
    8: "max_inverse_distance",
    9: "sum_inverse_distance",
    10: "ratio_within_near_radius",
    11: "ratio_within_mid_radius",
    12: "ratio_within_far_radius",
}

# Rol por dimensión (para decidir la imputación semántica)
DIM_ROLE = {
    1: "count",                # -> 0
    2: "distance",             # -> R3
    3: "distance",             # -> R3
    4: "distance",             # -> R3
    5: "distance",             # -> R3
    6: "std",                  # -> 0
    7: "inverse",              # -> 0
    8: "inverse",              # -> 0
    9: "inverse",              # -> 0
    10: "ratio",               # -> 0
    11: "ratio",               # -> 0
    12: "ratio",               # -> 0
}

# R3 por tipo de clase
R3_DEFAULT = 2400.0  # clases generales
R3_METRO = 1600.0
R3_BUS = 800.0

# %% Funciones
def _class_and_dim(col: str) -> Optional[Tuple[str, int]]:
    """Extrae (clase, índice de dimensión) de columnas tipo '<clase>_dimXX'."""
    m = re.match(r"^(?P<klass>.+)_dim(?P<idx>\d{1,2})$", col)
    if not m:
        return None
    return m.group("klass"), int(m.group("idx"))


def _r3_for_class(klass: str) -> float:
    k = klass.lower()
    if "metro" in k:
        return R3_METRO
    if "bus" in k:
        return R3_BUS
    return R3_DEFAULT


def impute_vcr_semantic(df: pd.DataFrame) -> pd.DataFrame:
    """Imputa VCR por semántica de ausencia: distancias=R3, inversas/ratios=0, count=0, std=0.
    Además agrega flags `has_<clase>` indicando presencia de POIs por clase.
    """
    out = df.copy()

    # Agrupar columnas por clase
    groups: Dict[str, Dict[int, str]] = {}
    vcr_cols = []
    for c in out.columns:
        parsed = _class_and_dim(c)
        if parsed is None:
            continue
        klass, idx = parsed
        groups.setdefault(klass, {})[idx] = c
        vcr_cols.append(c)

    if not groups:
        # Nada que imputar
        return out

    # Flags de presencia por clase (antes de imputar)
    for klass, dim_map in groups.items():
        cols = list(dim_map.values())
        has_series = out[cols].notna().any(axis=1).astype("int64")
        out[f"has_{klass}"] = has_series  # por qué: distingue ausencia real vs lejanía

    # Imputación por clase/dim
    n_total_nans = int(out[vcr_cols].isna().sum().sum())
    for klass, dim_map in groups.items():
        r3 = _r3_for_class(klass)
        for idx, col in dim_map.items():
            role = DIM_ROLE.get(idx)
            if role == "distance":
                fill_value = r3
            elif role in {"inverse", "ratio", "std", "count"}:
                fill_value = 0.0
            else:
                # Si hay una dimensión desconocida, ser conservador con 0.0
                fill_value = 0.0
            out[col] = out[col].fillna(fill_value)

    n_after_nans = int(out[vcr_cols].isna().sum().sum())
    print(f"Imputación VCR completada. NaNs antes: {n_total_nans:,d} -> después: {n_after_nans:,d}")

    return out


df_vcr_imp = impute_vcr_semantic(df_vcr)
df_vcr_imp.info() 

Imputación VCR completada. NaNs antes: 246,228 -> después: 0
<class 'pandas.core.frame.DataFrame'>
Index: 25211 entries, 0 to 25214
Columns: 194 entries, monto to has_bus
dtypes: float64(161), int64(33)
memory usage: 37.5 MB


In [22]:
# Split
DF = df_vcr_imp
X = DF.drop(columns=["monto", "log_monto"]).copy()
y = DF["log_monto"].values

X_train_full, X_test, y_train_full, y_test = train_test_split(
X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
X_train, X_val, y_train, y_val = train_test_split(
X_train_full, y_train_full, test_size=VAL_SIZE, random_state=RANDOM_STATE
)
print(f"Shapes -> train: {X_train.shape}, val: {X_val.shape}, test: {X_test.shape}")

Shapes -> train: (16134, 192), val: (4034, 192), test: (5043, 192)


In [23]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val,   label=y_val)

In [24]:
def objective_xgb(trial):
    # Buscamos en NOMBRES estilo sklearn para que luego pasarlos directo al XGBRegressor
    params_skl = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 12),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 10.0, log=True),
    }
    num_boost_round = trial.suggest_int("num_boost_round", 800, 3000, step=100)

    # Mapear a nombres de xgb.train
    params_train = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",   # en LOG
        "device": "cuda",        # GPU obligatoria
        "seed": RANDOM_STATE,
        "eta": params_skl["learning_rate"],
        "max_depth": params_skl["max_depth"],
        "min_child_weight": params_skl["min_child_weight"],
        "subsample": params_skl["subsample"],
        "colsample_bytree": params_skl["colsample_bytree"],
        "gamma": params_skl["gamma"],
        "alpha": params_skl["reg_alpha"],
        "lambda": params_skl["reg_lambda"],
    }

    booster = xgb.train(
        params=params_train,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        evals=[(dval, "val")],
        early_stopping_rounds=EARLY_STOP_ROUNDS,
        verbose_eval=False,
    )

    y_val_pred_log = booster.predict(dval, iteration_range=(0, booster.best_iteration + 1))
    rmse_log_val = float(np.sqrt(np.mean((y_val - y_val_pred_log) ** 2)))
    # Guardar también n_estimators óptimo en los user_attrs del trial
    trial.set_user_attr("best_n_estimators", booster.best_iteration + 1)
    # Devolver params estilo sklearn (los leeremos después)
    for k, v in params_skl.items():
        trial.set_user_attr(k, v)
    return rmse_log_val


In [25]:
study = optuna.create_study(
    direction="minimize",
    sampler=TPESampler(seed=RANDOM_STATE),
    pruner=MedianPruner(n_startup_trials=5),
)
print("Optuna XGB: iniciando búsqueda...")
study.optimize(objective_xgb, n_trials=N_TRIALS_XGB, show_progress_bar=True)

t = study.best_trial
best_n_estimators = t.user_attrs["best_n_estimators"]
best_params_sklearn = {
    "n_estimators": best_n_estimators,
    "learning_rate": t.user_attrs["learning_rate"],
    "max_depth": t.user_attrs["max_depth"],
    "min_child_weight": t.user_attrs["min_child_weight"],
    "subsample": t.user_attrs["subsample"],
    "colsample_bytree": t.user_attrs["colsample_bytree"],
    "gamma": t.user_attrs["gamma"],
    "reg_alpha": t.user_attrs["reg_alpha"],
    "reg_lambda": t.user_attrs["reg_lambda"],
    "objective": "reg:squarederror",
    "random_state": RANDOM_STATE,
}
print("\nMejores HP XGB completo (val RMSE_log ↓):")
print({**best_params_sklearn})


[I 2025-10-08 17:51:35,987] A new study created in memory with name: no-name-8f947f13-5979-456a-9ff4-35be181bea9a


Optuna XGB: iniciando búsqueda...


  0%|          | 0/60 [00:00<?, ?it/s]

[I 2025-10-08 17:51:56,009] Trial 0 finished with value: 0.17462495238637127 and parameters: {'learning_rate': 0.008468008575248327, 'max_depth': 10, 'min_child_weight': 9, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.6624074561769746, 'gamma': 0.7799726016810132, 'reg_alpha': 2.9152036385288193e-08, 'reg_lambda': 3.9676050770529883, 'num_boost_round': 2100}. Best is trial 0 with value: 0.17462495238637127.
[I 2025-10-08 17:52:00,442] Trial 1 finished with value: 0.1850335463882574 and parameters: {'learning_rate': 0.05675206026988748, 'max_depth': 3, 'min_child_weight': 12, 'subsample': 0.9329770563201687, 'colsample_bytree': 0.6849356442713105, 'gamma': 0.9091248360355031, 'reg_alpha': 2.9324868872723725e-07, 'reg_lambda': 0.08179499475211674, 'num_boost_round': 2000}. Best is trial 0 with value: 0.17462495238637127.
[I 2025-10-08 17:52:08,873] Trial 2 finished with value: 0.18525487304533986 and parameters: {'learning_rate': 0.01174843954800703, 'max_depth': 5, 'min_child_

In [26]:
# === Exportar hiperparámetros ===
VERSION_TAG = "v3"  
os.makedirs("models_XGB", exist_ok=True)

save_path = os.path.join("models_XGB", f"best_params_XGB_{VERSION_TAG}.json")
with open(save_path, "w") as f:
    json.dump(best_params_sklearn, f, indent=2)

print(f"Hiperparámetros guardados en: {save_path}")

Hiperparámetros guardados en: models_XGB\best_params_XGB_v3.json
