In [1]:
import pandas as pd
import numpy as np

In [2]:
df_vcr_e = pd.read_csv('dataset_vcr_expanded.csv')
df_vcr_e = df_vcr_e[df_vcr_e['monto'] < 56000].copy()
df_vcr_e['log_monto']=np.log(df_vcr_e['monto'])
df_vcr_e['log_monto'].describe()

count    25211.000000
mean         8.395828
std          0.830310
min          5.950643
25%          7.740664
50%          8.242756
75%          8.984694
max         10.915088
Name: log_monto, dtype: float64

In [3]:
df_control_vcr =df_vcr_e.copy()
obj_cols = df_control_vcr.select_dtypes(include=["object"]).columns
cols_to_drop = list(obj_cols)
cols_to_drop.append("id")
df_control_vcr = df_control_vcr.drop(columns=cols_to_drop)
df_control_vcr.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 25211 entries, 0 to 25214
Data columns (total 181 columns):
 #    Column                        Dtype  
---   ------                        -----  
 0    monto                         int64  
 1    superficie_t                  float64
 2    dormitorios                   int64  
 3    dormitorios_faltante          int64  
 4    banos                         int64  
 5    banos_faltante                int64  
 6    antiguedad                    int64  
 7    antiguedad_faltante           int64  
 8    Or_N                          int64  
 9    Or_S                          int64  
 10   Or_E                          int64  
 11   Or_O                          int64  
 12   Or_Faltante                   int64  
 13   terraza                       float64
 14   estacionamiento               int64  
 15   bodegas                       int64  
 16   flag_Departamento             int64  
 17   flag_Multinivel               int64  
 18   flag_Semi

In [4]:
import re
import numpy as np
import pandas as pd
from typing import Dict, Tuple, Optional

# %% Configuración (ajusta si tu mapeo de dimensiones difiere)
# Dimensiones (1..12) -> nombre lógico (solo para claridad)
DIMS_MAP = {
    1: "count_pois",
    2: "mean_distance",
    3: "min_distance",
    4: "max_distance",
    5: "median_distance",
    6: "std_distance",
    7: "mean_inverse_distance",
    8: "max_inverse_distance",
    9: "sum_inverse_distance",
    10: "ratio_within_near_radius",
    11: "ratio_within_mid_radius",
    12: "ratio_within_far_radius",
}

# Rol por dimensión (para decidir la imputación semántica)
DIM_ROLE = {
    1: "count",                # -> 0
    2: "distance",             # -> R3
    3: "distance",             # -> R3
    4: "distance",             # -> R3
    5: "distance",             # -> R3
    6: "std",                  # -> 0
    7: "inverse",              # -> 0
    8: "inverse",              # -> 0
    9: "inverse",              # -> 0
    10: "ratio",               # -> 0
    11: "ratio",               # -> 0
    12: "ratio",               # -> 0
}

# R3 por tipo de clase
R3_DEFAULT = 2400.0  # clases generales
R3_METRO = 1600.0
R3_BUS = 800.0

# %% Funciones
def _class_and_dim(col: str) -> Optional[Tuple[str, int]]:
    """Extrae (clase, índice de dimensión) de columnas tipo '<clase>_dimXX'."""
    m = re.match(r"^(?P<klass>.+)_dim(?P<idx>\d{1,2})$", col)
    if not m:
        return None
    return m.group("klass"), int(m.group("idx"))


def _r3_for_class(klass: str) -> float:
    k = klass.lower()
    if "metro" in k:
        return R3_METRO
    if "bus" in k:
        return R3_BUS
    return R3_DEFAULT


def impute_vcr_semantic(df: pd.DataFrame) -> pd.DataFrame:
    """Imputa VCR por semántica de ausencia: distancias=R3, inversas/ratios=0, count=0, std=0.
    Además agrega flags `has_<clase>` indicando presencia de POIs por clase.
    """
    out = df.copy()

    # Agrupar columnas por clase
    groups: Dict[str, Dict[int, str]] = {}
    vcr_cols = []
    for c in out.columns:
        parsed = _class_and_dim(c)
        if parsed is None:
            continue
        klass, idx = parsed
        groups.setdefault(klass, {})[idx] = c
        vcr_cols.append(c)

    if not groups:
        # Nada que imputar
        return out

    # Flags de presencia por clase (antes de imputar)
    for klass, dim_map in groups.items():
        cols = list(dim_map.values())
        has_series = out[cols].notna().any(axis=1).astype("int64")
        out[f"has_{klass}"] = has_series  # por qué: distingue ausencia real vs lejanía

    # Imputación por clase/dim
    n_total_nans = int(out[vcr_cols].isna().sum().sum())
    for klass, dim_map in groups.items():
        r3 = _r3_for_class(klass)
        for idx, col in dim_map.items():
            role = DIM_ROLE.get(idx)
            if role == "distance":
                fill_value = r3
            elif role in {"inverse", "ratio", "std", "count"}:
                fill_value = 0.0
            else:
                # Si hay una dimensión desconocida, ser conservador con 0.0
                fill_value = 0.0
            out[col] = out[col].fillna(fill_value)

    n_after_nans = int(out[vcr_cols].isna().sum().sum())
    print(f"Imputación VCR completada. NaNs antes: {n_total_nans:,d} -> después: {n_after_nans:,d}")

    return out


# %% Uso mínimo
# Asume que `df_control_vcr` existe, sólo numérico, con columnas tipo '<clase>_dimXX',
# e incluye 'monto' y 'log_monto' que NO se tocan aquí.
df_control_vcr_imp = impute_vcr_semantic(df_control_vcr)
df_control_vcr_imp.info()  # verificación rápida


Imputación VCR completada. NaNs antes: 246,228 -> después: 0
<class 'pandas.core.frame.DataFrame'>
Index: 25211 entries, 0 to 25214
Columns: 194 entries, monto to has_bus
dtypes: float64(161), int64(33)
memory usage: 37.5 MB


### Primer modelo XGBoost 
 - Dataset completo
 - Se imputan los datos faltantes en las columnas de VCR expandidas. Ver lógica en documentación.

In [22]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping

In [40]:
# %% Configuración
RANDOM_STATE = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.2 # fracción del train para validación temprana

PARAMS = dict(
n_estimators=5000,
learning_rate=0.03,
max_depth=6,
min_child_weight=5,
subsample=0.8,
colsample_bytree=0.8,
reg_alpha=0.0,
reg_lambda=1.0,
gamma=0.0,
objective="reg:squarederror",
random_state=RANDOM_STATE,
)

In [41]:
# Split
DF = df_control_vcr_imp
X = DF.drop(columns=["monto", "log_monto"]).copy()
y = DF["log_monto"].values

X_train_full, X_test, y_train_full, y_test = train_test_split(
X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
X_train, X_val, y_train, y_val = train_test_split(
X_train_full, y_train_full, test_size=VAL_SIZE, random_state=RANDOM_STATE
)
print(f"Shapes -> train: {X_train.shape}, val: {X_val.shape}, test: {X_test.shape}")

Shapes -> train: (16134, 192), val: (4034, 192), test: (5043, 192)


In [42]:
# Model
GPU_PARAMS = {"device": "cuda"}
model = XGBRegressor(eval_metric="rmse", **PARAMS, **GPU_PARAMS)

In [44]:
# Construcción del modelo 

model = XGBRegressor(
    **PARAMS,
    eval_metric="rmse",
    device="cuda",
    early_stopping_rounds=100,
)

start = time.perf_counter()
model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=False,
)
elapsed = time.perf_counter() - start

best_iter = getattr(model, "best_iteration", None)
print(f"Tiempo entrenamiento: {elapsed:.2f} s | Best iter: {best_iter}")


Tiempo entrenamiento: 10.54 s | Best iter: 775


In [45]:
# Evaluación en test
y_pred_log = model.predict(X_test)
y_pred_price = np.exp(y_pred_log)
y_true_price = np.asarray(DF.loc[X_test.index, "monto"], dtype=float)


r2_log = r2_score(y_test, y_pred_log)
rmse = mean_squared_error(y_true_price, y_pred_price, squared=False)
mae = mean_absolute_error(y_true_price, y_pred_price)
mape = np.mean(np.abs((y_true_price - y_pred_price) / np.clip(y_true_price, 1e-9, None))) * 100


print(f"R^2 (log): {r2_log:.4f}")
print(f"RMSE ($): {rmse:,.2f}")
print(f"MAE ($): {mae:,.2f}")
print(f"MAPE (%): {mape:.2f}")


# %% Importancias (gain)
booster = model.get_booster()
score = booster.get_score(importance_type="gain")
if score:
    imp = (
        pd.Series(score)
        .rename("gain")
        .sort_values(ascending=False)
        .to_frame()
        )
print("\nTop 20 features por gain:")
print(imp.head(20).to_string())

R^2 (log): 0.9633
RMSE ($): 1,533.91
MAE ($): 722.69
MAPE (%): 11.86

Top 20 features por gain:
                              gain
superficie_t             44.170135
longitud                 35.259632
banos                    27.243916
latitud                   8.570590
estacionamiento           6.125370
sport_and_leisure_dim00   3.303275
education_sup_dim10       2.335922
education_sup_dim00       2.252254
sport_and_leisure_dim03   1.741680
dormitorios               1.641414
terraza                   1.635981
security_dim00            1.296028
education_sup_dim08       1.175765
veterinary_dim00          1.062885
bodegas                   1.039361
education_prim_dim00      0.948555
education_prim_dim09      0.897919
food_and_drink_dim00      0.833105
sport_and_leisure_dim10   0.827551
education_sup_dim02       0.775305




In [46]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

def eval_split(model, X, y, label):
    y_pred_log = model.predict(X)
    r2 = r2_score(y, y_pred_log)
    # Si tus métricas en $ se calculan exponenciando:
    y_true_price = np.exp(y) if y.ndim else np.exp(y)  # si y está en log
    y_pred_price = np.exp(y_pred_log)
    rmse = mean_squared_error(y_true_price, y_pred_price, squared=False)
    mae  = mean_absolute_error(y_true_price, y_pred_price)
    mape = (np.abs((y_true_price - y_pred_price) / np.clip(y_true_price, 1e-9, None))).mean() * 100
    print(f"{label:>5} | R2(log)={r2:.4f} | RMSE=${rmse:,.2f} | MAE=${mae:,.2f} | MAPE={mape:.2f}%")

best_iter = getattr(model, "best_iteration", None)
print("best_iteration:", best_iter)

eval_split(model, X_train, y_train, "train")
eval_split(model, X_val,   y_val,   "val")
eval_split(model, X_test,  y_test,  "test")


best_iteration: 775




train | R2(log)=0.9848 | RMSE=$910.41 | MAE=$459.46 | MAPE=7.61%
  val | R2(log)=0.9613 | RMSE=$1,498.16 | MAE=$731.39 | MAPE=11.95%
 test | R2(log)=0.9633 | RMSE=$1,533.91 | MAE=$722.69 | MAPE=11.86%




Antes del Earlystopping las metricas eran ...

- Probar el XGBoost con el dataset sin nada georreferencial.
- probar validacion cruzada
- hacer modelos de aprendizaje profundo