In [65]:

import math
import time
from dataclasses import dataclass
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from copy import deepcopy

### Configuración común a los modelos:

In [92]:
# Config
SEED = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.2
BATCH_SIZE = 512
EPOCHS = 1000  
LR = 2e-3
D_TOKEN = 128
N_HEAD = 8
N_LAYERS = 2
FF_MULT = 2  # dim_feedforward = D_TOKEN * FF_MULT
DROPOUT = 0.1
WEIGHT_DECAY = 1e-4

# Early stop MUY AGRESIVO
#EARLY_STOP_PATIENCE = 50     # épocas sin mejora antes de cortar
#MIN_DELTA_R2 = 1e-3          # mejora mínima en R^2(log) para resetear paciencia
#LR_PATIENCE = 10             # épocas sin mejora antes de reducir LR
#LR_FACTOR = 0.5              # multiplicador del LR cuando no mejora
#MIN_LR = 1e-6                # LR mínimo

# Early stop
EARLY_STOP_PATIENCE = 150     # épocas sin mejora antes de cortar
MIN_DELTA_R2 = 5e-4          # mejora mínima en R^2(log) para resetear paciencia
LR_PATIENCE = 25             # épocas sin mejora antes de reducir LR
LR_FACTOR = 0.5              # multiplicador del LR cuando no mejora
MIN_LR = 1e-5                # LR mínimo

# Reproducibilidad
def set_seed(seed: int) -> None:
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(SEED)

### Primer modelo FT-Transformer --> Sin VCR ni coordenadas

In [93]:
# Configuración básica del dataset
df_vcr_c = pd.read_csv('dataset_vcr_compact.csv')
df_vcr_c = df_vcr_c[df_vcr_c['monto'] < 56000].copy()
df_vcr_c['log_monto']=np.log(df_vcr_c['monto'])
df_vcr_c['log_monto'].describe()

count    25211.000000
mean         8.395828
std          0.830310
min          5.950643
25%          7.740664
50%          8.242756
75%          8.984694
max         10.915088
Name: log_monto, dtype: float64

In [94]:
#Configuración específica del modelo
df_base =df_vcr_c.copy()
obj_cols = df_base.select_dtypes(include=["object"]).columns
cols_to_drop = list(obj_cols) + ["id", "latitud", "longitud"]
df_base = df_base.drop(columns=cols_to_drop)
df_base.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 25211 entries, 0 to 25214
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   monto                 25211 non-null  int64  
 1   superficie_t          25211 non-null  float64
 2   dormitorios           25211 non-null  int64  
 3   dormitorios_faltante  25211 non-null  int64  
 4   banos                 25211 non-null  int64  
 5   banos_faltante        25211 non-null  int64  
 6   antiguedad            25211 non-null  int64  
 7   antiguedad_faltante   25211 non-null  int64  
 8   Or_N                  25211 non-null  int64  
 9   Or_S                  25211 non-null  int64  
 10  Or_E                  25211 non-null  int64  
 11  Or_O                  25211 non-null  int64  
 12  Or_Faltante           25211 non-null  int64  
 13  terraza               25211 non-null  float64
 14  estacionamiento       25211 non-null  int64  
 15  bodegas               25

In [95]:
# Split
X_df = df_base.drop(columns=["monto", "log_monto"]).copy()
y = df_base["log_monto"].values.astype(np.float32)

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_df.values, y, test_size=TEST_SIZE, random_state=SEED
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=VAL_SIZE, random_state=SEED
)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train).astype(np.float32)
X_val = scaler.transform(X_val).astype(np.float32)
X_test = scaler.transform(X_test).astype(np.float32)

n_features = X_train.shape[1]
print(f"n_features: {n_features}")

n_features: 21


In [96]:
# Dataset & Loader
class NpDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return (
            torch.from_numpy(self.X[idx]),
            torch.tensor(self.y[idx], dtype=torch.float32),
        )

train_dl = DataLoader(NpDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True, drop_last=False, pin_memory=True)
val_dl = DataLoader(NpDataset(X_val, y_val), batch_size=BATCH_SIZE, shuffle=False, drop_last=False, pin_memory=True)
test_dl = DataLoader(NpDataset(X_test, y_test), batch_size=BATCH_SIZE, shuffle=False, drop_last=False, pin_memory=True)

# Modelo
class NumericTokenizer(nn.Module):
    """Mapea features numéricas a tokens: token_i = x_i * W_i + b_i"""
    def __init__(self, n_features: int, d_token: int):
        super().__init__()
        self.weight = nn.Parameter(torch.empty(n_features, d_token))
        self.bias = nn.Parameter(torch.zeros(n_features, d_token))
        nn.init.xavier_uniform_(self.weight)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (B, F) -> (B, F, D)
        return x.unsqueeze(-1) * self.weight + self.bias

class FTTransformer(nn.Module):
    def __init__(self, n_features: int, d_token: int, n_layers: int, n_head: int, ff_mult: int, dropout: float):
        super().__init__()
        self.tokenizer = NumericTokenizer(n_features, d_token)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_token))
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_token,
            nhead=n_head,
            dim_feedforward=d_token * ff_mult,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
        self.head = nn.Sequential(
            nn.LayerNorm(d_token),
            nn.Linear(d_token, d_token),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_token, 1),
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B = x.size(0)
        tokens = self.tokenizer(x)                  # (B, F, D)
        cls = self.cls_token.expand(B, -1, -1)     # (B, 1, D)
        x_tok = torch.cat([cls, tokens], dim=1)    # (B, 1+F, D)
        x_enc = self.encoder(x_tok)                # (B, 1+F, D)
        cls_out = x_enc[:, 0, :]                   # (B, D)
        y = self.head(cls_out).squeeze(-1)         # (B,)
        return y

In [97]:

# Entrenamiento
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FTTransformer(
    n_features=n_features,
    d_token=D_TOKEN,
    n_layers=N_LAYERS,
    n_head=N_HEAD,
    ff_mult=FF_MULT,
    dropout=DROPOUT,
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
criterion = nn.MSELoss()

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="max",               # maximizamos R^2 en validación
    factor=LR_FACTOR,
    patience=LR_PATIENCE,
    min_lr=MIN_LR,
    verbose=True,
)

best_r2 = -float("inf")
best_state = None
best_epoch = 0
no_improve = 0

def evaluate(dl: DataLoader) -> tuple[float, float, float, float]:
    model.eval()
    y_true_log, y_pred_log = [], []
    with torch.no_grad():
        for xb, yb in dl:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)
            pred = model(xb)
            y_true_log.append(yb.detach().cpu().numpy())
            y_pred_log.append(pred.detach().cpu().numpy())
    y_true_log = np.concatenate(y_true_log)
    y_pred_log = np.concatenate(y_pred_log)
    # Métricas
    r2_log = r2_score(y_true_log, y_pred_log)
    y_true_price = np.exp(y_true_log)
    y_pred_price = np.exp(y_pred_log)
    rmse = root_mean_squared_error(y_true_price, y_pred_price)
    mae = mean_absolute_error(y_true_price, y_pred_price)
    mape = np.mean(np.abs((y_true_price - y_pred_price) / np.clip(y_true_price, 1e-9, None))) * 100
    return r2_log, rmse, mae, mape

start = time.perf_counter()
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0
    for xb, yb in train_dl:
        xb = xb.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    train_loss = total_loss / len(train_dl.dataset)

    r2_log_val, rmse_val, mae_val, mape_val = evaluate(val_dl)
    print(
        f"Epoch {epoch:02d}/{EPOCHS} | train MSE: {train_loss:.5f} | "
        f"val R2_log: {r2_log_val:.4f} | val RMSE: {rmse_val:,.2f} | val MAE: {mae_val:,.2f} | val MAPE: {mape_val:.2f}%"
    )
    # Early stopping + ReduceLROnPlateau (monitor: R2_log de validación)
    improved = (r2_log_val - best_r2) > MIN_DELTA_R2
    if improved:
        best_r2 = r2_log_val
        best_state = deepcopy(model.state_dict())   # restore best weights
        best_epoch = epoch
        no_improve = 0
    else:
        no_improve += 1

    scheduler.step(r2_log_val)

    if no_improve >= EARLY_STOP_PATIENCE:
        print(f"Early stopping at epoch {epoch} (best @ {best_epoch} with R2_log={best_r2:.4f}).")
        break

elapsed = time.perf_counter() - start
print(f"\nEntrenamiento terminado en {elapsed:.2f} s (device={device}).")

# Restaurar los mejores pesos
if best_state is not None:
    model.load_state_dict(best_state)
    print(f"Restored best model from epoch {best_epoch} (R2_log={best_r2:.4f}).")

# Evaluación final en test
r2_log_test, rmse_test, mae_test, mape_test = evaluate(test_dl)
print("\n=== FT-Transformer No coords No VCR - Test ===")
print(f"R^2 (log): {r2_log_test:.4f}")
print(f"RMSE ($): {rmse_test:,.2f}")
print(f"MAE  ($): {mae_test:,.2f}")
print(f"MAPE (%): {mape_test:.2f}")

Epoch 01/1000 | train MSE: 8.71393 | val R2_log: -0.0527 | val RMSE: 6,543.30 | val MAE: 4,395.52 | val MAPE: 103.97%
Epoch 02/1000 | train MSE: 0.76933 | val R2_log: 0.5966 | val RMSE: 4,792.93 | val MAE: 2,741.22 | val MAPE: 54.46%
Epoch 03/1000 | train MSE: 0.38030 | val R2_log: 0.7546 | val RMSE: 3,600.20 | val MAE: 2,156.32 | val MAPE: 41.28%
Epoch 04/1000 | train MSE: 0.33068 | val R2_log: 0.8038 | val RMSE: 3,113.88 | val MAE: 1,896.76 | val MAPE: 36.11%
Epoch 05/1000 | train MSE: 0.29379 | val R2_log: 0.8360 | val RMSE: 2,947.72 | val MAE: 1,748.18 | val MAPE: 32.10%
Epoch 06/1000 | train MSE: 0.26657 | val R2_log: 0.8587 | val RMSE: 2,596.58 | val MAE: 1,548.82 | val MAPE: 28.93%
Epoch 07/1000 | train MSE: 0.25177 | val R2_log: 0.8675 | val RMSE: 2,360.86 | val MAE: 1,412.64 | val MAPE: 27.57%
Epoch 08/1000 | train MSE: 0.24070 | val R2_log: 0.8584 | val RMSE: 2,428.89 | val MAE: 1,484.85 | val MAPE: 29.21%
Epoch 09/1000 | train MSE: 0.23408 | val R2_log: 0.8573 | val RMSE: 2,

### Segundo modelo FT-Transformer --> Con coordenadas, sin VCR

In [98]:
# Configuración específica del modelo
df_coord =df_vcr_c.copy()
obj_cols = df_coord.select_dtypes(include=["object"]).columns
cols_to_drop = list(obj_cols)
cols_to_drop.append("id")
df_coord = df_coord.drop(columns=cols_to_drop)
df_coord.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 25211 entries, 0 to 25214
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   monto                 25211 non-null  int64  
 1   superficie_t          25211 non-null  float64
 2   dormitorios           25211 non-null  int64  
 3   dormitorios_faltante  25211 non-null  int64  
 4   banos                 25211 non-null  int64  
 5   banos_faltante        25211 non-null  int64  
 6   antiguedad            25211 non-null  int64  
 7   antiguedad_faltante   25211 non-null  int64  
 8   Or_N                  25211 non-null  int64  
 9   Or_S                  25211 non-null  int64  
 10  Or_E                  25211 non-null  int64  
 11  Or_O                  25211 non-null  int64  
 12  Or_Faltante           25211 non-null  int64  
 13  terraza               25211 non-null  float64
 14  estacionamiento       25211 non-null  int64  
 15  bodegas               25

In [99]:
# Split
X_df = df_coord.drop(columns=["monto", "log_monto"]).copy()
y = df_coord["log_monto"].values.astype(np.float32)

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_df.values, y, test_size=TEST_SIZE, random_state=SEED
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=VAL_SIZE, random_state=SEED
)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train).astype(np.float32)
X_val = scaler.transform(X_val).astype(np.float32)
X_test = scaler.transform(X_test).astype(np.float32)

n_features = X_train.shape[1]
print(f"n_features: {n_features}")

n_features: 23


In [100]:
# Dataset & Loader
class NpDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return (
            torch.from_numpy(self.X[idx]),
            torch.tensor(self.y[idx], dtype=torch.float32),
        )

train_dl = DataLoader(NpDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True, drop_last=False, pin_memory=True)
val_dl = DataLoader(NpDataset(X_val, y_val), batch_size=BATCH_SIZE, shuffle=False, drop_last=False, pin_memory=True)
test_dl = DataLoader(NpDataset(X_test, y_test), batch_size=BATCH_SIZE, shuffle=False, drop_last=False, pin_memory=True)

# Modelo
class NumericTokenizer(nn.Module):
    """Mapea features numéricas a tokens: token_i = x_i * W_i + b_i"""
    def __init__(self, n_features: int, d_token: int):
        super().__init__()
        self.weight = nn.Parameter(torch.empty(n_features, d_token))
        self.bias = nn.Parameter(torch.zeros(n_features, d_token))
        nn.init.xavier_uniform_(self.weight)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (B, F) -> (B, F, D)
        return x.unsqueeze(-1) * self.weight + self.bias

class FTTransformer(nn.Module):
    def __init__(self, n_features: int, d_token: int, n_layers: int, n_head: int, ff_mult: int, dropout: float):
        super().__init__()
        self.tokenizer = NumericTokenizer(n_features, d_token)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_token))
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_token,
            nhead=n_head,
            dim_feedforward=d_token * ff_mult,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
        self.head = nn.Sequential(
            nn.LayerNorm(d_token),
            nn.Linear(d_token, d_token),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_token, 1),
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B = x.size(0)
        tokens = self.tokenizer(x)                  # (B, F, D)
        cls = self.cls_token.expand(B, -1, -1)     # (B, 1, D)
        x_tok = torch.cat([cls, tokens], dim=1)    # (B, 1+F, D)
        x_enc = self.encoder(x_tok)                # (B, 1+F, D)
        cls_out = x_enc[:, 0, :]                   # (B, D)
        y = self.head(cls_out).squeeze(-1)         # (B,)
        return y

In [101]:

# Entrenamiento
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FTTransformer(
    n_features=n_features,
    d_token=D_TOKEN,
    n_layers=N_LAYERS,
    n_head=N_HEAD,
    ff_mult=FF_MULT,
    dropout=DROPOUT,
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
criterion = nn.MSELoss()

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="max",               # maximizamos R^2 en validación
    factor=LR_FACTOR,
    patience=LR_PATIENCE,
    min_lr=MIN_LR,
    verbose=True,
)

best_r2 = -float("inf")
best_state = None
best_epoch = 0
no_improve = 0

def evaluate(dl: DataLoader) -> tuple[float, float, float, float]:
    model.eval()
    y_true_log, y_pred_log = [], []
    with torch.no_grad():
        for xb, yb in dl:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)
            pred = model(xb)
            y_true_log.append(yb.detach().cpu().numpy())
            y_pred_log.append(pred.detach().cpu().numpy())
    y_true_log = np.concatenate(y_true_log)
    y_pred_log = np.concatenate(y_pred_log)
    # Métricas
    r2_log = r2_score(y_true_log, y_pred_log)
    y_true_price = np.exp(y_true_log)
    y_pred_price = np.exp(y_pred_log)
    rmse = root_mean_squared_error(y_true_price, y_pred_price)
    mae = mean_absolute_error(y_true_price, y_pred_price)
    mape = np.mean(np.abs((y_true_price - y_pred_price) / np.clip(y_true_price, 1e-9, None))) * 100
    return r2_log, rmse, mae, mape

start = time.perf_counter()
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0
    for xb, yb in train_dl:
        xb = xb.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    train_loss = total_loss / len(train_dl.dataset)

    r2_log_val, rmse_val, mae_val, mape_val = evaluate(val_dl)
    print(
        f"Epoch {epoch:02d}/{EPOCHS} | train MSE: {train_loss:.5f} | "
        f"val R2_log: {r2_log_val:.4f} | val RMSE: {rmse_val:,.2f} | val MAE: {mae_val:,.2f} | val MAPE: {mape_val:.2f}%"
    )
    # Early stopping + ReduceLROnPlateau (monitor: R2_log de validación)
    improved = (r2_log_val - best_r2) > MIN_DELTA_R2
    if improved:
        best_r2 = r2_log_val
        best_state = deepcopy(model.state_dict())   # restore best weights
        best_epoch = epoch
        no_improve = 0
    else:
        no_improve += 1

    scheduler.step(r2_log_val)

    if no_improve >= EARLY_STOP_PATIENCE:
        print(f"Early stopping at epoch {epoch} (best @ {best_epoch} with R2_log={best_r2:.4f}).")
        break

elapsed = time.perf_counter() - start
print(f"\nEntrenamiento terminado en {elapsed:.2f} s (device={device}).")

# Restaurar los mejores pesos
if best_state is not None:
    model.load_state_dict(best_state)
    print(f"Restored best model from epoch {best_epoch} (R2_log={best_r2:.4f}).")

# Evaluación final en test
r2_log_test, rmse_test, mae_test, mape_test = evaluate(test_dl)
print("\n=== FT-Transformer con coords No VCR - Test ===")
print(f"R^2 (log): {r2_log_test:.4f}")
print(f"RMSE ($): {rmse_test:,.2f}")
print(f"MAE  ($): {mae_test:,.2f}")
print(f"MAPE (%): {mape_test:.2f}")

Epoch 01/1000 | train MSE: 8.07576 | val R2_log: 0.1173 | val RMSE: 6,961.54 | val MAE: 3,953.49 | val MAPE: 57.47%
Epoch 02/1000 | train MSE: 0.50069 | val R2_log: 0.7127 | val RMSE: 3,866.47 | val MAE: 2,406.59 | val MAPE: 47.96%
Epoch 03/1000 | train MSE: 0.30102 | val R2_log: 0.8099 | val RMSE: 3,220.38 | val MAE: 1,904.70 | val MAPE: 36.27%
Epoch 04/1000 | train MSE: 0.26536 | val R2_log: 0.8601 | val RMSE: 3,057.80 | val MAE: 1,808.42 | val MAPE: 30.00%
Epoch 05/1000 | train MSE: 0.23735 | val R2_log: 0.8675 | val RMSE: 2,432.61 | val MAE: 1,487.85 | val MAPE: 29.13%
Epoch 06/1000 | train MSE: 0.22839 | val R2_log: 0.8624 | val RMSE: 2,569.84 | val MAE: 1,606.06 | val MAPE: 30.42%
Epoch 07/1000 | train MSE: 0.21992 | val R2_log: 0.9058 | val RMSE: 2,303.92 | val MAE: 1,323.78 | val MAPE: 22.94%
Epoch 08/1000 | train MSE: 0.20885 | val R2_log: 0.8948 | val RMSE: 2,446.28 | val MAE: 1,416.91 | val MAPE: 25.06%
Epoch 09/1000 | train MSE: 0.19538 | val R2_log: 0.8858 | val RMSE: 2,49

### Tercer modelo FT-Transformer --> Con coordenadas y VCR

In [102]:
df_vcr_e = pd.read_csv('dataset_vcr_expanded.csv')
df_vcr_e = df_vcr_e[df_vcr_e['monto'] < 56000].copy()
df_vcr_e['log_monto']=np.log(df_vcr_e['monto'])
df_vcr_e['log_monto'].describe()

count    25211.000000
mean         8.395828
std          0.830310
min          5.950643
25%          7.740664
50%          8.242756
75%          8.984694
max         10.915088
Name: log_monto, dtype: float64

In [103]:
df_vcr =df_vcr_e.copy()
obj_cols = df_vcr.select_dtypes(include=["object"]).columns
cols_to_drop = list(obj_cols)
cols_to_drop.append("id")
df_vcr = df_vcr.drop(columns=cols_to_drop)
df_vcr.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 25211 entries, 0 to 25214
Data columns (total 181 columns):
 #    Column                        Dtype  
---   ------                        -----  
 0    monto                         int64  
 1    superficie_t                  float64
 2    dormitorios                   int64  
 3    dormitorios_faltante          int64  
 4    banos                         int64  
 5    banos_faltante                int64  
 6    antiguedad                    int64  
 7    antiguedad_faltante           int64  
 8    Or_N                          int64  
 9    Or_S                          int64  
 10   Or_E                          int64  
 11   Or_O                          int64  
 12   Or_Faltante                   int64  
 13   terraza                       float64
 14   estacionamiento               int64  
 15   bodegas                       int64  
 16   flag_Departamento             int64  
 17   flag_Multinivel               int64  
 18   flag_Semi

In [104]:
# Imputación datos faltantes en VCR
import re
from typing import Dict, Tuple, Optional

# Configuración (
# Dimensiones (1..12) 
DIMS_MAP = {
    1: "count_pois",
    2: "mean_distance",
    3: "min_distance",
    4: "max_distance",
    5: "median_distance",
    6: "std_distance",
    7: "mean_inverse_distance",
    8: "max_inverse_distance",
    9: "sum_inverse_distance",
    10: "ratio_within_near_radius",
    11: "ratio_within_mid_radius",
    12: "ratio_within_far_radius",
}

# Rol por dimensión (para decidir la imputación semántica)
DIM_ROLE = {
    1: "count",                # -> 0
    2: "distance",             # -> R3
    3: "distance",             # -> R3
    4: "distance",             # -> R3
    5: "distance",             # -> R3
    6: "std",                  # -> 0
    7: "inverse",              # -> 0
    8: "inverse",              # -> 0
    9: "inverse",              # -> 0
    10: "ratio",               # -> 0
    11: "ratio",               # -> 0
    12: "ratio",               # -> 0
}

# R3 por tipo de clase
R3_DEFAULT = 2400.0  # clases generales
R3_METRO = 1600.0
R3_BUS = 800.0

# Funciones
def _class_and_dim(col: str) -> Optional[Tuple[str, int]]:
    """Extrae (clase, índice de dimensión) de columnas tipo '<clase>_dimXX'."""
    m = re.match(r"^(?P<klass>.+)_dim(?P<idx>\d{1,2})$", col)
    if not m:
        return None
    return m.group("klass"), int(m.group("idx"))


def _r3_for_class(klass: str) -> float:
    k = klass.lower()
    if "metro" in k:
        return R3_METRO
    if "bus" in k:
        return R3_BUS
    return R3_DEFAULT


def impute_vcr_semantic(df: pd.DataFrame) -> pd.DataFrame:
    """Imputa VCR por semántica de ausencia: distancias=R3, inversas/ratios=0, count=0, std=0.
    Además agrega flags `has_<clase>` indicando presencia de POIs por clase.
    """
    out = df.copy()

    # Agrupar columnas por clase
    groups: Dict[str, Dict[int, str]] = {}
    vcr_cols = []
    for c in out.columns:
        parsed = _class_and_dim(c)
        if parsed is None:
            continue
        klass, idx = parsed
        groups.setdefault(klass, {})[idx] = c
        vcr_cols.append(c)

    if not groups:
        # Nada que imputar
        return out

    # Flags de presencia por clase (antes de imputar)
    for klass, dim_map in groups.items():
        cols = list(dim_map.values())
        has_series = out[cols].notna().any(axis=1).astype("int64")
        out[f"has_{klass}"] = has_series  # por qué: distingue ausencia real vs lejanía

    # Imputación por clase/dim
    n_total_nans = int(out[vcr_cols].isna().sum().sum())
    for klass, dim_map in groups.items():
        r3 = _r3_for_class(klass)
        for idx, col in dim_map.items():
            role = DIM_ROLE.get(idx)
            if role == "distance":
                fill_value = r3
            elif role in {"inverse", "ratio", "std", "count"}:
                fill_value = 0.0
            else:
                # Si hay una dimensión desconocida, ser conservador con 0.0
                fill_value = 0.0
            out[col] = out[col].fillna(fill_value)

    n_after_nans = int(out[vcr_cols].isna().sum().sum())
    print(f"Imputación VCR completada. NaNs antes: {n_total_nans:,d} -> después: {n_after_nans:,d}")

    return out



df_vcr_imp = impute_vcr_semantic(df_vcr)
df_vcr_imp.info()  


Imputación VCR completada. NaNs antes: 246,228 -> después: 0
<class 'pandas.core.frame.DataFrame'>
Index: 25211 entries, 0 to 25214
Columns: 194 entries, monto to has_bus
dtypes: float64(161), int64(33)
memory usage: 37.5 MB


In [105]:
# Split
X_df = df_vcr_imp.drop(columns=["monto", "log_monto"]).copy()
y = df_vcr_imp["log_monto"].values.astype(np.float32)

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_df.values, y, test_size=TEST_SIZE, random_state=SEED
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=VAL_SIZE, random_state=SEED
)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train).astype(np.float32)
X_val = scaler.transform(X_val).astype(np.float32)
X_test = scaler.transform(X_test).astype(np.float32)

n_features = X_train.shape[1]
print(f"n_features: {n_features}")

n_features: 192


In [106]:
# Dataset & Loader
class NpDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return (
            torch.from_numpy(self.X[idx]),
            torch.tensor(self.y[idx], dtype=torch.float32),
        )

train_dl = DataLoader(NpDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True, drop_last=False, pin_memory=True)
val_dl = DataLoader(NpDataset(X_val, y_val), batch_size=BATCH_SIZE, shuffle=False, drop_last=False, pin_memory=True)
test_dl = DataLoader(NpDataset(X_test, y_test), batch_size=BATCH_SIZE, shuffle=False, drop_last=False, pin_memory=True)

# Modelo
class NumericTokenizer(nn.Module):
    """Mapea features numéricas a tokens: token_i = x_i * W_i + b_i"""
    def __init__(self, n_features: int, d_token: int):
        super().__init__()
        self.weight = nn.Parameter(torch.empty(n_features, d_token))
        self.bias = nn.Parameter(torch.zeros(n_features, d_token))
        nn.init.xavier_uniform_(self.weight)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (B, F) -> (B, F, D)
        return x.unsqueeze(-1) * self.weight + self.bias

class FTTransformer(nn.Module):
    def __init__(self, n_features: int, d_token: int, n_layers: int, n_head: int, ff_mult: int, dropout: float):
        super().__init__()
        self.tokenizer = NumericTokenizer(n_features, d_token)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_token))
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_token,
            nhead=n_head,
            dim_feedforward=d_token * ff_mult,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
        self.head = nn.Sequential(
            nn.LayerNorm(d_token),
            nn.Linear(d_token, d_token),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_token, 1),
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B = x.size(0)
        tokens = self.tokenizer(x)                  # (B, F, D)
        cls = self.cls_token.expand(B, -1, -1)     # (B, 1, D)
        x_tok = torch.cat([cls, tokens], dim=1)    # (B, 1+F, D)
        x_enc = self.encoder(x_tok)                # (B, 1+F, D)
        cls_out = x_enc[:, 0, :]                   # (B, D)
        y = self.head(cls_out).squeeze(-1)         # (B,)
        return y

In [107]:

# Entrenamiento
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FTTransformer(
    n_features=n_features,
    d_token=D_TOKEN,
    n_layers=N_LAYERS,
    n_head=N_HEAD,
    ff_mult=FF_MULT,
    dropout=DROPOUT,
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
criterion = nn.MSELoss()

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="max",               # maximizamos R^2 en validación
    factor=LR_FACTOR,
    patience=LR_PATIENCE,
    min_lr=MIN_LR,
    verbose=True,
)

best_r2 = -float("inf")
best_state = None
best_epoch = 0
no_improve = 0

def evaluate(dl: DataLoader) -> tuple[float, float, float, float]:
    model.eval()
    y_true_log, y_pred_log = [], []
    with torch.no_grad():
        for xb, yb in dl:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)
            pred = model(xb)
            y_true_log.append(yb.detach().cpu().numpy())
            y_pred_log.append(pred.detach().cpu().numpy())
    y_true_log = np.concatenate(y_true_log)
    y_pred_log = np.concatenate(y_pred_log)
    # Métricas
    r2_log = r2_score(y_true_log, y_pred_log)
    y_true_price = np.exp(y_true_log)
    y_pred_price = np.exp(y_pred_log)
    rmse = root_mean_squared_error(y_true_price, y_pred_price)
    mae = mean_absolute_error(y_true_price, y_pred_price)
    mape = np.mean(np.abs((y_true_price - y_pred_price) / np.clip(y_true_price, 1e-9, None))) * 100
    return r2_log, rmse, mae, mape

start = time.perf_counter()
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0
    for xb, yb in train_dl:
        xb = xb.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    train_loss = total_loss / len(train_dl.dataset)

    r2_log_val, rmse_val, mae_val, mape_val = evaluate(val_dl)
    print(
        f"Epoch {epoch:02d}/{EPOCHS} | train MSE: {train_loss:.5f} | "
        f"val R2_log: {r2_log_val:.4f} | val RMSE: {rmse_val:,.2f} | val MAE: {mae_val:,.2f} | val MAPE: {mape_val:.2f}%"
    )
    # Early stopping + ReduceLROnPlateau (monitor: R2_log de validación)
    improved = (r2_log_val - best_r2) > MIN_DELTA_R2
    if improved:
        best_r2 = r2_log_val
        best_state = deepcopy(model.state_dict())   # restore best weights
        best_epoch = epoch
        no_improve = 0
    else:
        no_improve += 1

    scheduler.step(r2_log_val)

    if no_improve >= EARLY_STOP_PATIENCE:
        print(f"Early stopping at epoch {epoch} (best @ {best_epoch} with R2_log={best_r2:.4f}).")
        break

elapsed = time.perf_counter() - start
print(f"\nEntrenamiento terminado en {elapsed:.2f} s (device={device}).")

# Restaurar los mejores pesos
if best_state is not None:
    model.load_state_dict(best_state)
    print(f"Restored best model from epoch {best_epoch} (R2_log={best_r2:.4f}).")

# Evaluación final en test
r2_log_test, rmse_test, mae_test, mape_test = evaluate(test_dl)
print("\n=== FT-Transformer completo - Test ===")
print(f"R^2 (log): {r2_log_test:.4f}")
print(f"RMSE ($): {rmse_test:,.2f}")
print(f"MAE  ($): {mae_test:,.2f}")
print(f"MAPE (%): {mape_test:.2f}")

Epoch 01/1000 | train MSE: 8.19166 | val R2_log: 0.0356 | val RMSE: 6,841.29 | val MAE: 4,176.12 | val MAPE: 77.25%
Epoch 02/1000 | train MSE: 0.75642 | val R2_log: 0.4980 | val RMSE: 4,982.91 | val MAE: 3,042.15 | val MAPE: 62.75%
Epoch 03/1000 | train MSE: 0.46935 | val R2_log: 0.6709 | val RMSE: 4,121.70 | val MAE: 2,426.80 | val MAPE: 47.03%
Epoch 04/1000 | train MSE: 0.37539 | val R2_log: 0.7673 | val RMSE: 3,496.25 | val MAE: 2,055.10 | val MAPE: 38.14%
Epoch 05/1000 | train MSE: 0.31988 | val R2_log: 0.8133 | val RMSE: 3,481.24 | val MAE: 2,023.16 | val MAPE: 33.87%
Epoch 06/1000 | train MSE: 0.27397 | val R2_log: 0.7997 | val RMSE: 3,202.98 | val MAE: 1,986.70 | val MAPE: 38.04%
Epoch 07/1000 | train MSE: 0.25222 | val R2_log: 0.8394 | val RMSE: 2,795.18 | val MAE: 1,685.75 | val MAPE: 32.52%
Epoch 08/1000 | train MSE: 0.23553 | val R2_log: 0.8416 | val RMSE: 3,036.90 | val MAE: 1,803.72 | val MAPE: 32.52%
Epoch 09/1000 | train MSE: 0.21835 | val R2_log: 0.8730 | val RMSE: 2,49

##### 1000 epochs -> 155 min

In [6]:
r2_log_test, rmse_test, mae_test, mape_test = evaluate(test_dl)
print("\n=== FT-Transformer (simple) - Test ===")
print(f"R^2 (log): {r2_log_test:.4f}")
print(f"RMSE ($): {rmse_test:,.2f}")
print(f"MAE  ($): {mae_test:,.2f}")
print(f"MAPE (%): {mape_test:.2f}")


=== FT-Transformer (simple) - Test ===
R^2 (log): 0.9541
RMSE ($): 1,753.83
MAE  ($): 834.79
MAPE (%): 13.37




#### Recuento modelos (100 epochs)
| Modelo                                 | R² (log) | RMSE ($) | MAE ($) | MAPE (%) |
|----------------------------------------|:--------:|---------:|--------:|---------:|
| 1) FT-Transformer (solo estructural)   |  0.8829  |  2,660.34| 1,463.98|    25.63 |
| 2) FT-Transformer + latitud/longitud   |  0.9213  |  1,901.99| 1,074.23|    20.88 |
| 3) FT-Transformer con VCR completos    |  0.9381  |  1,934.98| 1,083.63|    18.22 |

#### Recuento modelos (1000 epochs)
| Modelo                                 | R² (log) | RMSE ($) | MAE ($) | MAPE (%) |
|----------------------------------------|:--------:|---------:|--------:|---------:|
| 1) FT-Transformer (solo estructural)   |    0.8997|  2,058.75| 1,131.58|     20.72|
| 2) FT-Transformer + latitud/longitud   |    0.9505|  1,698.56|   857.12|     13.74|
| 3) FT-Transformer con VCR completos    |  0.9541  |  1,753.83|   834.79|    13.37 |

#### Recuento modelos (1000 epochs) con EarlyStopping (muy agresivo al parecer)
| Modelo                                    | R² (log) | RMSE ($) | MAE ($) | MAPE (%) |
|-------------------------------------------|:--------:|---------:|--------:|---------:|
| 1) FT-Transformer (solo estructural) epoch 52 |    0.9048|  2,057.48| 1,141.61|     20.74|
| 2) FT-Transformer + latitud/longitud epoch 65 |    0.9464|  1,900.77|   978.96|     15.75|
| 3) FT-Transformer con VCR completos epoch 41  |    0.9481|  1,794.88|   927.01|   14.40|

#### Recuento modelos (1000 epochs) con EarlyStopping (mas suave)
| Modelo                                         | R² (log) | RMSE ($) | MAE ($) | MAPE (%) |
|------------------------------------------------|:--------:|---------:|--------:|---------:|
| 1) FT-Transformer (solo estructural) epoch 175 |    0.9086|  2,110.31| 1,129.95|     19.75|
| 2) FT-Transformer + latitud/longitud epoch 167 |    0.9555|  1,686.77|   834.62|     13.56|
| 3) FT-Transformer con VCR completos epoch 124  |    0.9573|  1,676.25|   837.69|     13.16|