In [1]:
import os, json, time, math, random, re
from typing import Dict, Tuple, Optional
from dataclasses import dataclass
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from copy import deepcopy
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

In [2]:
# Config Hiperparámetros
SEED = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.2
BATCH_SIZE = 512

SEED = 42
N_TRIALS_FTT = 50          # presupuesto de búsqueda
EPOCHS_TUNER = 20          # épocas por trial (rápido)
ES_PATIENCE_TUNER = 6      # early stopping por trial
MIN_DELTA_RMSE = 1e-4      # mejora mínima en RMSE_log
SAVE_DIR = "models_FTT"
os.makedirs(SAVE_DIR, exist_ok=True)

def set_seed(seed: int):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
# ---- Dataset mínimo ----
class NpDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X, self.y = X, y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx]), torch.tensor(self.y[idx], dtype=torch.float32)

# ---- Modelo ----
class NumericTokenizer(nn.Module):
    def __init__(self, n_features: int, d_token: int):
        super().__init__()
        self.weight = nn.Parameter(torch.empty(n_features, d_token))
        self.bias = nn.Parameter(torch.zeros(n_features, d_token))
        nn.init.xavier_uniform_(self.weight)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x.unsqueeze(-1) * self.weight + self.bias

class FTTransformer(nn.Module):
    def __init__(self, n_features: int, d_token: int, n_layers: int, n_head: int, ff_mult: int, dropout: float):
        super().__init__()
        self.tokenizer = NumericTokenizer(n_features, d_token)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_token))
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_token,
            nhead=n_head,
            dim_feedforward=d_token * ff_mult,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
        self.head = nn.Sequential(
            nn.LayerNorm(d_token),
            nn.Linear(d_token, d_token),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_token, 1),
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B = x.size(0)
        tokens = self.tokenizer(x)
        cls = self.cls_token.expand(B, 1, -1)
        x_tok = torch.cat([cls, tokens], dim=1)
        x_enc = self.encoder(x_tok)
        cls_out = x_enc[:, 0, :]
        return self.head(cls_out).squeeze(-1)



In [4]:
# ---- Objective Optuna (minimiza RMSE_log val) ----

def objective_ftt(trial: optuna.Trial) -> float:
    assert torch.cuda.is_available(), "Se requiere GPU (CUDA)."
    device = torch.device("cuda")
    set_seed(SEED)

    n_features = X_train.shape[1]

    d_token = trial.suggest_categorical("d_token", [64, 96, 128, 192, 256])
    # n_head debe dividir a d_token
    valid_heads = [h for h in [4, 8] if d_token % h == 0]
    n_head = trial.suggest_categorical("n_head", valid_heads)
    n_layers = trial.suggest_int("n_layers", 1, 4)
    ff_mult  = trial.suggest_categorical("ff_mult", [2, 4])
    dropout  = trial.suggest_float("dropout", 0.0, 0.3, step=0.1)

    lr           = trial.suggest_float("lr", 1e-4, 3e-3, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)
    batch_size   = trial.suggest_categorical("batch_size", [256, 512])# Retiro 1024 por limitacion de hardware

    train_dl = DataLoader(NpDataset(X_train, y_train), batch_size=batch_size, shuffle=True, pin_memory=True)
    val_dl   = DataLoader(NpDataset(X_val,   y_val),   batch_size=batch_size, shuffle=False, pin_memory=True)

    model = FTTransformer(n_features, d_token, n_layers, n_head, ff_mult, dropout).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_fn = nn.MSELoss()
    scaler = torch.cuda.amp.GradScaler()

    best_rmse = float("inf")
    no_improve = 0

    for epoch in range(1, EPOCHS_TUNER + 1):
        model.train()
        for xb, yb in train_dl:
            xb, yb = xb.to(device, non_blocking=True), yb.to(device, non_blocking=True)
            opt.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast():
                pred = model(xb)
                loss = loss_fn(pred, yb)
            scaler.scale(loss).backward()
            scaler.step(opt)
            scaler.update()

        # validación (RMSE_log)
        model.eval()
        y_true, y_pred = [], []
        with torch.no_grad():
            for xb, yb in val_dl:
                xb = xb.to(device, non_blocking=True)
                pred = model(xb)
                y_true.append(yb.numpy())
                y_pred.append(pred.detach().cpu().numpy())
        y_true = np.concatenate(y_true)
        y_pred = np.concatenate(y_pred)
        rmse_log = float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

        trial.report(rmse_log, epoch)
        if rmse_log + MIN_DELTA_RMSE < best_rmse:
            best_rmse = rmse_log
            no_improve = 0
        else:
            no_improve += 1

        if trial.should_prune():
            raise optuna.TrialPruned()
        if no_improve >= ES_PATIENCE_TUNER:
            break

    return best_rmse




### Hiperparametros para modelo FTT - base

In [5]:
VERSION_TAG = "v1"         # cambiar a v2/v3 según dataset

In [6]:
# Configuración básica del dataset
df_vcr_c = pd.read_csv('dataset_vcr_compact.csv')
df_vcr_c = df_vcr_c[df_vcr_c['monto'] < 56000].copy()
df_vcr_c['log_monto']=np.log(df_vcr_c['monto'])
df_vcr_c['log_monto'].describe()

count    25211.000000
mean         8.395828
std          0.830310
min          5.950643
25%          7.740664
50%          8.242756
75%          8.984694
max         10.915088
Name: log_monto, dtype: float64

In [7]:
#Configuración específica del modelo
df_base =df_vcr_c.copy()
obj_cols = df_base.select_dtypes(include=["object"]).columns
cols_to_drop = list(obj_cols) + ["id", "latitud", "longitud"]
df_base = df_base.drop(columns=cols_to_drop)
df_base.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 25211 entries, 0 to 25214
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   monto                 25211 non-null  int64  
 1   superficie_t          25211 non-null  float64
 2   dormitorios           25211 non-null  int64  
 3   dormitorios_faltante  25211 non-null  int64  
 4   banos                 25211 non-null  int64  
 5   banos_faltante        25211 non-null  int64  
 6   antiguedad            25211 non-null  int64  
 7   antiguedad_faltante   25211 non-null  int64  
 8   Or_N                  25211 non-null  int64  
 9   Or_S                  25211 non-null  int64  
 10  Or_E                  25211 non-null  int64  
 11  Or_O                  25211 non-null  int64  
 12  Or_Faltante           25211 non-null  int64  
 13  terraza               25211 non-null  float64
 14  estacionamiento       25211 non-null  int64  
 15  bodegas               25

In [8]:
# Split
X_df = df_base.drop(columns=["monto", "log_monto"]).copy()
y = df_base["log_monto"].values.astype(np.float32)

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_df.values, y, test_size=TEST_SIZE, random_state=SEED
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=VAL_SIZE, random_state=SEED
)

scaler = StandardScaler().fit(X_train) #(x - mean)/std. --> mean = 0, std = 1
X_train = scaler.transform(X_train).astype(np.float32)
X_val = scaler.transform(X_val).astype(np.float32)
X_test = scaler.transform(X_test).astype(np.float32)

n_features = X_train.shape[1]
print(f"n_features: {n_features}")

n_features: 21


In [9]:
# ---- Ejecutar estudio y exportar HP ----
study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=SEED), pruner=MedianPruner(n_startup_trials=5))
print("Optuna FTT: iniciando búsqueda...")
study.optimize(objective_ftt, n_trials=N_TRIALS_FTT, show_progress_bar=True)

best = study.best_trial
best_params_FTT = {
    "d_token": best.params["d_token"],
    "n_head": best.params["n_head"],
    "n_layers": best.params["n_layers"],
    "ff_mult": best.params["ff_mult"],
    "dropout": best.params["dropout"],
    "lr": best.params["lr"],
    "weight_decay": best.params["weight_decay"],
    "batch_size": best.params["batch_size"],
}
print("\nMejores HP FTT Base (val RMSE_log ↓):\n", json.dumps(best_params_FTT, indent=2))

hp_path = os.path.join(SAVE_DIR, f"best_params_FTT_{VERSION_TAG}.json")
with open(hp_path, "w") as f:
    json.dump(best_params_FTT, f, indent=2)
print(f"Guardado: {hp_path}")

[I 2025-10-08 20:51:28,517] A new study created in memory with name: no-name-358f83c3-47e3-40f8-85cc-20fb1069cacf


Optuna FTT: iniciando búsqueda...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-10-08 20:52:05,482] Trial 0 finished with value: 0.26771169900894165 and parameters: {'d_token': 96, 'n_head': 4, 'n_layers': 4, 'ff_mult': 4, 'dropout': 0.0, 'lr': 0.002708160864249967, 'weight_decay': 0.0003142880890840109, 'batch_size': 256}. Best is trial 0 with value: 0.26771169900894165.
[I 2025-10-08 20:52:28,702] Trial 1 finished with value: 0.3959355652332306 and parameters: {'d_token': 128, 'n_head': 4, 'n_layers': 2, 'ff_mult': 4, 'dropout': 0.3, 'lr': 0.00019721610970574026, 'weight_decay': 3.489018845491386e-05, 'batch_size': 256}. Best is trial 0 with value: 0.26771169900894165.
[I 2025-10-08 20:52:52,818] Trial 2 finished with value: 0.27249932289123535 and parameters: {'d_token': 256, 'n_head': 4, 'n_layers': 1, 'ff_mult': 2, 'dropout': 0.0, 'lr': 0.0005388108577817234, 'weight_decay': 1.2681352169084602e-06, 'batch_size': 256}. Best is trial 0 with value: 0.26771169900894165.
[I 2025-10-08 20:53:20,566] Trial 3 finished with value: 0.41972628235816956 and param

### Hiperparametros para modelo FTT - Coordenadas

In [10]:
VERSION_TAG = "v2"         # cambiar a v2/v3 según dataset

In [11]:
#Configuración específica del modelo
df_coord =df_vcr_c.copy()
obj_cols = df_coord.select_dtypes(include=["object"]).columns
cols_to_drop = list(obj_cols)
cols_to_drop.append("id")
df_coord = df_coord.drop(columns=cols_to_drop)
df_coord.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 25211 entries, 0 to 25214
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   monto                 25211 non-null  int64  
 1   superficie_t          25211 non-null  float64
 2   dormitorios           25211 non-null  int64  
 3   dormitorios_faltante  25211 non-null  int64  
 4   banos                 25211 non-null  int64  
 5   banos_faltante        25211 non-null  int64  
 6   antiguedad            25211 non-null  int64  
 7   antiguedad_faltante   25211 non-null  int64  
 8   Or_N                  25211 non-null  int64  
 9   Or_S                  25211 non-null  int64  
 10  Or_E                  25211 non-null  int64  
 11  Or_O                  25211 non-null  int64  
 12  Or_Faltante           25211 non-null  int64  
 13  terraza               25211 non-null  float64
 14  estacionamiento       25211 non-null  int64  
 15  bodegas               25

In [12]:
# Split
X_df = df_coord.drop(columns=["monto", "log_monto"]).copy()
y = df_coord["log_monto"].values.astype(np.float32)

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_df.values, y, test_size=TEST_SIZE, random_state=SEED
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=VAL_SIZE, random_state=SEED
)

scaler = StandardScaler().fit(X_train) #(x - mean)/std. --> mean = 0, std = 1
X_train = scaler.transform(X_train).astype(np.float32)
X_val = scaler.transform(X_val).astype(np.float32)
X_test = scaler.transform(X_test).astype(np.float32)

n_features = X_train.shape[1]
print(f"n_features: {n_features}")

n_features: 23


In [13]:
# ---- Ejecutar estudio y exportar HP ----
study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=SEED), pruner=MedianPruner(n_startup_trials=5))
print("Optuna FTT: iniciando búsqueda...")
study.optimize(objective_ftt, n_trials=N_TRIALS_FTT, show_progress_bar=True)

best = study.best_trial
best_params_FTT = {
    "d_token": best.params["d_token"],
    "n_head": best.params["n_head"],
    "n_layers": best.params["n_layers"],
    "ff_mult": best.params["ff_mult"],
    "dropout": best.params["dropout"],
    "lr": best.params["lr"],
    "weight_decay": best.params["weight_decay"],
    "batch_size": best.params["batch_size"],
}
print("\nMejores HP FTT Coordenadas (val RMSE_log ↓):\n", json.dumps(best_params_FTT, indent=2))

hp_path = os.path.join(SAVE_DIR, f"best_params_FTT_{VERSION_TAG}.json")
with open(hp_path, "w") as f:
    json.dump(best_params_FTT, f, indent=2)
print(f"Guardado: {hp_path}")

[I 2025-10-08 21:02:31,147] A new study created in memory with name: no-name-8b6a1155-ad00-4f1e-b399-b2d302fb4a3d


Optuna FTT: iniciando búsqueda...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-10-08 21:03:07,992] Trial 0 finished with value: 0.19666080176830292 and parameters: {'d_token': 96, 'n_head': 4, 'n_layers': 4, 'ff_mult': 4, 'dropout': 0.0, 'lr': 0.002708160864249967, 'weight_decay': 0.0003142880890840109, 'batch_size': 256}. Best is trial 0 with value: 0.19666080176830292.
[I 2025-10-08 21:03:32,549] Trial 1 finished with value: 0.43681782484054565 and parameters: {'d_token': 128, 'n_head': 4, 'n_layers': 2, 'ff_mult': 4, 'dropout': 0.3, 'lr': 0.00019721610970574026, 'weight_decay': 3.489018845491386e-05, 'batch_size': 256}. Best is trial 0 with value: 0.19666080176830292.
[I 2025-10-08 21:03:57,465] Trial 2 finished with value: 0.20989729464054108 and parameters: {'d_token': 256, 'n_head': 4, 'n_layers': 1, 'ff_mult': 2, 'dropout': 0.0, 'lr': 0.0005388108577817234, 'weight_decay': 1.2681352169084602e-06, 'batch_size': 256}. Best is trial 0 with value: 0.19666080176830292.
[I 2025-10-08 21:04:27,109] Trial 3 finished with value: 0.34386202692985535 and para

### Hiperparametros para modelo FTT - Coordenadas y VCR

In [5]:
VERSION_TAG = "v3"         # cambiar a v2/v3 según dataset

In [6]:
df_vcr_e = pd.read_csv('dataset_vcr_expanded.csv')
df_vcr_e = df_vcr_e[df_vcr_e['monto'] < 56000].copy()
df_vcr_e['log_monto']=np.log(df_vcr_e['monto'])
df_vcr_e['log_monto'].describe()

count    25211.000000
mean         8.395828
std          0.830310
min          5.950643
25%          7.740664
50%          8.242756
75%          8.984694
max         10.915088
Name: log_monto, dtype: float64

In [7]:
df_vcr =df_vcr_e.copy()
obj_cols = df_vcr.select_dtypes(include=["object"]).columns
cols_to_drop = list(obj_cols)
cols_to_drop.append("id")
df_vcr = df_vcr.drop(columns=cols_to_drop)
df_vcr.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 25211 entries, 0 to 25214
Data columns (total 181 columns):
 #    Column                        Dtype  
---   ------                        -----  
 0    monto                         int64  
 1    superficie_t                  float64
 2    dormitorios                   int64  
 3    dormitorios_faltante          int64  
 4    banos                         int64  
 5    banos_faltante                int64  
 6    antiguedad                    int64  
 7    antiguedad_faltante           int64  
 8    Or_N                          int64  
 9    Or_S                          int64  
 10   Or_E                          int64  
 11   Or_O                          int64  
 12   Or_Faltante                   int64  
 13   terraza                       float64
 14   estacionamiento               int64  
 15   bodegas                       int64  
 16   flag_Departamento             int64  
 17   flag_Multinivel               int64  
 18   flag_Semi

In [8]:
# Imputación datos faltantes en VCR
# Dimensiones (1..12) 
DIMS_MAP = {
    1: "count_pois",
    2: "mean_distance",
    3: "min_distance",
    4: "max_distance",
    5: "median_distance",
    6: "std_distance",
    7: "mean_inverse_distance",
    8: "max_inverse_distance",
    9: "sum_inverse_distance",
    10: "ratio_within_near_radius",
    11: "ratio_within_mid_radius",
    12: "ratio_within_far_radius",
}

# Rol por dimensión (para decidir la imputación semántica)
DIM_ROLE = {
    1: "count",                # -> 0
    2: "distance",             # -> R3
    3: "distance",             # -> R3
    4: "distance",             # -> R3
    5: "distance",             # -> R3
    6: "std",                  # -> 0
    7: "inverse",              # -> 0
    8: "inverse",              # -> 0
    9: "inverse",              # -> 0
    10: "ratio",               # -> 0
    11: "ratio",               # -> 0
    12: "ratio",               # -> 0
}

# R3 por tipo de clase
R3_DEFAULT = 2400.0  # clases generales
R3_METRO = 1600.0
R3_BUS = 800.0

# Funciones
def _class_and_dim(col: str) -> Optional[Tuple[str, int]]:
    """Extrae (clase, índice de dimensión) de columnas tipo '<clase>_dimXX'."""
    m = re.match(r"^(?P<klass>.+)_dim(?P<idx>\d{1,2})$", col)
    if not m:
        return None
    return m.group("klass"), int(m.group("idx"))


def _r3_for_class(klass: str) -> float:
    k = klass.lower()
    if "metro" in k:
        return R3_METRO
    if "bus" in k:
        return R3_BUS
    return R3_DEFAULT


def impute_vcr_semantic(df: pd.DataFrame) -> pd.DataFrame:
    """Imputa VCR por semántica de ausencia: distancias=R3, inversas/ratios=0, count=0, std=0.
    Además agrega flags `has_<clase>` indicando presencia de POIs por clase.
    """
    out = df.copy()

    # Agrupar columnas por clase
    groups: Dict[str, Dict[int, str]] = {}
    vcr_cols = []
    for c in out.columns:
        parsed = _class_and_dim(c)
        if parsed is None:
            continue
        klass, idx = parsed
        groups.setdefault(klass, {})[idx] = c
        vcr_cols.append(c)

    if not groups:
        # Nada que imputar
        return out

    # Flags de presencia por clase (antes de imputar)
    for klass, dim_map in groups.items():
        cols = list(dim_map.values())
        has_series = out[cols].notna().any(axis=1).astype("int64")
        out[f"has_{klass}"] = has_series  # por qué: distingue ausencia real vs lejanía

    # Imputación por clase/dim
    n_total_nans = int(out[vcr_cols].isna().sum().sum())
    for klass, dim_map in groups.items():
        r3 = _r3_for_class(klass)
        for idx, col in dim_map.items():
            role = DIM_ROLE.get(idx)
            if role == "distance":
                fill_value = r3
            elif role in {"inverse", "ratio", "std", "count"}:
                fill_value = 0.0
            else:
                # Si hay una dimensión desconocida, ser conservador con 0.0
                fill_value = 0.0
            out[col] = out[col].fillna(fill_value)

    n_after_nans = int(out[vcr_cols].isna().sum().sum())
    print(f"Imputación VCR completada. NaNs antes: {n_total_nans:,d} -> después: {n_after_nans:,d}")

    return out



df_vcr_imp = impute_vcr_semantic(df_vcr)
df_vcr_imp.info()  


Imputación VCR completada. NaNs antes: 246,228 -> después: 0
<class 'pandas.core.frame.DataFrame'>
Index: 25211 entries, 0 to 25214
Columns: 194 entries, monto to has_bus
dtypes: float64(161), int64(33)
memory usage: 37.5 MB


In [9]:
# Split
X_df = df_vcr_imp.drop(columns=["monto", "log_monto"]).copy()
y = df_vcr_imp["log_monto"].values.astype(np.float32)

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_df.values, y, test_size=TEST_SIZE, random_state=SEED
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=VAL_SIZE, random_state=SEED
)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train).astype(np.float32)
X_val = scaler.transform(X_val).astype(np.float32)
X_test = scaler.transform(X_test).astype(np.float32)

n_features = X_train.shape[1]
print(f"n_features: {n_features}")

n_features: 192


In [10]:
# ---- Ejecutar estudio y exportar HP ----
study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=SEED), pruner=MedianPruner(n_startup_trials=5))
print("Optuna FTT: iniciando búsqueda...")
study.optimize(objective_ftt, n_trials=N_TRIALS_FTT, show_progress_bar=True)

best = study.best_trial
best_params_FTT = {
    "d_token": best.params["d_token"],
    "n_head": best.params["n_head"],
    "n_layers": best.params["n_layers"],
    "ff_mult": best.params["ff_mult"],
    "dropout": best.params["dropout"],
    "lr": best.params["lr"],
    "weight_decay": best.params["weight_decay"],
    "batch_size": best.params["batch_size"],
}
print("\nMejores HP FTT Completo (val RMSE_log ↓):\n", json.dumps(best_params_FTT, indent=2))

hp_path = os.path.join(SAVE_DIR, f"best_params_FTT_{VERSION_TAG}.json")
with open(hp_path, "w") as f:
    json.dump(best_params_FTT, f, indent=2)
print(f"Guardado: {hp_path}")

[I 2025-10-09 16:39:34,493] A new study created in memory with name: no-name-b3677ee5-d1a1-4452-a87c-94ffc7ae4a94


Optuna FTT: iniciando búsqueda...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-10-09 16:43:07,553] Trial 0 finished with value: 0.18795368075370789 and parameters: {'d_token': 96, 'n_head': 4, 'n_layers': 4, 'ff_mult': 4, 'dropout': 0.0, 'lr': 0.002708160864249967, 'weight_decay': 0.0003142880890840109, 'batch_size': 256}. Best is trial 0 with value: 0.18795368075370789.
[I 2025-10-09 16:45:27,241] Trial 1 finished with value: 0.3773398697376251 and parameters: {'d_token': 128, 'n_head': 4, 'n_layers': 2, 'ff_mult': 4, 'dropout': 0.3, 'lr': 0.00019721610970574026, 'weight_decay': 3.489018845491386e-05, 'batch_size': 256}. Best is trial 0 with value: 0.18795368075370789.
[W 2025-10-09 16:46:15,033] Trial 2 failed with parameters: {'d_token': 256, 'n_head': 4, 'n_layers': 1, 'ff_mult': 2, 'dropout': 0.0, 'lr': 0.0005388108577817234, 'weight_decay': 1.2681352169084602e-06, 'batch_size': 256} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\Pc-ADS\.conda\envs\tf\lib\site-packages\optuna\study\_optimize.p

KeyboardInterrupt: 