In [1]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd



PROJECT_ROOT = os.getcwd()

if PROJECT_ROOT not in sys.path:
	sys.path.insert(0, PROJECT_ROOT)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("Contenido de ./data:", os.listdir(os.path.join(PROJECT_ROOT, "data")))

PROJECT_ROOT: c:\Users\adria.flores\Documents\Projects\hacks\datathon2025-smadex
Contenido de ./data: ['sample_submission.csv', 'split', 'test', 'train']


In [2]:
import dask
import dask.dataframe as dd
import pandas as pd
import json
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
dask.config.set({"dataframe.convert-string": False})

<dask.config.set at 0x1fb4027cb60>

In [3]:

DATASET_PATH = "./data/train/train"
EMBEDDINGS_MAPPING_FILE = "embeddings_mappings.json"
SCALER_FILE = "scaler.joblib"
CATEGORICAL_FEATURES = [
	'advertiser_bundle',
	'advertiser_category',
	'advertiser_subcategory',
	'advertiser_bottom_taxonomy_level',
	'country',
	'dev_make',
	'dev_model',
	'dev_os',
	'dev_osv',
	'release_date',
	'avg_daily_sessions',
	'avg_duration',
	'bcat_bottom_taxonomy',
	'ctr',
	]
NUMERICAL_FEATURES = ['buyer_d1', 'buyer_d7', 'buyer_d14', 'buyer_d28', 'buy_d7', 'buy_d14', 'buy_d28', 'iap_revenue_d7', 'iap_revenue_d14', 'iap_revenue_d28', 'registration', 'retention_d1_to_d7', 'retention_d3_to_d7', 'retention_d7_to_d14', 'retention_d1', 'retention_d3', 'retentiond7', 'hour', 'release_msrp', 'weekday', 'avg_act_days', 'bundles_cat_bottom_taxonomy', 'first_request_ts_bundle', 'iap_revenue_usd_bundle', 'last_buy', 'last_buy_ts_bundle', 'last_buy_ts_category', 'last_ins', 'user_actions_bundles_action_last_timestamp', 'num_buys_bundle', 'rev_by_adv', 'rwd_prank', 'weekend_ratio', 'weeks_since_first_seen', 'wifi_ratio', 'whale_users_bundle_num_buys_prank', 'whale_users_bundle_revenue_prank', 'whale_users_bundle_total_num_buys', 'whale_users_bundle_total_revenue', 'last_buy_safe', 'last_ins_safe', 'last_buy_dt', 'last_ins_dt', 'hours_since_last_buy', 'hours_since_last_ins']
TARGET = "iap_revenue_d7"

def load_embeddings_mapping():
	with open(EMBEDDINGS_MAPPING_FILE, "r") as f:
		mappings = json.load(f)
	return mappings

#EMBEDDING_MAPPINGS = load_embeddings_mapping()
COLS_TO_READ = ['buyer_d1', 
			 'buyer_d7', 
			 'buyer_d14', 
			 'buyer_d28', 
			 'buy_d7', 
			 'buy_d14', 
			 'buy_d28', 
			 'iap_revenue_d7', 
			 'iap_revenue_d14', 
			 'iap_revenue_d28', 
			 'registration', 
			 'retention_d1_to_d7', 
			 'retention_d3_to_d7', 
			 'retention_d7_to_d14', 
			 'retention_d1', 
			 'retention_d3', 
			 'retentiond7', 
			 'advertiser_bundle', 
			 'advertiser_category', 
			 'advertiser_subcategory', 
			 'advertiser_bottom_taxonomy_level', 
			 'country', 
			 'dev_make', 
			 'dev_model', 
			 'dev_os', 
			 'dev_osv', 
			 'hour', 
			 'release_date', 
			 'release_msrp', 
			 'weekday', 
			 'avg_act_days', 
			 'avg_daily_sessions', 
			 'avg_duration', 
			 'bcat_bottom_taxonomy', 
			 'bundles_cat_bottom_taxonomy',  
			 'ctr',  
			 'first_request_ts_bundle', 
			 'iap_revenue_usd_bundle', 
			 'last_buy', 
			 'last_buy_ts_bundle', 
			 'last_buy_ts_category', 
			 'last_ins', 
			 'user_actions_bundles_action_last_timestamp', 
			 'num_buys_bundle', 
			 'rev_by_adv', 
			 'rwd_prank', 
			 'weekend_ratio', 
			 'weeks_since_first_seen', 
			 'wifi_ratio', 
			 'whale_users_bundle_num_buys_prank', 
			 'whale_users_bundle_revenue_prank', 
			 'whale_users_bundle_total_num_buys', 
			 'whale_users_bundle_total_revenue']

train_filters = [("datetime", ">=", "2025-10-01-00-00"), ("datetime", "<", "2025-10-06-00-00")]

ddf_train = dd.read_parquet(
	DATASET_PATH,
	engine="pyarrow",
	columns=COLS_TO_READ,
	filters=train_filters
)

# loaded_scaler = joblib.load(SCALER_FILE)

In [17]:


def process_partition(df):
	transform_variables(df)
	scale_numerical_features(df, loaded_scaler)
	impute_missings(df)

def hours_since_now_from_list(tuples_list, now_ts):
	# Check if the input is a list
	if isinstance(tuples_list, list) and len(tuples_list) > 0:
		# Extract all numeric timestamps from the tuples
		timestamps = []
		for t in tuples_list:
			if isinstance(t, tuple) and len(t) == 2:
				ts = t[1]
				if ts is not None and not isinstance(ts, list):
					timestamps.append(ts)
		if len(timestamps) > 0:
			# Use the largest timestamp (closest to now)
			max_ts = max(timestamps)
			return (now_ts - max_ts) / 3600  # seconds → hours
	return np.nan

def extract_numbers(tuple_list):
	"""Extract only the numeric part from a list of (id, value) tuples."""
	if isinstance(tuple_list, list):
		return [t[1] for t in tuple_list if isinstance(t, tuple) and len(t) >= 2]
	return []

def aggregate(values, mode):
	"""Apply either sum or mean depending on mode."""
	if not values:
		return np.nan
	if mode == "sum":
		return sum(values)
	return sum(values) / len(values)

def transform_variables(df):
	now = pd.Timestamp.now()

	# Define reasonable bounds for Unix timestamps (seconds)
	min_ts = 0                  # 1970-01-01
	max_ts = 4102444800         # 2100-01-01 in Unix seconds

	# Replace invalid timestamps with NaN
	df["last_buy_safe"] = df["last_buy"].where(
		df["last_buy"].between(min_ts, max_ts), np.nan
	)
	df["last_ins_safe"] = df["last_ins"].where(
		df["last_ins"].between(min_ts, max_ts), np.nan
	)

	# Convert safe Unix timestamps to datetime
	df["last_buy_dt"] = pd.to_datetime(df["last_buy_safe"], unit="s")
	df["last_ins_dt"] = pd.to_datetime(df["last_ins_safe"], unit="s")

	# Compute hours ago
	df["hours_since_last_buy"] = (now - df["last_buy_dt"]).dt.total_seconds() / 3600
	df["hours_since_last_ins"] = (now - df["last_ins_dt"]).dt.total_seconds() / 3600

	# Drop the original Unix timestamp columns
	df = df.drop(columns=["last_buy", "last_ins", "last_buy_safe", "last_ins_dt", "last_buy_dt", "last_ins_safe"])




	# Convert 'hour' from string to integer
	df['hour'] = df['hour'].astype(int)

	# Convert hour to radians (full circle = 24 hours)
	radians = df['hour'] * (2 * np.pi / 24)

	# Create two new columns: sin_hour and cos_hour
	df['hour_sin'] = np.sin(radians)
	df['hour_cos'] = np.cos(radians)

	# Drop the original 'hour' column
	df.drop(columns=['hour'], inplace=True)




	# Ensure `now_ts` is a Unix timestamp
	now_ts = int(pd.Timestamp.now().timestamp())

	# Apply to your 4 timestamp columns
	ts_cols = [
		"first_request_ts_bundle",
		"last_buy_ts_bundle",
		"last_buy_ts_category",
		"user_actions_bundles_action_last_timestamp"
	]

	for col in ts_cols:
		new_col = col + "_hours_ago"
		df[new_col] = df[col].apply(lambda x: hours_since_now_from_list(x, now_ts))

	# Drop the original tuple columns if you want
	df.drop(columns=ts_cols, inplace=True)




	rules = {
		"iap_revenue_usd_bundle": "sum",
		"num_buys_bundle": "sum",
		"rev_by_adv": "sum",
		"rwd_prank": "mean",
		"whale_users_bundle_num_buys_prank": "mean",
		"whale_users_bundle_revenue_prank": "mean",
		"whale_users_bundle_total_num_buys": "sum",
		"whale_users_bundle_total_revenue": "sum",
		"avg_daily_sessions": "mean",
		"avg_duration": "mean",
		"bcat_bottom_taxonomy": "mean",
		"ctr": "sum",
		"bundles_cat_bottom_taxonomy": "sum",
	}

	for col, mode in rules.items():
		new_col = col + "_agg"
		df[new_col] = df[col].apply(
			lambda lst: aggregate(extract_numbers(lst), mode)
		)
		print(f"Transformed {col} into {new_col} using {mode} aggregation.")

	df.drop(columns=list(rules.keys()), inplace=True)

	# If the df contains the target column, transform it with the logarithm
	if TARGET in df.columns:
		df[TARGET] = np.log1p(df[TARGET])
	
	return df

def impute_missings(df):

	for col in CATEGORICAL_FEATURES:
		df[col] = df[col].fillna("<MISSING>")

		# convert to embeddings indices (if category not seen before, assign index 0)
		mapping = EMBEDDING_MAPPINGS[col]
		df[col] = df[col].map(lambda x: mapping.get(x, 0))

	for col in NUMERICAL_FEATURES:
		# 1. Create missing indicator
		df[f"{col}_is_missing"] = df[col].isna().astype(float)

		# 2. Impute missings with 0
		df[col] = df[col].fillna(0).astype(float)

def scale_numerical_features(df, scaler):
	df[NUMERICAL_FEATURES] = scaler.transform(df[NUMERICAL_FEATURES])

def train_scaler(df):
	scaler = StandardScaler()
	scaler.fit(df[NUMERICAL_FEATURES])
	joblib.dump(scaler, SCALER_FILE)
	print("Scaler saved.")

def generate_embeddings_mapping(df: pd.DataFrame):
	# Use after transform_variables but BEFORE imputing missings
	
	mappings = {}
	for feature in CATEGORICAL_FEATURES:
		unique_values = ["<MISSING>"] + df[feature].dropna().unique().tolist()
		embeddings = {val: idx for idx, val in enumerate(unique_values)}
		mappings[feature] = embeddings
	
	# Save mappings to disk
	with open(EMBEDDINGS_MAPPING_FILE, "w") as f:
		json.dump(mappings, f)


In [7]:
df_train = ddf_train.compute()
print("Training data loaded.")

Training data loaded.


In [8]:
df_train = transform_variables(df_train)

Transformed iap_revenue_usd_bundle into iap_revenue_usd_bundle_agg using sum aggregation.
Transformed num_buys_bundle into num_buys_bundle_agg using sum aggregation.
Transformed rev_by_adv into rev_by_adv_agg using sum aggregation.
Transformed rwd_prank into rwd_prank_agg using mean aggregation.
Transformed whale_users_bundle_num_buys_prank into whale_users_bundle_num_buys_prank_agg using mean aggregation.
Transformed whale_users_bundle_revenue_prank into whale_users_bundle_revenue_prank_agg using mean aggregation.
Transformed whale_users_bundle_total_num_buys into whale_users_bundle_total_num_buys_agg using sum aggregation.
Transformed whale_users_bundle_total_revenue into whale_users_bundle_total_revenue_agg using sum aggregation.
Transformed avg_daily_sessions into avg_daily_sessions_agg using mean aggregation.
Transformed avg_duration into avg_duration_agg using mean aggregation.
Transformed bcat_bottom_taxonomy into bcat_bottom_taxonomy_agg using mean aggregation.
Transformed ctr 

In [9]:
print("Index único:", df_train.index.is_unique)
print("Columnas únicas:", df_train.columns.is_unique)

if not df_train.columns.is_unique:
    print("Columnas duplicadas:")
    print(df_train.columns[df_train.columns.duplicated()])
    
cols_with_lists = []

for col in df_train.columns:
    has_list = df_train[col].apply(lambda x: isinstance(x, list)).any()
    if has_list:
        cols_with_lists.append(col)

print("Columnas con listas:", cols_with_lists)

Index único: False
Columnas únicas: True
Columnas con listas: []


In [None]:
#Check all types

# Make a list with the numerical values and another with the categorical ones
numerical_features = []
categorical_features = []

for col in df_train.columns:
	if pd.api.types.is_numeric_dtype(df_train[col]):
		numerical_features.append(col)
	else:
		categorical_features.append(col)

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

CATEGORICAL_FEATURES = categorical_features
NUMERICAL_FEATURES = numerical_features

# Remove the objective columns from numerical features
if TARGET in NUMERICAL_FEATURES:
 	NUMERICAL_FEATURES.remove(TARGET)
	



Numerical features: ['buyer_d1', 'buyer_d7', 'buyer_d14', 'buyer_d28', 'buy_d7', 'buy_d14', 'buy_d28', 'iap_revenue_d7', 'iap_revenue_d14', 'iap_revenue_d28', 'registration', 'retention_d1_to_d7', 'retention_d3_to_d7', 'retention_d7_to_d14', 'retention_d1', 'retention_d3', 'retentiond7', 'release_msrp', 'weekday', 'avg_act_days', 'weekend_ratio', 'weeks_since_first_seen', 'wifi_ratio', 'hours_since_last_buy', 'hours_since_last_ins', 'hour_sin', 'hour_cos', 'first_request_ts_bundle_hours_ago', 'last_buy_ts_bundle_hours_ago', 'last_buy_ts_category_hours_ago', 'user_actions_bundles_action_last_timestamp_hours_ago', 'iap_revenue_usd_bundle_agg', 'num_buys_bundle_agg', 'rev_by_adv_agg', 'rwd_prank_agg', 'whale_users_bundle_num_buys_prank_agg', 'whale_users_bundle_revenue_prank_agg', 'whale_users_bundle_total_num_buys_agg', 'whale_users_bundle_total_revenue_agg', 'avg_daily_sessions_agg', 'avg_duration_agg', 'bcat_bottom_taxonomy_agg', 'ctr_agg', 'bundles_cat_bottom_taxonomy_agg']
Categorica

In [18]:
generate_embeddings_mapping(df_train)
train_scaler(df_train)

Scaler saved.


In [None]:
import pandas as pd
import numpy as np

import os

# --- 1. Define Data and Columns to Scale ---
cols_to_scale = ['age', 'income']
scaler_filename = 'my_scaler.joblib'

# Create a training DataFrame
df_train = pd.DataFrame({
	'age': [20, 30, 40, 50, np.nan],
	'income': [50000, 60000, np.nan, 80000, 75000],
	'city': ['New York', 'London', 'Paris', 'Tokyo', 'Sydney']
})

print("Original Training Data:")
print(df_train)
print("-" * 30)

# --- 2. Initialize, Fit, and Save Scaler ---

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler ONLY on the specified columns
# .fit() ignores NaNs when calculating mean and std
print(f"Fitting scaler on columns: {cols_to_scale}")
scaler.fit(df_train[cols_to_scale])

# Save the fitted scaler to a file
joblib.dump(scaler, scaler_filename)
print(f"Scaler saved to {scaler_filename}")
print("-" * 30)


# --- 3. Load Scaler and Transform New Data ---

# Create new "test" data to transform
df_test = pd.DataFrame({
	'age': [35, 45, np.nan],
	'income': [55000, 90000, 62000],
	'city': ['Berlin', 'Moscow', 'Lisbon']
})

print("New Data (Before Transform):")
print(df_test)
print(f"Column names before: {df_test.columns.tolist()}")
print("-" * 30)

# Load the scaler from the file
loaded_scaler = joblib.load(scaler_filename)
print("Scaler loaded.")

# Create a copy to avoid SettingWithCopyWarning
# df_test_transformed = df_test.copy()

# --- This is the key step ---
# Transform only the specified columns and assign them back
# to the *same columns* in the copied DataFrame.
# .transform() scales non-NaNs and keeps NaNs as NaN.
df_test[cols_to_scale] = loaded_scaler.transform(df_test[cols_to_scale])

# --- 4. Verify Results ---

print("\nNew Data (After Transform):")
print(df_test)
print(f"Column names after:  {df_test.columns.tolist()}")

# Check that column names and order are identical
are_names_same = df_test.columns.equals(df_test.columns)
print(f"\nColumn names and order unchanged: {are_names_same}")

# Clean up the created file
os.remove(scaler_filename)

In [None]:
# Check all categorical features are present in df
for feature in CATEGORICAL_FEATURES:
	if feature not in df.columns:
		raise ValueError(f"Categorical feature '{feature}' not found in dataframe columns.")

# Show the other variables present in df
df_vars = [col for col in df.columns if col not in CATEGORICAL_FEATURES]
print("Other variables in df:", df_vars)

Other variables in df: ['buyer_d1', 'buyer_d7', 'buyer_d14', 'buyer_d28', 'buy_d7', 'buy_d14', 'buy_d28', 'iap_revenue_d7', 'iap_revenue_d14', 'iap_revenue_d28', 'registration', 'retention_d1_to_d7', 'retention_d3_to_d7', 'retention_d7_to_d14', 'retention_d1', 'retention_d3', 'retentiond7', 'hour', 'release_msrp', 'weekday', 'avg_act_days', 'bundles_cat', 'bundles_cat_bottom_taxonomy', 'city_hist', 'country_hist', 'dev_language_hist', 'dev_osv_hist', 'first_request_ts_bundle', 'iap_revenue_usd_bundle', 'last_buy', 'last_buy_ts_bundle', 'last_buy_ts_category', 'last_ins', 'user_actions_bundles_action_last_timestamp', 'num_buys_bundle', 'region_hist', 'rev_by_adv', 'rwd_prank', 'weekend_ratio', 'weeks_since_first_seen', 'wifi_ratio', 'whale_users_bundle_num_buys_prank', 'whale_users_bundle_revenue_prank', 'whale_users_bundle_total_num_buys', 'whale_users_bundle_total_revenue', 'last_buy_safe', 'last_ins_safe', 'last_buy_dt', 'last_ins_dt', 'hours_since_last_buy', 'hours_since_last_ins']


In [None]:
for feature in CATEGORICAL_FEATURES:
	print("====", feature, "====")
	print(df[feature].map(type).value_counts().head())

==== advertiser_bundle ====
advertiser_bundle
<class 'str'>    17294102
Name: count, dtype: int64
==== advertiser_category ====
advertiser_category
<class 'str'>         15609385
<class 'NoneType'>     1684717
Name: count, dtype: int64
==== advertiser_subcategory ====
advertiser_subcategory
<class 'str'>         15609385
<class 'NoneType'>     1684717
Name: count, dtype: int64
==== advertiser_bottom_taxonomy_level ====
advertiser_bottom_taxonomy_level
<class 'str'>         10849228
<class 'NoneType'>     6444874
Name: count, dtype: int64
==== country ====
country
<class 'str'>         17287248
<class 'NoneType'>        6854
Name: count, dtype: int64
==== dev_make ====
dev_make
<class 'str'>         16974677
<class 'NoneType'>      319425
Name: count, dtype: int64
==== dev_model ====
dev_model
<class 'str'>         17265823
<class 'NoneType'>       28279
Name: count, dtype: int64
==== dev_os ====
dev_os
<class 'str'>         17294014
<class 'NoneType'>          88
Name: count, dtype: in

In [None]:
generate_embeddings_mapping(df)

TypeError: unhashable type: 'list'

In [None]:
import torch
import torch.nn as nn
from typing import Dict, List


class EmbeddingMLP(nn.Module):
    def __init__(
        self,
        cat_cardinalities: Dict[str, int],
        cat_features: List[str],
        num_cont_features: int,          # numéricas + indicadores de missing
        embedding_dim: int = 16,
        hidden_dims: List[int] = [128, 64],
        dropout: float = 0.1,
    ):
        super().__init__()
        self.cat_features = cat_features
        self.num_cat = len(cat_features)

        # Embeddings por feature categórica
        self.embeddings = nn.ModuleDict({
            feat: nn.Embedding(num_embeddings=cardinality, embedding_dim=embedding_dim)
            for feat, cardinality in cat_cardinalities.items()
        })

        # Dimensión de entrada al MLP: [contínuas + embeddings]
        total_cat_dim = embedding_dim * self.num_cat
        input_dim = self.num_cont_features + total_cat_dim

        layers = []
        prev_dim = input_dim
        for h in hidden_dims:
            layers.append(nn.Linear(prev_dim, h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h

        layers.append(nn.Linear(prev_dim, 1))  # regresión escalar

        self.mlp = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: [batch, num_cat + num_cont_features]
        columnas en el orden:
        [CATEGORICAL_FEATURES..., NUMERICAL_FEATURES..., NUMERICAL_IS_MISSING...]
        """

        # 1) Separar categóricas y numéricas del gran vector
        x_cat_ids = x[:, :self.num_cat].long()     # [batch, num_cat]
        x_cont = x[:, self.num_cat:].float()       # [batch, num_cont_features]

        # 2) Embeddings categóricas
        emb_list = []
        for i, feat in enumerate(self.cat_features):
            emb = self.embeddings[feat](x_cat_ids[:, i])  # [batch, embedding_dim]
            emb_list.append(emb)

        if emb_list:
            x_cat_emb = torch.cat(emb_list, dim=1)        # [batch, embedding_dim * num_cat]
            x_input = torch.cat([x_cont, x_cat_emb], dim=1)
        else:
            x_input = x_cont

        out = self.mlp(x_input)   # [batch, 1]
        return out.squeeze(-1)   

In [None]:
with open("embeddings_mappings.json", "r") as f:
    EMBEDDING_MAPPINGS = json.load(f)

num_cont_features = len(NUMERICAL_FEATURES) + len(NUMERICAL_FEATURES)  # numéricas + _is_missing

cat_cardinalities = {
    feat: len(EMBEDDING_MAPPINGS[feat])
    for feat in CATEGORICAL_FEATURES
}

model = EmbeddingMLP(
    cat_cardinalities=cat_cardinalities,
    cat_features=CATEGORICAL_FEATURES,
    num_cont_features=num_cont_features,
    embedding_dim=16,
    hidden_dims=[128, 64],
    dropout=0.1,
).to(device)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.MSELoss()   # estás trabajando en log1p(target)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    n_batches = 0

    for batch in train_loader:
        x_num = batch["num"].to(device).float()
        x_cat = batch["cat"].to(device).long()
        y = batch["y"].to(device).float()

        optimizer.zero_grad()
        preds = model(x_num, x_cat)          # [batch]
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        n_batches += 1

    avg_loss = running_loss / max(n_batches, 1)
    print(f"Epoch {epoch+1}/{num_epochs} - train_loss: {avg_loss:.4f}")

    # Validación simple (opcional aquí, pero deberías hacerlo)
    model.eval()
    val_loss = 0.0
    n_val = 0
    with torch.inference_mode():
        for batch in val_loader:
            x_num = batch["num"].to(device).float()
            x_cat = batch["cat"].to(device).long()
            y = batch["y"].to(device).float()

            preds = model(x_num, x_cat)
            loss = criterion(preds, y)
            val_loss += loss.item()
            n_val += 1

    val_loss /= max(n_val, 1)
    print(f"           val_loss: {val_loss:.4f}")


In [None]:
class FTTransformer(nn.Module):
	def __init__(
		self,
		cat_cardinalities: Dict[str, int],
		cat_features: List[str],
		num_cont_features: int,        # numéricas + indicadores de missing
		d_token: int = 32,             # dimensión de cada token (embeddings + cont)
		n_layers: int = 3,
		n_heads: int = 4,
		ff_mult: float = 4.0,
		dropout: float = 0.1,
		mlp_hidden_dims: List[int] = [128, 64],
	):
		super().__init__()

		assert d_token % n_heads == 0, "d_token debe ser divisible por n_heads"

		self.cat_features = cat_features
		self.num_cat = len(cat_features)
		self.num_cont_features = num_cont_features
		self.d_token = d_token

		# 1) Embeddings por feature categórica (cada una produce un token de dim d_token)
		self.cat_embeddings = nn.ModuleDict({
			feat: nn.Embedding(num_embeddings=cardinality, embedding_dim=d_token)
			for feat, cardinality in cat_cardinalities.items()
		})

		# 2) "Embeddings" para continuas: un token por feature numérica
		#    x_cont: [B, num_cont] -> [B, num_cont, d_token]
		self.cont_weight = nn.Parameter(torch.randn(num_cont_features, d_token))
		self.cont_bias = nn.Parameter(torch.zeros(num_cont_features, d_token))

		# 3) Token CLS para agregar toda la fila
		self.cls_token = nn.Parameter(torch.zeros(1, 1, d_token))

		# 4) Bloques Transformer encoder
		encoder_layer = nn.TransformerEncoderLayer(
			d_model=d_token,
			nhead=n_heads,
			dim_feedforward=int(ff_mult * d_token),
			dropout=dropout,
			batch_first=True,        # [B, T, C]
			activation="gelu",
		)
		self.transformer = nn.TransformerEncoder(
			encoder_layer,
			num_layers=n_layers,
		)

		# 5) Cabeza final MLP (regresión escalar)
		mlp_layers = []
		prev_dim = d_token
		for h in mlp_hidden_dims:
			mlp_layers.append(nn.Linear(prev_dim, h))
			mlp_layers.append(nn.ReLU())
			mlp_layers.append(nn.Dropout(dropout))
			prev_dim = h
		mlp_layers.append(nn.Linear(prev_dim, 1))

		self.head = nn.Sequential(*mlp_layers)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
		"""
		x: [batch, num_cat + num_cont_features]
		columnas en el orden:
		[CATEGORICAL_FEATURES..., NUMERICAL_FEATURES..., NUMERICAL_IS_MISSING...]
		"""

		# 1) Separar categóricas y numéricas del gran vector
		x_cat_ids = x[:, :self.num_cat].long()      # [B, num_cat]
		x_cont = x[:, self.num_cat:].float()        # [B, num_cont]

		# 2) Tokens categóricos: uno por feature categórica
		cat_tokens = []
		for i, feat in enumerate(self.cat_features):
			tok = self.cat_embeddings[feat](x_cat_ids[:, i])   # [B, d_token]
			cat_tokens.append(tok)

		if cat_tokens:
			x_cat_tok = torch.stack(cat_tokens, dim=1)        # [B, num_cat, d_token]
		else:
			# Por si acaso, aunque en tu caso siempre hay categóricas
			x_cat_tok = torch.empty(x.size(0), 0, self.d_token, device=x.device)

		# 3) Tokens continuos: uno por feature numérica (incluye indicadores de missing)
		# x_cont: [B, num_cont]
		# cont_weight: [num_cont, d_token]
		# -> [B, num_cont, d_token]
		x_cont_exp = x_cont.unsqueeze(-1)                     # [B, num_cont, 1]
		cont_tokens = x_cont_exp * self.cont_weight.unsqueeze(0) + self.cont_bias.unsqueeze(0)

		# 4) Concatenar tokens: [cat_tokens..., cont_tokens...]
		tokens = torch.cat([x_cat_tok, cont_tokens], dim=1)   # [B, T, d_token], T = num_cat + num_cont

		# 5) Prepend CLS token
		B = x.size(0)
		cls = self.cls_token.expand(B, 1, self.d_token)       # [B, 1, d_token]
		tokens = torch.cat([cls, tokens], dim=1)              # [B, 1+T, d_token]

		# 6) Pasar por el Transformer
		encoded = self.transformer(tokens)                    # [B, 1+T, d_token]

		# 7) Leer solo el CLS
		cls_out = encoded[:, 0, :]                            # [B, d_token]

		# 8) Cabeza MLP final
		out = self.head(cls_out)                              # [B, 1]
		return out.squeeze(-1)                                # [B]   


def main():
	AMP = False

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")
	if not torch.cuda.is_available() or not torch.amp.autocast_mode.is_autocast_available("cuda"):
		AMP = False
	
	train_filters = [("datetime", "<", "2025-10-06-23-00")]
	val_filters = [("datetime", ">=", "2025-10-06-23-00")]
	
	ddf_train = dd.read_parquet(
		DATASET_PATH,
		engine="pyarrow",
		columns=COLS_TO_READ,
		filters=train_filters
	)

	ddf_val = dd.read_parquet(
		DATASET_PATH,
		engine="pyarrow",
		columns=COLS_TO_READ,
		filters=val_filters
	)

	val_pdf = ddf_val.compute()
	val_pdf = process_partition(val_pdf)

	val_features_tensor = torch.tensor(
		val_pdf[CATEGORICAL_FEATURES + NUMERICAL_FEATURES + [f"{col}_is_missing" for col in NUMERICAL_FEATURES]].values, 
		dtype=torch.float32
	)
	
	val_target_tensor = torch.tensor(
		val_pdf[TARGET].values,
		dtype=torch.float32
	)

	val_dataset = TensorDataset(val_features_tensor, val_target_tensor)
	test_loader = DataLoader(dataset=val_dataset, 
							 batch_size=BATCH_SIZE, 
							 num_workers=0,
							 pin_memory=True,
							 shuffle=False)

	num_cont_features = len(NUMERICAL_FEATURES) + len(NUMERICAL_FEATURES)  # numéricas + _is_missing

	cat_cardinalities = {
		feat: len(EMBEDDING_MAPPINGS[feat])
		for feat in CATEGORICAL_FEATURES
	}
	model = FTTransformer(
		cat_cardinalities=cat_cardinalities,
		cat_features=CATEGORICAL_FEATURES,
		num_cont_features=num_cont_features,
		d_token=64,          # puedes subir a 64 si la GPU aguanta
		n_layers=3,
		n_heads=4,
		ff_mult=4.0,
		dropout=0.1,
		mlp_hidden_dims=[128, 64],
	).to(device)