In [3]:
# %% imports básicos
import os
import json
import numpy as np
import pandas as pd
import dask.dataframe as dd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
import joblib
from typing import List, Dict
from sklearn.preprocessing import StandardScaler

# %% paths y config (ajusta a tu setup)
DATASET_PATH = "./data/train"
CHECKPOINT_PATH = "model_finetuned_train_val.pth"   # <-- pon aquí tu checkpoint final

EMBEDDINGS_MAPPING_FILE = "embeddings_mappings.json"
SCALER_FILE = "scaler.joblib"

CATEGORICAL_FEATURES = ['advertiser_bundle', 'advertiser_category', 'advertiser_subcategory', 'advertiser_bottom_taxonomy_level', 'country', 'dev_make', 'dev_model', 'dev_os', 'dev_osv', 'release_date']
NUMERICAL_FEATURES = ['release_msrp', 'weekday', 'avg_act_days', 'weekend_ratio', 'weeks_since_first_seen', 'wifi_ratio', 'hours_since_last_buy', 'hours_since_last_ins', 'hour_sin', 'hour_cos', 'first_request_ts_bundle_hours_ago', 'last_buy_ts_bundle_hours_ago', 'last_buy_ts_category_hours_ago', 'user_actions_bundles_action_last_timestamp_hours_ago', 'iap_revenue_usd_bundle_agg', 'num_buys_bundle_agg', 'rev_by_adv_agg', 'rwd_prank_agg', 'whale_users_bundle_num_buys_prank_agg', 'whale_users_bundle_revenue_prank_agg', 'whale_users_bundle_total_num_buys_agg', 'whale_users_bundle_total_revenue_agg', 'avg_daily_sessions_agg', 'avg_duration_agg', 'bcat_bottom_taxonomy_agg', 'ctr_agg', 'bundles_cat_bottom_taxonomy_agg']
ALL_FEATURES = CATEGORICAL_FEATURES + NUMERICAL_FEATURES + [f"{col}_is_missing" for col in NUMERICAL_FEATURES]

TARGET = "iap_revenue_d7"
TARGET_BIN = "buy_d7_binary"        # 0/1 compra
TARGET_LOG = "iap_revenue_d7_log"  

def load_embeddings_mapping():
	with open(EMBEDDINGS_MAPPING_FILE, "r") as f:
		mappings = json.load(f)
	return mappings

EMBEDDING_MAPPINGS = load_embeddings_mapping()
COLS_TO_READ = [
			 'iap_revenue_d7', 
			 'advertiser_bundle', 
			 'advertiser_category', 
			 'advertiser_subcategory', 
			 'advertiser_bottom_taxonomy_level', 
			 'country', 
			 'dev_make', 
			 'dev_model', 
			 'dev_os', 
			 'dev_osv', 
			 'hour', 
			 'release_date', 
			 'release_msrp', 
			 'weekday', 
			 'avg_act_days', 
			 'avg_daily_sessions', 
			 'avg_duration', 
			 'bcat_bottom_taxonomy', 
			 'bundles_cat_bottom_taxonomy',  
			 'ctr',  
			 'first_request_ts_bundle', 
			 'iap_revenue_usd_bundle', 
			 'last_buy', 
			 'last_buy_ts_bundle', 
			 'last_buy_ts_category', 
			 'last_ins', 
			 'user_actions_bundles_action_last_timestamp', 
			 'num_buys_bundle', 
			 'rev_by_adv', 
			 'rwd_prank', 
			 'weekend_ratio', 
			 'weeks_since_first_seen', 
			 'wifi_ratio', 
			 'whale_users_bundle_num_buys_prank', 
			 'whale_users_bundle_revenue_prank', 
			 'whale_users_bundle_total_num_buys', 
			 'whale_users_bundle_total_revenue']

loaded_scaler = joblib.load(SCALER_FILE)


def process_partition(df):
	df = transform_variables(df)
	df = scale_numerical_features(df, loaded_scaler)
	df = impute_missings(df)
	return df

def hours_since_now_from_list(tuples_list, now_ts):
	# Check if the input is a list
	if isinstance(tuples_list, list) and len(tuples_list) > 0:
		# Extract all numeric timestamps from the tuples
		timestamps = []
		for t in tuples_list:
			if isinstance(t, tuple) and len(t) == 2:
				ts = t[1]
				if ts is not None and not isinstance(ts, list):
					timestamps.append(ts)
		if len(timestamps) > 0:
			# Use the largest timestamp (closest to now)
			max_ts = max(timestamps)
			return (now_ts - max_ts) / 3600  # seconds → hours
	return np.nan

def extract_numbers(tuple_list):
	"""Extract only the numeric part from a list of (id, value) tuples."""
	if isinstance(tuple_list, list):
		return [t[1] for t in tuple_list if isinstance(t, tuple) and len(t) >= 2]
	return []

def aggregate(values, mode):
	"""Apply either sum or mean depending on mode."""
	if not values:
		return np.nan
	if mode == "sum":
		return sum(values)
	return sum(values) / len(values)

def transform_variables(df):
	 
	try:

			now = pd.Timestamp.now()
			# Define reasonable bounds for Unix timestamps (seconds)
			min_ts = 0                  # 1970-01-01
			max_ts = 4102444800         # 2100-01-01 in Unix seconds

			# Replace invalid timestamps with NaN
			df["last_buy_safe"] = df["last_buy"].where(
				df["last_buy"].between(min_ts, max_ts), np.nan
			)
			df["last_ins_safe"] = df["last_ins"].where(
				df["last_ins"].between(min_ts, max_ts), np.nan
			)

			# Convert safe Unix timestamps to datetime
			df["last_buy_dt"] = pd.to_datetime(df["last_buy_safe"], unit="s")
			df["last_ins_dt"] = pd.to_datetime(df["last_ins_safe"], unit="s")

			# Compute hours ago
			df["hours_since_last_buy"] = (now - df["last_buy_dt"]).dt.total_seconds() / 3600
			df["hours_since_last_ins"] = (now - df["last_ins_dt"]).dt.total_seconds() / 3600

			# Drop the original Unix timestamp columns
			df = df.drop(columns=["last_buy", "last_ins", "last_buy_safe", "last_ins_dt", "last_buy_dt", "last_ins_safe"])




			# Convert 'hour' from string to integer
			df['hour'] = df['hour'].astype(int)

			# Convert hour to radians (full circle = 24 hours)
			radians = df['hour'] * (2 * np.pi / 24)

			# Create two new columns: sin_hour and cos_hour
			df['hour_sin'] = np.sin(radians)
			df['hour_cos'] = np.cos(radians)

			# Drop the original 'hour' column
			df.drop(columns=['hour'], inplace=True)




			# Ensure `now_ts` is a Unix timestamp
			now_ts = int(pd.Timestamp.now().timestamp())

			# Apply to your 4 timestamp columns
			ts_cols = [
				"first_request_ts_bundle",
				"last_buy_ts_bundle",
				"last_buy_ts_category",
				"user_actions_bundles_action_last_timestamp"
			]

			for col in ts_cols:
				new_col = col + "_hours_ago"
				df[new_col] = df[col].apply(lambda x: hours_since_now_from_list(x, now_ts))

			# Drop the original tuple columns if you want
			df.drop(columns=ts_cols, inplace=True)




			rules = {
				"iap_revenue_usd_bundle": "sum",
				"num_buys_bundle": "sum",
				"rev_by_adv": "sum",
				"rwd_prank": "mean",
				"whale_users_bundle_num_buys_prank": "mean",
				"whale_users_bundle_revenue_prank": "mean",
				"whale_users_bundle_total_num_buys": "sum",
				"whale_users_bundle_total_revenue": "sum",
				"avg_daily_sessions": "mean",
				"avg_duration": "mean",
				"bcat_bottom_taxonomy": "mean",
				"ctr": "sum",
				"bundles_cat_bottom_taxonomy": "sum",
			}

			for col, mode in rules.items():
				new_col = col + "_agg"
				df[new_col] = df[col].apply(
					lambda lst: aggregate(extract_numbers(lst), mode)
				)

			df.drop(columns=list(rules.keys()), inplace=True)

			# If the df contains the target column, transform it with the logarithm
			if TARGET in df.columns:
				# 1) binario de compra
				df[TARGET_BIN] = (df[TARGET] > 0).astype(float)

				# 2) log1p del importe (clip por si acaso)
				df[TARGET_LOG] = np.log1p(df[TARGET].clip(lower=0))

	# Remove all the variables and create the new ones by setting them to the default values
	except Exception:
		# If something failed, remove any partially created/intermediate columns
		ts_cols = [
			"first_request_ts_bundle",
			"last_buy_ts_bundle",
			"last_buy_ts_category",
			"user_actions_bundles_action_last_timestamp",
		]

		rules_keys = [
			"iap_revenue_usd_bundle",
			"num_buys_bundle",
			"rev_by_adv",
			"rwd_prank",
			"whale_users_bundle_num_buys_prank",
			"whale_users_bundle_revenue_prank",
			"whale_users_bundle_total_num_buys",
			"whale_users_bundle_total_revenue",
			"avg_daily_sessions",
			"avg_duration",
			"bcat_bottom_taxonomy",
			"ctr",
			"bundles_cat_bottom_taxonomy",
		]

		intermediate_cols = (
			[
				"last_buy",
				"last_ins",
				"last_buy_safe",
				"last_ins_safe",
				"last_buy_dt",
				"last_ins_dt",
				"hour",
				"hours_since_last_buy",
				"hours_since_last_ins",
				"hour_sin",
				"hour_cos",
			]
			+ [c + "_hours_ago" for c in ts_cols]
			+ [k + "_agg" for k in rules_keys]
		)

		# Drop any of those columns if they exist (silent if not)
		df.drop(columns=[c for c in intermediate_cols if c in df.columns], inplace=True, errors="ignore")

		# Create all expected features with default (NaN) values so subsequent steps won't fail
		for col in ALL_FEATURES:
			if col not in df.columns:
				df[col] = np.nan

		# Ensure target exists (set to NaN)
		if TARGET not in df.columns:
			df[TARGET] = np.nan

		# NUEVO: asegurarnos de que también existen los targets derivados
		if TARGET_BIN not in df.columns:
			df[TARGET_BIN] = np.nan

		if TARGET_LOG not in df.columns:
			df[TARGET_LOG] = np.nan

	
	return df

def impute_missings(df):

	for col in CATEGORICAL_FEATURES:
		df[col] = df[col].fillna("<MISSING>")

		# convert to embeddings indices (if category not seen before, assign index 0)
		mapping = EMBEDDING_MAPPINGS[col]
		df[col] = df[col].map(lambda x: mapping.get(x, 0))

	for col in NUMERICAL_FEATURES:
		# 1. Create missing indicator
		df[f"{col}_is_missing"] = df[col].isna().astype(float)

		# 2. Impute missings with 0
		df[col] = df[col].fillna(0).astype(float)

	return df

def scale_numerical_features(df, scaler):
	df[NUMERICAL_FEATURES] = scaler.transform(df[NUMERICAL_FEATURES])
	return df

def train_scaler(df):
	scaler = StandardScaler()
	scaler.fit(df[NUMERICAL_FEATURES])
	joblib.dump(scaler, SCALER_FILE)
	print("Scaler saved.")

def generate_embeddings_mapping(pdf):
	# Use after transform_variables but BEFORE imputing missings
	
	mappings = {}
	for feature in CATEGORICAL_FEATURES:
		unique_values = ["<MISSING>"] + pdf[feature].dropna().unique().tolist()
		embeddings = {val: idx for idx, val in enumerate(unique_values)}
		mappings[feature] = embeddings
	
	# Save mappings to disk
	with open(EMBEDDINGS_MAPPING_FILE, "w") as f:
		json.dump(mappings, f)


cat_cardinalities = {
    feat: len(EMBEDDING_MAPPINGS[feat])
    for feat in CATEGORICAL_FEATURES
}

class EmbeddingMLP(nn.Module):
    def _init_(
        self,
        cat_cardinalities,
        cat_features: List[str],
        num_cont_features: int,          # numéricas + indicadores de missing
        embedding_dim: int = 64,
        hidden_dims: List[int] = [1024, 512, 256, 128, 64],
        dropout: float = 0.1,
    ):
        super()._init_()
        self.cat_features = cat_features
        self.num_cat = len(cat_features)
        self.num_cont_features = num_cont_features

        # Embeddings por feature categórica
        self.embeddings = nn.ModuleDict({
            feat: nn.Embedding(num_embeddings=cardinality, embedding_dim=embedding_dim)
            for feat, cardinality in cat_cardinalities.items()
        })

        # Dimensión de entrada al MLP: [contínuas + embeddings]
        total_cat_dim = embedding_dim * self.num_cat
        input_dim = self.num_cont_features + total_cat_dim

        layers = []
        prev_dim = input_dim
        for h in hidden_dims:
            layers.append(nn.Linear(prev_dim, h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h

        layers.append(nn.Linear(prev_dim, 1))  # regresión escalar

        self.mlp = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: [batch, num_cat + num_cont_features]
        columnas en el orden:
        [CATEGORICAL_FEATURES..., NUMERICAL_FEATURES..., NUMERICAL_IS_MISSING...]
        """

        # 1) Separar categóricas y numéricas del gran vector
        x_cat_ids = x[:, :self.num_cat].long()     # [batch, num_cat]
        x_cont = x[:, self.num_cat:].float()       # [batch, num_cont_features]

        # 2) Embeddings categóricas
        emb_list = []
        for i, feat in enumerate(self.cat_features):
            emb = self.embeddings[feat](x_cat_ids[:, i])  # [batch, embedding_dim]
            emb_list.append(emb)

        if emb_list:
            x_cat_emb = torch.cat(emb_list, dim=1)        # [batch, embedding_dim * num_cat]
            x_input = torch.cat([x_cont, x_cat_emb], dim=1)
        else:
            x_input = x_cont

        out = self.mlp(x_input)   # [batch, 1]
        return out.squeeze(-1)

# %% aquí deberías importar tu pipeline real:
# from tu_modulo import process_partition, DeepEmbeddingMLP  (o EmbeddingMLP/FTTransformer)
#
# Para que el ejemplo sea autocontenido, asumo que process_partition y el modelo
# ya los tienes definidos igual que en el notebook de entrenamiento.

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# %% helper para construir el tensor de features en el mismo orden que en train
def make_features_tensor(pdf: pd.DataFrame) -> torch.Tensor:
    return torch.tensor(
        pdf[
            CATEGORICAL_FEATURES
            + NUMERICAL_FEATURES
            + [f"{col}_is_missing" for col in NUMERICAL_FEATURES]
        ].values,
        dtype=torch.float32,
    )

# %% cargar validación (mismos filtros que en entrenamiento)
val_filters = [("datetime", ">=", "2025-10-06-23-00")]

ddf_val = dd.read_parquet(
    DATASET_PATH,
    engine="pyarrow",
    columns=COLS_TO_READ,   # usa la misma COLS_TO_READ del train
    filters=val_filters,
)

val_pdf = ddf_val.compute()
val_pdf = process_partition(val_pdf)

# y_true en escala original
y_true = val_pdf[TARGET].values.copy()

# si en el pipeline transformaste TARGET a log1p, asegúrate de que aquí
# estás usando la columna original sin log; si no la preservaste, mala decisión,
# pero entonces simplemente evalúas en escala log.

X_val = make_features_tensor(val_pdf)

val_dataset = TensorDataset(X_val, torch.tensor(y_true, dtype=torch.float32))
val_loader = DataLoader(
    val_dataset,
    batch_size=4096,
    shuffle=False,
    num_workers=0,
    pin_memory=True,
)

# %% reconstruir el modelo y cargar el checkpoint
# usa la MISMA arquitectura y parámetros que en entrenamiento
num_cont_features = len(NUMERICAL_FEATURES) * 2

with open(EMBEDDINGS_MAPPING_FILE, "r") as f:
    EMBEDDING_MAPPINGS = json.load(f)

cat_cardinalities = {
    feat: len(EMBEDDING_MAPPINGS[feat])
    for feat in CATEGORICAL_FEATURES
}

model = EmbeddingMLP(   # o EmbeddingMLP / FTTransformer
    cat_cardinalities=cat_cardinalities,
    cat_features=CATEGORICAL_FEATURES,
    num_cont_features=num_cont_features,
    embedding_dim=64,            # pon los mismos que en train
    hidden_dims=[512, 256, 128, 64],
    dropout=0.2,
).to(device)

state = torch.load(CHECKPOINT_PATH, map_location=device)
model.load_state_dict(state)
model.eval()

# %% obtener predicciones en validación
all_preds = []

with torch.no_grad():
    for (data, y_batch) in val_loader:
        data = data.to(device, non_blocking=True)

        pred = model(data)    # [batch], normalmente en log1p
        all_preds.append(pred.cpu())

y_pred = torch.cat(all_preds).numpy()

# si el modelo predice en log1p, deshaz la transformación para comparar en escala real
if TARGET_LOG:
    y_pred_real = np.expm1(y_pred)
else:
    y_pred_real = y_pred

# alineamos por si acaso
y_true = y_true[: len(y_pred_real)]

print("y_true mean:", y_true.mean())
print("y_pred mean:", y_pred_real.mean())

# %% gráfico: y_pred vs y_true

# para que el gráfico no sea un borrón, muestramos si hay demasiados puntos
max_points = 50000
n = len(y_true)
if n > max_points:
    idx = np.random.choice(n, size=max_points, replace=False)
    y_true_plot = y_true[idx]
    y_pred_plot = y_pred_real[idx]
else:
    y_true_plot = y_true
    y_pred_plot = y_pred_real

plt.figure(figsize=(8, 8))

# scatter denso
plt.scatter(y_true_plot, y_pred_plot, alpha=0.3, s=5)

# línea ideal y = x
min_val = min(y_true_plot.min(), y_pred_plot.min())
max_val = max(y_true_plot.max(), y_pred_plot.max())
plt.plot([min_val, max_val], [min_val, max_val], "k--", linewidth=1)

plt.xlabel("Real iap_revenue_d7")
plt.ylabel("Predicho iap_revenue_d7")
plt.title("Validación: predicción vs valor real")
plt.grid(True)
plt.tight_layout()
plt.show()


Using device: cuda


TypeError: EmbeddingMLP.__init__() got an unexpected keyword argument 'cat_cardinalities'