In [1]:
import os
import sys

PROJECT_ROOT = "/home/pablo/Documents/datathon2025-smadex"

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("Contenido de ./data:", os.listdir(os.path.join(PROJECT_ROOT, "data")))

PROJECT_ROOT: /home/pablo/Documents/datathon2025-smadex
Contenido de ./data: ['test', 'sample_submission.csv', 'train']


In [2]:
import dask
import dask.dataframe as dd
import pandas as pd
import json
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from typing import Dict, List
dask.config.set({"dataframe.convert-string": False})

<dask.config.set at 0x7b4c3929e350>

In [3]:

DATASET_PATH = "../data/train"
EMBEDDINGS_MAPPING_FILE = "embeddings_mappings.json"
SCALER_FILE = "scaler.joblib"

CATEGORICAL_FEATURES = ['advertiser_bundle', 'advertiser_category', 'advertiser_subcategory', 'advertiser_bottom_taxonomy_level', 'country', 'dev_make', 'dev_model', 'dev_os', 'dev_osv', 'release_date']
NUMERICAL_FEATURES = ['release_msrp', 'weekday', 'avg_act_days', 'weekend_ratio', 'weeks_since_first_seen', 'wifi_ratio', 'hours_since_last_buy', 'hours_since_last_ins', 'hour_sin', 'hour_cos', 'first_request_ts_bundle_hours_ago', 'last_buy_ts_bundle_hours_ago', 'last_buy_ts_category_hours_ago', 'user_actions_bundles_action_last_timestamp_hours_ago', 'iap_revenue_usd_bundle_agg', 'num_buys_bundle_agg', 'rev_by_adv_agg', 'rwd_prank_agg', 'whale_users_bundle_num_buys_prank_agg', 'whale_users_bundle_revenue_prank_agg', 'whale_users_bundle_total_num_buys_agg', 'whale_users_bundle_total_revenue_agg', 'avg_daily_sessions_agg', 'avg_duration_agg', 'bcat_bottom_taxonomy_agg', 'ctr_agg', 'bundles_cat_bottom_taxonomy_agg']
ALL_FEATURES = CATEGORICAL_FEATURES + NUMERICAL_FEATURES + [f"{col}_is_missing" for col in NUMERICAL_FEATURES]
TARGET = "iap_revenue_d7"

def load_embeddings_mapping():
    with open(EMBEDDINGS_MAPPING_FILE, "r") as f:
        mappings = json.load(f)
    return mappings

EMBEDDING_MAPPINGS = load_embeddings_mapping()
COLS_TO_READ = [
			 'iap_revenue_d7', 
			 'advertiser_bundle', 
			 'advertiser_category', 
			 'advertiser_subcategory', 
			 'advertiser_bottom_taxonomy_level', 
			 'country', 
			 'dev_make', 
			 'dev_model', 
			 'dev_os', 
			 'dev_osv', 
			 'hour', 
			 'release_date', 
			 'release_msrp', 
			 'weekday', 
			 'avg_act_days', 
			 'avg_daily_sessions', 
			 'avg_duration', 
			 'bcat_bottom_taxonomy', 
			 'bundles_cat_bottom_taxonomy',  
			 'ctr',  
			 'first_request_ts_bundle', 
			 'iap_revenue_usd_bundle', 
			 'last_buy', 
			 'last_buy_ts_bundle', 
			 'last_buy_ts_category', 
			 'last_ins', 
			 'user_actions_bundles_action_last_timestamp', 
			 'num_buys_bundle', 
			 'rev_by_adv', 
			 'rwd_prank', 
			 'weekend_ratio', 
			 'weeks_since_first_seen', 
			 'wifi_ratio', 
			 'whale_users_bundle_num_buys_prank', 
			 'whale_users_bundle_revenue_prank', 
			 'whale_users_bundle_total_num_buys', 
			 'whale_users_bundle_total_revenue']

loaded_scaler = joblib.load(SCALER_FILE)

In [4]:
def process_partition(df):
    df = transform_variables(df)
    df = scale_numerical_features(df, loaded_scaler)
    df = impute_missings(df)
    return df

def hours_since_now_from_list(tuples_list, now_ts):
    # Check if the input is a list
    if isinstance(tuples_list, list) and len(tuples_list) > 0:
        # Extract all numeric timestamps from the tuples
        timestamps = []
        for t in tuples_list:
            if isinstance(t, tuple) and len(t) == 2:
                ts = t[1]
                if ts is not None and not isinstance(ts, list):
                    timestamps.append(ts)
        if len(timestamps) > 0:
            # Use the largest timestamp (closest to now)
            max_ts = max(timestamps)
            return (now_ts - max_ts) / 3600  # seconds → hours
    return np.nan

def extract_numbers(tuple_list):
    """Extract only the numeric part from a list of (id, value) tuples."""
    if isinstance(tuple_list, list):
        return [t[1] for t in tuple_list if isinstance(t, tuple) and len(t) >= 2]
    return []

def aggregate(values, mode):
    """Apply either sum or mean depending on mode."""
    if not values:
        return np.nan
    if mode == "sum":
        return sum(values)
    return sum(values) / len(values)

def transform_variables(df):
     
	now = pd.Timestamp.now()

	# Define reasonable bounds for Unix timestamps (seconds)
	min_ts = 0                  # 1970-01-01
	max_ts = 4102444800         # 2100-01-01 in Unix seconds

	# Replace invalid timestamps with NaN
	df["last_buy_safe"] = df["last_buy"].where(
		df["last_buy"].between(min_ts, max_ts), np.nan
	)
	df["last_ins_safe"] = df["last_ins"].where(
		df["last_ins"].between(min_ts, max_ts), np.nan
	)

	# Convert safe Unix timestamps to datetime
	df["last_buy_dt"] = pd.to_datetime(df["last_buy_safe"], unit="s")
	df["last_ins_dt"] = pd.to_datetime(df["last_ins_safe"], unit="s")

	# Compute hours ago
	df["hours_since_last_buy"] = (now - df["last_buy_dt"]).dt.total_seconds() / 3600
	df["hours_since_last_ins"] = (now - df["last_ins_dt"]).dt.total_seconds() / 3600

	# Drop the original Unix timestamp columns
	df = df.drop(columns=["last_buy", "last_ins", "last_buy_safe", "last_ins_dt", "last_buy_dt", "last_ins_safe"])




	# Convert 'hour' from string to integer
	df['hour'] = df['hour'].astype(int)

	# Convert hour to radians (full circle = 24 hours)
	radians = df['hour'] * (2 * np.pi / 24)

	# Create two new columns: sin_hour and cos_hour
	df['hour_sin'] = np.sin(radians)
	df['hour_cos'] = np.cos(radians)

	# Drop the original 'hour' column
	df.drop(columns=['hour'], inplace=True)




	# Ensure `now_ts` is a Unix timestamp
	now_ts = int(pd.Timestamp.now().timestamp())

	# Apply to your 4 timestamp columns
	ts_cols = [
		"first_request_ts_bundle",
		"last_buy_ts_bundle",
		"last_buy_ts_category",
		"user_actions_bundles_action_last_timestamp"
	]

	for col in ts_cols:
		new_col = col + "_hours_ago"
		df[new_col] = df[col].apply(lambda x: hours_since_now_from_list(x, now_ts))

	# Drop the original tuple columns if you want
	df.drop(columns=ts_cols, inplace=True)




	rules = {
		"iap_revenue_usd_bundle": "sum",
		"num_buys_bundle": "sum",
		"rev_by_adv": "sum",
		"rwd_prank": "mean",
		"whale_users_bundle_num_buys_prank": "mean",
		"whale_users_bundle_revenue_prank": "mean",
		"whale_users_bundle_total_num_buys": "sum",
		"whale_users_bundle_total_revenue": "sum",
		"avg_daily_sessions": "mean",
		"avg_duration": "mean",
		"bcat_bottom_taxonomy": "mean",
		"ctr": "sum",
		"bundles_cat_bottom_taxonomy": "sum",
	}

	for col, mode in rules.items():
		new_col = col + "_agg"
		df[new_col] = df[col].apply(
			lambda lst: aggregate(extract_numbers(lst), mode)
		)

	df.drop(columns=list(rules.keys()), inplace=True)

	# If the df contains the target column, transform it with the logarithm
	if TARGET in df.columns:
		df[TARGET] = np.log1p(df[TARGET])
	
	return df

def impute_missings(df):

    for col in CATEGORICAL_FEATURES:
        df[col] = df[col].fillna("<MISSING>")

        # convert to embeddings indices (if category not seen before, assign index 0)
        mapping = EMBEDDING_MAPPINGS[col]
        df[col] = df[col].map(lambda x: mapping.get(x, 0))

    for col in NUMERICAL_FEATURES:
        # 1. Create missing indicator
        df[f"{col}_is_missing"] = df[col].isna().astype(float)

        # 2. Impute missings with 0
        df[col] = df[col].fillna(0).astype(float)

    return df

def scale_numerical_features(df, scaler):
    df[NUMERICAL_FEATURES] = scaler.transform(df[NUMERICAL_FEATURES])
    return df

def train_scaler(df):
    scaler = StandardScaler()
    scaler.fit(df[NUMERICAL_FEATURES])
    joblib.dump(scaler, SCALER_FILE)
    print("Scaler saved.")

def generate_embeddings_mapping(pdf):
    # Use after transform_variables but BEFORE imputing missings
    
    mappings = {}
    for feature in CATEGORICAL_FEATURES:
        unique_values = ["<MISSING>"] + pdf[feature].dropna().unique().tolist()
        embeddings = {val: idx for idx, val in enumerate(unique_values)}
        mappings[feature] = embeddings
    
    # Save mappings to disk
    with open(EMBEDDINGS_MAPPING_FILE, "w") as f:
        json.dump(mappings, f)


In [5]:
VERSION = "v1"
EPOCHS = 5
BATCH_SIZE = 1024
LEARNING_RATE = 0.001
CHECKPOINT_PATH = "model_checkpoint_" + VERSION
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# AMP = False

cat_cardinalities = {
    feat: len(EMBEDDING_MAPPINGS[feat])
    for feat in CATEGORICAL_FEATURES
}

class EmbeddingMLP(nn.Module):
    def __init__(
        self,
        cat_cardinalities: Dict[str, int],
        cat_features: List[str],
        num_cont_features: int,          # numéricas + indicadores de missing
        embedding_dim: int = 32,
        hidden_dims: List[int] = [256, 128, 64],
        dropout: float = 0.1,
    ):
        super().__init__()
        self.cat_features = cat_features
        self.num_cat = len(cat_features)
        self.num_cont_features = num_cont_features

        # Embeddings por feature categórica
        self.embeddings = nn.ModuleDict({
            feat: nn.Embedding(num_embeddings=cardinality, embedding_dim=embedding_dim)
            for feat, cardinality in cat_cardinalities.items()
        })

        # Dimensión de entrada al MLP: [contínuas + embeddings]
        total_cat_dim = embedding_dim * self.num_cat
        input_dim = self.num_cont_features + total_cat_dim

        layers = []
        prev_dim = input_dim
        for h in hidden_dims:
            layers.append(nn.Linear(prev_dim, h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h

        layers.append(nn.Linear(prev_dim, 1))  # regresión escalar

        self.mlp = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: [batch, num_cat + num_cont_features]
        columnas en el orden:
        [CATEGORICAL_FEATURES..., NUMERICAL_FEATURES..., NUMERICAL_IS_MISSING...]
        """

        # 1) Separar categóricas y numéricas del gran vector
        x_cat_ids = x[:, :self.num_cat].long()     # [batch, num_cat]
        x_cont = x[:, self.num_cat:].float()       # [batch, num_cont_features]

        # 2) Embeddings categóricas
        emb_list = []
        for i, feat in enumerate(self.cat_features):
            emb = self.embeddings[feat](x_cat_ids[:, i])  # [batch, embedding_dim]
            emb_list.append(emb)

        if emb_list:
            x_cat_emb = torch.cat(emb_list, dim=1)        # [batch, embedding_dim * num_cat]
            x_input = torch.cat([x_cont, x_cat_emb], dim=1)
        else:
            x_input = x_cont

        out = self.mlp(x_input)   # [batch, 1]
        return out.squeeze(-1)   

In [6]:

def main():
    AMP = False

    
    print(f"Using device: {device}")
    if not torch.cuda.is_available() or not torch.amp.autocast_mode.is_autocast_available("cuda"):
        AMP = False
    
    train_filters = [("datetime", "<", "2025-10-06-23-00")]
    val_filters = [("datetime", ">=", "2025-10-06-23-00")]
    
    ddf_train = dd.read_parquet(
        DATASET_PATH,
        engine="pyarrow",
        columns=COLS_TO_READ,
        filters=train_filters
    )

    ddf_val = dd.read_parquet(
        DATASET_PATH,
        engine="pyarrow",
        columns=COLS_TO_READ,
        filters=val_filters
    )

    val_pdf = ddf_val.compute()
    val_pdf = process_partition(val_pdf)

    val_features_tensor = torch.tensor(
        val_pdf[CATEGORICAL_FEATURES + NUMERICAL_FEATURES + [f"{col}_is_missing" for col in NUMERICAL_FEATURES]].values, 
        dtype=torch.float32
    )
    
    val_target_tensor = torch.tensor(
        val_pdf[TARGET].values,
        dtype=torch.float32
    )

    val_dataset = TensorDataset(val_features_tensor, val_target_tensor)
    val_loader = DataLoader(dataset=val_dataset, 
                             batch_size=BATCH_SIZE, 
                             num_workers=0,
                             pin_memory=True,
                             shuffle=False)

    cat_cardinalities = {
        feat: len(EMBEDDING_MAPPINGS[feat])
        for feat in CATEGORICAL_FEATURES
    }
    model = EmbeddingMLP(cat_cardinalities=cat_cardinalities, cat_features=CATEGORICAL_FEATURES, num_cont_features=2*len(NUMERICAL_FEATURES)).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    scaler = torch.amp.GradScaler(device="cuda", enabled=AMP)

    for epoch in range(EPOCHS):
        print(f"\n--- Starting Epoch {epoch + 1}/{EPOCHS} ---")
        
        model.train()
        
        partition_iterator = ddf_train.to_delayed()
        
        for partition_idx, partition in enumerate(partition_iterator):
            train_loss_sum = 0.0

            pdf = partition.compute()
            
            if pdf.empty:
                print(f"  Skipping empty partition {partition_idx+1}")
                continue


            pdf = process_partition(pdf)
            
            # 2. Convert Pandas DF to PyTorch Tensors
            features_tensor = torch.tensor(
                pdf[CATEGORICAL_FEATURES + NUMERICAL_FEATURES + [f"{col}_is_missing" for col in NUMERICAL_FEATURES]].values, 
                dtype=torch.float32
            )
            
            target_tensor = torch.tensor(
                pdf[TARGET].values,
                dtype=torch.float32
            )
            
            partition_dataset = TensorDataset(features_tensor, target_tensor)
            partition_loader = DataLoader(dataset=partition_dataset, 
                                          batch_size=BATCH_SIZE, 
                                          num_workers=0,
                                          pin_memory=True,
                                          shuffle=True)

            for batch_idx, (data, target) in enumerate(partition_loader):

                data, target = data.to(device, non_blocking=True), target.to(device, non_blocking=True)
                
                optimizer.zero_grad(set_to_none=True)

                with torch.amp.autocast(device_type="cuda", enabled=AMP):
                    output = model(data)
                    loss = criterion(output, target)

                if AMP:
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    loss.backward()
                    optimizer.step()

                train_loss_sum += loss.item()

            print(f"  Epoch {epoch+1} | Processed Partition {partition_idx+1} | Partition Train Loss: {train_loss_sum / (len(partition_loader)):.4f}")

        val_loss_sum = 0.0
        model.eval()
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(val_loader):
                data, target = data.to(device, non_blocking=True), target.to(device, non_blocking=True)

                with torch.amp.autocast(device_type="cuda", enabled=AMP):
                    output = model(data)
                    loss = criterion(output, target)

                val_loss_sum += loss.item()
        print(f"\n--- Epoch {epoch + 1} Completed ---")
        print(f"Validation Loss: {val_loss_sum / (len(val_loader)):.4f}")

        # Save checkpoint
        torch.save(model.state_dict(), f"{CHECKPOINT_PATH}_{epoch+1}.pth")
        print(f"Model checkpoint saved to {CHECKPOINT_PATH}_{epoch+1}.pth")

    print("\n--- Training Finished ---")


In [7]:
if __name__ == "__main__":
    main()

Using device: cuda


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- buy_d14
- buy_d28
- buy_d7
- buyer_d1
- buyer_d14
- ...


In [None]:
DATASET_TEST_PATH = "../data/test"

ddf_test = dd.read_parquet(
    DATASET_TEST_PATH,
    engine="pyarrow",
    columns=COLS_TO_READ + ["row_id"]
)

partition_iterator = ddf_test.to_delayed()

for partition in partition_iterator:


    test_pdf = partition.compute()


row_ids = test_pdf["row_id"].values
test_pdf = process_partition(test_pdf)

In [13]:
test_features_tensor = torch.tensor(
    test_pdf[CATEGORICAL_FEATURES + NUMERICAL_FEATURES + [f"{col}_is_missing" for col in NUMERICAL_FEATURES]].values, 
    dtype=torch.float32
)

test_dataset = TensorDataset(test_features_tensor)
test_loader = DataLoader(dataset=test_dataset, 
                            batch_size=4096, 
                            num_workers=0,
                            pin_memory=True,
                            shuffle=False)

# Load the trained model
model = EmbeddingMLP(cat_cardinalities=cat_cardinalities, cat_features=CATEGORICAL_FEATURES, num_cont_features=2*len(NUMERICAL_FEATURES)).to(device)
model.load_state_dict(torch.load("model_checkpoint_v1_4.pth"))
model.eval()

# Generate predictions row_id -> target and save to CSV
predictions = []
with torch.no_grad():
    for batch_idx, (data,) in enumerate(test_loader):
        data = data.to(device, non_blocking=True)

        output = model(data)

        predictions.extend(output.cpu().numpy())

pred_df = pd.DataFrame({
    "row_id": row_ids,
    "iap_revenue_d7": np.expm1(predictions)  # inverse of log1p
})
print(pred_df.head())
pred_df.to_csv("test_predictions.csv", index=False)
print("Test predictions saved to test_predictions.csv")

                                 row_id  iap_revenue_d7
0  819ecc0e-1a97-43ed-83f6-b9ede4f7fc48        2.286234
1  0a7fbf18-5041-42af-bd0a-0cb6586b8598        0.001353
2  fc1a2689-b136-4ffa-b23b-9d8215bd720f        0.001353
3  0340fcc6-50bd-42ab-b9f4-4c1184b640cb        0.001353
4  219d253f-bef4-4039-84b2-ed55f009cc43        0.001353
Test predictions saved to test_predictions.csv


In [None]:
DATASET_TEST_PATH = "../data/test"

ddf_test = dd.read_parquet(
    DATASET_PATH,
    engine="pyarrow",
    columns=COLS_TO_READ + ["row_id"]
)

test_pdf = ddf_test.compute()
row_ids = test_pdf["row_id"].values
test_pdf = process_partition(test_pdf)

test_features_tensor = torch.tensor(
    ddf_test[CATEGORICAL_FEATURES + NUMERICAL_FEATURES + [f"{col}_is_missing" for col in NUMERICAL_FEATURES]].values, 
    dtype=torch.float32
)

test_dataset = TensorDataset(test_features_tensor)
test_loader = DataLoader(dataset=test_dataset, 
                            batch_size=BATCH_SIZE, 
                            num_workers=0,
                            pin_memory=True,
                            shuffle=False)

# Load the trained model
model = EmbeddingMLP(cat_cardinalities=cat_cardinalities, cat_features=CATEGORICAL_FEATURES, num_cont_features=2*len(NUMERICAL_FEATURES)).to(device)
model.load_state_dict(torch.load(f"{CHECKPOINT_PATH}_3"))
model.eval()

# Generate predictions row_id -> target and save to CSV
predictions = []
with torch.no_grad():
    for batch_idx, (data,) in enumerate(test_loader):
        data = data.to(device, non_blocking=True)

        output = model(data)

        predictions.extend(output.cpu().numpy())

pred_df = pd.DataFrame({
    "row_id": row_ids,
    "iap_revenue_d7": np.expm1(predictions)  # inverse of log1p
})
pred_df.to_csv("test_predictions.csv", index=False)
print("Test predictions saved to test_predictions.csv")
