In [1]:
import os
import sys

PROJECT_ROOT = "/home/pablo/Documents/datathon2025-smadex"

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("Contenido de ./data:", os.listdir(os.path.join(PROJECT_ROOT, "data")))

PROJECT_ROOT: /home/pablo/Documents/datathon2025-smadex
Contenido de ./data: ['test', 'sample_submission.csv', 'train']


In [None]:
import dask
import dask.dataframe as dd
import pandas as pd
import json
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
dask.config.set({"dataframe.convert-string": False})

In [None]:

DATASET_PATH = "../data/train"
EMBEDDINGS_MAPPING_FILE = "embeddings_mappings.json"
SCALER_FILE = "scaler.joblib"

CATEGORICAL_FEATURES = [
    'advertiser_bundle',
    'advertiser_category',
    'advertiser_subcategory',
    'advertiser_bottom_taxonomy_level',
    'country',
    'dev_make',
    'dev_model',
    'dev_os',
    'dev_osv',
    'release_date',
    'avg_daily_sessions',
    'avg_duration',
    'bcat_bottom_taxonomy',
    'ctr',
	]
NUMERICAL_FEATURES = ['buyer_d1', 'buyer_d7', 'buyer_d14', 'buyer_d28', 'buy_d7', 'buy_d14', 'buy_d28', 'iap_revenue_d14', 'iap_revenue_d28', 'registration', 'retention_d1_to_d7', 'retention_d3_to_d7', 'retention_d7_to_d14', 'retention_d1', 'retention_d3', 'retentiond7', 'hour', 'release_msrp', 'weekday', 'avg_act_days', 'bundles_cat', 'bundles_cat_bottom_taxonomy', 'first_request_ts_bundle', 'iap_revenue_usd_bundle', 'last_buy', 'last_buy_ts_bundle', 'last_buy_ts_category', 'last_ins', 'user_actions_bundles_action_last_timestamp', 'num_buys_bundle', 'rev_by_adv', 'rwd_prank', 'weekend_ratio', 'weeks_since_first_seen', 'wifi_ratio', 'whale_users_bundle_num_buys_prank', 'whale_users_bundle_revenue_prank', 'whale_users_bundle_total_num_buys', 'whale_users_bundle_total_revenue', 'last_buy_safe', 'last_ins_safe', 'last_buy_dt', 'last_ins_dt', 'hours_since_last_buy', 'hours_since_last_ins']
ALL_FEATURES = CATEGORICAL_FEATURES + NUMERICAL_FEATURES + [f"{col}_is_missing" for col in NUMERICAL_FEATURES]
TARGET = "iap_revenue_d7"

def load_embeddings_mapping():
    with open(EMBEDDINGS_MAPPING_FILE, "r") as f:
        mappings = json.load(f)
    return mappings

EMBEDDING_MAPPINGS = load_embeddings_mapping("embeddings_mappings.json")
COLS_TO_READ = ['buyer_d1', 
             'buyer_d7', 
             'buyer_d14', 
             'buyer_d28', 
             'buy_d7', 
             'buy_d14', 
             'buy_d28', 
             'iap_revenue_d7', 
             'iap_revenue_d14', 
             'iap_revenue_d28', 
             'registration', 
             'retention_d1_to_d7', 
             'retention_d3_to_d7', 
             'retention_d7_to_d14', 
             'retention_d1', 
             'retention_d3', 
             'retentiond7', 
             'advertiser_bundle', 
             'advertiser_category', 
             'advertiser_subcategory', 
             'advertiser_bottom_taxonomy_level', 
             'country', 
             'dev_make', 
             'dev_model', 
             'dev_os', 
             'dev_osv', 
             'hour', 
             'release_date', 
             'release_msrp', 
             'weekday', 
             'avg_act_days', 
             'avg_daily_sessions', 
             'avg_duration', 
             'bcat_bottom_taxonomy', 
             'bundles_cat', 
             'bundles_cat_bottom_taxonomy',  
             'ctr',  
             'first_request_ts_bundle', 
             'iap_revenue_usd_bundle', 
             'last_buy', 
             'last_buy_ts_bundle', 
             'last_buy_ts_category', 
             'last_ins', 
             'user_actions_bundles_action_last_timestamp', 
             'num_buys_bundle', 
             'rev_by_adv', 
             'rwd_prank', 
             'weekend_ratio', 
             'weeks_since_first_seen', 
             'wifi_ratio', 
             'whale_users_bundle_num_buys_prank', 
             'whale_users_bundle_revenue_prank', 
             'whale_users_bundle_total_num_buys', 
             'whale_users_bundle_total_revenue']

train_filters = [("datetime", ">=", "2025-10-01-00-00"), ("datetime", "<", "2025-10-06-00-00")]

ddf_train = dd.read_parquet(
    DATASET_PATH,
    engine="pyarrow",
    columns=COLS_TO_READ,
    filters=train_filters
)

loaded_scaler = joblib.load(SCALER_FILE)

In [None]:
def process_partition(df):
    df = transform_variables(df)
    df = scale_numerical_features(df, loaded_scaler)
    df = impute_missings(df)
    return df

def hours_since_now_from_list(tuples_list, now_ts):
    # Check if the input is a list
    if isinstance(tuples_list, list) and len(tuples_list) > 0:
        # Extract all numeric timestamps from the tuples
        timestamps = []
        for t in tuples_list:
            if isinstance(t, tuple) and len(t) == 2:
                ts = t[1]
                if ts is not None and not isinstance(ts, list):
                    timestamps.append(ts)
        if len(timestamps) > 0:
            # Use the largest timestamp (closest to now)
            max_ts = max(timestamps)
            return (now_ts - max_ts) / 3600  # seconds â†’ hours
    return np.nan

def extract_numbers(tuple_list):
    """Extract only the numeric part from a list of (id, value) tuples."""
    if isinstance(tuple_list, list):
        return [t[1] for t in tuple_list if isinstance(t, tuple) and len(t) >= 2]
    return []

def aggregate(values, mode):
    """Apply either sum or mean depending on mode."""
    if not values:
        return np.nan
    if mode == "sum":
        return sum(values)
    return sum(values) / len(values)

def transform_variables(df):
    now = pd.Timestamp.now()

    # Define reasonable bounds for Unix timestamps (seconds)
    min_ts = 0                  # 1970-01-01
    max_ts = 4102444800         # 2100-01-01 in Unix seconds

    # Replace invalid timestamps with NaN
    df["last_buy_safe"] = df["last_buy"].where(
        df["last_buy"].between(min_ts, max_ts), np.nan
    )
    df["last_ins_safe"] = df["last_ins"].where(
        df["last_ins"].between(min_ts, max_ts), np.nan
    )

    # Convert safe Unix timestamps to datetime
    df["last_buy_dt"] = pd.to_datetime(df["last_buy_safe"], unit="s")
    df["last_ins_dt"] = pd.to_datetime(df["last_ins_safe"], unit="s")

    # Compute hours ago
    df["hours_since_last_buy"] = (now - df["last_buy_dt"]).dt.total_seconds() / 3600
    df["hours_since_last_ins"] = (now - df["last_ins_dt"]).dt.total_seconds() / 3600

    # Drop the original Unix timestamp columns
    df = df.drop(columns=["last_buy", "last_ins", "last_buy_safe", "last_ins_dt", "last_buy_dt", "last_ins_safe"])




    # Convert 'hour' from string to integer
    df['hour'] = df['hour'].astype(int)

    # Convert hour to radians (full circle = 24 hours)
    radians = df['hour'] * (2 * np.pi / 24)

    # Create two new columns: sin_hour and cos_hour
    df['hour_sin'] = np.sin(radians)
    df['hour_cos'] = np.cos(radians)

    # Drop the original 'hour' column
    df.drop(columns=['hour'], inplace=True)




    # Ensure `now_ts` is a Unix timestamp
    now_ts = int(pd.Timestamp.now().timestamp())

    # Apply to your 4 timestamp columns
    ts_cols = [
        "first_request_ts_bundle",
        "last_buy_ts_bundle",
        "last_buy_ts_category",
        "user_actions_bundles_action_last_timestamp"
    ]

    for col in ts_cols:
        new_col = col + "_hours_ago"
        df[new_col] = df[col].apply(lambda x: hours_since_now_from_list(x, now_ts))

    # Drop the original tuple columns if you want
    df.drop(columns=ts_cols, inplace=True)




    rules = {
        "iap_revenue_usd_bundle": "sum",
        "num_buys_bundle": "sum",
        "rev_by_adv": "sum",
        "rwd_prank": "mean",
        "whale_users_bundle_num_buys_prank": "mean",
        "whale_users_bundle_revenue_prank": "mean",
        "whale_users_bundle_total_num_buys": "sum",
        "whale_users_bundle_total_revenue": "sum",
        "avg_daily_sessions": "mean",
        "avg_duration": "mean",
        "bcat_bottom_taxonomy": "mean",
        "ctr": "sum",
    }

    for col, mode in rules.items():
        new_col = col + "_agg"
        df[new_col] = df[col].apply(
            lambda lst: aggregate(extract_numbers(lst), mode)
        )

    df.drop(columns=list(rules.keys()), inplace=True)

    return df

def impute_missings(df):

    for col in CATEGORICAL_FEATURES:
        df[col] = df[col].fillna("<MISSING>")

        # convert to embeddings indices (if category not seen before, assign index 0)
        mapping = EMBEDDING_MAPPINGS[col]
        df[col] = df[col].map(lambda x: mapping.get(x, 0))

    for col in NUMERICAL_FEATURES:
        # 1. Create missing indicator
        df[f"{col}_is_missing"] = df[col].isna().astype(float)

        # 2. Impute missings with 0
        df[col] = df[col].fillna(0).astype(float)

    return df

def scale_numerical_features(df, scaler):
    df[NUMERICAL_FEATURES] = scaler.transform(df[NUMERICAL_FEATURES])
    return df

def train_scaler(df):
    scaler = StandardScaler()
    scaler.fit(df[NUMERICAL_FEATURES])
    joblib.dump(scaler, SCALER_FILE)
    print("Scaler saved.")

def generate_embeddings_mapping(pdf):
    # Use after transform_variables but BEFORE imputing missings
    
    mappings = {}
    for feature in CATEGORICAL_FEATURES:
        unique_values = ["<MISSING>"] + pdf[feature].dropna().unique().tolist()
        embeddings = {val: idx for idx, val in enumerate(unique_values)}
        mappings[feature] = embeddings
    
    # Save mappings to disk
    with open(EMBEDDINGS_MAPPING_FILE, "w") as f:
        json.dump(mappings, f)


In [None]:
pdf_train = ddf_train.compute()
print("Training data loaded.")

transform_variables(pdf_train)
generate_embeddings_mapping(pdf_train)
train_scaler(pdf_train)

In [None]:
import dask.dataframe as dd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd

EPOCHS = 5
BATCH_SIZE = 1024
LEARNING_RATE = 0.001


class SimpleMLP(nn.Module):

    def __init__(self, input_dim):
        super(SimpleMLP, self).__init__()
        self.layer_1 = nn.Linear(input_dim, 64)
        self.relu_1 = nn.ReLU()
        self.layer_2 = nn.Linear(64, 32)
        self.relu_2 = nn.ReLU()
        self.output_layer = nn.Linear(32, 1) # Output 1 value (target)

    def forward(self, x):
        x = self.relu_1(self.layer_1(x))
        x = self.relu_2(self.layer_2(x))
        x = self.output_layer(x)
        return x


def main():

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    ddf_train = dd.read_parquet(
        DATASET_PATH,
        engine="pyarrow",
        columns=COLS_TO_READ,
        filters=train_filters
    )

    input_dim = len(NUMERICAL_FEATURES) + len(CATEGORICAL_FEATURES) + len(NUMERICAL_FEATURES)  # numerical + categorical + numerical missing indicators
    model = SimpleMLP(input_dim=input_dim).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    for epoch in range(EPOCHS):
        print(f"\n--- Starting Epoch {epoch + 1}/{EPOCHS} ---")
        
        partition_iterator = ddf_train.to_delayed()
        
        for i, partition in enumerate(partition_iterator):

            pdf = partition.compute()
            
            if pdf.empty:
                print(f"  Skipping empty partition {i+1}")
                continue


            pdf = process_partition(pdf)
            
            # 2. Convert Pandas DF to PyTorch Tensors
            features_tensor = torch.tensor(
                pdf[NUMERICAL_FEATURES + CATEGORICAL_FEATURES + [f"{col}_is_missing" for col in NUMERICAL_FEATURES]].values, 
                dtype=torch.float32
            )
            
            target_tensor = torch.tensor(
                pdf[TARGET].values,
                dtype=torch.float32
            ).view(-1, 1)
            
            partition_dataset = TensorDataset(features_tensor, target_tensor)
            partition_loader = DataLoader(dataset=partition_dataset, batch_size=BATCH_SIZE, shuffle=True)

            for batch_num, (batch_features, batch_target) in enumerate(partition_loader):
                # Forward pass
                outputs = model(batch_features)
                loss = criterion(outputs, batch_target)
                
                # Backward pass and optimization
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            print(f"  Epoch {epoch+1} | Processed Partition {i+1} | Last Batch Loss: {loss.item():.4f}")

    print("\n--- Training Finished ---")


if __name__ == "__main__":
    main()