In [1]:
import h5py
import torch
import torch.nn.functional as F

In [None]:
import pandas as pd
pairs = pd.read_csv('../generated/auction_indices.csv')
pairs.sample(5)

In [None]:
from sklearn.model_selection import train_test_split
import json

batch_size = 32

pairs = pd.read_csv('../generated/auction_indices.csv')
train_pairs, val_pairs = train_test_split(pairs, test_size=0.05, random_state=42, shuffle=False)

train_dataset = AuctionDataset(train_pairs, feature_stats_path='../generated/feature_stats.pt', path='../generated/sequences.h5', normalization=False)
val_dataset = AuctionDataset(val_pairs, feature_stats_path='../generated/feature_stats.pt', path='../generated/sequences.h5', normalization=False)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_auctions, num_workers=4, pin_memory=True, persistent_workers=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_auctions, num_workers=4, pin_memory=True, persistent_workers=True)

In [None]:
from tqdm import tqdm
import torch

def compute_feature_stats(train_loader, output_dir='../generated/', max_batches=None):
    """
    Compute means and standard deviations per feature over all valid (non-padded) 
    timesteps in the training data.
    
    Assumes that each batch from train_loader is a tuple: (X, y, lengths),
    where X has shape (batch_size, max_seq_len, num_features).
    """
    sum_features = None
    sum_sq_features = None
    total_count = 0
    
    # For modifier values
    modifier_sum = None
    modifier_sum_sq = None
    modifier_count = 0

    for i, batch in enumerate(tqdm(train_loader)):
        if max_batches is not None and i >= max_batches:
            break
            
        (auctions, item_index, contexts, bonus_lists, modifier_types, modifier_values), y = batch  # X: (B, T, F)

        # Handle auction features
        auction_mask = auctions[:, :, 0] != 0 # padding
        auction_mask = auction_mask.unsqueeze(2)  # shape: (B, T, 1)
        mask = auction_mask.expand(-1, -1, auctions.size(2))  # shape: (B, T, F)
        X_valid = auctions[mask].view(-1, auctions.size(2))  # shape: (total_valid, F)

        # Initialize accumulators if this is the first batch
        if sum_features is None:
            sum_features = X_valid.sum(dim=0)
            sum_sq_features = (X_valid ** 2).sum(dim=0)
        else:
            sum_features += X_valid.sum(dim=0)
            sum_sq_features += (X_valid ** 2).sum(dim=0)

        total_count += X_valid.size(0)

        # Handle modifier values separately
        modifier_mask = modifier_values != 0
        valid_modifiers = modifier_values[modifier_mask]
        
        if modifier_sum is None:
            modifier_sum = valid_modifiers.sum()
            modifier_sum_sq = (valid_modifiers ** 2).sum()
        else:
            modifier_sum += valid_modifiers.sum()
            modifier_sum_sq += (valid_modifiers ** 2).sum()
            
        modifier_count += valid_modifiers.size(0)

    # Compute stats for auction features
    means = sum_features / total_count
    variances = (sum_sq_features / total_count) - (means ** 2)
    stds = torch.sqrt(variances)

    # Compute stats for modifier values
    modifier_mean = modifier_sum / modifier_count
    modifier_variance = (modifier_sum_sq / modifier_count) - (modifier_mean ** 2)
    modifier_std = torch.sqrt(modifier_variance)

    # Store in pt file
    torch.save({
        'means': means,
        'stds': stds,
        'modifiers_mean': modifier_mean,
        'modifiers_std': modifier_std
    }, f'{output_dir}/feature_stats.pt')

    return means, stds, modifier_mean, modifier_std

compute_feature_stats(train_dataloader, max_batches=10000)