In [1]:
import h5py
import torch
import torch.nn.functional as F

from pathlib import Path
import sys
repo_root = Path.cwd().parent.resolve()
sys.path.append(str(repo_root))

from src.data.auction_dataset import AuctionDataset

In [2]:
import pandas as pd
pairs = pd.read_csv('../generated/auction_indices.csv')
pairs.sample(5)

Unnamed: 0,record,item_index,g_hours_on_sale_len,g_hours_on_sale_mean,g_hours_on_sale_std,g_hours_on_sale_min,g_hours_on_sale_max,g_current_hours_mean,g_current_hours_std,g_current_hours_min,g_current_hours_max
3330543,2025-03-19 20:00:00,9286,7,12.14,11.31,3.0,33.0,31.86,16.58,2.0,45.0
1989221,2025-03-16 04:00:00,16316,5,17.4,15.13,0.0,40.0,4.0,3.95,0.0,10.0
3234539,2025-03-19 12:00:00,13858,4,22.75,7.66,18.0,36.0,24.75,7.98,11.0,30.0
3468099,2025-03-20 05:00:00,16726,19,8.32,7.39,1.0,28.0,8.47,12.15,0.0,42.0
2393512,2025-03-17 07:00:00,14102,7,18.86,11.34,0.0,40.0,28.29,11.08,8.0,47.0


In [3]:
from sklearn.model_selection import train_test_split
from src.data.utils import collate_auctions

batch_size = 512

pairs = pd.read_csv('../generated/auction_indices.csv')
train_pairs, val_pairs = train_test_split(pairs, test_size=0.25, random_state=42, shuffle=False)

train_dataset = AuctionDataset(train_pairs)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_auctions, num_workers=4, pin_memory=True, persistent_workers=True)

Dataset size: 3671496


In [4]:
from tqdm import tqdm
import torch

def compute_feature_stats(train_loader, output_dir='../generated/', max_batches=None):
    """
    Compute means and standard deviations per feature over all valid (non-padded) 
    timesteps in the training data.
    
    Assumes that each batch from train_loader is a tuple: (X, y, lengths),
    where X has shape (batch_size, max_seq_len, num_features).
    """
    sum_features = None
    sum_sq_features = None
    total_count = 0
    
    # For modifier values
    modifier_sum = None
    modifier_sum_sq = None
    modifier_count = 0

    for i, batch in enumerate(tqdm(train_loader)):
        if max_batches is not None and i >= max_batches:
            break
            
        (auctions, item_index, contexts, bonus_lists, modifier_types, modifier_values, current_hours), y = batch  # X: (B, T, F)

        # Handle auction features
        auction_mask = auctions[:, :, 0] != 0 # padding
        auction_mask = auction_mask.unsqueeze(2)  # shape: (B, T, 1)
        mask = auction_mask.expand(-1, -1, auctions.size(2))  # shape: (B, T, F)
        X_valid = auctions[mask].view(-1, auctions.size(2))  # shape: (total_valid, F)

        # Initialize accumulators if this is the first batch
        if sum_features is None:
            sum_features = X_valid.sum(dim=0)
            sum_sq_features = (X_valid ** 2).sum(dim=0)
        else:
            sum_features += X_valid.sum(dim=0)
            sum_sq_features += (X_valid ** 2).sum(dim=0)

        total_count += X_valid.size(0)

        # Handle modifier values separately
        modifier_mask = modifier_values != 0
        valid_modifiers = modifier_values[modifier_mask]
        
        if modifier_sum is None:
            modifier_sum = valid_modifiers.sum()
            modifier_sum_sq = (valid_modifiers ** 2).sum()
        else:
            modifier_sum += valid_modifiers.sum()
            modifier_sum_sq += (valid_modifiers ** 2).sum()
            
        modifier_count += valid_modifiers.size(0)

    # Compute stats for auction features
    means = sum_features / total_count
    variances = (sum_sq_features / total_count) - (means ** 2)
    stds = torch.sqrt(variances)

    # Compute stats for modifier values
    modifier_mean = modifier_sum / modifier_count
    modifier_variance = (modifier_sum_sq / modifier_count) - (modifier_mean ** 2)
    modifier_std = torch.sqrt(modifier_variance)

    # Store in pt file
    torch.save({
        'means': means,
        'stds': stds,
        'modifiers_mean': modifier_mean,
        'modifiers_std': modifier_std
    }, f'{output_dir}/feature_stats.pt')

    return means, stds, modifier_mean, modifier_std

compute_feature_stats(train_dataloader, max_batches=3000)

 42%|████▏     | 3000/7171 [09:15<12:52,  5.40it/s]


(tensor([8.1295e+00, 8.2789e+00, 1.0000e+00, 3.4194e+01, 1.8167e+01, 2.1206e-02,
         1.5043e-01]),
 tensor([ 2.5664,  2.5649,  0.0000, 18.2883, 13.3904,  0.7046,  0.6879]),
 tensor(5.8362),
 tensor(2.4271))