In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!pip install -r ../requirements.txt

In [None]:
import os

target_dir = "../"
zip_file_path = "generated.zip"

if not os.path.exists(target_dir + 'generated'):
    print(f"The directory {target_dir} does not exist. Proceeding with download.")

    !apt-get update
    !apt-get install unzip
    
    !curl "https://drive.usercontent.google.com/download?id=10xaugPOoC3SraTwp90sfqSMYQEQK96Ls&confirm=xxx" -o {zip_file_path}
    !mkdir -p {target_dir}
    
    !unzip {zip_file_path} -d {target_dir}
    
    print(f"File downloaded and extracted to {target_dir}")
    
    !rm {zip_file_path}
else:
    print(f"The directory {target_dir} already exists. No action taken.")

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py
import sys
from tqdm import tqdm
from datetime import datetime
pd.options.mode.chained_assignment = None
pd.options.display.float_format = '{:.2f}'.format

from pathlib import Path

repo_root = Path.cwd().parent.resolve()
sys.path.append(str(repo_root))

from sklearn.model_selection import train_test_split
from src.data.auction_dataset import AuctionDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
filters = [
    ("g_hours_on_sale_max", "<=", 50),
    ("g_current_hours_max", "<=", 50),
    ("g_hours_on_sale_len", "<=", 64),
    ("record", ">=", "2025-05-01"),
]

pairs = pd.read_parquet("../generated/auction_indices.parquet", engine="pyarrow", filters=filters)

pairs.head()

## Prepare and balance data

In [None]:
print(f"Pairs: {len(pairs)}")

split_idx = int(len(pairs) * 0.95)

train_pairs = pairs.iloc[:split_idx]
train_pairs = train_pairs.iloc[: int(len(train_pairs) * 0.90)]

print(f"Train pairs: {len(train_pairs)}")

val_pairs = pairs.iloc[split_idx:]

print(f"Val pairs: {len(val_pairs)}")

del pairs

In [None]:
train_pairs.tail()

In [None]:
val_pairs.head()

In [None]:
# histogram of group_hours_on_sale_mean
plt.hist(train_pairs['g_hours_on_sale_mean'], bins=30)

In [None]:
val_pairs.sample(5)

In [None]:
plt.hist(train_pairs['g_hours_on_sale_len'], bins=10)
plt.show()

In [None]:
plt.hist(train_pairs['g_hours_on_sale_mean'], bins=15)
plt.show()

In [None]:
import json
import os

mappings_dir = '../generated/mappings'

with open(os.path.join(mappings_dir, 'item_to_idx.json'), 'r') as f:
    item_to_idx = json.load(f)

with open(os.path.join(mappings_dir, 'context_to_idx.json'), 'r') as f:
    context_to_idx = json.load(f)
    
with open(os.path.join(mappings_dir, 'bonus_to_idx.json'), 'r') as f:
    bonus_to_idx = json.load(f)

with open(os.path.join(mappings_dir, 'modtype_to_idx.json'), 'r') as f:
    modtype_to_idx = json.load(f)

In [None]:
feature_stats = torch.load('../generated/feature_stats.pt')

In [None]:
from src.data.auction_dataset import AuctionDataset
from src.data.utils import collate_auctions

batch_size = 1024

train_dataset = AuctionDataset(train_pairs, feature_stats=feature_stats, path='../generated/sequences.h5')
val_dataset = AuctionDataset(val_pairs, feature_stats=feature_stats, path='../generated/sequences.h5')

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_auctions, num_workers=4, prefetch_factor=8, pin_memory=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_auctions, num_workers=4, prefetch_factor=8, pin_memory=True)

## Model definition

In [None]:
test_data_loader = True

if test_data_loader:
    iter_loader = iter(train_dataloader)
    batch = next(iter_loader)
    auctions = batch['auctions']
    item_index = batch['item_index']
    contexts = batch['contexts']
    bonus_lists = batch['bonus_lists']
    modifier_types = batch['modifier_types']
    modifier_values = batch['modifier_values']
    current_hours_raw = batch['current_hours_raw']
    time_left_raw = batch['time_left_raw']
    y = batch['target']

    print(f'auctions: {auctions.shape}')
    print(f'item_index: {item_index.shape}')
    print(f'contexts: {contexts.shape}')
    print(f'bonus_lists: {bonus_lists.shape}')
    print(f'modifier_types: {modifier_types.shape}')
    print(f'modifier_values: {modifier_values.shape}')
    print(f'y: {y.shape}')
    
    print("\nAuction feature statistics:")
    for i in range(auctions.shape[-1]):
        mean = auctions[..., i].mean().item()
        std = auctions[..., i].std().item()
        print(f"Feature {i}: mean = {mean:.3f}, std = {std:.3f}")

In [None]:
from src.models.auction_transformer import AuctionTransformer

input_size = 9
embedding_dim = 32
d_model = 256
dim_feedforward = d_model * 4
nhead = 16
num_layers = 4
dropout_p = 0.0
n_items = len(item_to_idx)
n_contexts = len(context_to_idx) + 1
n_bonuses = len(bonus_to_idx)
n_modtypes = len(modtype_to_idx)

model = AuctionTransformer(
    input_size, 
    n_items, 
    n_contexts,
    n_bonuses,
    n_modtypes,
    embedding_dim, 
    d_model, 
    dim_feedforward, 
    nhead, 
    num_layers,
    dropout_p=dropout_p,
    learning_rate=1e-4,
    logging_interval=1000,
    quantiles=[0.1, 0.5, 0.9],
    log_raw_batch_data=True,
    log_step_predictions=True,
)

print(sum(p.numel() for p in model.parameters()))

## Training

In [None]:
import lightning as L
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.profilers import PyTorchProfiler
from torch.profiler import schedule, tensorboard_trace_handler


!rm -rf ../generated/logs

name = 'auction-transformer-40M-quantile'

logger = WandbLogger(
    project="auction_transformer",
    name=name
)

checkpoint_callback = ModelCheckpoint(
    dirpath=f'../models/{name}',
    filename='epoch_{epoch:02d}',
    save_top_k=-1,
    every_n_epochs=1,
    save_last=True
)
"""
profiler = PyTorchProfiler(
    dirpath="profiler_logs",              # where traces go
    filename="profile",                   # base name per rank
    record_shapes=True,
    profile_memory=True,
    with_stack=True,
    on_trace_ready=tensorboard_trace_handler("profiler_logs"),
    schedule=schedule(wait=1, warmup=1, active=3, repeat=2),
)"""

trainer = L.Trainer(
    max_epochs=2,
    accelerator='gpu',
    devices=1,
    log_every_n_steps=10,
    logger=logger,
    limit_val_batches=500,
    val_check_interval=0.1,
    precision="bf16",
    callbacks=[checkpoint_callback],
    gradient_clip_val=3.0,
    #profiler=profiler,
)

trainer.fit(model, train_dataloader, val_dataloader)