In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
!pip install -r ../requirements.txt

In [None]:
import os

target_dir = "../"
zip_file_path = "generated.zip"

if not os.path.exists(target_dir + 'generated'):
    print(f"The directory {target_dir} does not exist. Proceeding with download.")

    !apt-get update
    !apt-get install unzip
    
    !curl "https://drive.usercontent.google.com/download?id=1SJkXUcdWqPvhBO0Ug5SlJYRvdSn9eY22&confirm=xxx" -o {zip_file_path}
    !mkdir -p {target_dir}
    
    !unzip {zip_file_path} -d {target_dir}
    
    print(f"File downloaded and extracted to {target_dir}")
    
    !rm {zip_file_path}
else:
    print(f"The directory {target_dir} already exists. No action taken.")

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py
import sys
from tqdm import tqdm
from datetime import datetime
pd.options.mode.chained_assignment = None

from pathlib import Path

repo_root = Path.cwd().parent.resolve()
sys.path.append(str(repo_root))

from sklearn.model_selection import train_test_split
from src.data.auction_dataset import AuctionDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
pairs = pd.read_csv('../generated/auction_indices.csv')
pairs.head()

## Prepare and balance data

In [None]:
pd.options.display.float_format = '{:.2f}'.format
pairs.describe()

In [None]:
pairs = pairs[pairs['g_hours_on_sale_max'] < 50]
pairs = pairs[pairs['g_current_hours_max'] < 50]

pairs.describe()

In [None]:
train_pairs, val_pairs = train_test_split(pairs, test_size=0.15, random_state=42, shuffle=False)

print(f"Before filtering: {len(train_pairs)}")

train_pairs = train_pairs[train_pairs['g_hours_on_sale_len'] <= 32]
val_pairs = val_pairs[val_pairs['g_hours_on_sale_len'] <= 32]

print(f"After filtering: {len(train_pairs)}\n")

train_pairs = train_pairs[:int(len(train_pairs)*0.9)]
val_pairs = val_pairs[len(val_pairs)//2:]

print(f"Train pairs: {len(train_pairs)}")
print(f"Val pairs: {len(val_pairs)}")

In [None]:
train_pairs.tail()

In [None]:
val_pairs.head()

In [None]:
train_pairs.sample(5)

In [None]:
# histogram of group_hours_on_sale_mean
plt.hist(train_pairs['g_hours_on_sale_mean'], bins=30)

In [None]:
val_pairs.sample(5)

In [None]:
plt.hist(train_pairs['g_hours_on_sale_len'], bins=10)
plt.show()

In [None]:
plt.hist(train_pairs['g_hours_on_sale_mean'], bins=15)
plt.show()

In [16]:
import json
import os

mappings_dir = '../generated/mappings'

with open(os.path.join(mappings_dir, 'item_to_idx.json'), 'r') as f:
    item_to_idx = json.load(f)

with open(os.path.join(mappings_dir, 'context_to_idx.json'), 'r') as f:
    context_to_idx = json.load(f)
    
with open(os.path.join(mappings_dir, 'bonus_to_idx.json'), 'r') as f:
    bonus_to_idx = json.load(f)

with open(os.path.join(mappings_dir, 'modtype_to_idx.json'), 'r') as f:
    modtype_to_idx = json.load(f)

In [17]:
feature_stats = torch.load('../generated/feature_stats.pt')

In [None]:
from src.data.auction_dataset import AuctionDataset
from src.data.utils import collate_auctions

batch_size = 256

train_dataset = AuctionDataset(train_pairs, feature_stats=feature_stats, path='../generated/sequences.h5')
val_dataset = AuctionDataset(val_pairs, feature_stats=feature_stats, path='../generated/sequences.h5')

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_auctions, num_workers=8, prefetch_factor=8, pin_memory=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_auctions, num_workers=8, prefetch_factor=8, pin_memory=True)

## Model definition

In [None]:
test_data_loader = True

if test_data_loader:
    iter_loader = iter(train_dataloader)
    (auctions, item_index, contexts, bonus_lists, modifier_types, modifier_values, current_hours), y = next(iter_loader)

    print(f'auctions: {auctions.shape}')
    print(f'item_index: {item_index.shape}')
    print(f'contexts: {contexts.shape}')
    print(f'bonus_lists: {bonus_lists.shape}')
    print(f'modifier_types: {modifier_types.shape}')
    print(f'modifier_values: {modifier_values.shape}')
    print(f'current_hours: {current_hours.shape}')
    print(f'y: {y.shape}')

In [None]:
from src.models.auction_transformer import AuctionTransformer

input_size = 7
embedding_dim = 64  # Increase from 32
d_model = 512       # Increase from 256
dim_feedforward = d_model * 4  # This will automatically scale with d_model
nhead = 16          # Increase from 8 (should be a divisor of d_model)
num_layers = 12     # Increase from 8
dropout_p = 0.1
n_items = len(item_to_idx)
n_contexts = len(context_to_idx) + 1
n_bonuses = len(bonus_to_idx)
n_modtypes = len(modtype_to_idx)

model = AuctionTransformer(
    input_size, 
    n_items, 
    n_contexts,
    n_bonuses,
    n_modtypes,
    embedding_dim, 
    d_model, 
    dim_feedforward, 
    nhead, 
    num_layers,
    dropout_p=dropout_p,
    learning_rate=3e-5,
    logging_interval=500
)

print(sum(p.numel() for p in model.parameters()))

## Training

In [None]:
!rm -rf logs/train
!rm -rf logs/val

import lightning as L
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.callbacks import ModelCheckpoint

name = "tf_auctions_40M"

logger = TensorBoardLogger( # tensorboard --logdir=notebooks/logs
    save_dir="logs",
    name=name,
    version='full_train-128b-wpos-ch-weighted'
)

checkpoint_callback = ModelCheckpoint(
    dirpath='../models/auction_transformer_40M_128b_wpos_ch_weighted',
    filename='epoch_{epoch:02d}',
    save_top_k=-1,
    every_n_epochs=1,
    save_last=True
)

trainer = L.Trainer(
    max_epochs=5,
    accelerator='gpu',
    devices=1,
    log_every_n_steps=10,
    logger=logger,
    limit_val_batches=500,
    val_check_interval=0.25,  # Validate every 25% of training epoch
    callbacks=[checkpoint_callback]
)

trainer.fit(model, train_dataloader, val_dataloader)

## Overfit on single batch

!rm -rf logs/train
!rm -rf logs/val

import lightning as L
from lightning.pytorch.loggers import TensorBoardLogger

name = "tf_auctions_2.0M"

logger = TensorBoardLogger( # tensorboard --logdir=logs
    save_dir="logs",
    name=name,
    version='overfit'
)

trainer = L.Trainer(
    max_epochs=1000,
    accelerator='gpu',
    devices=1,
    log_every_n_steps=1,
    logger=logger,
    limit_train_batches=1,  # Overfit on single batch
    limit_val_batches=1,
    val_check_interval=1,
    overfit_batches=1
)

trainer.fit(model, train_dataloader, val_dataloader)