In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
!pip install -r requirements.txt

In [None]:
import os

target_dir = "../"
zip_file_path = "generated.zip"

if not os.path.exists(target_dir + 'generated'):
    print(f"The directory {target_dir} does not exist. Proceeding with download.")

    !apt-get update
    !apt-get install unzip
    
    !curl "https://drive.usercontent.google.com/download?id=1uBkBBph8tS7Rz2iQU8I41NSV2S9vsn_H&confirm=xxx" -o {zip_file_path}
    !mkdir -p {target_dir}
    
    !unzip {zip_file_path} -d {target_dir}
    
    print(f"File downloaded and extracted to {target_dir}")
    
    !rm {zip_file_path}
else:
    print(f"The directory {target_dir} already exists. No action taken.")

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os
import wandb
import matplotlib.pyplot as plt
import h5py
from tqdm import tqdm
from datetime import datetime
pd.options.mode.chained_assignment = None

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from auction_dataset import AuctionDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
pairs = pd.read_csv('../generated/auction_indices.csv')
pairs.head()

## Prepare and balance data

In [None]:
pd.options.display.float_format = '{:.2f}'.format
pairs.describe()

In [None]:
pairs = pairs[pairs['group_hours_on_sale_max'] < 50]
pairs = pairs[pairs['group_hours_since_first_appearance_max'] < 50]

pairs = pairs[pairs['expansion'] == 'wotlk']

pairs.describe()

In [None]:
train_pairs, val_pairs = train_test_split(pairs, test_size=0.05, random_state=42, shuffle=False)

print(f"Before filtering: {len(train_pairs)}")

train_pairs = train_pairs[train_pairs['group_hours_on_sale_len'] <= 32]
val_pairs = val_pairs[val_pairs['group_hours_on_sale_len'] <= 32]

print(f"After filtering: {len(train_pairs)}\n")

# We use this to improve the performance of the model in this scenario
train_pairs = train_pairs[train_pairs['group_hours_since_first_appearance_mean'] <= 12]
val_pairs = val_pairs[val_pairs['group_hours_since_first_appearance_mean'] <= 12]

print(f"Train pairs: {len(train_pairs)}")
print(f"Val pairs: {len(val_pairs)}")

In [None]:
train_pairs.sample(5)

In [None]:
# histogram of group_hours_on_sale_mean
plt.hist(train_pairs['group_hours_on_sale_mean'], bins=30)

In [None]:
val_pairs.sample(5)

In [None]:
plt.hist(train_pairs['group_hours_on_sale_len'], bins=10)
plt.show()

In [None]:
plt.hist(train_pairs['group_hours_on_sale_mean'], bins=15)
plt.show()

In [None]:
items = pd.read_csv('../data/items_wotlk.csv')
n_items = len(items)

item_to_index = {item_id: i + 2 for i, item_id in enumerate(items['item_id'])}
item_to_index[0] = 0 # padding
item_to_index[1] = 1 # unknown
n_items

In [15]:
def collate_auctions(batch):
    X, y = zip(*batch)

    lengths = torch.LongTensor([x.size(0) for x in X])

    max_length = lengths.max()

    X = [F.pad(x, (0, 0, 0, max_length - x.size(0))) for x in X]
    y = [F.pad(x, (0, max_length - x.size(0))) for x in y]

    X = torch.stack(X)
    y = torch.stack(y)

    return X, y, lengths

In [None]:
batch_size = 64

train_dataset = AuctionDataset(train_pairs, item_to_index, path='../generated/sequences.h5')
val_dataset = AuctionDataset(val_pairs, item_to_index, path='../generated/sequences.h5')

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_auctions, num_workers=8, prefetch_factor=2, pin_memory=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_auctions, num_workers=8, prefetch_factor=2, pin_memory=True)

In [17]:
compute_stats = False

if compute_stats:
    feature_means, feature_stds = compute_feature_stats(train_dataloader)
    print("Feature means (excluding item_id):", feature_means)
    print("Feature stds (excluding item_id):", feature_stds)

## Model definition

In [None]:
test_data_loader = True

if test_data_loader:
    iter_loader = iter(train_dataloader)
    X, y, lengths = next(iter_loader)

    print(X.shape)
    print(y.shape)
    print(lengths.shape)

    print(X.dtype)
    print(y.dtype)
    print(lengths.dtype)

In [None]:
from auction_transformer import AuctionTransformer

input_size = 8
embedding_dim = 32
d_model = 256
dim_feedforward = d_model * 4
nhead = 8
num_layers = 8
dropout_p = 0.0
n_items = len(item_to_index)

model = AuctionTransformer(
    input_size, 
    n_items, 
    embedding_dim, 
    d_model, 
    dim_feedforward, 
    nhead, 
    num_layers,
    dropout_p=dropout_p,
    learning_rate=3e-5
)

print(sum(p.numel() for p in model.parameters()))

## Training

In [None]:
!rm -rf logs/train
!rm -rf logs/val

import lightning as L
from lightning.pytorch.loggers import TensorBoardLogger

name = "tf_auctions_7.0M"

logger = TensorBoardLogger( # tensorboard --logdir=logs
    save_dir="logs",
    name=name,
    version='standarized-no-pos'
)

trainer = L.Trainer(
    max_epochs=1,
    accelerator='gpu',
    devices=1,
    log_every_n_steps=10,
    logger=logger,
    val_check_interval=1000,
    limit_val_batches=100,
)

trainer.fit(model, train_dataloader, val_dataloader)