In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
!pip install -r requirements.txt

In [None]:
import os

target_dir = "../"
zip_file_path = "generated.zip"

if not os.path.exists(target_dir + 'generated'):
    print(f"The directory {target_dir} does not exist. Proceeding with download.")

    !apt-get update
    !apt-get install unzip
    
    !curl "https://drive.usercontent.google.com/download?id=1uBkBBph8tS7Rz2iQU8I41NSV2S9vsn_H&confirm=xxx" -o {zip_file_path}
    !mkdir -p {target_dir}
    
    !unzip {zip_file_path} -d {target_dir}
    
    print(f"File downloaded and extracted to {target_dir}")
    
    !rm {zip_file_path}
else:
    print(f"The directory {target_dir} already exists. No action taken.")

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os
import wandb
import matplotlib.pyplot as plt
import h5py
from tqdm import tqdm
from datetime import datetime
pd.options.mode.chained_assignment = None

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from model import AuctionTransformer
from auction_dataset import AuctionDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
pairs = pd.read_csv('../generated/auction_indices.csv')
pairs.head()

## Prepare and balance data

In [None]:
pd.options.display.float_format = '{:.2f}'.format
pairs.describe()

In [None]:
pairs = pairs[pairs['group_hours_on_sale_max'] <= 50]
pairs = pairs[pairs['group_hours_since_first_appearance_max'] <= 50]

pairs.describe()

In [None]:
train_pairs, val_pairs = train_test_split(pairs, test_size=0.05, random_state=42, shuffle=False)

print(f"Before filtering: {len(train_pairs)}")

train_pairs = train_pairs[train_pairs['group_hours_on_sale_len'] <= 64]
val_pairs = val_pairs[val_pairs['group_hours_on_sale_len'] <= 64]

print(f"After filtering: {len(train_pairs)}\n")

# We use this to improve the performance of the model in this scenario
train_pairs = train_pairs[train_pairs['group_hours_since_first_appearance_mean'] <= 12]
val_pairs = val_pairs[val_pairs['group_hours_since_first_appearance_mean'] <= 12]

print(f"Train pairs: {len(train_pairs)}")
print(f"Val pairs: {len(val_pairs)}")

In [None]:
train_pairs.sample(5)

In [None]:
val_pairs.sample(5)

In [None]:
undersample_expansion = False

if undersample_expansion:
    print(f"Under-sampling expansion")
    train_pairs_wotlk = train_pairs[train_pairs['expansion'] == 'wotlk']

    rows_to_delete = train_pairs_wotlk.sample(n=int(len(train_pairs_wotlk) * 0.85)).index
    train_pairs = train_pairs.drop(rows_to_delete)

print(train_pairs.expansion.value_counts())
train_pairs.expansion.value_counts().plot(kind='bar')

In [None]:
plt.hist(train_pairs['group_hours_on_sale_len'], bins=10)
plt.show()

In [None]:
plt.hist(train_pairs['group_hours_on_sale_mean'], bins=15)
plt.show()

We undersample the hours_on_sale variable. In some previous experiments, we noticed that the model was better predicting low values. To make the model more robust, we want to have more high values in the training dataset

In [None]:
train_pairs["hours_cat"] = pd.cut(train_pairs["group_hours_on_sale_mean"], bins=[0., 12.0, np.inf], labels=['very_short', 'long'])
train_pairs["hours_cat"].value_counts().plot(kind='bar')

In [15]:
sample_size = False

if sample_size:
    min_samples = train_pairs["hours_cat"].value_counts().min()

    train_pairs = pd.concat([
        train_pairs[train_pairs["hours_cat"] == cat].sample(n=min_samples, random_state=42, replace=False) 
        for cat in ['very_short', 'long']
    ])

    print(f"Train pairs: {len(train_pairs)} (after filtering)")

In [None]:
items = pd.read_csv('../data/items.csv')
n_items = len(items)

item_to_index = {item_id: i + 2 for i, item_id in enumerate(items['item_id'])}
item_to_index[0] = 0 # padding
item_to_index[1] = 1 # unknown
n_items

In [17]:
def collate_auctions(batch):
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    X, y = zip(*batch)

    lengths = torch.LongTensor([x.size(0) for x in X])

    max_length = lengths.max()

    X = [F.pad(x, (0, 0, 0, max_length - x.size(0))) for x in X]
    y = [F.pad(x, (0, max_length - x.size(0))) for x in y]

    X = torch.stack(X)
    y = torch.stack(y)

    return X, y, lengths

## Model definition

In [None]:
batch_size = 32

train_dataset = AuctionDataset(train_pairs, item_to_index, path='../generated/sequences.h5')
val_dataset = AuctionDataset(val_pairs, item_to_index, path='../generated/sequences.h5')

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_auctions, num_workers=8, prefetch_factor=2, pin_memory=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_auctions, num_workers=8, prefetch_factor=2, pin_memory=True)

In [None]:
test_data_loader = True

if test_data_loader:
    iter_loader = iter(train_dataloader)
    X, y, lengths = next(iter_loader)

    print(X.shape)
    print(y.shape)
    print(lengths.shape)

    print(X.dtype)
    print(y.dtype)
    print(lengths.dtype)

In [None]:
input_size = 8
embedding_dim = 128
d_model = 512
dim_feedforward = d_model * 4
nhead = 4
num_layers = 4
dropout_p = 0.1
n_items = len(item_to_index)

model = AuctionTransformer(
    input_size, 
    n_items, 
    embedding_dim, 
    d_model, 
    dim_feedforward, 
    nhead, 
    num_layers,
    dropout_p=dropout_p
).to(device)

print(sum(p.numel() for p in model.parameters()))

In [21]:
load_checkpoint = False

if load_checkpoint:
  checkpoint = torch.load('checkpoints/checkpoint_epoch_4.pt')
  model.load_state_dict(checkpoint['model_state_dict'])

In [None]:
epochs = 3
eval_steps = 2500
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
total_steps = len(train_dataloader) * epochs
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=50, verbose=True)
criterion = nn.MSELoss(reduction='sum')

print(f'Iterations per epoch: {len(train_dataloader)}')

In [None]:
enable_logging = False

if enable_logging:
  print("Logging enabled")
  wandb.init(project="auction-classic", config={
      "epochs": epochs,
      "batch_size": train_dataloader.batch_size,
      "learning_rate": optimizer.param_groups[0]['lr'],
      "model_size": sum(p.numel() for p in model.parameters()),
      "embedding_dim": embedding_dim,
      "d_model": d_model,
      "dim_feedforward": dim_feedforward,
      "nhead": nhead,
      "num_layers": num_layers,
      "dropout_p": dropout_p
  })
else:
  print("Logging disabled")
  wandb.init(mode="disabled")

## Training

In [None]:
from train import train

train(
  model,
  train_dataloader,
  val_dataloader,
  epochs,
  eval_steps=eval_steps,
  device=device,
  optimizer=optimizer,
  criterion=criterion,
  lr_scheduler=lr_scheduler
)