In [None]:
!pip install -r requirements.txt

In [None]:
import os

target_dir = "../"
zip_file_path = "generated.zip"

if not os.path.exists(target_dir + 'generated'):
    print(f"The directory {target_dir} does not exist. Proceeding with download.")

    !apt-get update
    !apt-get install unzip
    
    !curl "https://drive.usercontent.google.com/download?id=1z5nV8cGfQ0L67VfNW-2UnU_q9ZAPfFZ_&confirm=xxx" -o {zip_file_path}
    !mkdir -p {target_dir}
    
    !unzip {zip_file_path} -d {target_dir}
    
    print(f"File downloaded and extracted to {target_dir}")
    
    !rm {zip_file_path}
else:
    print(f"The directory {target_dir} already exists. No action taken.")

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os
import wandb
import matplotlib.pyplot as plt
import torch.profiler
import h5py
from tqdm import tqdm
from datetime import datetime
pd.options.mode.chained_assignment = None

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from model import AuctionPredictor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
pairs = pd.read_csv('../generated/auction_indices.csv')
pairs.head()

## Prepare and balance data

In [None]:
pd.options.display.float_format = '{:.2f}'.format
pairs.describe()

In [None]:
pairs[['group_len']].quantile(0.99)

In [None]:
pairs = pairs[pairs['group_max'] < 50]
pairs = pairs[pairs['group_len'] < 64]
pairs.describe()

In [None]:
train_pairs, val_pairs = train_test_split(pairs, test_size=0.1, random_state=42, shuffle=False)

print(f"Train pairs: {len(train_pairs)}")
print(f"Val pairs: {len(val_pairs)}")

val_pairs.head()

In [None]:
train_pairs_wotlk = train_pairs[train_pairs['expansion'] == 'wotlk']

rows_to_delete = train_pairs_wotlk.sample(n=int(len(train_pairs_wotlk) * 0.85)).index
train_pairs = train_pairs.drop(rows_to_delete)

In [None]:
print(train_pairs.expansion.value_counts())

train_pairs.expansion.value_counts().plot(kind='bar')

In [None]:
plt.hist(train_pairs['group_mean'], bins=15)
plt.show()

In [None]:
def uniform_sample(df, column, n_samples):
    bins = pd.cut(df[column], bins=48)

    grouped = df.groupby(bins)

    samples_per_bin = n_samples // len(grouped)
    remainder = n_samples % len(grouped)

    sampled_df = pd.DataFrame()
    for _, group in grouped:
        if len(group) > samples_per_bin:
            sample = group.sample(n=samples_per_bin, replace=False)
        else:
            sample = group
        sampled_df = pd.concat([sampled_df, sample])

    if remainder > 0:
        additional_sample = df.sample(n=remainder, replace=False)
        sampled_df = pd.concat([sampled_df, additional_sample])

    return sampled_df.sample(frac=1).reset_index(drop=True)  # Shuffle the final result

train_pairs = uniform_sample(train_pairs, 'group_mean', n_samples=int(len(train_pairs)))
print(f"Train pairs: {len(train_pairs)}")
train_pairs.group_mean.hist(bins=10)

In [None]:
items = pd.read_csv('../data/items.csv')
n_items = len(items)

item_to_index = {item_id: i + 2 for i, item_id in enumerate(items['item_id'])}
item_to_index[0] = 0 # padding
item_to_index[1] = 1 # unknown
n_items

## Preprocess data

In [None]:
import torch
from datetime import datetime

class AuctionDataset(torch.utils.data.Dataset):

    def __init__(self, pairs, item_to_index, path='../generated/sequences.h5'):
        self.pairs = pairs
        self.column_map = {
            'bid': 0,
            'buyout': 1,
            'quantity': 2,
            'item_id': 3,
            'time_left': 4,
            'hours_since_first_appearance': 5
        }
        self.item_to_index = item_to_index
        self.path = path

        print(f"Dataset size: {len(self)}")

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs.iloc[idx]

        record = pair['record']
        item_id = pair['item_id']

        date_time_obj = datetime.strptime(record, "%Y-%m-%d %H:%M:%S")
        date_folder_name = date_time_obj.strftime("%Y-%m-%d")
        hour_folder_name = date_time_obj.strftime("%H")

        with h5py.File(self.path, 'r') as f:
            data = f[f'{date_folder_name}/{hour_folder_name}/{item_id}'][:]

        X = torch.tensor(data, dtype=torch.float32)

        y = X[:, -1]
        X = X[:, :-1]

        X[:, self.column_map['item_id']] = torch.tensor([self.item_to_index.get(int(item), 1) for item in X[:, self.column_map['item_id']]], dtype=torch.long)
        X[:, self.column_map['time_left']] = X[:, self.column_map['time_left']] / 48.0
        X[:, self.column_map['hours_since_first_appearance']] = X[:, self.column_map['hours_since_first_appearance']] / 48.0
        X[:, self.column_map['bid']] = torch.log1p(X[:, self.column_map['bid']]) / 15.0
        X[:, self.column_map['buyout']] = torch.log1p(X[:, self.column_map['buyout']]) / 15.0
        X[:, self.column_map['quantity']] = X[:, self.column_map['quantity']] / 200.0

        return X, y

In [None]:
def collate_auctions(batch):
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    X, y = zip(*batch)

    lengths = torch.LongTensor([x.size(0) for x in X])

    max_length = lengths.max()

    X = [F.pad(x, (0, 0, 0, max_length - x.size(0))) for x in X]
    y = [F.pad(x, (0, max_length - x.size(0))) for x in y]

    X = torch.stack(X)
    y = torch.stack(y)

    return X, y, lengths

## Model definition

In [None]:
embedding_size = 512
encoder_hidden_size = 1024
decoder_hidden_size = 1024
dropout = 0.1
bidirectional = False
n_items = len(item_to_index)

model = AuctionPredictor(n_items,
                         input_size=5,
                         encoder_hidden_size=encoder_hidden_size,
                         decoder_hidden_size=decoder_hidden_size,
                         item_index=3,
                         embedding_size=embedding_size,
                         dropout_p=dropout,
                         bidirectional=bidirectional
                         ).to(device)

print(sum(p.numel() for p in model.parameters()))

In [None]:
load_checkpoint = False

if load_checkpoint:
  checkpoint = torch.load('checkpoints/checkpoint_epoch_4.pt')
  model.load_state_dict(checkpoint['model_state_dict'])

In [None]:
train_dataset = AuctionDataset(train_pairs, item_to_index, path='../generated/sequences.h5')
val_dataset = AuctionDataset(val_pairs, item_to_index, path='../generated/sequences.h5')

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True, collate_fn=collate_auctions, num_workers=8, prefetch_factor=2, pin_memory=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=256, shuffle=True, collate_fn=collate_auctions, num_workers=8, prefetch_factor=2, pin_memory=True)

In [None]:
test_data_loader = True

if test_data_loader:
    iter_loader = iter(train_dataloader)
    X, y, lengths = next(iter_loader)

    print(X.shape)
    print(y.shape)
    print(lengths.shape)

    print(X.dtype)
    print(y.dtype)
    print(lengths.dtype)

In [None]:
epochs = 1
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
total_steps = len(train_dataloader) * epochs
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=50, verbose=True)
criterion = nn.MSELoss(reduction='sum')

print(f'Iterations per epoch: {len(train_dataloader)}')

In [None]:
enable_logging = True

if enable_logging:
  print("Logging enabled")
  wandb.init(project="auction-classic", config={
      "epochs": epochs,
      "batch_size": train_dataloader.batch_size,
      "learning_rate": optimizer.param_groups[0]['lr'],
      "encoder_hidden_size": encoder_hidden_size,
      "decoder_hidden_size": decoder_hidden_size,
      "model_size": sum(p.numel() for p in model.parameters()),
      "embedding_size": embedding_size,
      "bidirectional": bidirectional,
      "dropout": dropout
  })
else:
  print("Logging disabled")
  wandb.init(mode="disabled")

## Training

In [None]:
def save_checkpoint(model, optimizer, epoch, iters, checkpoint_path='checkpoints'):
    os.makedirs(checkpoint_path, exist_ok=True)
    checkpoint_file = os.path.join(checkpoint_path, f"checkpoint_epoch_{epoch}_iter_{iters}.pt")
    torch.save({
        'epoch': epoch,
        'iter': iters,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, checkpoint_file)
    print(f"Checkpoint saved at {checkpoint_file}")

def train(
    model,
    train_loader,
    val_loader,
    epochs,
    eval_steps,
    device,
    optimizer,
    criterion,
    lr_scheduler
):
    print("Starting training for", epochs, "epochs")

    for epoch in tqdm(range(epochs)):
        model.train()

        mse_losses = []
        mae_losses = []
        
        for i, (X, y, lengths) in enumerate(tqdm(train_loader, total=len(train_loader))):                
            X = X.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)

            y_pred = model(X, lengths)
            mask = (y != 0).float().unsqueeze(2)

            loss = criterion(y_pred * mask, y.unsqueeze(2)) / mask.sum()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            with torch.no_grad():
                mae = F.l1_loss(y_pred * mask, y.unsqueeze(2) * mask, reduction='sum') / mask.sum()

            mse_losses.append(loss.item())
            mae_losses.append(mae.item())

            if i % 50 == 0:
                mse_loss_avg = np.mean(mse_losses)
                mae_loss_avg = np.mean(mae_losses)
                lr = optimizer.param_groups[0]['lr']#lr_scheduler.get_last_lr()[0]
                print(f"Epoch {epoch} Iteration {i} Loss {mse_loss_avg} MAE {mae_loss_avg} LR {lr}")
                lr_scheduler.step(mse_loss_avg)

                wandb.log({
                  "train/mse_loss": mse_loss_avg,
                  "train/mae_loss": mae_loss_avg,
                  "train/learning_rate": lr,
                  "epoch": epoch,
                  "iter": i
                })

                mse_losses = []
                mae_losses = []

            if (i + 1) % eval_steps == 0:
              val_loss, val_mae = evaluate(model, val_loader, device, criterion)
              wandb.log({
                "val/mse_loss": val_loss,
                "val/mae_loss": val_mae,
                "epoch": epoch
              })

        save_checkpoint(model, optimizer, epoch, len(train_loader))
    
    wandb.finish()

@torch.no_grad()
def evaluate(
    model,
    val_loader,
    device,
    criterion
):
    print("Evaluating model")
    model.eval()

    mse_losses = []
    mae_losses = []

    for i, (X, y, lengths) in enumerate(val_loader):

      if i >= 300:
        break

      if i % 15 == 0:
        print(f"Evaluating step {i}")

      X = X.to(device)
      y = y.to(device)
      lengths = lengths.cpu()

      y_pred = model(X, lengths)

      mask = (y != 0).float().unsqueeze(2)
      loss = criterion(y_pred * mask, y.unsqueeze(2)) / mask.sum()
      mae = F.l1_loss(y_pred * mask, y.unsqueeze(2) * mask, reduction='sum') / mask.sum()

      mse_losses.append(loss.item())
      mae_losses.append(mae.item())

      if i % 25 == 0:
        print(f"Evaluating step {i}")
        print(y[0][:10])
        print(y_pred[0,:, 0][:10])

    mse_loss_avg = np.mean(mse_losses)
    mae_loss_avg = np.mean(mae_losses)

    print(f"Validation loss: {mse_loss_avg} MAE: {mae_loss_avg}")
    model.train()

    return mse_loss_avg, mae_loss_avg

train(
    model,
    train_dataloader,
    val_dataloader,
    epochs,
    eval_steps=250,
    device=device,
    optimizer=optimizer,
    criterion=criterion,
    lr_scheduler=lr_scheduler
)