In [None]:
!pip install wandb

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os
import wandb
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime
pd.options.mode.chained_assignment = None

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Projects/auction-classic

In [None]:
pairs = pd.read_csv('auction_indices.csv')
pairs.head()

## Prepare and balance data

In [None]:
pd.options.display.float_format = '{:.2f}'.format
pairs.describe()

In [None]:
pairs = pairs[pairs['group_max'] < 50]
pairs.describe()

In [None]:

train_pairs, val_pairs = train_test_split(pairs, test_size=0.1, random_state=42, shuffle=False)

print(f"Train pairs: {len(train_pairs)}")
print(f"Val pairs: {len(val_pairs)}")

val_pairs.head()

In [None]:
train_pairs_wotlk = train_pairs[train_pairs['expansion'] == 'wotlk']

rows_to_delete = train_pairs_wotlk.sample(n=int(len(train_pairs_wotlk) * 0.85)).index
train_pairs = train_pairs.drop(rows_to_delete)

In [None]:
print(train_pairs.expansion.value_counts())

train_pairs.expansion.value_counts().plot(kind='bar')

In [None]:
plt.hist(train_pairs['group_mean'], bins=15)
plt.show()

In [None]:
def uniform_sample(df, column, n_samples):
    bins = pd.cut(df[column], bins=48)

    grouped = df.groupby(bins)

    samples_per_bin = n_samples // len(grouped)
    remainder = n_samples % len(grouped)

    sampled_df = pd.DataFrame()
    for _, group in grouped:
        if len(group) > samples_per_bin:
            sample = group.sample(n=samples_per_bin, replace=False)
        else:
            sample = group
        sampled_df = pd.concat([sampled_df, sample])

    if remainder > 0:
        additional_sample = df.sample(n=remainder, replace=False)
        sampled_df = pd.concat([sampled_df, additional_sample])

    return sampled_df.sample(frac=1).reset_index(drop=True)  # Shuffle the final result

train_pairs = uniform_sample(train_pairs, 'group_mean', n_samples=int(len(train_pairs)))
print(f"Train pairs: {len(train_pairs)}")
train_pairs.group_mean.hist(bins=10)

In [None]:
train_pairs.sample(5)

In [None]:
items = pd.read_csv('items.csv')
n_items = len(items)

item_to_index = {item_id: i + 2 for i, item_id in enumerate(items['item_id'])}
item_to_index[0] = 0 # padding
item_to_index[1] = 1 # unknown
n_items

## Preprocess data

In [None]:
class AuctionDataset(torch.utils.data.Dataset):

    def __init__(self, pairs, item_to_index, path='sequences'):
        self.pairs = pairs
        self.column_map = {
            'bid': 0,
            'buyout': 1,
            'quantity': 2,
            'item_id': 3,
            'time_left': 4,
            'hours_since_first_appearance': 5
        }
        self.item_to_index = item_to_index
        self.path = path

        print(f"Dataset size: {len(self)}")

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs.iloc[idx]

        record = pair['record']
        item_id = pair['item_id']

        date_time_obj = datetime.strptime(record, "%Y-%m-%d %H:%M:%S")

        date_folder_name = date_time_obj.strftime("%Y-%m-%d")
        hour_folder_name = date_time_obj.strftime("%H")

        datetime_str = date_time_obj.strftime("%Y-%m-%d %H:%M:%S")
        datetime_str = datetime_str.split(' ')[0] + ' 00:00:00'

        data = torch.load(f'{self.path}/{date_folder_name}/{hour_folder_name}.pt')

        X = torch.tensor(data[item_id])

        y = X[:, -1]
        X = X[:, :-1]

        X[:, self.column_map['item_id']] = torch.tensor([self.item_to_index.get(int(item), 1) for item in X[:, self.column_map['item_id']]], dtype=torch.long)
        X[:, self.column_map['time_left']] = X[:, self.column_map['time_left']] / 48.0
        X[:, self.column_map['hours_since_first_appearance']] = X[:, self.column_map['hours_since_first_appearance']] / 48.0

        X[:, self.column_map['bid']] = torch.log1p(X[:, self.column_map['bid']]) / 15.0
        X[:, self.column_map['buyout']] = torch.log1p(X[:, self.column_map['buyout']]) / 15.0

        X[:, self.column_map['quantity']] = X[:, self.column_map['quantity']] / 200.0

        return X, y

In [None]:
def collate_auctions(batch):
    X, y = zip(*batch)

    lengths = [x.size(0) for x in X]
    lengths = torch.tensor(lengths)

    max_length = lengths.max()

    X = [F.pad(x, (0, 0, 0, max_length - x.size(0))) for x in X]
    y = [F.pad(x, (0, max_length - x.size(0))) for x in y]

    X = torch.stack(X)
    y = torch.stack(y)

    return X, y

train_dataset = AuctionDataset(pairs, item_to_index)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collate_auctions)

iter_loader = iter(train_dataloader)
X, y = next(iter_loader)

print(X.shape)
print(y.shape)

## Model definition

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class Encoder(nn.Module):

    def __init__(self, input_size=5, item_index=3, embedding_size=16, hidden_size=16, dropout_p=0.1, bidirectional=True):
        super(Encoder, self).__init__()

        self.hidden_size = hidden_size
        self.item_index = item_index
        n_items = len(item_to_index)

        self.embedding = nn.Embedding(n_items, embedding_size)
        self.rnn = nn.LSTM(input_size + embedding_size, hidden_size, batch_first=True, num_layers=2, bidirectional=bidirectional)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, X):
        item_ids = X[:, :, self.item_index].long()

        X = torch.cat([X[:, :, :self.item_index], X[:, :, self.item_index + 1:]], dim=2)

        item_embeddings = self.dropout(self.embedding(item_ids))

        X = torch.cat([X, item_embeddings], dim=2)

        output, (hidden, cell) = self.rnn(X)

        return output, (hidden, cell)


class Decoder(nn.Module):

    def __init__(self, input_size, hidden_size, bidirectional=True):
        super(Decoder, self).__init__()
        output_size = hidden_size * 2 if bidirectional else hidden_size
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True, num_layers=2, bidirectional=bidirectional)
        self.projection = nn.Sequential(
            nn.Linear(output_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, encoder_outputs, encoder_hidden):
        output, _ = self.rnn(encoder_outputs, encoder_hidden)
        output = self.projection(output)

        return output


class AuctionPredictor(nn.Module):
    def __init__(self, input_size=5, encoder_hidden_size=16, decoder_hidden_size=16, item_index=3, embedding_size=16, dropout_p=0.1, bidirectional=True):
        super(AuctionPredictor, self).__init__()
        decoder_input_size = encoder_hidden_size * 2 if bidirectional else encoder_hidden_size
        self.encoder = Encoder(input_size, item_index, embedding_size, encoder_hidden_size, dropout_p, bidirectional=bidirectional)
        self.decoder = Decoder(decoder_input_size, decoder_hidden_size, bidirectional=bidirectional)

    def forward(self, X):
        encoder_outputs, encoder_hidden = self.encoder(X)
        decoder_outputs = self.decoder(encoder_outputs, encoder_hidden)
        return decoder_outputs

In [None]:
train_dataset = AuctionDataset(train_pairs, item_to_index)
val_dataset = AuctionDataset(val_pairs, item_to_index)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_auctions, num_workers=8, prefetch_factor=4)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=True, collate_fn=collate_auctions, num_workers=8, prefetch_factor=4)

In [None]:
embedding_size = 128
encoder_hidden_size = 256
decoder_hidden_size = 256
epochs = 1
save_every_iters = 5000

model = AuctionPredictor(input_size=5,
                         encoder_hidden_size=encoder_hidden_size,
                         decoder_hidden_size=decoder_hidden_size,
                         item_index=3,
                         embedding_size=embedding_size,
                         dropout_p=0.2,
                         bidirectional=False
                         ).to(device)

print(sum(p.numel() for p in model.parameters()))

load_checkpoint = False

if load_checkpoint:
  checkpoint = torch.load('checkpoints/checkpoint_epoch_4.pt')
  model.load_state_dict(checkpoint['model_state_dict'])

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
total_steps = len(train_dataloader) * epochs
lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=1e-06, total_iters=total_steps)
criterion = nn.MSELoss(reduction='sum')

print(f'Iterations per epoch: {len(train_dataloader)}')

In [None]:
wandb.init(project="auction-classic", config={
    "epochs": epochs,
    "batch_size": train_loader.batch_size,
    "learning_rate": optimizer.param_groups[0]['lr'],
    "encoder_hidden_size": encoder_hidden_size,
    "decoder_hidden_size": decoder_hidden_size,
    "model_size": sum(p.numel() for p in model.parameters()),
    "embedding_size": embedding_size,
    "bidirectional": False,
    "dropout": 0.2
})

## Training

In [None]:
def save_checkpoint(model, optimizer, epoch, iters, checkpoint_path='checkpoints'):
    os.makedirs(checkpoint_path, exist_ok=True)
    checkpoint_file = os.path.join(checkpoint_path, f"checkpoint_epoch_{epoch}_iter_{iters}.pt")
    torch.save({
        'epoch': epoch,
        'iter': iters,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, checkpoint_file)
    print(f"Checkpoint saved at {checkpoint_file}")

def train(
    model,
    train_loader,
    val_loader,
    epochs,
    eval_steps,
    device,
    optimizer,
    criterion,
    lr_scheduler
):
    print("Starting training for", epochs, "epochs")

    for epoch in tqdm(range(epochs)):
        model.train()

        mse_losses = []
        mae_losses = []

        for i, (X, y) in enumerate(tqdm(train_loader, total=len(train_loader))):
            X = X.to(device)
            y = y.to(device)

            y_pred = model(X)

            loss = criterion(y_pred, y.unsqueeze(2))
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            with torch.no_grad():
                mae = F.l1_loss(y_pred, y.unsqueeze(2), reduction='sum')
                n = (y != 0).sum().item()
                mae /= n

            mse_losses.append(loss.item() / n)
            mae_losses.append(mae.item())

            if i % 50 == 0:
                mse_loss_avg = np.mean(mse_losses)
                mae_loss_avg = np.mean(mae_losses)
                lr = lr_scheduler.get_last_lr()[0]
                print(f"Epoch {epoch} Iteration {i} Loss {mse_loss_avg} MAE {mae_loss_avg} LR {lr}")

                wandb.log({
                  "train/mse_loss": mse_loss_avg,
                  "train/mae_loss": mae_loss_avg,
                  "train/learning_rate": lr,
                  "epoch": epoch
                })

                mse_losses = []
                mae_losses = []

            if (i + 1) % eval_steps == 0:
              val_loss, val_mae = evaluate(model, val_loader, device, criterion)
              wandb.log({
                "val/mse_loss": val_loss,
                "val/mae_loss": val_mae,
                "epoch": epoch
              })

            if (i + 1) % save_every_iters == 0:
              save_checkpoint(model, optimizer, epoch, i)

        save_checkpoint(model, optimizer, epoch, len(train_loader))

    wandb.finish()

@torch.no_grad()
def evaluate(
    model,
    val_loader,
    device,
    criterion
):
    print("Evaluating model")
    model.eval()

    mse_losses = []
    mae_losses = []

    for i, (X, y) in enumerate(val_loader):
      if i >= 100:
        break

      if i % 15 == 0:
        print(f"Evaluating step {i}")

      X = X.to(device)
      y = y.to(device)

      y_pred = model(X)

      loss = criterion(y_pred, y.unsqueeze(2))

      mae = F.l1_loss(y_pred, y.unsqueeze(2), reduction='sum')
      n = (y != 0).sum().item()
      mae /= n

      mse_losses.append(loss.item() / n)
      mae_losses.append(mae.item())

      if i % 25 == 0:
        print(y[0][:10])
        print(y_pred[0,:, 0][:10])

    mse_loss_avg = np.mean(mse_losses)
    mae_loss_avg = np.mean(mae_losses)

    print(f"Validation loss: {mse_loss_avg} MAE: {mae_loss_avg}")
    model.train()

    return mse_loss_avg, mae_loss_avg

train(
    model,
    train_dataloader,
    val_dataloader,
    epochs,
    eval_steps=250,
    device=device,
    optimizer=optimizer,
    criterion=criterion,
    lr_scheduler=lr_scheduler
)

In [None]:
wandb.finish()

## Inference

In [None]:
torch.set_printoptions(sci_mode=False)

In [None]:
X[:, val_dataset.column_map['bid']] = X[:, val_dataset.column_map['bid']] / 1000
X[:, val_dataset.column_map['buyout']] = X[:, val_dataset.column_map['buyout']] / 1000

print(X)

pred = model(X.unsqueeze(0).to('cpu'))

X[:, val_dataset.column_map['bid']] = X[:, val_dataset.column_map['bid']] * 1000
X[:, val_dataset.column_map['buyout']] = X[:, val_dataset.column_map['buyout']] * 1000
X[:, val_dataset.column_map['quantity']] = X[:, val_dataset.column_map['quantity']] * 200
X[:, val_dataset.column_map['hours_since_first_appearance']] = X[:, val_dataset.column_map['hours_since_first_appearance']] * 48
X[:, val_dataset.column_map['time_left']] = X[:, val_dataset.column_map['time_left']] * 48

print(X)
print(y)

print(pred)

In [None]:
index_to_item = {i + 2: item_id for i, item_id in enumerate(items['item_id'])}
index_to_item[0] = 0
index_to_item[1] = 1

print(index_to_item.get(7631))