In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from datetime import datetime
pd.options.mode.chained_assignment = None

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.compose import make_column_transformer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Projects/AuctionHouse

Mounted at /content/drive
/content/drive/MyDrive/Projects/AuctionHouse


In [None]:
pairs = pd.read_csv('auction_indices.csv')
pairs.head()

Unnamed: 0,record,item_id
0,2024-01-02 00:00:00,754
1,2024-01-02 00:00:00,774
2,2024-01-02 00:00:00,785
3,2024-01-02 00:00:00,870
4,2024-01-02 00:00:00,1076


In [None]:
from sklearn.model_selection import train_test_split

train_pairs, val_pairs = train_test_split(pairs, test_size=0.1, random_state=42, shuffle=False)

val_pairs.head()

Unnamed: 0,record,item_id
1002395,2024-02-25 10:00:00,32676
1002396,2024-02-25 10:00:00,32751
1002397,2024-02-25 10:00:00,32772
1002398,2024-02-25 10:00:00,33208
1002399,2024-02-25 10:00:00,33365


In [None]:
items = pd.read_csv('items.csv')
n_items = len(items)

item_to_index = {item_id: i + 2 for i, item_id in enumerate(items['item_id'])}
item_to_index[0] = 0 # padding
item_to_index[1] = 1 # unknown
n_items

10396

In [None]:
class AuctionDataset(torch.utils.data.Dataset):

    def __init__(self, pairs, item_to_index, path='sequences'):
        self.pairs = pairs
        self.time_left_to_int = {
            'VERY_LONG': 48,
            'LONG': 24,
            'MEDIUM': 12,
            'SHORT': 2
        }
        self.column_map = {
            'bid': 0,
            'buyout': 1,
            'quantity': 2,
            'item_id': 3,
            'time_left': 4,
            'hours_since_first_appearance': 5
        }
        self.item_to_index = item_to_index
        self.path = path

        print(f"Dataset size: {len(self)}")

    def __len__(self):
        return len(self.pairs)

    def scale(self, column):
        q1, median, q3 = torch.quantile(column, torch.tensor([0.25, 0.5, 0.75]), dim=0)
        iqr = q3 - q1

        if iqr < 1e-3:
            if len(column) == 1:
                return 0.0
            return (column - torch.mean(column)) / (torch.std(column) + 1e-6)
        else:
            return (column - median) / (q3 - q1 + 1e-6)

    def __getitem__(self, idx):
        pair = self.pairs.iloc[idx]

        record = pair['record']
        item_id = pair['item_id']

        date_time_obj = datetime.strptime(record, "%Y-%m-%d %H:%M:%S")

        date_folder_name = date_time_obj.strftime("%d-%m-%Y")
        hour_folder_name = date_time_obj.strftime("%H")

        data = torch.load(f'{self.path}/{date_folder_name}/{hour_folder_name}.pt')

        X = data[str(item_id)]

        y = X[:, -1]
        X = X[:, :-1]

        X[:, self.column_map['bid']] = X[:, self.column_map['bid']] * 10000
        X[:, self.column_map['buyout']] = X[:, self.column_map['buyout']] * 10000
        X[:, self.column_map['item_id']] = torch.tensor([self.item_to_index.get(int(item), 1) for item in X[:, self.column_map['item_id']]], dtype=torch.long)
        X[:, self.column_map['time_left']] = X[:, self.column_map['time_left']] / 48.0
        X[:, self.column_map['hours_since_first_appearance']] = X[:, self.column_map['hours_since_first_appearance']] / 48.0

        #X[:, self.column_map['bid']] = self.scale(X[:, self.column_map['bid']])
        #X[:, self.column_map['buyout']] = self.scale(X[:, self.column_map['buyout']])

        X[:, self.column_map['bid']] = X[:, self.column_map['bid']] / 1000
        X[:, self.column_map['buyout']] = X[:, self.column_map['buyout']] / 1000

        #X[:, self.column_map['bid']] = torch.log1p(X[:, self.column_map['bid']]) / 12.206
        #X[:, self.column_map['buyout']] = torch.log1p(X[:, self.column_map['buyout']]) / 12.206

        X[:, self.column_map['quantity']] = X[:, self.column_map['quantity']] / 200.0

        return X, y

In [None]:
def collate_auctions(batch):
    X, y = zip(*batch)

    lengths = [x.size(0) for x in X]
    lengths = torch.tensor(lengths)

    max_length = lengths.max()

    X = [F.pad(x, (0, 0, 0, max_length - x.size(0))) for x in X]
    y = [F.pad(x, (0, max_length - x.size(0))) for x in y]

    X = torch.stack(X)
    y = torch.stack(y)

    return X, y

train_dataset = AuctionDataset(pairs, item_to_index)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_auctions)

iter_loader = iter(train_loader)
X, y = next(iter_loader)

Dataset size: 1113773


In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class Encoder(nn.Module):

    def __init__(self, input_size=5, item_index=3, embedding_size=16, hidden_size=16, dropout_p=0.1, bidirectional=True):
        super(Encoder, self).__init__()

        self.hidden_size = hidden_size
        self.item_index = item_index
        n_items = len(item_to_index)

        self.embedding = nn.Embedding(n_items, embedding_size)
        self.rnn = nn.LSTM(input_size + embedding_size, hidden_size, batch_first=True, num_layers=2, bidirectional=bidirectional)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, X):
        #with torch.no_grad():
        #  lengths = X.sum(dim=2)
        #  lengths[lengths != 0] = 1
        #  lengths = lengths.sum(dim=1).long()

        item_ids = X[:, :, self.item_index].long()

        X = torch.cat([X[:, :, :self.item_index], X[:, :, self.item_index + 1:]], dim=2)

        item_embeddings = self.dropout(self.embedding(item_ids))

        X = torch.cat([X, item_embeddings], dim=2)
        #X = pack_padded_sequence(X, lengths.cpu(), batch_first=True, enforce_sorted=False)

        output, (hidden, cell) = self.rnn(X)

        return output, (hidden, cell)


class Decoder(nn.Module):

    def __init__(self, input_size, hidden_size, bidirectional=True):
        super(Decoder, self).__init__()
        output_size = hidden_size * 2 if bidirectional else hidden_size
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True, num_layers=2, bidirectional=bidirectional)
        self.projection = nn.Sequential(
            nn.Linear(output_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, encoder_outputs, encoder_hidden):
        output, _ = self.rnn(encoder_outputs, encoder_hidden)

        #output, output_lengths = pad_packed_sequence(output, batch_first=True)
        output = self.projection(output)

        return output


class AuctionPredictor(nn.Module):
    def __init__(self, input_size=5, encoder_hidden_size=16, decoder_hidden_size=16, item_index=3, embedding_size=16, dropout_p=0.1, bidirectional=True):
        super(AuctionPredictor, self).__init__()
        decoder_input_size = encoder_hidden_size * 2 if bidirectional else encoder_hidden_size
        self.encoder = Encoder(input_size, item_index, embedding_size, encoder_hidden_size, dropout_p, bidirectional=bidirectional)
        self.decoder = Decoder(decoder_input_size, decoder_hidden_size, bidirectional=bidirectional)

    def forward(self, X):
        encoder_outputs, encoder_hidden = self.encoder(X)
        decoder_outputs = self.decoder(encoder_outputs, encoder_hidden)
        return decoder_outputs

In [None]:
train_dataset = AuctionDataset(train_pairs, item_to_index)
val_dataset = AuctionDataset(val_pairs, item_to_index)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_auctions, num_workers=8, prefetch_factor=4)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=True, collate_fn=collate_auctions, num_workers=8, prefetch_factor=4)

Dataset size: 1002395
Dataset size: 111378


In [None]:
embedding_size = 64
encoder_hidden_size = 128
decoder_hidden_size = 128
epochs = 5

model = AuctionPredictor(input_size=5,
                         encoder_hidden_size=encoder_hidden_size,
                         decoder_hidden_size=decoder_hidden_size,
                         item_index=3,
                         embedding_size=embedding_size,
                         dropout_p=0.1,
                         bidirectional=False
                         ).to(device)

print(sum(p.numel() for p in model.parameters()))

# load checkpoints/checkpoint_epoch_4.pt
checkpoint = torch.load('checkpoints/checkpoint_epoch_4.pt')
model.load_state_dict(checkpoint['model_state_dict'])

optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
total_steps = len(train_dataloader) * epochs
lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.00001, total_iters=total_steps)
criterion = nn.MSELoss(reduction='sum')

1174017


In [None]:
def save_checkpoint(model, optimizer, epoch, checkpoint_path='checkpoints'):
    os.makedirs(checkpoint_path, exist_ok=True)
    checkpoint_file = os.path.join(checkpoint_path, f"checkpoint_epoch_{epoch}.pt")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, checkpoint_file)
    print(f"Checkpoint saved at {checkpoint_file}")

def train(
    model,
    train_loader,
    val_loader,
    epochs,
    eval_steps,
    device,
    optimizer,
    criterion,
    lr_scheduler
):
    print("Starting training for", epochs, "epochs")

    for epoch in tqdm(range(epochs)):
        model.train()

        mse_losses = []
        mae_losses = []

        for i, (X, y) in enumerate(train_loader):
            X = X.to(device)
            y = y.to(device)

            y_pred = model(X)

            loss = criterion(y_pred, y.unsqueeze(2))
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            with torch.no_grad():
                mae = F.l1_loss(y_pred, y.unsqueeze(2), reduction='sum')
                n = (y != 0).sum().item()
                mae /= n

            mse_losses.append(loss.item() / n)
            mae_losses.append(mae.item())

            if i % 50 == 0:
                print(f"Epoch {epoch} Iteration {i} Loss {np.mean(mse_losses)} MAE {np.mean(mae_losses)} LR {lr_scheduler.get_last_lr()[0]}")
                mse_losses = []
                mae_losses = []

            if (i + 1) % eval_steps == 0:
                evaluate(model, val_loader, device, criterion)

        if epoch % 1 == 0:
            save_checkpoint(model, optimizer, epoch)

@torch.no_grad()
def evaluate(
    model,
    val_loader,
    device,
    criterion
):
    print("Evaluating model")
    model.eval()

    mse_losses = []
    mae_losses = []

    for i, (X, y) in enumerate(val_loader):
      if i >= 100:
        break

      if i % 15 == 0:
        print(f"Evaluating step {i}")

      X = X.to(device)
      y = y.to(device)

      y_pred = model(X)

      loss = criterion(y_pred, y.unsqueeze(2))

      mae = F.l1_loss(y_pred, y.unsqueeze(2), reduction='sum')
      n = (y != 0).sum().item()
      mae /= n

      mse_losses.append(loss.item() / n)
      mae_losses.append(mae.item())

      if i % 25 == 0:
        print(y[0][:10])
        print(y_pred[0,:, 0][:10])

    print(f"Validation loss: {np.mean(mse_losses)} MAE: {np.mean(mae_losses)}")
    model.train()

train(
    model,
    train_dataloader,
    val_dataloader,
    epochs,
    eval_steps=1000,
    device=device,
    optimizer=optimizer,
    criterion=criterion,
    lr_scheduler=lr_scheduler
)

Starting training for 5 epochs


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 0 Iteration 0 Loss 387.74139197127937 MAE 16.728565216064453 LR 0.0002999961696992913
Epoch 0 Iteration 50 Loss 341.43650337382553 MAE 16.906766510009767 LR 0.00029980465466385793
Epoch 0 Iteration 100 Loss 258.1453592647156 MAE 14.06164945602417 LR 0.0002996131396284243
Epoch 0 Iteration 150 Loss 163.49432383409533 MAE 10.134545240402222 LR 0.00029942162459299084
Epoch 0 Iteration 200 Loss 155.2483641341591 MAE 9.923890647888184 LR 0.0002992301095575574
Epoch 0 Iteration 250 Loss 149.19825456594478 MAE 9.697921285629272 LR 0.0002990385945221238
Epoch 0 Iteration 300 Loss 144.4887900161598 MAE 9.617882661819458 LR 0.0002988470794866904
Epoch 0 Iteration 350 Loss 144.4611324280949 MAE 9.404248542785645 LR 0.0002986555644512569
Epoch 0 Iteration 400 Loss 156.59676921302244 MAE 9.785256271362305 LR 0.0002984640494158232
Epoch 0 Iteration 450 Loss 141.44065185498633 MAE 9.400469379425049 LR 0.0002982725343803896
Epoch 0 Iteration 500 Loss 137.90751923391988 MAE 9.114545831680298 LR 0

 20%|██        | 1/5 [2:37:37<10:30:31, 9457.76s/it]

Checkpoint saved at checkpoints/checkpoint_epoch_0.pt
Epoch 1 Iteration 0 Loss 40.21374311257563 MAE 4.468859672546387 LR 0.00024000216969939346
Epoch 1 Iteration 50 Loss 72.59356893115157 MAE 6.124475927352905 LR 0.00023981065466395949
Epoch 1 Iteration 100 Loss 69.01309597827397 MAE 5.85589298248291 LR 0.0002396191396285256
Epoch 1 Iteration 150 Loss 66.91184265419555 MAE 5.772394471168518 LR 0.0002394276245930918
Epoch 1 Iteration 200 Loss 63.225954398695094 MAE 5.604863777160644 LR 0.00023923610955765793
Epoch 1 Iteration 250 Loss 71.18922708166815 MAE 5.8794188308715825 LR 0.00023904459452222385
Epoch 1 Iteration 300 Loss 65.57650654303194 MAE 5.6242072057724 LR 0.00023885307948678984
Epoch 1 Iteration 350 Loss 73.05049762624854 MAE 6.095512895584107 LR 0.00023866156445135605
Epoch 1 Iteration 400 Loss 68.1220819766015 MAE 5.797743291854858 LR 0.00023847004941592208
Epoch 1 Iteration 450 Loss 68.06593890971835 MAE 5.8053111743927 LR 0.0002382785343804883
Epoch 1 Iteration 500 Loss

 40%|████      | 2/5 [5:15:55<7:54:03, 9481.20s/it] 

Checkpoint saved at checkpoints/checkpoint_epoch_1.pt
Epoch 2 Iteration 0 Loss 77.748914297862 MAE 5.9417009353637695 LR 0.00018000816969935865
Epoch 2 Iteration 50 Loss 67.03021397161905 MAE 5.555707139968872 LR 0.0001798166546639247
Epoch 2 Iteration 100 Loss 61.74754737541488 MAE 5.330478038787842 LR 0.0001796251396284908
Epoch 2 Iteration 150 Loss 59.25876862821334 MAE 5.253379578590393 LR 0.00017943362459305694
Epoch 2 Iteration 200 Loss 58.84966441024446 MAE 5.239739398956299 LR 0.00017924210955762299
Epoch 2 Iteration 250 Loss 59.69230212390392 MAE 5.377727022171021 LR 0.00017905059452218898
Epoch 2 Iteration 300 Loss 62.76048214371118 MAE 5.651548280715942 LR 0.000178859079486755
Epoch 2 Iteration 350 Loss 64.81365107320127 MAE 5.491477370262146 LR 0.00017866756445132103
Epoch 2 Iteration 400 Loss 63.1889686659222 MAE 5.4221864652633665 LR 0.0001784760494158871
Epoch 2 Iteration 450 Loss 66.24445307268545 MAE 5.635470457077027 LR 0.0001782845343804531
Epoch 2 Iteration 500 Loss

 60%|██████    | 3/5 [7:52:56<5:15:07, 9453.95s/it]

Checkpoint saved at checkpoints/checkpoint_epoch_2.pt
Epoch 3 Iteration 0 Loss 62.07037741324536 MAE 4.741634845733643 LR 0.0001200141696993054
Epoch 3 Iteration 50 Loss 60.65375428292836 MAE 5.083800010681152 LR 0.00011982265466387166
Epoch 3 Iteration 100 Loss 61.003044434831466 MAE 5.192513461112976 LR 0.00011963113962843778
Epoch 3 Iteration 150 Loss 59.283032700788105 MAE 5.151449122428894 LR 0.00011943962459300392
Epoch 3 Iteration 200 Loss 57.83025053847074 MAE 5.106484980583191 LR 0.00011924810955757008
Epoch 3 Iteration 250 Loss 58.823592385556175 MAE 5.143420548439026 LR 0.00011905659452213623
Epoch 3 Iteration 300 Loss 59.36708352985109 MAE 5.241384673118591 LR 0.00011886507948670238
Epoch 3 Iteration 350 Loss 60.9713658456985 MAE 5.2146401643753055 LR 0.00011867356445126853
Epoch 3 Iteration 400 Loss 58.28736707228202 MAE 5.096413898468017 LR 0.00011848204941583466
Epoch 3 Iteration 450 Loss 66.65729881166135 MAE 5.508370280265808 LR 0.00011829053438040083
Epoch 3 Iteration

 80%|████████  | 4/5 [10:31:18<2:37:52, 9472.89s/it]

Checkpoint saved at checkpoints/checkpoint_epoch_3.pt
Epoch 4 Iteration 0 Loss 68.53375234962407 MAE 5.896768093109131 LR 6.002016969931219e-05
Epoch 4 Iteration 50 Loss 61.213519047702995 MAE 5.094015970230102 LR 5.982865466387831e-05
Epoch 4 Iteration 100 Loss 59.457481210769735 MAE 5.0392138671875 LR 5.9637139628444466e-05
Epoch 4 Iteration 150 Loss 53.30390284006513 MAE 4.725748348236084 LR 5.944562459301057e-05
Epoch 4 Iteration 200 Loss 57.00452119418108 MAE 4.875680317878723 LR 5.925410955757669e-05
Epoch 4 Iteration 250 Loss 56.68901117319919 MAE 4.960416550636292 LR 5.9062594522142804e-05
Epoch 4 Iteration 300 Loss 58.66597653662594 MAE 5.005424580574036 LR 5.88710794867089e-05
Epoch 4 Iteration 350 Loss 63.98354915853766 MAE 5.288551650047302 LR 5.867956445127498e-05
Epoch 4 Iteration 400 Loss 56.31032955248066 MAE 4.969975929260254 LR 5.848804941584107e-05
Epoch 4 Iteration 450 Loss 53.06448429824889 MAE 4.814384655952454 LR 5.829653438040714e-05
Epoch 4 Iteration 500 Loss 5

100%|██████████| 5/5 [13:10:14<00:00, 9482.85s/it]

Checkpoint saved at checkpoints/checkpoint_epoch_4.pt





In [None]:
val_dataset.column_map

{'bid': 0,
 'buyout': 1,
 'quantity': 2,
 'item_id': 3,
 'time_left': 4,
 'hours_since_first_appearance': 5}

In [None]:
torch.set_printoptions(sci_mode=False)

In [None]:
X[:, val_dataset.column_map['bid']] = X[:, val_dataset.column_map['bid']] / 1000
X[:, val_dataset.column_map['buyout']] = X[:, val_dataset.column_map['buyout']] / 1000

print(X)

pred = model(X.unsqueeze(0).to('cpu'))

X[:, val_dataset.column_map['bid']] = X[:, val_dataset.column_map['bid']] * 1000
X[:, val_dataset.column_map['buyout']] = X[:, val_dataset.column_map['buyout']] * 1000
X[:, val_dataset.column_map['quantity']] = X[:, val_dataset.column_map['quantity']] * 200
X[:, val_dataset.column_map['hours_since_first_appearance']] = X[:, val_dataset.column_map['hours_since_first_appearance']] * 48
X[:, val_dataset.column_map['time_left']] = X[:, val_dataset.column_map['time_left']] * 48

print(X)
print(y)

print(pred)

In [None]:
index_to_item = {i + 2: item_id for i, item_id in enumerate(items['item_id'])}
index_to_item[0] = 0
index_to_item[1] = 1

print(index_to_item.get(7631))

36050
