In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from datetime import datetime
pd.options.mode.chained_assignment = None 

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.compose import make_column_transformer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
pairs = pd.read_csv('auction_indices.csv')
pairs.head()

In [None]:
from sklearn.model_selection import train_test_split

train_pairs, val_pairs = train_test_split(pairs, test_size=0.1, random_state=42, shuffle=False)

val_pairs.head()

In [None]:
items = pd.read_csv('items.csv')
n_items = len(items)

item_to_index = {item_id: i + 1 for i, item_id in enumerate(items['item_id'])}
item_to_index[0] = 0
n_items

In [None]:
class AuctionDataset(torch.utils.data.Dataset):
    
    def __init__(self, pairs, item_to_index, path='data'):
        self.pairs = pairs
        self.time_left_to_int = {
            'VERY_LONG': 48,
            'LONG': 24,
            'MEDIUM': 12,
            'SHORT': 2
        }
        self.column_map = {
            'bid': 0,
            'buyout': 1,
            'quantity': 2,
            'item_id': 3,
            'time_left': 4,
            'hours_since_first_appearance': 5
        }
        self.item_to_index = item_to_index
        self.path = path
        
        print(f"Dataset size: {len(self)}")
        
    def __len__(self):
        return len(self.pairs)

    def scale(self, column):
        q1, median, q3 = torch.quantile(column, torch.tensor([0.25, 0.5, 0.75]), dim=0)
        iqr = q3 - q1
        
        if iqr < 1e-3:
            if len(column) == 1:
                return 0.0
            return (column - torch.mean(column)) / (torch.std(column) + 1e-6)
        else:
            return (column - median) / (q3 - q1 + 1e-6)

    def __getitem__(self, idx):
        pair = self.pairs.iloc[idx]
        
        record = pair['record']
        item_id = pair['item_id']
        
        date_time_obj = datetime.strptime(record, "%Y-%m-%d %H:%M:%S")

        date_folder_name = date_time_obj.strftime("%d-%m-%Y")
        hour_folder_name = date_time_obj.strftime("%H")
        
        X = torch.load(f'{self.path}/{date_folder_name}/{hour_folder_name}/{item_id}.pt')
        
        y = X[:, -1]
        X = X[:, :-1]
        
        X[:, self.column_map['bid']] = X[:, self.column_map['bid']] * 10000
        X[:, self.column_map['buyout']] = X[:, self.column_map['buyout']] * 10000
        X[:, self.column_map['item_id']] = torch.tensor([self.item_to_index.get(item, 0) for item in X[:, self.column_map['item_id']]], dtype=torch.long)
        X[:, self.column_map['time_left']] = X[:, self.column_map['time_left']] / 48.0
        X[:, self.column_map['hours_since_first_appearance']] = X[:, self.column_map['hours_since_first_appearance']] / 48.0
        
        X[:, self.column_map['bid']] = self.scale(X[:, self.column_map['bid']])
        X[:, self.column_map['buyout']] = self.scale(X[:, self.column_map['buyout']])
        X[:, self.column_map['quantity']] = X[:, self.column_map['quantity']] / 200.0

        return X, y

In [None]:
def collate_auctions(batch):
    X, y = zip(*batch)
    
    max_length = max([x.size(0) for x in X])
    
    X = [F.pad(x, (0, 0, 0, max_length - x.size(0))) for x in X]
    y = [F.pad(x, (0, max_length - x.size(0))) for x in y]
    
    X = torch.stack(X)
    y = torch.stack(y)
    
    return X, y

train_dataset = AuctionDataset(pairs, item_to_index)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_auctions)

iter_loader = iter(train_loader)
X, y = next(iter_loader)

In [None]:
class Encoder(nn.Module):
    
    def __init__(self, input_size=5, item_index=3, embedding_size=16, hidden_size=16, dropout_p=0.1):
        super(Encoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.item_index = item_index
        n_items = len(item_to_index)

        self.embedding = nn.Embedding(n_items, embedding_size)
        self.rnn = nn.LSTM(input_size + embedding_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, X):
        item_ids = X[:, :, self.item_index].long()
        
        X = torch.cat([X[:, :, :self.item_index], X[:, :, self.item_index + 1:]], dim=2)
        
        item_embeddings = self.dropout(self.embedding(item_ids))
        
        X = torch.cat([X, item_embeddings], dim=2)
        
        output, (hidden, cell) = self.rnn(X)
        
        return output, (hidden, cell)
    

class Decoder(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(Decoder, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.projection = nn.Sequential(
            nn.Linear(hidden_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.ReLU()
        )
        
    def forward(self, encoder_outputs, encoder_hidden):
        output, _ = self.rnn(encoder_outputs, encoder_hidden)
        output = self.projection(output)
        
        return output
    
    
class AuctionPredictor(nn.Module):
    def __init__(self, input_size=5, encoder_hidden_size=16, decoder_hidden_size=16, item_index=3, embedding_size=16, dropout_p=0.1):
        super(AuctionPredictor, self).__init__()
        self.encoder = Encoder(input_size, item_index, embedding_size, encoder_hidden_size, dropout_p)
        self.decoder = Decoder(encoder_hidden_size, decoder_hidden_size)
        
    def forward(self, X):
        encoder_outputs, encoder_hidden = self.encoder(X)
        decoder_outputs = self.decoder(encoder_outputs, encoder_hidden)
        return decoder_outputs

In [None]:
embedding_size = 16
encoder_hidden_size = 32
decoder_hidden_size = 32
epochs = 1

train_dataset = AuctionDataset(pairs, item_to_index)
val_dataset = AuctionDataset(val_pairs, item_to_index)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=collate_auctions)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=128, shuffle=True, collate_fn=collate_auctions)

model = AuctionPredictor(input_size=5, encoder_hidden_size=encoder_hidden_size, decoder_hidden_size=decoder_hidden_size, item_index=3, embedding_size=embedding_size).to(device)

print(sum(p.numel() for p in model.parameters()))

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.MSELoss(reduction='sum')

In [None]:
import os

def save_checkpoint(model, optimizer, epoch, checkpoint_path='checkpoints'):
    os.makedirs(checkpoint_path, exist_ok=True)
    checkpoint_file = os.path.join(checkpoint_path, f"checkpoint_epoch_{epoch}.pt")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, checkpoint_file)
    print(f"Checkpoint saved at {checkpoint_file}")

def train(
    model,
    train_loader,
    val_loader,
    epochs,
    eval_steps,
    device,
    optimizer,
    criterion
):
    print("Starting training for", epochs, "epochs")

    for epoch in tqdm(range(epochs)):
        model.train()
        
        mse_losses = []
        mae_losses = []
        
        for i, (X, y) in enumerate(train_loader):
            X = X.to(device)
            y = y.to(device)
            
            y_pred = model(X)
            
            loss = criterion(y_pred, y.unsqueeze(2))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            with torch.no_grad():
                mae = F.l1_loss(y_pred, y.unsqueeze(2), reduction='sum')
                n = (y != 0).sum().item()
                mae /= n
                
            mae_losses.append(mae.item())
            mse_losses.append(loss.item())
            
            if i % 50 == 0:
                print(f"Epoch {epoch} Iteration {i} Loss {np.mean(mse_losses)} MAE {np.mean(mae_losses)}")
                mse_losses = []
                mae_losses = []
                
            if i % eval_steps == 0:
                evaluate(model, val_loader, device, criterion)
        
        if epoch  % 5 == 0:
            save_checkpoint(model, optimizer, epoch)

@torch.no_grad()
def evaluate(
    model,
    val_loader,
    device,
    criterion
):
    print("Evaluating model")
    model.eval()

    mse_losses = []
    mae_losses = []

    for i, (X, y) in enumerate(val_loader):
        X = X.to(device)
        y = y.to(device)

        y_pred = model(X)

        loss = criterion(y_pred, y.unsqueeze(2))
        
        mae = F.l1_loss(y_pred, y.unsqueeze(2), reduction='sum')
        n = (y != 0).sum().item()
        mae /= n
        
        mse_losses.append(loss.item())
        mae_losses.append(mae.item())
        
        if (i + 1) % 100 == 0:
            break
        
    print(f"Validation loss: {np.mean(mse_losses)} MAE: {np.mean(mae_losses)}")
    model.train()
    
train(
    model,
    train_dataloader,
    val_dataloader,
    epochs,
    eval_steps=300,
    device=device,
    optimizer=optimizer,
    criterion=criterion
)