In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import mysql.connector
import os
from mysql.connector.pooling import MySQLConnectionPool, PooledMySQLConnection
from tqdm import tqdm
pd.options.mode.chained_assignment = None 

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.compose import make_column_transformer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
conn = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="auction_db"
)

In [None]:
cursor = conn.cursor()

cursor.execute("""  
    CREATE TABLE AuctionHours AS
    SELECT 
        a.auction_id,
        a.item_id,
        a.bid / 10000.0 AS bid, 
        a.buyout / 10000.0 AS buyout, 
        a.quantity,
        h.first_appearance_timestamp,
        h.total_hours_on_sale
    FROM 
        Auctions a
    INNER JOIN (
        SELECT 
            ae.auction_id, 
            COUNT(ae.record) AS total_hours_on_sale, 
            MIN(ae.record) AS first_appearance_timestamp
        FROM 
            ActionEvents ae
        GROUP BY 
            ae.auction_id
    ) h 
    ON 
        a.auction_id = h.auction_id;
""")

cursor.execute("""
    CREATE INDEX index_item
    ON AuctionHours (item_id);
""")

cursor.close()

In [None]:
cursor = conn.cursor()

cursor.execute("""  
    SELECT DISTINCT ae.record, a.item_id
    FROM Auctions a
    JOIN ActionEvents ae ON a.auction_id = ae.auction_id;       
""")

data = cursor.fetchall()

headers = [column[0] for column in cursor.description]

cursor.close()

pairs = pd.DataFrame(data, columns=headers)
pairs.to_csv('auction_indices.csv', index=False)

In [None]:
pairs = pd.read_csv('auction_indices.csv')
pairs.head()

In [None]:
from sklearn.model_selection import train_test_split

train_pairs, val_pairs = train_test_split(pairs, test_size=0.2, random_state=42, shuffle=False)

print(train_pairs.shape, val_pairs.shape)

In [None]:
def get_random_sample():
    pair = pairs.sample(1)

    cursor = conn.cursor()
    
    cursor.execute(f"""
        SELECT ae.auction_id, 
        ae.record, 
        ae.time_left,
        bid / 10000.0 as bid, 
        buyout / 10000.0 as buyout, 
        ah.quantity, 
        ah.item_id,
        ah.first_appearance_timestamp,
        CAST(TIMESTAMPDIFF(HOUR, ah.first_appearance_timestamp, ae.record) AS SIGNED) AS hours_since_first_appearance,
        ah.total_hours_on_sale
        FROM ActionEvents ae
        INNER JOIN (
            SELECT auction_id, item_id, bid, buyout, quantity, first_appearance_timestamp, total_hours_on_sale
            FROM AuctionHours ah
            WHERE ah.item_id = {pair['item_id'].values[0]}
        ) ah ON ah.auction_id = ae.auction_id 
        WHERE ae.record = "{pair['record'].values[0]}";
    """)

    data = cursor.fetchall()

    headers = [column[0] for column in cursor.description]

    sample = pd.DataFrame(data, columns=headers)

    sample['first_appearance_timestamp'] = pd.to_datetime(sample['first_appearance_timestamp'])
    sample['record'] = pd.to_datetime(sample['record'])

    sample['hours_on_sale'] = sample['total_hours_on_sale'] - sample['hours_since_first_appearance']

    cursor.close()
        
    return sample

sample = get_random_sample()
print(f"Median buyout: {sample['buyout'].median()}")
print(f"Median bid: {sample['bid'].median()}")
    
sample.head(5)

In [None]:
items = pd.read_csv('items.csv')
n_items = len(items)

item_to_index = {item_id: i + 1 for i, item_id in enumerate(items['item_id'])}
item_to_index[0] = 0
n_items

In [None]:
class AuctionDataset(torch.utils.data.Dataset):
    
    def __init__(self, conn, pairs, item_to_index):
        self.conn = conn
        self.pairs = pairs
        self.time_left_to_int = {
            'VERY_LONG': 48,
            'LONG': 24,
            'MEDIUM': 12,
            'SHORT': 2
        }
        self.item_to_index = item_to_index
        
        print(f"Dataset size: {len(self)}")
        
        #self.pool = MySQLConnectionPool(
        #    pool_name="mypool",
        #    pool_size=32,
        #    host="localhost",
        #    user="root",
        #    password="password",
        #    database="auction_db"
        #)

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs.iloc[idx]
        
        cursor = self.conn.cursor()
    
        cursor.execute(f"""
            SELECT ae.auction_id, 
            ae.record, 
            ae.time_left,
            bid / 10000.0 as bid, 
            buyout / 10000.0 as buyout, 
            ah.quantity, 
            ah.item_id,
            ah.first_appearance_timestamp,
            CAST(TIMESTAMPDIFF(HOUR, ah.first_appearance_timestamp, ae.record) AS SIGNED) AS hours_since_first_appearance,
            ah.total_hours_on_sale
            FROM ActionEvents ae
            INNER JOIN (
                SELECT auction_id, item_id, bid, buyout, quantity, first_appearance_timestamp, total_hours_on_sale
                FROM AuctionHours ah
                WHERE ah.item_id = {pair.values[1]}
            ) ah ON ah.auction_id = ae.auction_id 
            WHERE ae.record = "{pair.values[0]}";
        """)

        data = cursor.fetchall()

        headers = [column[0] for column in cursor.description]

        sample = pd.DataFrame(data, columns=headers)

        sample['first_appearance_timestamp'] = pd.to_datetime(sample['first_appearance_timestamp'])
        sample['record'] = pd.to_datetime(sample['record'])

        sample['hours_on_sale'] = sample['total_hours_on_sale'] - sample['hours_since_first_appearance']

        cursor.close()
        
        sample = sample.drop(columns=['auction_id', 'record', 'first_appearance_timestamp', 'total_hours_on_sale'])
        
        numerical_columns = [
            'bid', 
            'buyout', 
            'quantity'
        ]
        
        categorical_columns_ordinal = [
            'item_id',
            'time_left',
            'hours_since_first_appearance' 
        ]

        X = sample[numerical_columns + categorical_columns_ordinal]
        y = sample['hours_on_sale']
        
        X['time_left'] = X['time_left'].map(self.time_left_to_int) / 48.0
        X['item_id'] = X['item_id'].map(lambda x: self.item_to_index.get(x, 0))
        X['hours_since_first_appearance'] = X['hours_since_first_appearance'] / 48.0
        
        num_transformer = RobustScaler()

        column_transformer = make_column_transformer(
            (num_transformer, numerical_columns),
            remainder='passthrough'
        )

        X = column_transformer.fit_transform(X)
        y = np.array(y)
        
        X = torch.tensor(X, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)

        return X, y

In [None]:
def collate_auctions(batch):
    X, y = zip(*batch)
    
    max_length = max([x.size(0) for x in X])
    
    X = [F.pad(x, (0, 0, 0, max_length - x.size(0))) for x in X]
    y = [F.pad(x, (0, max_length - x.size(0))) for x in y]
    
    X = torch.stack(X)
    y = torch.stack(y)
    
    return X, y

train_dataset = AuctionDataset(conn, pairs, item_to_index)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_auctions)

iter_loader = iter(train_loader)
X, y = next(iter_loader)

In [None]:
class Encoder(nn.Module):
    
    def __init__(self, input_size=5, item_index=3, embedding_size=16, hidden_size=16, dropout_p=0.1):
        super(Encoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.item_index = item_index
        n_items = len(item_to_index)

        self.embedding = nn.Embedding(n_items, embedding_size)
        self.rnn = nn.LSTM(input_size + embedding_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, X):
        item_ids = X[:, :, self.item_index].long()
        
        X = torch.cat([X[:, :, :self.item_index], X[:, :, self.item_index + 1:]], dim=2)
        
        item_embeddings = self.dropout(self.embedding(item_ids))
        
        X = torch.cat([X, item_embeddings], dim=2)
        
        output, (hidden, cell) = self.rnn(X)
        
        return output, (hidden, cell)
    

class Decoder(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(Decoder, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.projection = nn.Sequential(
            nn.Linear(hidden_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.ReLU()
        )
        
    def forward(self, encoder_outputs, encoder_hidden):
        output, _ = self.rnn(encoder_outputs, encoder_hidden)
        output = self.projection(output)
        
        return output
    
    
class AuctionPredictor(nn.Module):
    def __init__(self, input_size=5, encoder_hidden_size=16, decoder_hidden_size=16, item_index=3, embedding_size=16, dropout_p=0.1):
        super(AuctionPredictor, self).__init__()
        self.encoder = Encoder(input_size, item_index, embedding_size, encoder_hidden_size, dropout_p)
        self.decoder = Decoder(encoder_hidden_size, decoder_hidden_size)
        
    def forward(self, X):
        encoder_outputs, encoder_hidden = self.encoder(X)
        decoder_outputs = self.decoder(encoder_outputs, encoder_hidden)
        return decoder_outputs

In [None]:
embedding_size = 16
encoder_hidden_size = 32
decoder_hidden_size = 32
epochs = 1

train_dataset = AuctionDataset(conn, pairs, item_to_index)
val_dataset = AuctionDataset(conn, val_pairs, item_to_index)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_auctions)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=True, collate_fn=collate_auctions)

model = AuctionPredictor(input_size=5, encoder_hidden_size=encoder_hidden_size, decoder_hidden_size=decoder_hidden_size, item_index=3, embedding_size=embedding_size).to(device)

print(sum(p.numel() for p in model.parameters()))

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.MSELoss(reduction='sum')

In [None]:
import os

def save_checkpoint(model, optimizer, epoch, checkpoint_path='checkpoints'):
    os.makedirs(checkpoint_path, exist_ok=True)
    checkpoint_file = os.path.join(checkpoint_path, f"checkpoint_epoch_{epoch}.pt")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, checkpoint_file)
    print(f"Checkpoint saved at {checkpoint_file}")

def train(
    model,
    train_loader,
    val_loader,
    epochs,
    eval_steps,
    device,
    optimizer,
    criterion
):
    print("Starting training for", epochs, "epochs")

    for epoch in tqdm(range(epochs)):
        model.train()
        
        mse_losses = []
        mae_losses = []
        
        for i, (X, y) in enumerate(train_loader):
            X = X.to(device)
            y = y.to(device)
            
            y_pred = model(X)
            
            loss = criterion(y_pred, y.unsqueeze(2))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            with torch.no_grad():
                mae = F.l1_loss(y_pred, y.unsqueeze(2), reduction='sum')
                n = (y != 0).sum().item()
                mae /= n
                
            mae_losses.append(mae.item())
            mse_losses.append(loss.item())
            
            if i % 50 == 0:
                print(f"Epoch {epoch} Iteration {i} Loss {np.mean(mse_losses)} MAE {np.mean(mae_losses)}")
                mse_losses = []
                mae_losses = []
                
            if i % eval_steps == 0:
                evaluate(model, val_loader, device, criterion)
        
        if epoch  % 5 == 0:
            save_checkpoint(model, optimizer, epoch)

@torch.no_grad()
def evaluate(
    model,
    val_loader,
    device,
    criterion
):
    print("Evaluating model")
    model.eval()

    mse_losses = []
    mae_losses = []

    for i, (X, y) in enumerate(val_loader):
        X = X.to(device)
        y = y.to(device)

        y_pred = model(X)

        loss = criterion(y_pred, y.unsqueeze(2))
        
        mae = F.l1_loss(y_pred, y.unsqueeze(2), reduction='sum')
        n = (y != 0).sum().item()
        mae /= n
        
        mse_losses.append(loss.item())
        mae_losses.append(mae.item())
        
        if (i + 1) % 100 == 0:
            break
        
    print(f"Validation loss: {np.mean(mse_losses)} MAE: {np.mean(mae_losses)}")
    model.train()
    
train(
    model,
    train_dataloader,
    val_dataloader,
    epochs,
    eval_steps=300,
    device=device,
    optimizer=optimizer,
    criterion=criterion
)