# Transformer Evaluation

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os
from pathlib import Path

wd = Path(os.path.dirname(os.path.abspath("__file__"))).parent.resolve()
sys.path.append(str(wd))

import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
from datetime import datetime, timedelta
from src.models.auction_transformer import AuctionTransformer
from src.models.inference import predict_dataframe

pd.options.display.max_columns = None
pd.options.display.width = None

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [42]:
prediction_time = datetime.strptime("2025-04-01 00:00:00", "%Y-%m-%d %H:%M:%S")

mappings_dir = '../generated/mappings'

with open(os.path.join(mappings_dir, 'item_to_idx.json'), 'r') as f:
        item_to_idx = json.load(f)

with open(os.path.join(mappings_dir, 'context_to_idx.json'), 'r') as f:
    context_to_idx = json.load(f)
    
with open(os.path.join(mappings_dir, 'bonus_to_idx.json'), 'r') as f:
    bonus_to_idx = json.load(f)

with open(os.path.join(mappings_dir, 'modtype_to_idx.json'), 'r') as f:
    modtype_to_idx = json.load(f)

feature_stats = torch.load('../generated/feature_stats.pt')

time_left_mapping = {
    'VERY_LONG': 48,
    'LONG': 12,
    'MEDIUM': 2,
    'SHORT': 0.5
}

In [None]:
from src.data.utils import load_auctions_from_sample

data_dir = '../data/sample/'

df_auctions = load_auctions_from_sample(data_dir, prediction_time, time_left_mapping, item_to_idx, context_to_idx, bonus_to_idx, modtype_to_idx)

print("Auctions shape:", df_auctions.shape)
df_auctions.head()

In [None]:
df_auctions[['current_hours', 'hours_on_sale']].describe()

In [None]:
model = AuctionTransformer.load_from_checkpoint(
    '../models/auction_transformer_40M/last.ckpt',
    map_location=device
)

print(f'Number of model parameters: {sum(p.numel() for p in model.parameters())}')
model.eval()
print('Pre-trained Transformer model loaded successfully.')

In [None]:
from src.models.inference import predict_dataframe

model = model.to('cuda')
df_auctions = predict_dataframe(model, df_auctions, prediction_time, feature_stats)

print("Mean hours on sale:", df_auctions['hours_on_sale'].mean())
print("Mean prediction:", df_auctions['prediction'].mean())
print("Mean sale probability:", df_auctions['sale_probability'].mean())

df_auctions.head()

In [None]:
df_auctions_12h = df_auctions[df_auctions['current_hours'] <= 0]
df_auctions_12h = df_auctions_12h[df_auctions_12h['time_left'] > 12.0]
len(df_auctions_12h)

mae = mean_absolute_error(df_auctions_12h['hours_on_sale'], df_auctions_12h['prediction'])
print(f"Mean absolute error: {mae}")

In [None]:
df_auctions.sample(5)

In [10]:
# save the dataframe to excel
df_auctions_12h.to_excel('../generated/predictions.xlsx', index=False)

### Binary classification

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

df_auctions_12h['sold_gt'] = df_auctions_12h['hours_on_sale'] <= 12
df_auctions_12h['sold_pred'] = df_auctions_12h['sale_probability'] >= 0.70

accuracy = accuracy_score(df_auctions_12h['sold_gt'], df_auctions_12h['sold_pred'])
precision = precision_score(df_auctions_12h['sold_gt'], df_auctions_12h['sold_pred'])
recall = recall_score(df_auctions_12h['sold_gt'], df_auctions_12h['sold_pred'])
f1 = f1_score(df_auctions_12h['sold_gt'], df_auctions_12h['sold_pred'])

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")

## Error analysis

In [None]:
columns = [
    'item_index',
    'bid',
    'buyout',
    'quantity',
    'time_left',
    'first_appearance',
    'last_appearance',
    'current_hours',
    'hours_on_sale',
    'prediction',
    'sale_probability'
]

df_error = df_auctions[columns].copy()
df_error['error'] = np.abs(df_error['hours_on_sale'] - df_error['prediction'])

df_error.head()

In [None]:
df_error['time_left'].hist(bins=10)

In [None]:
plt.boxplot(df_error[['hours_on_sale', 'prediction']])
plt.grid()
plt.show()

In [None]:
# Define bins for hours_on_sale
bins = [(0,12), (12,24), (24,48)]

# Calculate mean error for each bin
for start, end in bins:
    mask = (df_error['hours_on_sale'] >= start) & (df_error['hours_on_sale'] <= end)
    mean_error = df_error[mask]['error'].mean()
    print(f"Mean error for hours {start}-{end}: {mean_error:.2f}")

# Create boxplot showing error distribution in each bin
error_by_bin = []
labels = []
for start, end in bins:
    mask = (df_error['hours_on_sale'] >= start) & (df_error['hours_on_sale'] <= end)
    error_by_bin.append(df_error[mask]['error'])
    labels.append(f"{start}-{end}h")

plt.boxplot(error_by_bin, labels=labels)
plt.title("Error Distribution by Hours on Sale")
plt.ylabel("Absolute Error")
plt.xlabel("Hours on Sale Range") 
plt.grid()
plt.show()

In [None]:
# histogram of hours on sale and prediction
plt.hist(df_error['hours_on_sale'], bins=100, alpha=0.5, label='Hours on sale')
plt.hist(df_error['prediction'], bins=100, alpha=0.5, label='Prediction')
plt.legend(loc='upper right')
plt.grid()
plt.show()

In [None]:
plt.hist(df_error['current_hours'], bins=15)
plt.grid()
plt.show()

We are interested in evaluating the model when the items are recently published, because this will be the main use case for the model

In [None]:
query = (df_error['current_hours'] <= 12) & (df_error['time_left'] == 48.0)
query_df = df_error[query]
print(f"Mean sale probability: {query_df['sale_probability'].mean()}")
print(f"Mean error: {query_df['error'].mean()}")
print(f"Mean hours on sale: {query_df['hours_on_sale'].mean()}")
query_df.sample(10)

In [None]:
query_df['hours_on_sale'].hist(bins=10)
plt.grid()
plt.show()

In [None]:
corr_matrix = df_error[['bid', 'buyout', 'quantity', 'time_left', 'current_hours', 'sale_probability',
                        'hours_on_sale', 'prediction', 'error']].corr()

plt.figure(figsize=(10, 8))

sns.heatmap(corr_matrix, 
            annot=True, 
            cmap='coolwarm',
            vmin=-1, vmax=1, 
            center=0,
            fmt='.2f',
            square=True) 

plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

plt.tight_layout()

plt.show()

# Eval validation

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py
import sys
from tqdm import tqdm
from datetime import datetime
pd.options.mode.chained_assignment = None

from pathlib import Path

repo_root = Path.cwd().parent.resolve()
sys.path.append(str(repo_root))

from sklearn.model_selection import train_test_split
from src.data.auction_dataset import AuctionDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
pairs = pd.read_csv('../generated/auction_indices.csv')
pairs = pairs[pairs['g_hours_on_sale_max'] < 50]
pairs = pairs[pairs['g_current_hours_max'] < 50]

train_pairs, val_pairs = train_test_split(pairs, test_size=0.15, random_state=42, shuffle=False)

print(f"Before filtering: {len(train_pairs)}")

train_pairs = train_pairs[train_pairs['g_hours_on_sale_len'] <= 32]
val_pairs = val_pairs[val_pairs['g_hours_on_sale_len'] <= 32]

print(f"After filtering: {len(train_pairs)}\n")

val_pairs = val_pairs[val_pairs['record'] < '2025-03-23 00:00:00']

print(f"Train pairs: {len(train_pairs)}")
print(f"Val pairs: {len(val_pairs)}")

In [None]:
val_pairs = val_pairs[val_pairs['record'] == '2025-03-20 00:00:00']
val_pairs.head()

In [22]:
import json
import os

mappings_dir = '../generated/mappings'

with open(os.path.join(mappings_dir, 'item_to_idx.json'), 'r') as f:
    item_to_idx = json.load(f)

with open(os.path.join(mappings_dir, 'context_to_idx.json'), 'r') as f:
    context_to_idx = json.load(f)
    
with open(os.path.join(mappings_dir, 'bonus_to_idx.json'), 'r') as f:
    bonus_to_idx = json.load(f)

with open(os.path.join(mappings_dir, 'modtype_to_idx.json'), 'r') as f:
    modtype_to_idx = json.load(f)

feature_stats = torch.load('../generated/feature_stats.pt')

In [None]:
val_pairs.tail()

In [None]:
from src.models.auction_transformer import AuctionTransformer

model = AuctionTransformer.load_from_checkpoint(
    '../models/auction_transformer_7.2M_128b_wpos_ch_filtered/last-v1.ckpt',
    map_location=device
)

print(f'Number of model parameters: {sum(p.numel() for p in model.parameters())}')
model.eval()
print('Pre-trained Transformer model loaded successfully.')

In [None]:
from src.data.auction_dataset import AuctionDataset
from src.data.utils import collate_auctions

batch_size = 32

val_dataset = AuctionDataset(val_pairs, feature_stats=feature_stats, path='../generated/sequences.h5')
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_auctions) 

In [None]:
# evaluate the model on the validation set
total_mse = 0
total_mae = 0
total_samples = 0

with torch.no_grad():
    for batch in tqdm(val_dataloader):
        (auctions, item_index, contexts, bonus_lists, modifier_types, modifier_values, current_hours), y = batch

        auctions = auctions.to(device)
        item_index = item_index.to(device)
        contexts = contexts.to(device)
        bonus_lists = bonus_lists.to(device)
        modifier_types = modifier_types.to(device)
        modifier_values = modifier_values.to(device)
        current_hours = current_hours.to(device)
        y = y.to(device)

        model.eval()
        y_hat = model((auctions, item_index, contexts, bonus_lists, modifier_types, modifier_values))

        mask = (item_index != 0).float().unsqueeze(-1)
        current_hours_mask = (current_hours <= 12.0).float().unsqueeze(-1)
        mask = mask * current_hours_mask
        
        mse = torch.nn.functional.mse_loss(y_hat * mask, y.unsqueeze(2) * mask) / mask.sum()
        mae = torch.nn.functional.l1_loss(
            y_hat * mask * 48.0,
            y.unsqueeze(2) * mask * 48.0,
            reduction='sum'
        ) / mask.sum()
        
        total_mse += mse.item() * mask.sum()
        total_mae += mae.item() * mask.sum()
        total_samples += mask.sum()

avg_mse = total_mse / total_samples
avg_mae = total_mae / total_samples
print(f'Validation MSE: {avg_mse}')
print(f'Validation MAE: {avg_mae}')

In [None]:
from src.models.inference import predict_dataframe

predict_dataframe(model, df_auctions[df_auctions['item_index'] == 13815], prediction_time, feature_stats)