In [None]:
import sys
import os
from pathlib import Path

wd = Path(os.path.dirname(os.path.abspath("__file__"))).parent.resolve()
sys.path.append(str(wd))

import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
from datetime import datetime, timedelta
from prediction_engine.model import AuctionPredictor

pd.options.display.max_columns = None
pd.options.display.width = None

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = 'cpu'
print(f"Using device: {device}")

In [None]:
items = pd.read_csv('../data/items.csv')
print("Items shape:", items.shape)

n_items = len(items)
item_to_index = {item_id: i + 2 for i, item_id in enumerate(items['item_id'])}
item_to_index[0] = 0 
item_to_index[1] = 1  
print(f"Number of unique items: {n_items}")

time_left_mapping = {
    'VERY_LONG': 48,
    'LONG': 12,
    'MEDIUM': 2,
    'SHORT': 0.5
}

In [None]:
def load_auctions_from_sample(data_dir):
    file_info = {}
    auction_appearances = {}

    for root, dirs, files in os.walk(data_dir):
        for filename in tqdm(files):
            filepath = os.path.join(root, filename)
            date = datetime.strptime(filename.split('.')[0], '%Y%m%dT%H')
            file_info[filepath] = date

    file_info = {k: v for k, v in sorted(file_info.items(), key=lambda item: item[1])}
    
    all_auctions = []
    
    for filepath in list(file_info.keys()):
        with open(filepath, 'r') as f:
            try:
                json_data = json.load(f)
                
                if 'auctions' not in json_data:
                    print(f"File {filepath} does not contain 'auctions' key, skipping.")
                    continue
                
                auction_data = json_data['auctions']
                timestamp = file_info[filepath]
                
                for auction in auction_data:
                    auction_id = auction['id']
                    auction['timestamp'] = timestamp.strftime("%Y-%m-%d %H:%M:%S")
                    
                    if auction_id not in auction_appearances:
                        auction_appearances[auction_id] = {'first': timestamp, 'last': timestamp}
                    else:
                        auction_appearances[auction_id]['last'] = timestamp
                
                all_auctions.extend(auction_data)
            except json.JSONDecodeError as e:
                print(f"Error loading file {filepath}: {e}")
                continue
            except Exception as e:
                print(f"Unexpected error loading file {filepath}: {e}")
                continue

    return all_auctions, auction_appearances

def process_auction_data(auctions, auction_appearances, prediction_time):
    auctions_by_item = {}
    hours_on_sale = {}
    auction_ids_by_item = {}
    hours_since_first_appearance_values = [] 

    for auction in auctions:
        auction_id = auction['id']
        item_id = auction['item']['id']
        time_left_numeric = time_left_mapping.get(auction['time_left'], 0) / 48.0
        bid = np.log1p(auction['bid'] / 10000.0) / 15.0
        buyout = np.log1p(auction['buyout'] / 10000.0) / 15.0
        quantity = auction['quantity'] / 200.0
        item_index = item_to_index.get(item_id, 1)
        timestamp = datetime.strptime(auction['timestamp'], "%Y-%m-%d %H:%M:%S")

        if timestamp != prediction_time:
            continue

        hours_since_first_appearance = (prediction_time - auction_appearances[auction_id]['first']).total_seconds() / 3600
        hours_since_first_appearance_values.append(hours_since_first_appearance)  
        hours_since_first_appearance_normalized = hours_since_first_appearance / 48.0
        hours_on_sale[auction_id] = (auction_appearances[auction_id]['last'] - prediction_time).total_seconds() / 3600

        datetime_str = prediction_time.strftime("%Y-%m-%d %H:%M:%S")

        processed_auction = [
            bid, 
            buyout,  
            quantity, 
            item_index,
            time_left_numeric, 
            hours_since_first_appearance_normalized
        ]
        
        if item_index not in auctions_by_item:
            auctions_by_item[item_index] = []
            auction_ids_by_item[item_index] = []

        auctions_by_item[item_index].append(processed_auction)
        auction_ids_by_item[item_index].append(auction_id)

    if hours_since_first_appearance_values:
        print(f"Hours since first appearance statistics: Min: {min(hours_since_first_appearance_values)}, Max: {max(hours_since_first_appearance_values)}, Mean: {np.mean(hours_since_first_appearance_values)}")
    
    return auctions_by_item, auction_ids_by_item, hours_on_sale

prediction_time = datetime.strptime("2024-10-12 00:00:00", "%Y-%m-%d %H:%M:%S")

data_dir = '../data/sample/'
auction_data, auction_appearances = load_auctions_from_sample(data_dir)
auctions_by_item, auction_ids_by_item, hours_on_sale = process_auction_data(auction_data, auction_appearances, prediction_time)
print(f"Processed auctions for {len(auctions_by_item)} different items.")
print(f"Example of processed auctions for an item: {auctions_by_item[list(auctions_by_item.keys())[0]][0]}")
print(f"Example of hours_on_sale for an auction: {list(hours_on_sale.items())[0]}")

In [None]:
model = AuctionPredictor(
    n_items=len(item_to_index),             
    input_size=5,                   
    encoder_hidden_size=1024,
    decoder_hidden_size=1024,
    item_index=3,                   
    embedding_size=512,
    dropout_p=0.2,
    bidirectional=False
).to(device)

print(f'Number of model parameters: {sum(p.numel() for p in model.parameters())}')

model_path = '../models/checkpoint_epoch_1_iter_10336.pt'
checkpoint = torch.load(model_path, map_location=device)

model.load_state_dict(checkpoint['model_state_dict'])
model.eval()  
print('Pre-trained RNN model loaded successfully.')

In [None]:
def evaluate_rnn_with_inspection_and_mae(model, auctions_by_item, auction_ids_by_item, hours_on_sale, prediction_time):
    all_predictions = []
    all_actual_values = []

    print(f"Total number of items: {len(auctions_by_item)}")
    print(f"Total number of auctions: {sum(len(auctions) for auctions in auctions_by_item.values())}")

    if hours_on_sale:
        print(f"Hours on sale statistics: Min: {min(hours_on_sale.values())}, Max: {max(hours_on_sale.values())}, Mean: {np.mean(list(hours_on_sale.values()))}")
    else:
        print("No 'hours_on_sale' data to calculate statistics.")

    for item_idx, auctions in auctions_by_item.items():
        if not auctions:
            continue
        
        auctions_np = np.array(auctions)
        X = torch.tensor(auctions_np, dtype=torch.float32).to(device)
        X = X.unsqueeze(0)
        lengths = torch.tensor([X.size(1)], dtype=torch.long)
        
        with torch.no_grad():
            predictions = model(X, lengths)

        auction_ids = auction_ids_by_item.get(item_idx, [])  
        actual_values = [hours_on_sale.get(auction_id, 0) for auction_id in auction_ids]  

        if len(predictions.squeeze(0)) == len(actual_values):
            all_predictions.extend(predictions.squeeze((0, -1)).cpu().numpy())
            all_actual_values.extend(actual_values)
        else:
            print(f"Skipping item {item_idx} due to size mismatch: {len(predictions.squeeze(0))} predictions vs {len(actual_values)} actual values.")

    if not all_predictions:
        print("No valid auctions were processed. Check your data.")
        return None, None

    print(f"Number of predictions: {len(all_predictions)}")
    print(f"Number of actual values: {len(all_actual_values)}")
    print(f"Predictions statistics: Min: {min(all_predictions)}, Max: {max(all_predictions)}, Mean: {np.mean(all_predictions)}")
    print(f"Actual values statistics: Min: {min(all_actual_values)}, Max: {max(all_actual_values)}, Mean: {np.mean(all_actual_values)}")

    return all_predictions, all_actual_values

def calculate_mae(all_predictions, all_actual_values):
    if len(all_predictions) == 0 or len(all_actual_values) == 0:
        print("No valid data for MAE calculation.")
        return None
    
    all_predictions = np.array(all_predictions)
    all_actual_values = np.array(all_actual_values)
    
    mae = mean_absolute_error(all_actual_values, all_predictions)
    
    return mae

all_predictions, all_actual_values = evaluate_rnn_with_inspection_and_mae(
    model, auctions_by_item, auction_ids_by_item, hours_on_sale, prediction_time
)

if all_predictions is not None and all_actual_values is not None:
    rnn_mae = calculate_mae(all_predictions, all_actual_values)
    if rnn_mae is not None:
        print(f'RNN Model MAE: {rnn_mae}')
    else:
        print('Evaluation failed due to lack of valid data.')
else:
    print('No predictions were made.')

In [None]:
df = pd.DataFrame({
    'target': all_actual_values,
    'prediction': all_predictions
})

df['error'] = np.abs(df['target'] - df['prediction'])

df.sample(10)

In [None]:
df.describe()

In [None]:
df.query('target > 30').sample(10)

In [None]:
df['target'].hist()

In [None]:
df['prediction'].hist()

In [None]:
df['error'].hist()

In [None]:
plt.boxplot(df[['target', 'prediction']])
plt.grid()
plt.show()

In [None]:
df[df.target <= 10].sample(10)

In [None]:
df[df.target >= 30].error.mean()

In [None]:
df[df.target >= 35].sample(10)

In [None]:
df.to_csv('../generated/predictions.csv')