In [32]:
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import os
import json
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
from datetime import datetime, timedelta
from auction_predictor import AuctionPredictor
from auction_dataset import AuctionDataset

pd.options.display.max_columns = None
pd.options.display.width = None

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [33]:
items = pd.read_csv('../data/items.csv')
print("Items shape:", items.shape)
n_items = len(items)
item_to_index = {item_id: i + 2 for i, item_id in enumerate(items['item_id'])}
item_to_index[0] = 0 
item_to_index[1] = 1  
print(f"Number of unique items: {n_items}")

historical_prices_path = '../data/historical_prices.csv'
if not os.path.exists(historical_prices_path):
    historical_prices_path = 'historical_prices.csv'

try:
    historical_prices = pd.read_csv(historical_prices_path)
    historical_prices['datetime'] = pd.to_datetime(historical_prices['datetime'])
    print('Historical prices loaded successfully.')
except FileNotFoundError:
    print(f'Error: The historical prices file {historical_prices_path} was not found.')
    historical_prices = pd.DataFrame(columns=['item_id', 'datetime', 'price'])

if historical_prices.empty:
    print("Warning: historical_prices is empty. This may cause issues later.")

historical_prices['week_start'] = historical_prices['datetime'].dt.to_period('W').apply(lambda r: r.start_time)
weekly_historical_prices = historical_prices.groupby(['item_id', 'week_start'])['price'].mean().reset_index()
weekly_historical_prices['datetime'] = weekly_historical_prices['week_start'].astype(str)
weekly_historical_prices.set_index(['item_id', 'datetime'], inplace=True)

time_left_mapping = {
    'VERY_LONG': 48,
    'LONG': 12,
    'MEDIUM': 2,
    'SHORT': 0.5
}

Items shape: (10396, 13)
Number of unique items: 10396
Historical prices loaded successfully.


In [34]:
def load_auctions_from_sample(data_dir='sample/'):
    file_info = {}
    auction_appearances = {}

    for root, dirs, files in os.walk(data_dir):
        for filename in tqdm(files):
            filepath = os.path.join(root, filename)
            date = datetime.strptime(filename.split('.')[0], '%Y%m%dT%H')
            file_info[filepath] = date

    file_info = {k: v for k, v in sorted(file_info.items(), key=lambda item: item[1])}
    
    all_auctions = []
    
    for filepath in list(file_info.keys()):
        with open(filepath, 'r') as f:
            try:
                json_data = json.load(f)
                
                if 'auctions' not in json_data:
                    print(f"File {filepath} does not contain 'auctions' key, skipping.")
                    continue
                
                auction_data = json_data['auctions']
                
                if not auction_data:
                    print(f"File {filepath} is empty, skipping.")
                    continue
                
                timestamp = file_info[filepath]
                for auction in auction_data:
                    auction_id = auction['id']
                    auction['timestamp'] = timestamp.strftime("%Y-%m-%d %H:%M:%S")
                    
                    if auction_id not in auction_appearances:
                        auction_appearances[auction_id] = {'first': timestamp, 'last': timestamp}
                    else:
                        auction_appearances[auction_id]['last'] = timestamp
                
                all_auctions.extend(auction_data)
            except json.JSONDecodeError as e:
                print(f"Error loading file {filepath}: {e}")
                continue
            except Exception as e:
                print(f"Unexpected error loading file {filepath}: {e}")
                continue

    return all_auctions, auction_appearances

def process_auction_data(auctions, auction_appearances, prediction_time, max_auctions_per_item=1000):
    auctions_by_item = {}
    hours_on_sale = {}
    
    for auction in auctions:
        if not isinstance(auction, dict) or 'item' not in auction or 'id' not in auction['item']:
            print(f"Unexpected structure in auction: {auction}")
            continue

        auction_id = auction['id']
        item_id = auction['item']['id']
        time_left_numeric = time_left_mapping.get(auction['time_left'], 0)
        bid = auction['bid'] * 10000 / 1000
        buyout = auction['buyout'] * 10000 / 1000
        quantity = auction['quantity'] / 200
        time_left = time_left_numeric / 48
        item_index = item_to_index.get(item_id, 1)
        
        timestamp = datetime.strptime(auction['timestamp'], "%Y-%m-%d %H:%M:%S")
        hours_on_sale[auction_id] = (auction_appearances[auction_id]['last'] - auction_appearances[auction_id]['first']).total_seconds() / 3600
        hours_since_first_appearance = (prediction_time - auction_appearances[auction_id]['first']).total_seconds() / 3600
        
        week_start = (prediction_time - timedelta(days=prediction_time.weekday())).strftime("%Y-%m-%d")
        if (item_id, week_start) in weekly_historical_prices.index:
            historical_price = weekly_historical_prices.loc[(item_id, week_start), 'price']
        else:
            historical_price = buyout
        
        processed_auction = [
            bid, 
            buyout,  
            quantity, 
            item_index,
            time_left, 
            hours_since_first_appearance,
            historical_price  
        ]
        
        if item_index not in auctions_by_item:
            auctions_by_item[item_index] = []
        
        if len(auctions_by_item[item_index]) < max_auctions_per_item:
            auctions_by_item[item_index].append(processed_auction)
    
    return auctions_by_item, hours_on_sale

data_dir = 'sample/'
prediction_time = datetime.strptime("2024-08-25 00:00:00", "%Y-%m-%d %H:%M:%S")
auction_data, auction_appearances = load_auctions_from_sample(data_dir)
auctions_by_item, hours_on_sale = process_auction_data(auction_data, auction_appearances, prediction_time)

print(f"Processed auctions for {len(auctions_by_item)} different items.")
print(f"Example of processed auctions for an item: {auctions_by_item[list(auctions_by_item.keys())[0]][0]}")
print(f"Example of hours_on_sale for an auction: {list(hours_on_sale.items())[0]}")

0it [00:00, ?it/s]
100%|██████████| 24/24 [00:00<00:00, 40853.61it/s]
100%|██████████| 24/24 [00:00<00:00, 31418.01it/s]


Error loading file sample/26-08-2024/20240826T22.json: Expecting value: line 1 column 1 (char 0)
Processed auctions for 1817 different items.
Example of processed auctions for an item: [23844810.0, 25099790.0, 0.005, 1, 0.010416666666666666, 0.0, np.float64(283.13385810810814)]
Example of hours_on_sale for an auction: (1360771519, 2.0)


In [35]:
embedding_size = 64
encoder_hidden_size = 128
decoder_hidden_size = 128
epochs = 10

model = AuctionPredictor(
    n_items=n_items,             
    input_size=6,                   
    encoder_hidden_size=encoder_hidden_size,
    decoder_hidden_size=decoder_hidden_size,
    item_index=3,                   
    embedding_size=embedding_size,
    dropout_p=0.1,
    bidirectional=False
).to(device)

print(f'Number of model parameters: {sum(p.numel() for p in model.parameters())}')

model_path = 'models/rnn_model.pt'
if not os.path.exists(model_path):
    model_path = '../eval/models/rnn_model.pt'  

try:
    checkpoint = torch.load(model_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()  
    print('Pre-trained RNN model loaded successfully.')
except FileNotFoundError:
    print(f'Error: The model file {model_path} was not found.')
except Exception as e:
    print(f'An error occurred while loading the model: {str(e)}')

Number of model parameters: 1174401
An error occurred while loading the model: Error(s) in loading state_dict for AuctionPredictor:
	size mismatch for encoder.embedding.weight: copying a param with shape torch.Size([10398, 64]) from checkpoint, the shape in current model is torch.Size([10396, 64]).


  checkpoint = torch.load(model_path, map_location=device)


In [38]:
def evaluate_rnn_with_consistent_length(model, auctions_by_item, hours_on_sale, prediction_time):
    all_predictions = []
    all_actual_values = []

    print(f"Total number of items: {len(auctions_by_item)}")
    print(f"Total number of auctions: {sum(len(auctions) for auctions in auctions_by_item.values())}")
    
    if hours_on_sale:
        print(f"Hours on sale statistics: Min: {min(hours_on_sale.values())}, Max: {max(hours_on_sale.values())}, Mean: {np.mean(list(hours_on_sale.values()))}")
    else:
        print("No 'hours_on_sale' data to calculate statistics.")

    for item_idx, auctions in auctions_by_item.items():
        if not auctions: 
            continue
        auctions_np = np.array(auctions)
        X = torch.tensor(auctions_np, dtype=torch.float32).to(device)
        X = X.unsqueeze(0) 
        with torch.no_grad():
            predictions = model(X)

        auction_ids = [auction[0] for auction in auctions]
        actual_values = [hours_on_sale.get(auction_id, 0) for auction_id in auction_ids]

        if len(predictions.squeeze(0)) == len(actual_values):  
            all_predictions.extend(predictions.squeeze(0).cpu().numpy())
            all_actual_values.extend(actual_values)
        else:
            print(f"Skipping item {item_idx} due to size mismatch: {len(predictions.squeeze(0))} predictions vs {len(actual_values)} actual values.")

    if not all_predictions:
        print("No valid auctions were processed. Check your data.")
        return None, None
    
    print(f"Number of predictions: {len(all_predictions)}")
    print(f"Number of actual values: {len(all_actual_values)}")
    print(f"Predictions statistics: Min: {min(all_predictions)}, Max: {max(all_predictions)}, Mean: {np.mean(all_predictions)}")
    print(f"Actual values statistics: Min: {min(all_actual_values)}, Max: {max(all_actual_values)}, Mean: {np.mean(all_actual_values)}")

    return all_predictions, all_actual_values

In [39]:
def calculate_mae(all_predictions, all_actual_values):
    if len(all_predictions) == 0 or len(all_actual_values) == 0:
        print("No valid data for MAE calculation.")
        return None
    all_predictions = np.array(all_predictions)
    all_actual_values = np.array(all_actual_values)
    mae = mean_absolute_error(all_actual_values, all_predictions)
    return mae

all_predictions, all_actual_values = evaluate_rnn_with_consistent_length(model, auctions_by_item, hours_on_sale, prediction_time)

if all_predictions is not None and all_actual_values is not None:
    rnn_mae = calculate_mae(all_predictions, all_actual_values)
    if rnn_mae is not None:
        print(f'RNN Model MAE: {rnn_mae}')
    else:
        print('Evaluation failed due to lack of valid data.')
else:
    print('No predictions were made.')

Total number of items: 1817
Total number of auctions: 256596
Hours on sale statistics: Min: 0.0, Max: 47.0, Mean: 19.541915316851377
Number of predictions: 256596
Number of actual values: 256596
Predictions statistics: Min: [-0.3113299], Max: [43.916637], Mean: 8.294195175170898
Actual values statistics: Min: 0, Max: 0, Mean: 0.0
RNN Model MAE: 8.294197472924338
