In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import mysql.connector
import os
import datetime
from mysql.connector.pooling import MySQLConnectionPool, PooledMySQLConnection
from tqdm import tqdm
pd.options.mode.chained_assignment = None 

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.compose import make_column_transformer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [16]:
conn = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="auction_db"
)

In [None]:
cursor = conn.cursor()

cursor.execute("""  
    CREATE TABLE AuctionHours AS
    SELECT 
        a.auction_id,
        a.item_id,
        a.bid / 10000.0 AS bid, 
        a.buyout / 10000.0 AS buyout, 
        a.quantity,
        h.first_appearance_timestamp,
        h.total_hours_on_sale
    FROM 
        Auctions a
    INNER JOIN (
        SELECT 
            ae.auction_id, 
            COUNT(ae.record) AS total_hours_on_sale, 
            MIN(ae.record) AS first_appearance_timestamp
        FROM 
            ActionEvents ae
        GROUP BY 
            ae.auction_id
    ) h 
    ON 
        a.auction_id = h.auction_id;
""")

cursor.execute("""
    CREATE INDEX index_item
    ON AuctionHours (item_id);
""")

cursor.close()

In [17]:
def randomly_delete_data(df, percentage):
    if not (0 <= percentage <= 100):
        raise ValueError("Percentage must be between 0 and 100")

    # Calculate the number of rows to delete
    num_rows_to_delete = int(len(df) * (percentage / 100))

    # Randomly select rows to delete
    rows_to_delete = np.random.choice(df.index, size=num_rows_to_delete, replace=False)

    # Delete selected rows from the DataFrame
    df_after_deletion = df.drop(rows_to_delete)

    return df_after_deletion

In [18]:
cursor = conn.cursor()

cursor.execute("""  
    SELECT DISTINCT ae.record, a.item_id
    FROM Auctions a
    JOIN ActionEvents ae ON a.auction_id = ae.auction_id;       
""")

data = cursor.fetchall()

headers = [column[0] for column in cursor.description]

cursor.close()

pairs = pd.DataFrame(data, columns=headers)

print(pairs.shape)
pairs = randomly_delete_data(pairs, 95)
print(pairs.shape)

pairs.to_csv('auction_indices.csv', index=False)

(11137725, 2)
(1113773, 2)


In [25]:
pairs = pd.read_csv('auction_indices.csv')
pairs.head()

Unnamed: 0,record,item_id
0,2024-01-02 00:00:00,754
1,2024-01-02 00:00:00,774
2,2024-01-02 00:00:00,785
3,2024-01-02 00:00:00,870
4,2024-01-02 00:00:00,1076


In [19]:
items = pd.read_csv('items.csv')
n_items = len(items)

item_to_index = {item_id: i + 1 for i, item_id in enumerate(items['item_id'])}
item_to_index[0] = 0
n_items

10396

In [20]:
time_left_to_int = {
    'VERY_LONG': 48,
    'LONG': 24,
    'MEDIUM': 12,
    'SHORT': 2
}

In [26]:
# create a folder to store the data
os.makedirs('data', exist_ok=True)

conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="password",
    database="auction_db"
)

cursor = conn.cursor()

date_format = "%Y-%m-%d %H:%M:%S"

for i, row in tqdm(pairs.iterrows(), total=len(pairs)):
    record = row['record']
    item_id = row['item_id']
    
    date_time_obj = datetime.datetime.strptime(record, "%Y-%m-%d %H:%M:%S")

    date_folder_name = date_time_obj.strftime("%d-%m-%Y")
    hour_folder_name = date_time_obj.strftime("%H")

    os.makedirs(f'data/{date_folder_name}/{hour_folder_name}', exist_ok=True)

    if os.path.exists(f'data/{date_folder_name}/{hour_folder_name}/{item_id}.pt'):
        print(f'Skipping {item_id} at {record}')
        continue

    cursor.execute(f"""
        SELECT ae.auction_id, 
               ae.record, 
               ae.time_left,
               bid, 
               buyout, 
               ah.quantity, 
               ah.item_id,
               ah.first_appearance_timestamp,
               CAST(TIMESTAMPDIFF(HOUR, ah.first_appearance_timestamp, ae.record) AS SIGNED) AS hours_since_first_appearance,
               ah.total_hours_on_sale
        FROM ActionEvents ae
        INNER JOIN (
            SELECT auction_id, item_id, bid, buyout, quantity, first_appearance_timestamp, total_hours_on_sale
            FROM AuctionHours ah
            WHERE ah.item_id = %s
        ) ah ON ah.auction_id = ae.auction_id 
        WHERE ae.record = %s;
    """, (item_id, record))
    
    data = cursor.fetchall()

    if not data:
        continue

    headers = [column[0] for column in cursor.description]
    sample = pd.DataFrame(data, columns=headers)

    sample['first_appearance_timestamp'] = pd.to_datetime(sample['first_appearance_timestamp'])
    sample['record'] = pd.to_datetime(sample['record'])
    sample['hours_on_sale'] = sample['total_hours_on_sale'] - sample['hours_since_first_appearance']

    numerical_columns = ['bid', 'buyout', 'quantity']
    categorical_columns_ordinal = ['item_id', 'time_left', 'hours_since_first_appearance']
    
    sample['time_left'] = sample['time_left'].map(time_left_to_int)
    sample[numerical_columns] = sample[numerical_columns].astype(np.float32)

    X = sample[numerical_columns + categorical_columns_ordinal + ['hours_on_sale']].to_numpy()
    X = torch.tensor(X, dtype=torch.float32)

    torch.save(X, f'data/{date_folder_name}/{hour_folder_name}/{item_id}.pt')
    
cursor.close()

100%|██████████| 1113773/1113773 [12:37:23<00:00, 24.51it/s]   


True