SQLite Database Initialization
This section initializes the SQLite database by creating necessary tables such as Items, Auctions, and ActionEvents. It ensures the existence of the required tables and sets up the database schema.

File and Configuration Setup
Additionally, it creates other essential files and configurations needed for subsequent operations.

In [None]:
!rm auction.db

In [None]:
import sqlite3
import os

db_path = 'auction.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

cursor.execute('''
    CREATE TABLE IF NOT EXISTS Items (
        item_id INT PRIMARY KEY,
        item_name TEXT,
        quality TEXT,
        item_level INT,
        required_level INT,
        item_class TEXT,
        item_subclass TEXT,
        purchase_price_gold INT,
        purchase_price_silver INT,
        sell_price_gold INT,
        sell_price_silver INT,
        max_count INT,
        is_stackable INT
    )
''')

cursor.execute('''
    CREATE TABLE IF NOT EXISTS Auctions (
        auction_id INT PRIMARY KEY,
        bid INT,
        buyout INT,
        quantity INT,
        time_left TEXT,
        item_id INT
    )
''')

cursor.execute('''
    CREATE TABLE IF NOT EXISTS ActionEvents (
        auction_id INT,
        record DATETIME,
        PRIMARY KEY (auction_id, record),
        FOREIGN KEY (auction_id) REFERENCES Auctions(auction_id)
    )
''')

conn.commit()
conn.close()

In [None]:
from datetime import datetime
from tqdm import tqdm

file_info = {}
data_dir = 'sample/'

for root, dirs, files in os.walk(data_dir):
    for filename in tqdm(files):
        filepath = os.path.join(root, filename)
        date = datetime.strptime(filename.split('.')[0], '%Y%m%dT%H')

        file_info[filepath] = date

file_info = {k: v for k, v in sorted(file_info.items(), key=lambda item: item[1])}
filenames = list(file_info.keys())

MySQL Items Data Retrieval
In this part, the script retrieves data from a MySQL database. It reads the MySQL database configuration from a JSON file, establishes a connection, and fetches data from the Items table. The retrieved data is then loaded into a Pandas DataFrame.

SQLite Database Update
After retrieving the data, the script connects to the SQLite database, deletes all existing records from the Items table, and appends the newly fetched data.

In [None]:
import json
import os
from datetime import datetime
from tqdm import tqdm
import sqlite3

db_path = 'auction.db'
data_dir = 'sample/'
db = sqlite3.connect(db_path)
cursor = db.cursor()

for i, filepath in tqdm(enumerate(filenames)):
    try:
        data = json.load(open(filepath, "r"))
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading file {filepath}: {e}")
        continue

    auction_record = datetime.strptime(filename[:-5], "%Y%m%dT%H")

    if i == 0:
        auction_ids = []
        auctions_data = []

        for auction in data["auctions"]:
            if auction["id"] not in auction_ids:
                auctions_data.append((auction["id"], auction["bid"], auction["buyout"], auction["quantity"], auction["time_left"], auction["item"]["id"]))
                auction_ids.append(auction["id"])

        try:
            cursor.executemany("""
                INSERT INTO Auctions (auction_id, bid, buyout, quantity, time_left, item_id)
                VALUES (?, ?, ?, ?, ?, ?)
            """, auctions_data)
            db.commit()
        except sqlite3.Error as err:
            db.rollback()
            print(f"Error inserting auction data for file {filepath} in Auctions: {err}")

    action_events_data = []
    for auction in data["auctions"]:
        action_events_data.append((auction["id"], auction_record.strftime('%Y-%m-%d %H:%M:%S')))
            
    try:
        cursor.executemany("""
            INSERT OR REPLACE INTO ActionEvents (auction_id, record)
            VALUES (?, ?)
        """, action_events_data)
        db.commit()
    except sqlite3.Error as err:
        db.rollback()
        print(f"Error inserting auction events for file {filepath} in ActionEvents: {err}")

cursor.close()
db.close()

Auction Data and Items Storage
This section processes JSON files containing auction data. It iterates through the files, extracts relevant information, and inserts it into the Auctions and ActionEvents tables of the SQLite database.

Data Import from Pandas DataFrame
Moreover, it initializes a connection to the SQLite database and imports data from a Pandas DataFrame into the Items table. This DataFrame is generated from MySQL database retrieval, ensuring the SQLite Items table is up-to-date.

In [None]:
import pandas as pd
import mysql.connector
import json
import sqlite3

with open('../data/config.json') as f:
    config = json.load(f)

db_path = 'auction.db'
query = "SELECT * FROM Items"

def import_items():
    try:
        mysql_db = mysql.connector.connect(**config['database'])
    except mysql.connector.Error as err:
        print(err)
        return

    cursor = mysql_db.cursor()

    items = cursor.execute(query)
    items = cursor.fetchall()

    mysql_db.close()
    cursor.close()

    try:
        db = sqlite3.connect(db_path)
        print("Connected to SQLite")
    except sqlite3.Error as err:
        return

    cursor = db.cursor()
    cursor.executemany("""
        INSERT OR REPLACE INTO Items (item_id, item_name, quality, item_level, required_level, item_class, item_subclass, purchase_price_gold, purchase_price_silver, sell_price_gold, sell_price_silver, max_count, is_stackable)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    """, items)

    db.commit()

    cursor.close()
    db.close()

    print("Inserted items into SQLite: " + str(len(items)))

import_items()

It is responsible for connecting to an SQLite database, executing a SQL query involving multiple tables, and retrieving the results. These results are stored in the variable

In [None]:
import sqlite3

conn = sqlite3.connect(db_path)
cursor = conn.cursor()

query = """
    SELECT
        a.auction_id,
        a.bid / 10000 AS bid_in_gold,
        a.buyout / 10000 AS buyout_in_gold,
        (a.buyout / 10000) / a.quantity AS unit_price,
        a.quantity,
        a.time_left,
        a.item_id,
        i.item_name,
        i.quality,
        i.item_class,
        i.item_subclass,
        i.is_stackable,
        i.purchase_price_gold,
        i.required_level,
        i.item_level,
        i.sell_price_gold,
        MIN(ae.record) AS first_appearance_timestamp,
        strftime('%Y', MIN(ae.record)) AS first_appearance_year,
        strftime('%m', MIN(ae.record)) AS first_appearance_month,
        strftime('%d', MIN(ae.record)) AS first_appearance_day,
        strftime('%H', MIN(ae.record)) AS first_appearance_hour,
        COUNT(*) AS hours_on_sale
    FROM Auctions a
    JOIN ActionEvents ae ON a.auction_id = ae.auction_id
    JOIN Items i ON i.item_id = a.item_id
    GROUP BY a.auction_id
"""

cursor.execute(query)
results = cursor.fetchall()

conn.close()

df = pd.DataFrame(results, columns=[i[0] for i in cursor.description])
df.head()

organize the results of a SQL query into a DataFrame, and then apply some form of preprocessing to that data using functions from a module called preprocess_data.

In [None]:
import pandas as pd
import numpy as np

def map_time_left(df):
    df['time_left'] = np.where(df['time_left'] == 'SHORT', 0.5, df['time_left'])
    df['time_left'] = np.where(df['time_left'] == 'MEDIUM', 2, df['time_left'])
    df['time_left'] = np.where(df['time_left'] == 'LONG', 12, df['time_left'])
    df['time_left'] = np.where(df['time_left'] == 'VERY_LONG', 48, df['time_left'])

    df['time_left'].value_counts()

    return df


def compute_median_competitor_price(df):
    df['median_buyout_price'] = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['buyout_in_gold'].transform('median')
    df['median_bid_price'] = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['bid_in_gold'].transform('median')
    df['median_unit_price'] = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['unit_price'].transform('median')

    df['rank_buyout_price'] = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['buyout_in_gold'].rank(ascending=True)
    df['rank_bid_price'] = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['bid_in_gold'].rank(ascending=True)
    df['rank_unit_price'] = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['unit_price'].rank(ascending=True)
    
    return df


def compute_avg_competitor_price(df):
    avg_competitor_price = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['unit_price'].mean().reset_index(name='avg_competitor_price')
    std_competitor_price = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['unit_price'].std().reset_index(name='std_competitor_price')

    df_merged = pd.merge(df, avg_competitor_price, on=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'], how='left')
    df_merged['avg_competitor_price'] = df_merged['avg_competitor_price'].fillna(0)

    df_merged = pd.merge(df_merged, std_competitor_price, on=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'], how='left')
    df_merged['std_competitor_price'] = df_merged['std_competitor_price'].fillna(0)

    return df_merged


def compute_competitor_count(df):
    competitor_count = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['unit_price'].count().reset_index(name='competitor_count')

    df_merged = pd.merge(df, competitor_count, on=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'], how='left')
    df_merged['competitor_count'] = df_merged['competitor_count'].fillna(0)

    return df_merged


def compute_minimum_competitor_price(df):
    minimum_competitor_price = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['unit_price'].quantile(0.15).reset_index(name='lowest_competitor_price')

    df_merged = pd.merge(df, minimum_competitor_price, on=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'], how='left')
    df_merged['lowest_competitor_price'] = df_merged['lowest_competitor_price'].fillna(0)
    
    return df_merged


def compute_top_competitor_price(df):
    top_competitor_price = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['unit_price'].quantile(0.80).reset_index(name='top_competitor_price')

    df_merged = pd.merge(df, top_competitor_price, on=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'], how='left')
    df_merged['top_competitor_price'] = df_merged['top_competitor_price'].fillna(0)
    
    return df_merged


def compute_relative_differences(df):
    df['relative_price_difference'] = (df['unit_price'] - df['median_unit_price']) / (df['median_unit_price'] + 1e-6)
    df['relative_price_difference'] = df['relative_price_difference'].fillna(0)

    df['relative_avg_price_difference'] = (df['unit_price'] - df['avg_competitor_price']) / (df['std_competitor_price'] + 1e-6)
    df['relative_avg_price_difference'] = df['relative_avg_price_difference'].fillna(0)

    df['relative_buyout_difference'] = (df['buyout_in_gold'] - df['median_buyout_price']) / (df['median_buyout_price'] + 1e-6)
    df['relative_buyout_difference'] = df['relative_buyout_difference'].fillna(0)

    df['relative_bid_difference'] = (df['bid_in_gold'] - df['median_bid_price']) / (df['median_bid_price'] + 1e-6)
    df['relative_bid_difference'] = df['relative_bid_difference'].fillna(0)

    df['relative_price_to_lowest_competitor'] = (df['unit_price'] - df['lowest_competitor_price']) / (df['lowest_competitor_price'] + 1e-6)
    df['relative_price_to_lowest_competitor'] = df['relative_price_to_lowest_competitor'].fillna(0)

    df['relative_price_to_top_competitor'] = (df['unit_price'] - df['top_competitor_price']) / (df['top_competitor_price'] + 1e-6)
    df['relative_price_to_top_competitor'] = df['relative_price_to_top_competitor'].fillna(0)

    return df


def preprocess_data(df):
    df = map_time_left(df)
    df = compute_median_competitor_price(df)
    df = compute_avg_competitor_price(df)
    df = compute_competitor_count(df)
    df = compute_minimum_competitor_price(df)
    df = compute_top_competitor_price(df)
    df = compute_relative_differences(df)

    return df

df = preprocess_data(df)
df.head()

In [None]:
numerical_columns = [
    'quantity',
    'unit_price',
    'bid_in_gold',
    'buyout_in_gold',
    'time_left',
    'median_buyout_price',
    'median_bid_price',
    'median_unit_price',
    'lowest_competitor_price',
    'avg_competitor_price',
    'std_competitor_price',
    'top_competitor_price',
    'competitor_count',
    'rank_buyout_price',
    'rank_bid_price',
    'rank_unit_price',
    'relative_price_difference',
    'relative_avg_price_difference',
    'relative_buyout_difference',
    'relative_bid_difference',
    'relative_price_to_lowest_competitor',
    'relative_price_to_top_competitor',
    'purchase_price_gold',
    'sell_price_gold',
    'required_level',
    'item_level'
]

categorical_columns_ordinal = [
    'item_id',
    'quality',
    'item_class',
    'item_subclass'
]
categorical_columns_onehot = [
  'is_stackable'
]

X = df[numerical_columns + categorical_columns_ordinal + categorical_columns_onehot]
y = df['hours_on_sale']

In [None]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

num_transformer = StandardScaler()
ordinal_transformer = OrdinalEncoder()
onehot_transformer = OneHotEncoder(sparse_output=False)

column_transformer = make_column_transformer(
    #(num_transformer, numerical_columns),
    (ordinal_transformer, categorical_columns_ordinal),
    (onehot_transformer, categorical_columns_onehot),
    remainder='passthrough'
)

X = column_transformer.fit_transform(X)
y = np.array(y)

loads a trained model from a file, makes predictions on a data set, calculates the RMSE and displays the result

In [None]:
import pickle
import pandas as pd
from sklearn.metrics import mean_squared_error

pd.options.display.float_format = '{:.0f}'.format

with open('models/linear_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

predictions = model.predict(X)
df['prediction'] = predictions
df[['item_name', 'item_class', 'unit_price', 'buyout_in_gold', 'hours_on_sale', 'prediction']].head(20)

In [None]:
rmse = mean_squared_error(y, predictions, squared=False)

print("RMSE:", rmse)