In [1]:
import pandas as pd
import numpy as np
import os
import json
import mysql.connector
import csv
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

## Retrieve most recent data

In [2]:
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="auction_db"
)

In [3]:
with open("data/sql/get_data.sql", "r") as f:
    sql = f.read()

In [4]:
cursor = mydb.cursor()

cursor.execute(sql)

auctions = cursor.fetchall()

print('Imported {} auctions'.format(len(auctions)))

headers = [column[0] for column in cursor.description]

cursor.close()

Imported 16532184 auctions


True

In [5]:
with open('auction_data.csv', 'w', newline='') as out:
    csv_out = csv.writer(out)
    csv_out.writerow(headers)
    for row in auctions:
        csv_out.writerow(row)

## Read data from CSV

In [None]:
df = pd.read_csv('auction_data.csv')
df.head(5)

In [None]:
df.shape

In [None]:
df['time_left'].value_counts()

### Since we can't know if an item was sold or just expired, we'll focus on VERY_LONG time left items (between 12 and 48 hours left)

In [None]:
df = df[df['time_left'] == 'VERY_LONG']

# Exploratory Data Analysis

In [None]:
df.info()

In [None]:
# where 'hours_on_sale' is < 12, the auction was sold set to 1, else 0
df['sold'] = np.where(df['hours_on_sale'] < 12, 1, 0)

In [None]:
columns = ['bid_in_gold', 'buyout_in_gold', 'quantity', 'hours_on_sale']
df[columns].describe()

## Percentage of items sold

In [None]:
df_count = df['sold'].value_counts(normalize=True)
df_count = df_count.mul(100)
df_count = df_count.rename('percent').reset_index()
df_count['sold'] = df_count['index'] 

g = sns.catplot(x='sold', y='percent', kind='bar', data=df_count)
for p in g.ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x()
    txt_y = p.get_height() + 3
    g.ax.text(txt_x,txt_y,txt)

## Most popular Item

In [None]:
items_sold = df[df['Sold'] == 1]

count_items = items_sold.groupby(['ItemName']).size().reset_index(name='Counts')
count_items = count_items.sort_values(by=['Counts'], ascending=False)

print(count_items.head(10))

plot = sns.barplot(x='ItemName', y='Counts', data=count_items[:10])
for item in plot.get_xticklabels():
    item.set_rotation(45)

## Most popular Item Class

In [None]:
items_sold = df[df['Sold'] == 1]

count_items = items_sold.groupby(['ItemClass']).size().reset_index(name='Counts')
count_items = count_items.sort_values(by=['Counts'], ascending=False)

print(count_items.head(100))

plot = sns.barplot(x='ItemClass', y='Counts', data=count_items[:5])
for item in plot.get_xticklabels():
    item.set_rotation(45)

## Most popular Item Subclass

In [None]:
items_sold = df[df['Sold'] == 1]

count_items = items_sold.groupby(['ItemSubClass']).size().reset_index(name='Counts')
count_items = count_items.sort_values(by=['Counts'], ascending=False)

print(count_items.head(10))

plot = sns.barplot(x='ItemSubClass', y='Counts', data=count_items[:10])
for item in plot.get_xticklabels():
    item.set_rotation(45)

## Most sold quality tipe

In [None]:
items_sold = df[df['Sold'] == 1]

count_items = items_sold.groupby(['Quality']).size().reset_index(name='Counts')
count_items = count_items.sort_values(by=['Counts'], ascending=False)

print(count_items.head(10))

plot = sns.barplot(x='Quality', y='Counts', data=count_items[:10])
for item in plot.get_xticklabels():
    item.set_rotation(45)

## Are most sold items equippable?

In [None]:
items_sold = df[df['Sold'] == 1]

count_items = items_sold.groupby(['IsEquippable']).size().reset_index(name='Counts')
count_items = count_items.sort_values(by=['Counts'], ascending=False)[:10]

print(count_items.head())

plot = sns.barplot(x='IsEquippable', y='Counts', data=count_items)

## Best week day to sell

In [None]:
import datetime

df['WeekDay'] = df.apply(lambda row: datetime.datetime(row['FirstTimeSeenYear'], row['FirstTimeSeenMonth'], row['FirstTimeSeenDay']).weekday(), axis=1)

count_items = df.groupby(['WeekDay']).size().reset_index(name='Counts')
count_items = count_items.sort_values(by=['Counts'], ascending=False)[:10]

print(count_items.head(10))

plot = sns.barplot(x='WeekDay', y='Counts', data=count_items)

# Data Preparation

In [None]:
df['time_left'] = np.where(df['time_left'] == 'SHORT', 0.5, df['time_left'])
df['time_left'] = np.where(df['time_left'] == 'MEDIUM', 2, df['time_left'])
df['time_left'] = np.where(df['time_left'] == 'LONG', 12, df['time_left'])
df['time_left'] = np.where(df['time_left'] == 'VERY_LONG', 48, df['time_left'])

df['time_left'].value_counts()

In [None]:
df['first_appearance_year'] = df['first_appearance_timestamp'].str[:4].astype(int)
df['first_appearance_month'] = df['first_appearance_timestamp'].str[5:7].astype(int)
df['first_appearance_day'] = df['first_appearance_timestamp'].str[8:10].astype(int)
df['first_appearance_hour'] = df['first_appearance_timestamp'].str[11:13].astype(int)


In [None]:
def compute_median_competitor_price(df):
    df['median_buyout_price'] = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['buyout_in_gold'].transform('median')
    df['median_bid_price'] = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['bid_in_gold'].transform('median')
    df['median_unit_price'] = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['unit_price'].transform('median')

    df['rank_buyout_price'] = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['buyout_in_gold'].rank(ascending=True)
    df['rank_bid_price'] = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['bid_in_gold'].rank(ascending=True)
    df['rank_unit_price'] = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['unit_price'].rank(ascending=True)
    
    return df

df = compute_median_competitor_price(df)
df.head()

In [None]:
def compute_avg_competitor_price(df):
    avg_competitor_price = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['unit_price'].mean().reset_index(name='avg_competitor_price')
    std_competitor_price = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['unit_price'].std().reset_index(name='std_competitor_price')

    df_merged = pd.merge(df, avg_competitor_price, on=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'], how='left')
    df_merged['avg_competitor_price'] = df_merged['avg_competitor_price'].fillna(0)

    df_merged = pd.merge(df_merged, std_competitor_price, on=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'], how='left')
    df_merged['std_competitor_price'] = df_merged['std_competitor_price'].fillna(0)

    return df_merged

df = compute_avg_competitor_price(df)
df.head()

In [None]:
def compute_competitor_count(df):
    competitor_count = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['unit_price'].count().reset_index(name='competitor_count')

    df_merged = pd.merge(df, competitor_count, on=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'], how='left')
    df_merged['competitor_count'] = df_merged['competitor_count'].fillna(0)

    return df_merged

df = compute_competitor_count(df)
df.head()

In [None]:
def compute_minimum_competitor_price(df):
    minimum_competitor_price = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['unit_price'].quantile(0.15).reset_index(name='lowest_competitor_price')

    df_merged = pd.merge(df, minimum_competitor_price, on=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'], how='left')
    df_merged['lowest_competitor_price'] = df_merged['lowest_competitor_price'].fillna(0)
    
    return df_merged
    
df = compute_minimum_competitor_price(df)
df.head()

In [None]:
def compute_top_competitor_price(df):
    top_competitor_price = df.groupby(by=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'])['unit_price'].quantile(0.80).reset_index(name='top_competitor_price')

    df_merged = pd.merge(df, top_competitor_price, on=['item_id', 'first_appearance_year', 'first_appearance_month', 'first_appearance_day'], how='left')
    df_merged['top_competitor_price'] = df_merged['top_competitor_price'].fillna(0)
    
    return df_merged

df = compute_top_competitor_price(df)
df.head()

In [None]:
df['relative_price_difference'] = (df['unit_price'] - df['median_unit_price']) / (df['median_unit_price'] + 1e-6)
df['relative_price_difference'] = df['relative_price_difference'].fillna(0)

df['relative_buyout_difference'] = (df['buyout_in_gold'] - df['median_buyout_price']) / (df['median_buyout_price'] + 1e-6)
df['relative_buyout_difference'] = df['relative_buyout_difference'].fillna(0)

df['relative_bid_difference'] = (df['bid_in_gold'] - df['median_bid_price']) / (df['median_bid_price'] + 1e-6)
df['relative_bid_difference'] = df['relative_bid_difference'].fillna(0)

df['relative_price_to_lowest_competitor'] = (df['unit_price'] - df['lowest_competitor_price']) / (df['lowest_competitor_price'] + 1e-6)
df['relative_price_to_lowest_competitor'] = df['relative_price_to_lowest_competitor'].fillna(0)

df['relative_price_to_top_competitor'] = (df['unit_price'] - df['top_competitor_price']) / (df['top_competitor_price'] + 1e-6)
df['relative_price_to_top_competitor'] = df['relative_price_to_top_competitor'].fillna(0)

df[['relative_price_difference', 'relative_buyout_difference', 'relative_bid_difference', 'relative_price_to_lowest_competitor', 'relative_price_to_top_competitor']].head()

In [None]:
df.head(5)

In [None]:
def randomly_delete_data(df, percentage):
    if not (0 <= percentage <= 100):
        raise ValueError("Percentage must be between 0 and 100")

    # Calculate the number of rows to delete
    num_rows_to_delete = int(len(df) * (percentage / 100))

    # Randomly select rows to delete
    rows_to_delete = np.random.choice(df.index, size=num_rows_to_delete, replace=False)

    # Delete selected rows from the DataFrame
    df_after_deletion = df.drop(rows_to_delete)

    return df_after_deletion

df = randomly_delete_data(df, 70)
df.shape

In [None]:
numerical_columns = [
    'quantity', 
    'unit_price', 
    'bid_in_gold', 
    'buyout_in_gold', 
    'time_left', 
    'median_buyout_price', 
    'median_bid_price', 
    'median_unit_price', 
    'lowest_competitor_price', 
    'avg_competitor_price', 
    'std_competitor_price', 
    'competitor_count', 
    'rank_buyout_price', 
    'rank_bid_price', 
    'rank_unit_price',
    'relative_price_difference'
]

categorical_columns = ['item_id']

In [None]:
X = df[numerical_columns + categorical_columns]
y = df['hours_on_sale']

columns = X.columns

X.head()

In [None]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer

num_transformer = StandardScaler()
categorical_transformer = OrdinalEncoder()

column_transformer = make_column_transformer(
    (num_transformer, numerical_columns),
    (categorical_transformer, categorical_columns),
    remainder='passthrough'
)

X = column_transformer.fit_transform(X)
y = np.array(y)

In [None]:
number_of_items = len(df['item_id'].unique())
number_of_items

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(np.array(X), np.array(y), random_state=0, test_size=0.1)

print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

# Model Selection

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
from sklearn.dummy import DummyRegressor

dummy = DummyRegressor(strategy='mean')
dummy.fit(X_train, y_train)

train_rmse = np.sqrt(mean_squared_error(y_train, dummy.predict(X_train)))
test_rmse = np.sqrt(mean_squared_error(y_test, dummy.predict(X_test)))

print(f'Train RMSE: {train_rmse:.2f}')
print(f'Test RMSE: {test_rmse:.2f}')

In [None]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train)

train_rmse = np.sqrt(mean_squared_error(y_train, reg.predict(X_train)))
test_rmse = np.sqrt(mean_squared_error(y_test, reg.predict(X_test)))

print(f'Train RMSE: {train_rmse}')
print(f'Test RMSE: {test_rmse}')

In [None]:
from sklearn.tree import DecisionTreeRegressor

reg = DecisionTreeRegressor(max_depth=10)
reg.fit(X_train, y_train)

train_rmse = np.sqrt(mean_squared_error(y_train, reg.predict(X_train)))
test_rmse = np.sqrt(mean_squared_error(y_test, reg.predict(X_test)))

print(f'Train RMSE: {train_rmse}')
print(f'Test RMSE: {test_rmse}')

In [None]:
from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor(n_estimators=100, max_depth=10)
reg.fit(X_train, y_train)

train_rmse = np.sqrt(mean_squared_error(y_train, reg.predict(X_train)))
test_rmse = np.sqrt(mean_squared_error(y_test, reg.predict(X_test)))

print(f'Train RMSE: {train_rmse}')
print(f'Test RMSE: {test_rmse}') # 9.21

In [None]:
# show cpu info
import multiprocessing
multiprocessing.cpu_count()

In [None]:
from xgboost import XGBRegressor

reg = XGBRegressor()
reg.fit(X_train, y_train)

train_rmse = np.sqrt(mean_squared_error(y_train, reg.predict(X_train)))
test_rmse = np.sqrt(mean_squared_error(y_test, reg.predict(X_test)))

print(f'Train RMSE: {train_rmse}')
print(f'Test RMSE: {test_rmse}') # 8.80

In [None]:
import lightgbm as lgb

reg = lgb.LGBMRegressor()
reg.fit(X_train, y_train)

train_rmse = np.sqrt(mean_squared_error(y_train, reg.predict(X_train)))
test_rmse = np.sqrt(mean_squared_error(y_test, reg.predict(X_test)))

print(f'Train RMSE: {train_rmse}')
print(f'Test RMSE: {test_rmse}') # 9.19

In [None]:
feature_names = numerical_columns + categorical_columns
importance = reg.feature_importances_

for i,v in enumerate(importance):
    print(f'{i}. Feature: {feature_names[i]}, Score: {v}')

plt.bar([x for x in range(len(importance))], importance)
plt.show()

### Neural Network

In [None]:
import torch
import torch.nn as nn

class NeuralAuctioneer(nn.Module):
    def __init__(self, num_of_items, num_embeddings)
        super().__init__()
        self.item_embeddings = nn.Embedding(num_of_items, num_embeddings)
        self.fc1 = nn.Linear(16 + num_embeddings, 16)
        self.fc2 = nn.Linear(16, 1)
        self.relu = nn.ReLU()
        self.bn1 = nn.BatchNorm1d(32)

    def forward(self, x):
        item_embeddings = self.item_embeddings(x[:, 16].long())
        x = torch.cat([x[:, :16], item_embeddings], dim=1)
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)

        return self.fc2(x)

In [None]:
from torch.utils.data import TensorDataset, DataLoader

X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()

X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float()

train_ds = TensorDataset(X_train, y_train)
test_ds = TensorDataset(X_test, y_test)

train_dl = DataLoader(train_ds, batch_size=512, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=512, shuffle=False)

In [None]:
import torch.optim as optim

criterion = nn.MSELoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = NeuralAuctioneer(number_of_items=number_of_items).to(device)
optimizer = optim.Adam(net.parameters(), lr=1e-4)

for epoch in range(100):
    for step, (X_train, y_train) in enumerate(train_dl):
        X_train, y_train = X_train.to(device), y_train.to(device)

        optimizer.zero_grad()
        output = net(X_train)

        loss = criterion(output, y_train)
        loss.backward()
        optimizer.step()

        if step % 1000 == 0:
            rmse = torch.sqrt(loss)
            print(f'Epoch: {epoch}, Loss: {loss.item()} RMSE: {rmse.item()}')
    
    with torch.no_grad():
        val_losses = []
        for X_test, y_test in test_dl:
            X_test, y_test = X_test.to(device), y_test.to(device)
            output = net(X_test)
            loss = criterion(output, y_test)
            rmse = torch.sqrt(loss)
            val_losses.append(rmse.item())
        
        print(f'Epoch: {epoch}, RMSE: {np.mean(val_losses)}')

### Full data

In [None]:
cross_val_score(reg, X_train, y_train, scoring='neg_mean_absolute_error').mean()

In [None]:
reg.fit(X_train, y_train)

In [None]:
features = np.array(column_transformer.transformers_[0][1].get_feature_names(categorical_columns).tolist() + numerical_columns)

In [None]:
sorted_idx = reg.feature_importances_.argsort()

plt.figure(figsize=(8,12))
plt.barh(features[sorted_idx], reg.feature_importances_[sorted_idx])

# Finetuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = [
    {'randomforestregressor__n_estimators': [3, 10, 30], 'randomforestregressor__max_features': [2, 4, 6, 8]},
    {'randomforestregressor__bootstrap': [False], 'randomforestregressor__n_estimators': [3, 10], 'randomforestregressor__max_features': [2, 3, 4]}
]

reg = RandomForestRegressor()

pipeline = make_pipeline(
    column_transformer,
    reg
)


grid_search = GridSearchCV(pipeline, param_grid, cv=5,scoring='neg_mean_absolute_error',return_train_score=True)

grid_search.fit(X, y)

In [None]:
grid_search.best_params_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
X.columns

In [None]:
sorted_idx = grid_search.best_estimator_._final_estimator.feature_importances_.argsort()

plt.figure(figsize=(8,24))
plt.barh(X.columns[sorted_idx], grid_search.best_estimator_._final_estimator.feature_importances_[sorted_idx])

# Save model

In [None]:
import pickle

filename = 'model.sav'
pickle.dump(reg, open(filename, 'wb'))

# Load model

In [None]:
import pickle

filename = 'model.sav'
reg = pickle.load(open(filename, 'rb'))

# Data for prediction

Historical data from database

In [None]:
result, headers = get_data(sql, config)

df_historical = pd.DataFrame.from_records(result, columns=headers)
df_historical = df_historical[df_historical['BuyoutGold'] > 0]
df_historical = df_historical[df_historical['TimeLeft'] == 'VERY_LONG']
df_historical.loc[:,'Sold'] = df_historical['TimesSeen'] <= 12
df_historical['UnitPrice'] = df_historical['BuyoutGold'] / df_historical['Quantity']
df_historical = df_historical[df_historical['Sold'] == True]

df_historical.head()

Item data

In [None]:
with open("sql/get_items.sql", "r") as f:
    sql_items = f.read()
    
result, headers = get_data(sql_items, config)

items = pd.DataFrame.from_records(result, columns=headers)
items['ItemId'] = items['Id']
items['SellPrice'] = items['SellPriceGold'] + items['SellPriceSilver'] / 100.0

items.head(3)

Get actual auction data

In [None]:
auctions = get_auction_data(save=False)

df_actual = pd.DataFrame.from_records(auctions, columns=['Id', 'ItemId', 'BidGold', 'BidSilver', 'BuyoutGold', 'BuyoutSilver', 'Quantity', 'TimeLeft', 'Rand', 'Seed'])
df_actual['BuyoutGold'] = df_actual['BuyoutGold'] + (df_actual['BuyoutSilver'] / 100.0)
df_actual['UnitPrice'] = df_actual['BuyoutGold'] / df_actual['Quantity']
df_actual = df_actual[df_actual['BuyoutGold'] > 0]

df_actual.head()

In [None]:
historical_price = df_historical.groupby(by=['ItemId'])['UnitPrice'].median().reset_index(name='HistoricalPrice')
median_competitor_price = df_actual.groupby(by=['ItemId'])['UnitPrice'].median().reset_index(name='MedianCompetitorPrice')
lowest_competitor_price = df_actual[df_actual['UnitPrice'] > 0].groupby(by=['ItemId'])['UnitPrice'].min().reset_index(name='LowestCompetitorPrice')

# Model Prediction

In [None]:
predict = pd.read_csv('predict.csv')
predict.head()

In [None]:
df_merged = pd.merge(predict, historical_price, on=['ItemId'], how='left')
df_merged = pd.merge(df_merged, median_competitor_price, on=['ItemId'], how='left')
df_merged = pd.merge(df_merged, lowest_competitor_price, on=['ItemId'], how='left')
df_merged = pd.merge(df_merged, items[['ItemId', 'Name', 'Quality', 'ItemClass']], on=['ItemId'], how='left')

df_merged['HistoricalPrice'] = df_merged['HistoricalPrice'].fillna(0)
df_merged['MedianCompetitorPrice'] = df_merged['MedianCompetitorPrice'].fillna(0)
df_merged['LowestCompetitorPrice'] = df_merged['LowestCompetitorPrice'].fillna(0)

df_merged.head(10)

In [None]:
quantity = 1
predictions = reg.predict(df_merged[['Quantity', 'HistoricalPrice', 'MedianCompetitorPrice', 'LowestCompetitorPrice']]) * quantity
 
df_merged['RecommendedPrice'] = predictions
df_merged.to_csv('predictions.csv')

# Get best offers

In [None]:
df_merged = pd.merge(df_actual, historical_price, on=['ItemId'], how='left')
df_merged = pd.merge(df_merged, median_competitor_price, on=['ItemId'], how='left')
df_merged = pd.merge(df_merged, lowest_competitor_price, on=['ItemId'], how='left')
df_merged = pd.merge(df_merged, items[['ItemId', 'Name', 'Quality', 'ItemClass']], on=['ItemId'], how='left')

df_merged['HistoricalPrice'] = df_merged['HistoricalPrice'].fillna(0)
df_merged['MedianCompetitorPrice'] = df_merged['MedianCompetitorPrice'].fillna(0)
df_merged['LowestCompetitorPrice'] = df_merged['LowestCompetitorPrice'].fillna(0)

df_merged.head()

In [None]:
predictions = reg.predict(df_merged[['Quantity', 'HistoricalPrice', 'MedianCompetitorPrice', 'LowestCompetitorPrice']])

df_predictions = pd.DataFrame(predictions, columns=['RecommendedPrice'])
df_predictions['Id'] = df_merged[['Id']]

df_predictions.head()

In [None]:
interest_columns = ['Id', 'ItemId', 'Name', 'Quantity', 'HistoricalPrice', 'MedianCompetitorPrice', 'LowestCompetitorPrice', 'UnitPrice', 'BuyoutGold']

df_merged_recommended = pd.merge(df_merged[interest_columns], df_predictions, on=['Id'], how='left')
df_merged_recommended = df_merged_recommended[df_merged_recommended['HistoricalPrice'] > 0]

df_merged_recommended['PriceDifference'] = df_merged_recommended['RecommendedPrice'] - df_merged_recommended['BuyoutGold']
df_merged_recommended['RelativeCompetitorPrice'] = df_merged_recommended['LowestCompetitorPrice'] / df_merged_recommended['MedianCompetitorPrice']
df_merged_recommended['RelativeHistoricalPrice'] = df_merged_recommended['LowestCompetitorPrice'] / df_merged_recommended['HistoricalPrice']

df_merged_recommended[df_merged_recommended['PriceDifference'] >= 1].to_csv('recommended_sales.csv')
df_merged_recommended[df_merged_recommended['PriceDifference'] >= 1].head()

# Get items sold in auction at a price lower than vendor price

In [None]:
columns = ['Name', 'UnitPrice', 'SellPrice', 'Underpriced']

df_merged = pd.merge(df_actual, items, on=['ItemId'], how='left')
df_merged['Underpriced'] = df_merged['UnitPrice'] < df_merged['SellPrice']
df_merged[columns].head()

In [None]:
df_merged[df_merged['Underpriced'] == True][columns].to_csv('underpriced.csv')