In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2022/11/08 15:54
# @Author  : Wang Yujia
# @File    : basicinfo_GT_models.ipynb
# @Description : 关于GT模型的计算结果的基本信息

# 0. What for
1. 关于GT模型的计算结果的基本信息

# 1. Preparations
## 1.1 全局设置

In [2]:
import os
import pandas as pd
import numpy as np
from visdom import Visdom
import csv
from tqdm import tqdm
from IPython.display import clear_output

In [3]:
# Small dataset
small_data_np_path = r'../data/small_auctions_np.csv'
settings_small_NN_path = r"../data/small_settings_NN.csv"

# Large data
large_data_np_path = r'E:\DATA\large_dta\large_auctions_np.csv'                 #
settings_large_NN_path = r'E:\DATA\large_dta\large_settings_NN.csv'

# target path
target_root_path = "../data/targets_all/"
train_all_path = os.listdir(target_root_path)

unique_setting_NN = ['desc','bidincrement','bidfee','retail','flg_fixedprice']


## 1.2 Read

In [4]:
data_small = pd.read_csv(small_data_np_path,encoding="utf-8")
data_large = pd.read_csv(large_data_np_path,encoding="utf-8")

data_key_small = pd.read_csv(settings_small_NN_path,encoding="utf-8")
data_key_large = pd.read_csv(settings_large_NN_path,encoding="utf-8")
data_key = pd.concat([data_key_small,data_key_large],axis = 0,ignore_index=True)

len_small = data_key_small.shape[0]
len_large = data_key_large.shape[0]
len_all = data_key.shape[0]
print(f"小数据集有 *{len_small}* 场auction，大数据集有 *{len_large}* 场auction，一共*{len_all}*")

小数据集有 *1226* 场auction，大数据集有 *80* 场auction，一共*1306*


# 1. Count out data records

In [7]:
records_all = data_small.shape[0]+data_large.shape[0]
records_all_avg = records_all/len_all
print(f"一共有 *{records_all}* 条records，平均每场auction有 *{records_all_avg}* 条records")

一共有 *117625* 条records，平均每场auction有 *90.06508422664625* 条records


In [15]:
records_asc = 0
records_fixed = 0
auction_asc = 0
auction_fixed = 0

for i in tqdm(range(len_all)):
    data_path = os.path.join(target_root_path+train_all_path[i])
    data = pd.read_csv(data_path,encoding="utf-8")
    if data_key.loc[i,'flg_fixedprice'] == 1:
        auction_fixed += 1
        records_fixed += data.shape[0]
    else:
        auction_asc += 1
        records_asc += data.shape[0]

100%|██████████| 1306/1306 [00:00<00:00, 1673.17it/s]


In [16]:
print(f"Ascending-price中，一共有 *{records_asc}* 条records，平均每场auction有 *{records_asc/auction_asc}* 条records")
print(f"Fixed-price中，一共有 *{records_fixed}* 条records，平均每场auction有 *{records_fixed/auction_fixed}* 条records")

Ascending-price中，一共有 *115880* 条records，平均每场auction有 *90.81504702194357* 条records
Fixed-price中，一共有 *1745* 条records，平均每场auction有 *58.166666666666664* 条records


# 2. 统计settings的信息

In [5]:
data_key.head()

Unnamed: 0,desc,bidincrement,bidfee,retail,flg_fixedprice
0,Sony Ericsson S500i Unlocked Mysterious Green,0.15,0.75,499.99,0
1,PSP Slim & Lite Sony Piano Black,0.15,0.75,169.99,0
2,iPod Touch Apple 8GB with Software Upgrade,0.15,0.75,299.99,0
3,LG KU990 Viewty Unlocked Black,0.0,0.75,899.99,1
4,Logitech Cordless Wave Keyboard and Mouse,0.15,0.75,89.99,0


In [19]:
print(f"所有dataset中一共有 {data_key.bidincrement.nunique()} 种unique ‘bid increment’")
print(f"所有dataset中一共有 {data_key.bidfee.nunique()} 种‘bid fee’")
print(f"所有dataset中一共有 {data_key.retail.nunique()} 种‘retail’")
print(f"去掉fixed-data的话：(7-1)*3*506 = {(7-1)*3*506}种组合")
print(f"包括fixed-data的话：(7)*3*506 = {(7)*3*506}种组合")

所有dataset中一共有 7 种unique ‘bid increment’
所有dataset中一共有 3 种‘bid fee’
所有dataset中一共有 506 种‘retail’
去掉fixed-data的话：(7-1)*3*506 = 9108种组合
包括fixed-data的话：(7)*3*506 = 10626种组合


In [20]:
print(f"所有dataset中一共有 {data_key.bidincrement.unique()} 种unique ‘bid increment’")
print(f"所有dataset中一共有 {data_key.bidfee.unique()} 种‘bid fee’")

所有dataset中一共有 [0.15 0.   0.01 0.12 0.24 0.02 0.06] 种unique ‘bid increment’
所有dataset中一共有 [0.75 0.6  0.01] 种‘bid fee’


In [11]:
data_grouped = data_key.groupby(by=['bidincrement','bidfee','retail'],as_index=False)
pd.DataFrame(data_grouped.size())

Unnamed: 0,bidincrement,bidfee,retail,size
0,0.00,0.75,129.95,1
1,0.00,0.75,249.99,3
2,0.00,0.75,299.99,1
3,0.00,0.75,349.99,1
4,0.00,0.75,449.99,1
...,...,...,...,...
697,0.24,0.60,149.99,1
698,0.24,0.60,169.99,2
699,0.24,0.60,205.00,1
700,0.24,0.60,259.99,1
