In [37]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2022/9/29 11:07
# @Author  : Wang Yujia
# @File    : data_extract_for_fixed_symmetry.ipynb
# @Description : 为fixed-price auction从outcomes中提取features和data。ref：信息不对称的paper里symmetry的情况。

# 0. what for
1. 提取fixed-price auction（symmetry）的GT model需要的features, or settings
    - 并不是所有settings都会被considered into, 样本数小于`threshold`的settings不予考虑
2. 选择method-2来计算`n`与`p`，毕竟`traces.tsv`中不包含任何fixed-price auctions
3. 代码参考自`../BasicInfo/calculate_n.np`，对于列名/参数名进行了优化和更改

# 1. Preparations
## 1.1 全局设置

In [38]:
# outcomes dataset
outcomes_orignal_path = "../data/outcomes.tsv"
# traces dataset
traces_original_path = "../data/traces.tsv"
# 计算完n和p后，可以使用的dataset被output在
output_path = "../data/info_asymm/datawithnp_fixed_symmetry.csv"
# 经过threshold后，得到的dataset
data_selected_path = "../data/info_asymm/datawithnp_fixed_symmetry_selected.csv"

# 样本数小于`threshold`的settings不予考虑
threshold = 16

# 最终GT model做generate需要的features（不包括np），其中`product_id` is not necessary
# fixed model 多了一个'finalprice'
features_GT = ['auction_id','product_id','bidincrement','bidfee','retail','finalprice']
# 衡量一场auction是否unique的标志
# 需要喂给NN的necessary features
unique_setting = ['product_id', 'bidincrement', 'bidfee', 'retail']
# fixed-auction flag
flg_fixed = ['flg_fixedprice']
# To get 'n' by method-2, need this feature:
price_feature = ['price']

import numpy as np
import pandas as pd

## 1.2 读取fixed-price的data
1. 从outcomes中筛选那些flg_fixed == 1的data
2. outcomes.tsv中一共有 *2203* 场auctions 是 fixed-price auction.

In [39]:
outcomes = pd.read_csv(outcomes_orignal_path, sep='\t')
print("outcomes.tsv 中一共有 *{}* 场auctions.".format(outcomes.shape[0]))

outcomes = outcomes[(features_GT+flg_fixed+price_feature)]
outcomes['product_id'] = outcomes['product_id'].astype(int)
idx = np.array(outcomes[flg_fixed] == 1)  # 记得转化成array，否则容易因为index不对应产生NaN值
outcomes_fixed = outcomes[idx]
print("outcomes.tsv 中一共有 *{}* 场auctions 是 fixed-price auction.".format(outcomes_fixed.shape[0]))

outcomes.tsv 中一共有 *121419* 场auctions.
outcomes.tsv 中一共有 *2203* 场auctions 是 fixed-price auction.


# 2. 计算n和p
## 2.1 计算n
1. 这里计算n是“方法二”：在outcomes.tsv中通过(price-0)/bidincrement来计算

In [40]:
data_withn = outcomes_fixed.copy()
data_withn.loc[:,'n_2'] = outcomes.loc[:,'price'] / (outcomes.loc[:,'bidincrement']*0.01)
data_withn['n_2'] = data_withn['n_2'].astype(int)                      # 把n变成int

data_withn = data_withn.drop(flg_fixed+price_feature,axis=1)           # 去掉不用的列

## 2.2 计算p
1. `unique setting=['product_id', 'bidincrement', 'bidfee','retail']`，unique setting一样的auction认为是同一个
2. `P = cnt_n_2 / cnt_uniq`
3. cnt_uniq = ['n_2'].nunique() under the same 'unique setting'

In [41]:
# 2.2.1 each unique setting对应了'cnt_uniq'场auction: data_withn_cnt
# 注意'cnt_uniq'并不需要出现在最后的data中，但是后面和threshold会联动一下
data_grouped_tmp = data_withn.groupby(unique_setting,as_index=False)
tmp = pd.DataFrame(data_grouped_tmp.size())

data_withn_cnt = pd.merge(data_withn, tmp, on=unique_setting, how="left")
data_withn_cnt.rename(columns={'size': 'cnt_uniq'}, inplace=True)

- P = cnt_n / cnt_uniq,上面已经算了' cnt_uniq'，下面需要算cnt_n

In [42]:
# 2.2.2 计算cnt_n_2，并添加到data_withn的一列: data_withn_cnt_n1
        # cnt_n_2表示某个setting下的n_1某数值出现了几次/ Example: cnt_n=2表示在某个setting下，有2场拍卖持续了n_1轮
features_cnt_n_2 = unique_setting+['n_2']
data_grouped_tmp = data_withn.groupby(features_cnt_n_2,as_index=False)
tmp = pd.DataFrame(data_grouped_tmp.size())

data_withn_cnt_n2 = pd.merge(data_withn_cnt, tmp, on=features_cnt_n_2, how="left")
data_withn_cnt_n2.rename(columns={'size': 'cnt_n_2'}, inplace=True)

- P = cnt_n / cnt_uniq
- 输出的结果在`data_withn_cnt_n12`中，其中包含了所需要的3个值：cnt_n_1 cnt_uniq

In [43]:
# 2.2.3 计算P=cnt_n_2 / cnt_uniq: data_withn_cnt_n2

tmp = data_withn_cnt_n2['cnt_n_2'] / data_withn_cnt_n2['cnt_uniq']
data_withn_cnt_n2['P'] = tmp

## 2.3 保存结果
0. 逻辑检查请在之前进行
1. 这里保存的是没有经过`threshold`筛选的data
2. 去重注意不能按照`unique_setting`去搞，毕竟一个`unique_setting`对应一组`np`值，也就是对应一个概率分布

In [44]:
# 去重，重命名与输出
data_withn_cnt_n2.rename(columns={'n_2': 'N'}, inplace=True)
data_withn_cnt_n2.drop_duplicates(subset=(['N','P']+features_GT),inplace=True)
data_withn_cnt_n2.to_csv(output_path, header=True, encoding="utf-8",index=False)
print("The data is like: ")
print(data_withn_cnt_n2.head())

The data is like: 
   auction_id  product_id  bidincrement  bidfee  retail  finalprice     N  \
0       90391    10007682            15      75  899.99         1.0   175   
1       90392    10007682            15      75  899.99         1.0  1074   
2       92731    10009561            15      75  799.99         1.0    21   
3       92744    10009440            15      75  899.99         1.0  1073   
4       92933    10008521            15      75  749.99         1.0  1242   

   cnt_uniq  cnt_n_2         P  
0         8        1  0.125000  
1         8        1  0.125000  
2         3        1  0.333333  
3        15        1  0.066667  
4        93        1  0.010753  


# 3. 根据threshold筛选data
1. 取样本数在threshold之上的setting作为数据集来使用
    - 根据每个unique setting[product_id,bidincrement,bidfee]下的样本数: data_withn_cnt_n1['cnt_uniq']，用`threshold`筛选
2. GT model可以根据这个进行计算

In [45]:
# 筛选unique setting对应的样本数(data_withn_cnt_n1['cnt_uniq'])在threshold之上的部分
data_selected = data_withn_cnt_n2[data_withn_cnt_n2['cnt_uniq'] >= threshold][:]
data_selected.to_csv(data_selected_path,header=True,encoding="utf-8",index=False)

# 4. output
1.输出一些信息

In [46]:
total_amount = data_withn_cnt_n2.shape[0]
data_selected_size = data_selected.shape[0]
print("在当前threshold下，dataset包括*{}*个setting\n".format(data_selected_size))
print("当前threshold为*{0}*，相当于取了*{1}%*个unique settings\n".format(threshold, round(data_selected_size/total_amount*100, 3)))


在当前threshold下，dataset包括*1597*个setting

当前threshold为*16*，相当于取了*72.492%*个unique settings

