In [35]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2022/9/26 14:40
# @Author  : Wang Yujia
# @File    : data_extract_for_asc_symmetry_2.ipynb
# @Description : 1. 使用【method-2】计算target_n和target_p 2.为ascending-price 的GT模型 **以及** PT模型，从outcomes中提取features和data。

# 0. what for
1. 提取ascending-price auction（symmetry）的GT model需要的features, or settings
    - 并不是所有settings都会被considered into, 样本数小于`threshold`的settings不予考虑
2. 根据1中的settings，提取对应的data. 选择**method-2**计算`n`与`p`，作为target data使用
3. 代码参考自`../BasicInfo/calculate_n.np`，对于列名/参数名进行了优化和更改
4. Output：2个csv
    - `datawithnp_asc_symmetry_2.csv`：去除了重复行，但是未没经过threshold筛选的data
    - `datawithnp_asc_symmetry_2_selected.csv`去除了重复行，且经过threshold筛选，GTmodel需要的参数和target data都在这里了
5. **当GT模型计算需要更多的feature时**，更改`unique_setting`就好，把需要的feature提取出来
6. **检查逻辑的2个思路**：
    - 对于同一个unique setting, 是不是 sum(['cnt_n_1'])==['cnt_uniq'].
    - 对于同一个unique setting, 是不是['cnt_n_1']==2对应的['P']是['cnt_n_1']==1对应的['P']的 2倍
    - 逻辑检查 see 2.2.4
7. 刚意识到，'n'这个值其实是不会变的，变的是'p'，后者取决于我们把什么样的feature归为一类，而前者就是一场拍卖的轮数，是一个既已发生的客观值

# 1. Preparations
## 1.1 全局设置

In [36]:
# outcomes dataset
outcomes_orignal_path = "../data/outcomes.tsv"

# 计算完n和p后，可以使用的dataset被output在
output_path = "../data/info_asymm/datawithnp_asc_symmetry_2.csv"
# 经过threshold后，得到的dataset
data_selected_path = "../data/info_asymm/datawithnp_asc_symmetry_2_selected.csv"

# 最终GT model需要的features（不包括np），其中`product_id` is not necessary
# features_GT = ['bidincrement','bidfee','retail']

# 衡量一场auction是否unique的标志
unique_setting = ['product_id', 'bidincrement', 'bidfee','retail']
# To get 'n' by method-2, need this feature:
price_feature = ['price']

# 样本数小于`threshold`的settings不予考虑
threshold = 16

import numpy as np
import pandas as pd

## 1.2 读取data
1. 清洗掉data where `(outcomes['bidfee']*0.01)>=outcomes['retail']`

In [37]:
outcomes = pd.read_csv(outcomes_orignal_path, sep='\t')
outcomes = outcomes[(unique_setting+price_feature)]
idx_drop = outcomes[(outcomes['bidfee']*0.01)>=outcomes['retail']].index
outcomes.drop(idx_drop,axis = 0,inplace=True)
print("Drop 掉了 *{}* 行data，他们的bidfee >= retail".format(len(idx_drop)))

Drop 掉了 *32* 行data


# 2. 计算n和p
## 2.1 计算n
1. 这里计算n是 **“方法二”** ：在`outcomes.tsv`中通过(price-0)/bidincrement来计算

In [38]:
data_withn = outcomes.copy()
data_withn.loc[:,'n_2'] = outcomes.loc[:,'price'] / (outcomes.loc[:,'bidincrement']*0.01)
data_withn['n_2'] = data_withn['n_2'].astype(int)                      # 把n变成int

data_withn = data_withn.drop(price_feature,axis=1)                     # 去掉【绝对】不用的列

## 2.2 计算p
1. unique setting一样的auction认为是同一个
2. `P = cnt_n_2 / cnt_uniq`
3. `cnt_uniq = ['n_2'].nunique()` when 'unique setting' is the same

In [39]:
# 2.2.1 each unique setting对应了'cnt_uniq'场auction: data_withn_cnt
# 注意'cnt_uniq'本来并不需要出现在最后的data中，但是后面用threshold会联动一下
# 最后的data中保留了这一项，方便计算likelihood
data_grouped_tmp = data_withn.groupby(unique_setting,as_index=False)
tmp = pd.DataFrame(data_grouped_tmp.size())

data_withn_cnt = pd.merge(data_withn, tmp, on=unique_setting, how="left")
data_withn_cnt.rename(columns={'size': 'cnt_uniq'}, inplace=True)
data_withn_cnt.head()

Unnamed: 0,product_id,bidincrement,bidfee,retail,n_2,cnt_uniq
0,10009602,15,75,499.99,89,10
1,10009881,15,75,169.99,498,21
2,10009881,15,75,169.99,554,21
3,10006115,15,75,499.99,131,33
4,10006115,15,75,499.99,314,33


- P = cnt_n / cnt_uniq,上面已经算了' cnt_uniq'，下面需要算cnt_n

In [40]:
# 2.2.2 计算cnt_n_2，并添加到data_withn的一列: data_withn_cnt_n1
        # cnt_n_2表示某个setting下的n_1某数值出现了几次/ Example: cnt_n=2表示在某个setting下，有2场拍卖持续了n_1轮
features_cnt_n_2 = unique_setting+['n_2']
data_grouped_tmp = data_withn.groupby(features_cnt_n_2,as_index=False)
tmp = pd.DataFrame(data_grouped_tmp.size())

data_withn_cnt_n2 = pd.merge(data_withn_cnt, tmp, on=features_cnt_n_2, how="left")
data_withn_cnt_n2.rename(columns={'size': 'cnt_n_2'}, inplace=True)
data_withn_cnt_n2.head()

Unnamed: 0,product_id,bidincrement,bidfee,retail,n_2,cnt_uniq,cnt_n_2
0,10009602,15,75,499.99,89,10,1
1,10009881,15,75,169.99,498,21,1
2,10009881,15,75,169.99,554,21,1
3,10006115,15,75,499.99,131,33,1
4,10006115,15,75,499.99,314,33,1


- P = cnt_n / cnt_uniq
- 输出的结果在`data_withn_cnt_n12`中，其中包含了所需要的3个值：cnt_n_2 cnt_uniq

In [41]:
# 2.2.3 计算P=cnt_n_2 / cnt_uniq: data_withn_cnt_n2

tmp = data_withn_cnt_n2['cnt_n_2'] / data_withn_cnt_n2['cnt_uniq']
data_withn_cnt_n2['P'] = tmp
data_withn_cnt_n2.head()

Unnamed: 0,product_id,bidincrement,bidfee,retail,n_2,cnt_uniq,cnt_n_2,P
0,10009602,15,75,499.99,89,10,1,0.1
1,10009881,15,75,169.99,498,21,1,0.047619
2,10009881,15,75,169.99,554,21,1,0.047619
3,10006115,15,75,499.99,131,33,1,0.030303
4,10006115,15,75,499.99,314,33,1,0.030303


In [42]:
# 2.2.4 逻辑检查
# data_withn_cnt_n1.to_csv("data_check_tmp.csv", header=True, encoding="utf-8",index=False)

## 2.3 保存结果
0. 逻辑检查请在之前进行
1. **这里保存的是没有经过`threshold`筛选的data**
2. 去重注意不能按照`unique_setting`去搞，毕竟一个`unique_setting`对应一组`np`值，也就是对应一个概率分布

In [43]:
# 去重，重命名
data_withn_cnt_n2.rename(columns={'n_2': 'N'}, inplace=True)
len_before_drop = data_withn_cnt_n2.shape[0]
data_withn_cnt_n2.drop_duplicates(subset=(['N','P']+unique_setting),inplace=True)
print("Drop 掉了 *{}* 行duplicate data".format(data_withn_cnt_n2.shape[0] - len_before_drop))

# cent to dollar
data_withn_cnt_n2['bidincrement'] = data_withn_cnt_n2['bidincrement']*0.01
data_withn_cnt_n2['bidfee'] = data_withn_cnt_n2['bidfee']*0.01
# output to csv
data_withn_cnt_n2.to_csv(output_path, header=True, encoding="utf-8",index=False)
print("The data is like: ")
print(data_withn_cnt_n2.head())

Drop 掉了 *-36886* 行duplicate data
The data is like: 
   product_id  bidincrement  bidfee  retail    N  cnt_uniq  cnt_n_2         P
0    10009602          0.15    0.75  499.99   89        10        1  0.100000
1    10009881          0.15    0.75  169.99  498        21        1  0.047619
2    10009881          0.15    0.75  169.99  554        21        1  0.047619
3    10006115          0.15    0.75  499.99  131        33        1  0.030303
4    10006115          0.15    0.75  499.99  314        33        1  0.030303


# 3. 根据threshold筛选data
1. 取样本数在threshold之上的setting作为数据集来使用
    - 根据每个unique setting下的样本数: data_withn_cnt_n2['cnt_uniq']，用`threshold`筛选
2. GTmodel可以根据这个进行计算

In [44]:
# 筛选unique setting对应的样本数(data_withn_cnt_n2['cnt_uniq'])在threshold之上的部分
data_selected = data_withn_cnt_n2[data_withn_cnt_n2['cnt_uniq'] >= threshold][:]
data_selected.to_csv(data_selected_path,header=True,encoding="utf-8",index=False)

# 4. output
1.输出一些信息

In [45]:
total_amount = data_withn_cnt_n2.shape[0]
data_selected_size = data_selected.shape[0]
print("在当前threshold设置下，dataset包括{}个setting\n".format(data_selected_size))
print("当前threshold为{0}，相当于取了{1}%个unique settings\n".format(threshold, round(data_selected_size/total_amount*100, 3)))

在当前threshold设置下，dataset包括71178个setting

当前threshold为16，相当于取了84.233%个unique settings

