In [35]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2022/11/21 14:28
# @Author  : Wang Yujia
# @File    : data_cal_np.ipynb
# @Description : 1. 计算N和P 2. 设置threshold

# 0. what for
1. 计算N和P，从`data_extract_for_asc_symmetry_2.ipynb`提取出来的
    - 计算P的方法：**method-2**，`(price-0)/bidincrement`
2. 计算完后，样本数小于`threshold`的settings不予考虑
3. Output：2个csv
    - `datawithnp_2.csv`：去除了重复行，但是未没经过threshold筛选的data
    - `datawithnp_2_selected.csv`去除了重复行，且经过threshold筛选
4. **检查逻辑的2个思路**：
    - 对于同一个unique setting, 是不是 sum(['cnt_n_1'])==['cnt_uniq'].
    - 对于同一个unique setting, 是不是['cnt_n_1']==2对应的['P']是['cnt_n_1']==1对应的['P']的 2倍
    - 逻辑检查 see 2.2.4
# 1. Preparations
## 1.1 全局设置

In [36]:
# outcomes dataset
outcomes_path = "../data/outcomes_cleaned.csv"

# 经过threshold后，得到的dataset
data_selected_path = "../data/datawithnp_2_selected.csv"

# 衡量一场auction是否unique的标志
unique_setting = ['id', 'bidincrement', 'bidfee','retail']

threshold = 16

import numpy as np
import pandas as pd

## 1.2 读取data

In [37]:
outcomes = pd.read_csv(outcomes_path, encoding="utf-8")

# 2. 计算N和P
## 2.1 计算N
1. 这里计算N是 **“方法二”** ：在`outcomes.tsv`中通过(price-0)/bidincrement来计算

In [38]:
data_withn = outcomes.copy()
data_withn.loc[:,'n_2'] = outcomes.loc[:,'price'] / (outcomes.loc[:,'bidincrement'])
data_withn['n_2'] = data_withn['n_2'].astype(int)
print("Done")# 把n变成int

Done


## 2.2 threshold
1. **取样本数在threshold之上的setting作为数据集来使用**
    - 根据每个unique setting下的样本数: `data_withn_cnt['cnt_uniq']`，用`threshold`筛选
2. GTmodel可以根据这个进行计算

In [39]:
# each unique setting对应了'cnt_uniq'场auction
# 注意'cnt_uniq'本来并不需要出现在最后的data中，但是后面用threshold会联动一下
# 最后的data中保留了这一项，方便计算likelihood

data_grouped_tmp = data_withn.groupby(unique_setting,as_index=False)
tmp = pd.DataFrame(data_grouped_tmp.size())
data_withn_cnt = pd.merge(data_withn, tmp, on=unique_setting, how="left")
data_withn_cnt.rename(columns={'size': 'cnt_uniq'}, inplace=True)
assert data_withn.shape[0] == data_withn_cnt.shape[0],"Wrong!"
data_withn_cnt.head()

Unnamed: 0,auction_id,product_id,item,desc,retail,price,finalprice,bidincrement,bidfee,winner,placedbids,freebids,endtime_str,flg_click_only,flg_beginnerauction,flg_fixedprice,flg_endprice,id,n_2,cnt_uniq
0,86827,10009602,sony-ericsson-s500i-unlocked-mysterious-,Sony Ericsson S500i Unlocked Mysterious Green,499.99,13.35,13.35,0.15,0.75,Racer11,26,0,19:52 PDT 09-16-2008,0,0,0,0,0,89,69
1,87964,10009881,psp-slim-lite-sony-piano-black,PSP Slim & Lite Sony Piano Black,169.99,74.7,74.7,0.15,0.75,Cemo23,65,0,11:17 PDT 08-28-2008,0,0,0,0,1,498,60
2,87965,10009881,psp-slim-lite-sony-piano-black,PSP Slim & Lite Sony Piano Black,169.99,83.1,83.1,0.15,0.75,Jacobsonnich,94,0,22:52 PDT 11-07-2008,0,1,0,0,1,554,60
3,88638,10006115,sony-ericsson-s500i-unlocked-mysterious-,Sony Ericsson S500i Unlocked Mysterious Green,499.99,19.65,19.65,0.15,0.75,Mokkis,10,0,22:02 PDT 08-23-2008,0,0,0,0,0,131,69
4,88639,10006115,sony-ericsson-s500i-unlocked-mysterious-,Sony Ericsson S500i Unlocked Mysterious Green,499.99,47.1,47.1,0.15,0.75,Superloeffel,80,0,14:23 PDT 08-24-2008,0,0,0,0,0,314,69


In [40]:
# 筛选unique setting对应的样本数(data_withn_cnt['cnt_uniq'])在threshold之上的部分
data_selected = data_withn_cnt[data_withn_cnt['cnt_uniq'] >= threshold].copy()

print("Drop 掉了 *{}* 行duplicate data".format(data_withn_cnt.shape[0] - data_selected.shape[0]))

Drop 掉了 *12175* 行duplicate data


In [41]:
data_grouped_tmp_2 = data_selected.groupby(unique_setting,as_index=False)
tmp_2 = pd.DataFrame(data_grouped_tmp_2.size())

print("之前有 *{0}* 个uniq settings，现在有 *{1}*个".format(tmp.shape[0],tmp_2.shape[0]))
print("Drop 掉了 *{}* 个uniq auction settings".format(tmp.shape[0] - tmp_2.shape[0]))
print(f"uniq settings = {unique_setting}")

之前有 *3402* 个uniq settings，现在有 *1196*个
Drop 掉了 *2206* 个uniq auction settings
uniq settings = ['id', 'bidincrement', 'bidfee', 'retail']


In [42]:
del(tmp,tmp_2,data_grouped_tmp_2,data_grouped_tmp)

## 2.3 计算p
1. unique setting一样的auction认为是同一个
2. `P = cnt_n_2 / cnt_uniq`，下面需要计算`cnt_n_2`
    - `cnt_n_2`表示某个setting下的`n_2`某数值出现了几次
    - Example: `cnt_n=2`表示在某个setting下，有2场拍卖持续了n_2轮
features_cnt_n_2 = unique_setting+['n_2']

In [43]:
data_selected.head(),data_selected.shape

(   auction_id  product_id                                      item  \
 0       86827    10009602  sony-ericsson-s500i-unlocked-mysterious-   
 1       87964    10009881            psp-slim-lite-sony-piano-black   
 2       87965    10009881            psp-slim-lite-sony-piano-black   
 3       88638    10006115  sony-ericsson-s500i-unlocked-mysterious-   
 4       88639    10006115  sony-ericsson-s500i-unlocked-mysterious-   
 
                                             desc  retail  price  finalprice  \
 0  Sony Ericsson S500i Unlocked Mysterious Green  499.99  13.35       13.35   
 1               PSP Slim & Lite Sony Piano Black  169.99  74.70       74.70   
 2               PSP Slim & Lite Sony Piano Black  169.99  83.10       83.10   
 3  Sony Ericsson S500i Unlocked Mysterious Green  499.99  19.65       19.65   
 4  Sony Ericsson S500i Unlocked Mysterious Green  499.99  47.10       47.10   
 
    bidincrement  bidfee        winner  placedbids  freebids  \
 0          0.15    

In [44]:
# cnt_n_2表示某个setting下的n_2某数值出现了几次/ Example: cnt_n=2表示在某个setting下，有2场拍卖持续了n_2轮
features_cnt_n_2 = unique_setting+['n_2']
data_grouped_tmp = data_selected.groupby(features_cnt_n_2,as_index=False)
tmp = pd.DataFrame(data_grouped_tmp.size())

data_withn_cnt_n2 = pd.merge(data_selected, tmp, on=features_cnt_n_2, how="left")
data_withn_cnt_n2.rename(columns={'size': 'cnt_n_2'}, inplace=True)
data_withn_cnt_n2.head(),data_withn_cnt_n2.shape

(   auction_id  product_id                                      item  \
 0       86827    10009602  sony-ericsson-s500i-unlocked-mysterious-   
 1       87964    10009881            psp-slim-lite-sony-piano-black   
 2       87965    10009881            psp-slim-lite-sony-piano-black   
 3       88638    10006115  sony-ericsson-s500i-unlocked-mysterious-   
 4       88639    10006115  sony-ericsson-s500i-unlocked-mysterious-   
 
                                             desc  retail  price  finalprice  \
 0  Sony Ericsson S500i Unlocked Mysterious Green  499.99  13.35       13.35   
 1               PSP Slim & Lite Sony Piano Black  169.99  74.70       74.70   
 2               PSP Slim & Lite Sony Piano Black  169.99  83.10       83.10   
 3  Sony Ericsson S500i Unlocked Mysterious Green  499.99  19.65       19.65   
 4  Sony Ericsson S500i Unlocked Mysterious Green  499.99  47.10       47.10   
 
    bidincrement  bidfee        winner  ...  freebids           endtime_str  \
 0   

- P = cnt_n / cnt_uniq
- 输出的结果在`data_withn_cnt_n12`中，其中包含了所需要的3个值：cnt_n_2 cnt_uniq

In [45]:
# 2.2.3 计算P=cnt_n_2 / cnt_uniq: data_withn_cnt_n2

data_withn_cnt_n2['P'] = data_withn_cnt_n2['cnt_n_2'] / data_withn_cnt_n2['cnt_uniq']
data_withn_cnt_n2.head(),data_withn_cnt_n2.shape

(   auction_id  product_id                                      item  \
 0       86827    10009602  sony-ericsson-s500i-unlocked-mysterious-   
 1       87964    10009881            psp-slim-lite-sony-piano-black   
 2       87965    10009881            psp-slim-lite-sony-piano-black   
 3       88638    10006115  sony-ericsson-s500i-unlocked-mysterious-   
 4       88639    10006115  sony-ericsson-s500i-unlocked-mysterious-   
 
                                             desc  retail  price  finalprice  \
 0  Sony Ericsson S500i Unlocked Mysterious Green  499.99  13.35       13.35   
 1               PSP Slim & Lite Sony Piano Black  169.99  74.70       74.70   
 2               PSP Slim & Lite Sony Piano Black  169.99  83.10       83.10   
 3  Sony Ericsson S500i Unlocked Mysterious Green  499.99  19.65       19.65   
 4  Sony Ericsson S500i Unlocked Mysterious Green  499.99  47.10       47.10   
 
    bidincrement  bidfee        winner  ...           endtime_str  \
 0          0.1

# 3. 保存结果
## 3.1 去重
1. 去重注意**不能**按照`unique_setting`去搞，毕竟一个`unique_setting`对应多对`np`值
2. **应该按照** `unique_setting`+`np`+`cnt_n_2`
3. 'cnt_uniq'不是必须的列

In [46]:
data_withn_cnt_n2.rename(columns={'n_2': 'N'}, inplace=True)
len_before_drop = data_withn_cnt_n2.shape[0]
data_withn_cnt_n2.drop_duplicates(subset=(['N','P','cnt_n_2']+unique_setting),inplace=True)
print("Drop 掉了 *{}* 行duplicate data".format(len_before_drop - data_withn_cnt_n2.shape[0]))
# 221124: 少drop300行的data

Drop 掉了 *38684* 行duplicate data


In [47]:
# output to csv
data_withn_cnt_n2.to_csv(data_selected_path, header=True, encoding="utf-8",index=False)
data_withn_cnt_n2.head()

Unnamed: 0,auction_id,product_id,item,desc,retail,price,finalprice,bidincrement,bidfee,winner,...,endtime_str,flg_click_only,flg_beginnerauction,flg_fixedprice,flg_endprice,id,N,cnt_uniq,cnt_n_2,P
0,86827,10009602,sony-ericsson-s500i-unlocked-mysterious-,Sony Ericsson S500i Unlocked Mysterious Green,499.99,13.35,13.35,0.15,0.75,Racer11,...,19:52 PDT 09-16-2008,0,0,0,0,0,89,69,1,0.014493
1,87964,10009881,psp-slim-lite-sony-piano-black,PSP Slim & Lite Sony Piano Black,169.99,74.7,74.7,0.15,0.75,Cemo23,...,11:17 PDT 08-28-2008,0,0,0,0,1,498,60,1,0.016667
2,87965,10009881,psp-slim-lite-sony-piano-black,PSP Slim & Lite Sony Piano Black,169.99,83.1,83.1,0.15,0.75,Jacobsonnich,...,22:52 PDT 11-07-2008,0,1,0,0,1,554,60,1,0.016667
3,88638,10006115,sony-ericsson-s500i-unlocked-mysterious-,Sony Ericsson S500i Unlocked Mysterious Green,499.99,19.65,19.65,0.15,0.75,Mokkis,...,22:02 PDT 08-23-2008,0,0,0,0,0,131,69,1,0.014493
4,88639,10006115,sony-ericsson-s500i-unlocked-mysterious-,Sony Ericsson S500i Unlocked Mysterious Green,499.99,47.1,47.1,0.15,0.75,Superloeffel,...,14:23 PDT 08-24-2008,0,0,0,0,0,314,69,1,0.014493
