In [56]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2022/9/26 14:40
# @Author  : Wang Yujia
# @File    : data_cleaning.ipynb
# @Description : 1. 对outcomes.tsv进行清洗

# 0. what for
1. 对outcomes.tsv进行清洗
# 1. Preparations
## 1.1 全局设置

In [16]:
# outcomes dataset
outcomes_orignal_path = "../data/outcomes.tsv"

# output path
output_path = "../data/outcomes_cleaned.csv"
prod_id_path = "../data/prod_id.csv"
settings_GT_path = "../outcomes_settings_GT.csv"
settings_NN_path = "../outcomes_settings_NN.csv"

# ['auction_id', 'product_id', 'item', 'desc', 'retail', 'price','finalprice', 'bidincrement', 'bidfee', 'winner', 'placedbids', 'freebids', 'endtime_str', 'flg_click_only', 'flg_beginnerauction', 'flg_fixedprice', 'flg_endprice']

import numpy as np
import pandas as pd

## 1.2 读取data，统一单位
1. 把'bidfee'和'bidincrement'单位统一成dollar

In [5]:
outcomes = pd.read_csv(outcomes_orignal_path, sep='\t')
outcomes['bidfee'] = outcomes['bidfee']*0.01
outcomes['bidincrement'] = outcomes['bidincrement']*0.01
outcomes.head()

Unnamed: 0,auction_id,product_id,item,desc,retail,price,finalprice,bidincrement,bidfee,winner,placedbids,freebids,endtime_str,flg_click_only,flg_beginnerauction,flg_fixedprice,flg_endprice
0,86827,10009602,sony-ericsson-s500i-unlocked-mysterious-,Sony Ericsson S500i Unlocked Mysterious Green,499.99,13.35,13.35,0.15,0.75,Racer11,26,0,19:52 PDT 09-16-2008,0,0,0,0
1,87964,10009881,psp-slim-lite-sony-piano-black,PSP Slim & Lite Sony Piano Black,169.99,74.7,74.7,0.15,0.75,Cemo23,65,0,11:17 PDT 08-28-2008,0,0,0,0
2,87965,10009881,psp-slim-lite-sony-piano-black,PSP Slim & Lite Sony Piano Black,169.99,83.1,83.1,0.15,0.75,Jacobsonnich,94,0,22:52 PDT 11-07-2008,0,1,0,0
3,88638,10006115,sony-ericsson-s500i-unlocked-mysterious-,Sony Ericsson S500i Unlocked Mysterious Green,499.99,19.65,19.65,0.15,0.75,Mokkis,10,0,22:02 PDT 08-23-2008,0,0,0,0
4,88639,10006115,sony-ericsson-s500i-unlocked-mysterious-,Sony Ericsson S500i Unlocked Mysterious Green,499.99,47.1,47.1,0.15,0.75,Superloeffel,80,0,14:23 PDT 08-24-2008,0,0,0,0


- 只保留部分columns

In [8]:
col_keep = ['auction_id','product_id','item', 'desc', 'retail', 'price','bidincrement', 'bidfee','flg_fixedprice']
outcomes = outcomes[col_keep].copy()

# 2. 根据规则drop数据
## 2.1 drop data where ['bidfee'] >= ['retail']
1. 异常数据，bid fee比retail都高。

In [10]:
idx_drop = outcomes[(outcomes['bidfee'])>=outcomes['retail']].index
outcomes.drop(idx_drop,axis = 0,inplace=True)
print("Drop 掉了 *{}* 行data，他们的bidfee >= retail".format(len(idx_drop)))

Drop 掉了 *0* 行data，他们的bidfee >= retail


## 2.2 drop data where 'desc'="-"
1. 有的data的desc是“-”，而且item是一个“xxxx.html”，认为异常数据
    - 'product_id' == 10010818

In [12]:
idx_drop = outcomes[outcomes['desc'] == '-'].index
outcomes.drop(idx_drop,inplace=True,axis=0)
print(f"Drop 掉了 *{len(idx_drop)}* 行data，它们的desc是“-”）")

Drop 掉了 *36* 行data，它们的desc是“-”）


# 3. 添加id for desc
1. 有的product_id和item以及desc对不上，
2. 所以添加一列id，保留原来的'product_id'，希望“id”一样时，对应的desc是唯一的
    - 不应该以item为标准有很多desc不同但是item一样
    - 而且item项的信息过于简略，不全
3. 可以预想的是，reformat之后，settings数目会减少，但是由于并没有drop数据，samples会增多一点

In [13]:
desc_arr = outcomes['desc'].unique()
desc_df = pd.DataFrame({'id' : np.arange(0,len(desc_arr)),
                        'desc': desc_arr})

outcomes_with_id = pd.merge(outcomes,desc_df,on='desc',how="left")

print(len(desc_arr))

1778


# 4. save
1. 保存清洗完的数据，以及prod对应的id

In [15]:
outcomes_with_id.to_csv(output_path,index=False,header=True,encoding="utf-8")
desc_df.to_csv(prod_id_path,header=True,index=False,encoding="utf-8")
print("Done")

Done


2. 保存data key

In [17]:
unique_setting_GT = ['bidincrement','bidfee','retail']
data_auction_GT = outcomes_with_id[unique_setting_GT].copy()
settings_GT = data_auction_GT.drop_duplicates(ignore_index=True)
settings_GT.to_csv(settings_GT_path,encoding="utf-8",index=False,header=True)

print(f"当setting = {unique_setting_GT}")
print(f"一共有 *{settings_GT.shape[0]}*场不同的auction")

当setting = ['bidincrement', 'bidfee', 'retail']
一共有 *1638*场不同的auction


In [18]:
unique_setting_NN = ['desc','bidincrement','bidfee','retail']

data_auction_NN = outcomes_with_id[unique_setting_NN].copy()
settings_NN = data_auction_NN.drop_duplicates(ignore_index=True)
settings_NN.to_csv(settings_NN_path,encoding="utf-8",index=False,header=True)

print(f"当setting = {unique_setting_NN}")
print(f"一共有 *{settings_NN.shape[0]}*场不同的auction")

当setting = ['desc', 'bidincrement', 'bidfee', 'retail']
一共有 *3455*场不同的auction
