In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2022/11/25 14:22
# @Author  : Wang Yujia
# @File    : data_extract_for_training.ipynb
# @Description : 为了training data分别提取csv并保存

# 1. Preparations
## 1.1 全局设置

In [8]:
# 人工数据集or真实数据集
ARTIFICIAL = True
# 是否使用GT3
W_GT3 = True
# seed = [3,31,62,204,223,407,508,626], [4,31,35,204,407,66,508]
seed=35
noise_pct = 0.05                # 噪音占比:我们希望生成的data总体上最多浮动的百分比noise_pct

# Small dataset
settings_small_NN_path = r"../data/small_settings_NN.csv"
prod_embedding_small_path = "../data/small_prod_embedding_300.csv"

# Large data
settings_large_NN_path = r'E:\DATA\large_dta\large_settings_NN.csv'
prod_embedding_large_path = "E:\DATA\large_dta\large_prod_embedding_300.csv"

if ARTIFICIAL:
    GT_1_path = r"../data/info_asymm/results/GT_1_artificial_LEN=300.csv"
    GT_2_path = r"../data/SA_PT/results/GT_2_artificial_SA_LEN=300_noise="+str(noise_pct) +"_seed="+str(seed)+ ".csv"
    GT_3_path = r"../data/SA_PT/results/GT_3_artificial_SA_LEN=300_noise="+str(noise_pct) +"_seed="+str(seed)+ ".csv"
else:
    GT_1_path = r"../data/info_asymm/results/GT_1_LEN=300.csv"
    GT_2_path = r"../data/SA_PT/results/GT_2_NN_LEN=300_seed="+str(seed)+ ".csv"
    GT_3_path = r"../data/SA_PT/results/GT_3_NN_LEN=300_seed="+str(seed)+ ".csv"

# output paths
if ARTIFICIAL:
    if not W_GT3:
        train_root_path= "../data/artificial_train_v2_"+"noise="+str(noise_pct)+"_seed="+str(seed)+"/"
    else:
        train_root_path= "../data/artificial_train_v3_"+"noise="+str(noise_pct)+"_seed="+str(seed)+"/"
else:
    if not W_GT3:
        train_root_path= "../data/train_300_uniq_all_seed="+str(seed)+"/"
    else:
        train_root_path= "../data/train_300_uniq_all_v3_seed="+str(seed)+"/"

train_file_head = "train_data_NP_"
train_file_tail= ".csv"

unique_setting_NN = ['desc','bidincrement','bidfee','retail','flg_endprice']

import pandas as pd
import numpy as np
from tqdm import tqdm
import os

# 1.2 data读取与合并

In [9]:
# 检查目录是否存在，如果不存在则创建
if not os.path.exists(train_root_path):
    os.makedirs(train_root_path)

In [10]:
GT_1 = pd.read_csv(GT_1_path,encoding="utf-8")

GT_2 = pd.read_csv(GT_2_path,encoding="utf-8")

if W_GT3:

    GT_3 = pd.read_csv(GT_3_path,encoding="utf-8")

prod_embedding_small = pd.read_csv(prod_embedding_small_path,encoding="utf-8")
prod_embedding_large = pd.read_csv(prod_embedding_large_path,encoding="utf-8")

data_key_small = pd.read_csv(settings_small_NN_path,encoding="utf-8")
data_key_large = pd.read_csv(settings_large_NN_path,encoding="utf-8")

len_small = data_key_small.shape[0]
len_large = data_key_large.shape[0]
print(f"小数据集有 *{len_small}* 场auction，大数据集有 *{len_large}* 场auction，一共*{len_small+len_large}*")

小数据集有 *1196* 场auction，大数据集有 *80* 场auction，一共*1276*


- 检查

In [11]:
assert prod_embedding_small.shape[0] == len_small,"5"
assert prod_embedding_large.shape[0] == len_large,"6"

- prod_embedding去掉desc列

In [12]:
prod_embedding_small.drop(columns=['desc'],inplace=True)
prod_embedding_large.drop(columns=['desc'],inplace=True)

# 2. 按照data key拼接
1. 拼接GT1+GT2+embedding
2. 先拼小数据集，再拼大数据集

In [13]:
LEN = 300
train_col = [str(i) for i in range(0,LEN)]
if not W_GT3:
    for i in tqdm(range(len_small)):
        train_tmp = pd.concat([pd.DataFrame(GT_1.iloc[i,:]).T,
                                pd.DataFrame(GT_2.iloc[i,:]).T,
                                pd.DataFrame(prod_embedding_small.iloc[i,:]).T],
                                ignore_index=True)
        # 重命名，避免合并出问题
        train_tmp.columns = train_col
        # Save
        output_path = train_root_path + train_file_head + str(i).zfill(4) + train_file_tail
        train_tmp.to_csv(output_path,header=True,index=False,encoding="utf-8")
else:
    for i in tqdm(range(len_small)):
        train_tmp = pd.concat([pd.DataFrame(GT_1.iloc[i,:]).T,
                                pd.DataFrame(GT_2.iloc[i,:]).T,
                                pd.DataFrame(GT_3.iloc[i,:]).T,
                                pd.DataFrame(prod_embedding_small.iloc[i,:]).T],
                                ignore_index=True)
        # 重命名，避免合并出问题
        train_tmp.columns = train_col
        # Save
        output_path = train_root_path + train_file_head + str(i).zfill(4) + train_file_tail
        train_tmp.to_csv(output_path,header=True,index=False,encoding="utf-8")

100%|██████████| 1196/1196 [00:02<00:00, 415.53it/s]


In [14]:
LEN = 300
train_col = [str(i) for i in range(0,LEN)]
if not W_GT3:
    for i in tqdm(range(len_large)):
        train_tmp = pd.concat([pd.DataFrame(GT_1.iloc[i+len_small,:]).T,
                                pd.DataFrame(GT_2.iloc[i+len_small,:]).T,
                                pd.DataFrame(prod_embedding_large.iloc[i,:]).T],
                                ignore_index=True)
        # 重命名，避免合并出问题
        train_tmp.columns = train_col
        # Save
        output_path = train_root_path + train_file_head + str(i+len_small).zfill(4) + train_file_tail
        train_tmp.to_csv(output_path,header=True,index=False,encoding="utf-8")
else:
    for i in tqdm(range(len_large)):
        train_tmp = pd.concat([pd.DataFrame(GT_1.iloc[i+len_small,:]).T,
                                pd.DataFrame(GT_2.iloc[i+len_small,:]).T,
                                pd.DataFrame(GT_3.iloc[i+len_small,:]).T,
                                pd.DataFrame(prod_embedding_large.iloc[i,:]).T],
                                ignore_index=True)
        # 重命名，避免合并出问题
        train_tmp.columns = train_col
        # Save
        output_path = train_root_path + train_file_head + str(i+len_small).zfill(4) + train_file_tail
        train_tmp.to_csv(output_path,header=True,index=False,encoding="utf-8")

100%|██████████| 80/80 [00:00<00:00, 396.56it/s]


# 3. 拆分小表与保存【LEN<300】(用不到了..)
1. 缩小training data的粒度
    - scale=3时相当于LEN=100
2. **是在原LEN=300的基础上进行的，必须读入GT长度为300的那2个表+长度为新粒度的embedding表**

In [51]:
def rescale(data,SCALE):
    assert data.shape[1]==300,"!=300"
    data_new = pd.DataFrame()
    for i in range(0,data.shape[1],SCALE):
        if(i+SCALE<=data.shape[1]):  # 说明可以add SCALE个值
            tmp = np.sum(data.iloc[:,i:i+SCALE],axis=1)
        else:  # 加不够SCALE个值
            tmp = np.sum(data.iloc[:,i:],axis=1)
        # 拼接起来
        data_new = pd.concat([data_new,tmp],axis=1)
    assert data_new.shape[0] == data.shape[0],"Shape不等"

    return data_new

In [52]:
SCALE = 5
LEN = np.ceil(300/SCALE).astype(int)

print(f"SCALE={SCALE}, LEN={LEN}")

# 以SCALE为间隔生成col name
# train_col = [str(i+1) for i in range(0,LEN,SCALE)]
train_col = [str(i) for i in range(0,LEN)]

for i in tqdm(range(0,GT_1_withid.shape[0])):

    # 保存'id'
    id = GT_1_withid.loc[i,'id']
    # 先合并GT data
    train_tmp = pd.concat([pd.DataFrame(GT_1_withid.iloc[i,:]).T,
                            pd.DataFrame(GT_2_withid.iloc[i,:]).T],
                            ignore_index=True)

    # drop一些列，保持长度相等（LEN）
    train_tmp.drop(['id','bidincrement','bidfee','retail'],axis=1,inplace=True)
    # 执行Rescale
    scaled_data = rescale(train_tmp,SCALE)

    # 按照'id'列找到embedding信息
    embedding = prod_embedding[prod_embedding['id'] == id].copy()
    # drop一些列，保持长度相等(LEN)
    embedding.drop(['id','desc'],axis=1,inplace=True)
    # 重命名，避免合并出问题
    scaled_data.columns = train_col
    # 合并
    train_pd = pd.concat([scaled_data,pd.DataFrame(embedding)], ignore_index=True)
    # save
    output_path = train_root_path+train_file_head+ str(i).zfill(4) + train_file_tail
    train_pd.to_csv(output_path,header=True,index=False,encoding="utf-8")

print("Done")
print(train_root_path)

SCALE=5, LEN=60


100%|██████████| 1196/1196 [00:35<00:00, 33.58it/s]

Done
../data/train_60/



