In [101]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2022/11/25 14:22
# @Author  : Wang Yujia
# @File    : data_extract_for_training.ipynb
# @Description : 为了training data分别提取csv并保存

# 1. Preparations
## 1.1 全局设置
1. output一共`data_key.shape[0]`个文件

In [135]:
# input data
GT_1_data_path = "../data/info_asymm/results/asc_symmetry/GT_asc_symmetry_P2_K=300.csv"
GT_2_data_path = "../data/SA_PT/results/PT_oneforall_P_K=300.csv"
prod_embedding_output_path = "../data/prod_embedding.csv"
target_data_key_path = "../data/targets/target_datakey.csv"


# output path
train_root_path= "../data/train/"
train_file_head = "train_data_NP_"
train_file_tail= ".csv"
train_data_key_path = target_data_key_path

unique_features = ['id', 'bidincrement', 'bidfee','retail']
GT_features = ['bidincrement', 'bidfee','retail']
prod_embedding_features = ['id']

import pandas as pd
import numpy as np
from tqdm import tqdm

## 1.2 data读取与保存data_key

In [103]:
data_1 = pd.read_csv(GT_1_data_path, encoding="utf-8")
data_2 = pd.read_csv(GT_2_data_path, encoding="utf-8")
prod_embedding = pd.read_csv(prod_embedding_output_path,encoding="utf-8")
target_data_key = pd.read_csv(target_data_key_path,encoding="utf-8")

print(data_1.shape,data_2.shape,target_data_key.shape)

assert data_1.shape==data_2.shape,"Wrong"

(605, 303) (605, 303) (1196, 4)


In [104]:
data_1.head()

Unnamed: 0,bidincrement,bidfee,retail,0,1,2,3,4,5,6,...,290,291,292,293,294,295,296,297,298,299
0,0.15,0.75,499.99,0.003988,0.003983,0.003978,0.003973,0.003969,0.003964,0.003959,...,0.00277,0.002766,0.002763,0.002759,0.002755,0.002752,0.002748,0.002745,0.002741,0.002737
1,0.15,0.75,169.99,0.005615,0.005595,0.005575,0.005556,0.005536,0.005516,0.005497,...,0.001716,0.001708,0.0017,0.001692,0.001684,0.001676,0.001668,0.00166,0.001652,0.001644
2,0.15,0.75,299.99,0.004491,0.004482,0.004473,0.004464,0.004455,0.004446,0.004437,...,0.002398,0.002392,0.002387,0.002381,0.002375,0.00237,0.002364,0.002359,0.002353,0.002348
3,0.15,0.75,89.99,0.008598,0.008541,0.008484,0.008427,0.00837,0.008314,0.008258,...,0.000603,0.000595,0.000587,0.00058,0.000572,0.000565,0.000557,0.00055,0.000543,0.000535
4,0.15,0.75,59.99,0.012513,0.012388,0.012263,0.01214,0.012017,0.011895,0.011775,...,6.7e-05,6.4e-05,6.2e-05,6e-05,5.7e-05,5.5e-05,5.3e-05,5.1e-05,4.9e-05,4.7e-05


# 2. 合并与拆分
## 2.1 合并大表
1. 合并的是target_key+GT的P vector
2. 合并之后会多一列`id`，后面会去掉这一列
3. 注意：**一定要让`target_data_key`成为左表**，这样连接之后，index可以一一对应

In [105]:
target_data_key.head()

Unnamed: 0,id,bidincrement,bidfee,retail
0,0,0.15,0.75,499.99
1,1,0.15,0.75,169.99
2,2,0.15,0.75,299.99
3,3,0.15,0.75,89.99
4,5,0.15,0.75,59.99


In [106]:
GT_1_withid= pd.merge(target_data_key,data_1,how="left",on=GT_features)
GT_1_withid.head(),GT_1_withid.shape

(   id  bidincrement  bidfee  retail         0         1         2         3  \
 0   0          0.15    0.75  499.99  0.003988  0.003983  0.003978  0.003973   
 1   1          0.15    0.75  169.99  0.005615  0.005595  0.005575  0.005556   
 2   2          0.15    0.75  299.99  0.004491  0.004482  0.004473  0.004464   
 3   3          0.15    0.75   89.99  0.008598  0.008541  0.008484  0.008427   
 4   5          0.15    0.75   59.99  0.012513  0.012388  0.012263  0.012140   
 
           4         5  ...       290       291       292       293       294  \
 0  0.003969  0.003964  ...  0.002770  0.002766  0.002763  0.002759  0.002755   
 1  0.005536  0.005516  ...  0.001716  0.001708  0.001700  0.001692  0.001684   
 2  0.004455  0.004446  ...  0.002398  0.002392  0.002387  0.002381  0.002375   
 3  0.008370  0.008314  ...  0.000603  0.000595  0.000587  0.000580  0.000572   
 4  0.012017  0.011895  ...  0.000067  0.000064  0.000062  0.000060  0.000057   
 
         295       296       2

In [107]:
GT_2_withid= pd.merge(target_data_key,data_2,how="left",on=GT_features)
GT_2_withid.head(),GT_2_withid.shape

(   id  bidincrement  bidfee  retail         0         1         2         3  \
 0   0          0.15    0.75  499.99  0.001190  0.001197  0.001204  0.001212   
 1   1          0.15    0.75  169.99  0.004120  0.004129  0.004137  0.004145   
 2   2          0.15    0.75  299.99  0.001548  0.001557  0.001565  0.001574   
 3   3          0.15    0.75   89.99  0.014231  0.014097  0.013964  0.013831   
 4   5          0.15    0.75   59.99  0.026807  0.026188  0.025580  0.024982   
 
           4         5  ...           290           291           292  \
 0  0.001219  0.001227  ...  6.739960e-03  6.779261e-03  6.818779e-03   
 1  0.004153  0.004161  ...  1.316296e-03  1.302101e-03  1.288004e-03   
 2  0.001582  0.001591  ...  5.199096e-03  5.209410e-03  5.219623e-03   
 3  0.013697  0.013564  ...  4.994846e-05  4.879520e-05  4.766925e-05   
 4  0.024395  0.023818  ...  4.603279e-31  1.240681e-31  3.296644e-32   
 
             293           294           295           296           297  \
 0

## 2.2 拆分小表与保存



In [133]:
print(GT_1_withid.shape[0],GT_2_withid.shape[0],prod_embedding.shape[0])
print(GT_1_withid.shape[1],GT_2_withid.shape[1],prod_embedding.shape[1])
assert GT_1_withid.shape[0]==GT_2_withid.shape[0],"wrong!"

1196 1196 907
304 304 305


1. 把大表按照unique_features分成小表，
2. 小表里按照'id'，merge进去embedding的信息,然后删除'id'列，输出保存

In [136]:
train_col = [str(i) for i in range(0,GT_1_withid.shape[1]-1)]
for i in tqdm(range(0,GT_1_withid.shape[0])):
    train_tmp = pd.concat([pd.DataFrame(GT_1_withid.iloc[i,:]).T,pd.DataFrame(GT_2_withid.iloc[i,:]).T], ignore_index=True)
    # 保存'id'
    id = GT_1_withid.loc[i,'id']
    # drop'id'列，保持长度相等
    train_tmp.drop('id',axis=1,inplace=True)
    # 按照'id'列找到embedding信息
    embedding = prod_embedding[prod_embedding['id'] == id].copy()
    # drop['id','desc']列，保持长度相等
    embedding.drop(['id','desc'],axis=1,inplace=True)
    # 重命名，避免合并出问题
    train_tmp.columns = train_col
    # 合并
    train_pd = pd.concat([train_tmp,pd.DataFrame(embedding)], ignore_index=True)
    # print(train_pd.shape)

    # save
    output_path = train_root_path+train_file_head+ str(i) + train_file_tail
    train_pd.to_csv(output_path,header=True,index=False,encoding="utf-8")

print("Done")