In [1]:
import pandas as pd
import numpy as np
import os
import time


In [2]:
# 存储数据的根目录
ROOT_PATH = "/testcbd017_gujinfang/GJFCode/WeChat_2021/Code/data"
# 比赛数据集路径
DATASET_PATH = os.path.join(ROOT_PATH, "wechat_algo_data1")
# 训练集
USER_ACTION = os.path.join(DATASET_PATH, "user_action.csv")
FEED_INFO = os.path.join(DATASET_PATH, "feed_info.csv")
FEED_EMBEDDINGS = os.path.join(DATASET_PATH, "feed_embeddings.csv")

In [3]:
# 测试集
TEST_FILE = os.path.join(DATASET_PATH, "test_a.csv")
END_DAY = 15
SEED = 1997

# 初赛待预测行为列表
ACTION_LIST = ["read_comment", "like", "click_avatar",  "forward"]
# ACTION_LIST = ["read_comment", "like"]
# ACTION_LIST = ["click_avatar",  "forward"]
# 复赛待预测行为列表
# ACTION_LIST = ["read_comment", "like", "click_avatar",  "forward", "comment", "follow", "favorite"]
# 用于构造特征的字段列表
FEA_COLUMN_LIST = ["read_comment", "like", "click_avatar",  "forward", "comment", "follow", "favorite"]
# 负样本下采样比例(负样本:正样本)
ACTION_SAMPLE_RATE = {"read_comment": 15, "like": 15, "click_avatar": 10, "forward": 10, "comment": 10, "follow": 10, "favorite": 10}
# 各个阶段数据集的设置的最后一天
STAGE_END_DAY = {"online_train": 14, "offline_train": 12, "evaluate": 13, "submit": 15}
# 各个行为构造训练数据的天数
ACTION_DAY_NUM = {"read_comment": 14, "like": 14, "click_avatar": 14, "forward": 14, "comment": 14, "follow": 14, "favorite": 14}

In [4]:
# 采样History数据
def generate_sample(day=14,stage="offline_train"):
    """
    对负样本进行下采样，生成各个阶段所需样本
    """
    df_arr=[]
    sample_path=USER_ACTION
    df = pd.read_csv(sample_path)
    
    # 线下/线上训练
    # 去除重复行为，同行为取按时间最近的样本
    for action in ACTION_LIST:
        df = df.drop_duplicates(subset=['userid', 'feedid', action], keep='last')
    # 负样本下采样
    for action in ACTION_LIST:
        # 为每个行为选近期 ACTION_DAY_NUM 天的历史行为作为history
        # [8,9,10,11,12,13,14]
        action_df = df[(df["date_"] <= day) & (df["date_"] >= day - ACTION_DAY_NUM[action] + 1)]
        df_neg = action_df[action_df[action] == 0]
        
#         # 对于较难的 read_comment like 重新设计负采样
#         if(action in ['read_comment','like',"click_avatar",  "forward"]):
#             design_action=['read_comment','like','follow','favorite','forward','comment',"click_avatar",]
#             design_action.remove(action)
#             df_neg_design=df_neg[(df_neg[design_action[0]]==1) | (df_neg[design_action[1]]==1) | (df_neg[design_action[2]]==1) |
#                                  (df_neg[design_action[3]]==1) | (df_neg[design_action[4]]==1)| (df_neg[design_action[5]]==1)]
#             df_neg=df_neg[~df_neg.index.isin(df_neg_design.index)]
            
        all_pos_num=len(action_df[action_df[action] == 1])
        all_neg_num=len(action_df)-all_pos_num
        
        sample_neg_num=min(len(df_neg),all_pos_num*ACTION_SAMPLE_RATE[action])
        df_neg=df_neg.sample(n=sample_neg_num, random_state=SEED, replace=False)
        print('-----------{}-------------'.format(action))
        print('pos num:{};neg num:{}'.format(all_pos_num,sample_neg_num))
        # 为每个aciton进行负采样
#         df_neg = df_neg[df_neg['date_']<day].sample(n=sample_neg_num, random_state=SEED, replace=False)
        # 按照停留时间进行采样
#         df_neg=df_neg.sort_values(by='stay',ascending=False)
#         df_neg = df_neg[:sample_neg_num]
        # 正样本 负样本concat
    
        df_all = pd.concat([df_neg,action_df[action_df[action] == 1]])
#         df_all = pd.concat([df_neg, action_df[action_df[action] == 1],action_df[(action_df[action] == 0)&(action_df['date_'] == day)]])
            
        col = ["userid", "feedid", "date_", "device"] + ACTION_LIST
        file_path='{}/generater_data/{}_{}_sample.csv'.format(ROOT_PATH,action,day)
        print('Save to: {}'.format(file_path))
        df_all[col].to_csv(file_path, index=False)
        print(df_all[col].shape)
        df_arr.append(df_all[col])
    return df_arr

In [4]:
# 通过读取采样History数据 获取所有df
def get_generate_sample(day):
    df_arr=[]
    for action in ACTION_LIST:
        file_path='{}/generater_data/{}_{}_sample.csv'.format(ROOT_PATH,action,day)
        tmp=pd.read_csv(file_path)
        print('--------{}---------'.format(action))
        print(tmp.shape)
        pos_num,neg_num=len(tmp[tmp[action]==1]),len(tmp[tmp[action]==0])
        print('pos num:{};neg num:{}'.format(pos_num,neg_num))
        day_total_num=len(tmp[tmp['date_']==day])
        day_neg_num=len(tmp[(tmp['date_']==day) & (tmp[action]==0)])
        day_pos_num=day_total_num-day_neg_num
        print('day pos num:{};neg num:{},day total num:{}'.format(day_pos_num,day_neg_num,day_total_num))
        df_arr.append(tmp)
    return df_arr

In [6]:
# 把采样的history数据 拼接上 u i特征
def sample_concat(sample_arr,day):
    # 用户基本特征
    df_users=pd.read_csv(DATASET_PATH+'/user_info.csv')
    df_users = df_users.set_index('userid')
    # 用户统计特征
    df_users_static=pd.read_csv(DATASET_PATH+'/user_feature_sum_avg.csv')
    df_users_static=df_users_static.drop_duplicates(subset=['userid','date_'], keep='last')
    df_users_static=df_users_static.set_index(['userid','date_']) # 必须重新设置idx 不然join的时候报错
    # 视频特征
    df_feed=pd.read_csv(DATASET_PATH+'/feed_feature.csv')
    df_feed = df_feed.set_index('feedid')
    
    for index, sample in enumerate(sample_arr):
        features = ["userid", "feedid", "device", "authorid", "bgm_song_id", "bgm_singer_id",\
                    'watch_count_group','video_time_group','feed_cluter',\
                    "videoplayseconds","watch_count","play_times",'date_','des_words','ocr_words','asr_words',\
                    'manual_tag','machine_tag','manual_keywords','machine_keywords','feed_emb_id']
        features=features+['user_'+b+'_sum_group' for b in FEA_COLUMN_LIST]+['user_'+b+'_mean_group' for b in FEA_COLUMN_LIST]
        
        action=ACTION_LIST[index]
        print(action)
        sample = sample.join(df_feed, on="feedid", how="left", rsuffix="_feed")
        sample = sample.join(df_users, on=["userid"], how="left", rsuffix="_user_id")
        sample = sample.join(df_users_static, on=["userid", "date_"], how="left", rsuffix="_user_static")
        
        # 把各种统计信息更新到features中
        user_feature_col = [b+"_sum" for b in FEA_COLUMN_LIST]+[b+"_mean" for b in FEA_COLUMN_LIST]
        sample[user_feature_col] = sample[user_feature_col].fillna(0.0)
        
        features += user_feature_col
#         features += ACTION_LIST # 因为调整过ACTION_LIST 所以保存的sample里面的df的列又可能没有 其他的action
        features+=[action]
#         features+=["read_comment", "like", "click_avatar",  "forward"]
    
        # id=0 填充未知分类数据和离散数据
        sample[["authorid", "bgm_song_id", "bgm_singer_id",'watch_count_group','video_time_group']] += 1  
        sample[["authorid", "bgm_song_id", "bgm_singer_id", "videoplayseconds",'watch_count_group','video_time_group']] = \
            sample[["authorid", "bgm_song_id", "bgm_singer_id", "videoplayseconds",\
                    'watch_count_group','video_time_group']].fillna(0)
        
        # 连续型数据进行规范化尺度
#         sample["videoplayseconds"] = np.log(sample["videoplayseconds"] + 1.0)
#         sample["watch_count"] = np.log(sample["watch_count"] + 1.0)
        
        # 给数值型数据增加非线性
        dense_cols=['videoplayseconds','watch_count']+user_feature_col
#         for c in dense_cols:
#             # 先log再增加非线性
#             sample[c] = np.log(sample[c] + 1.0)
#             # log
#             sample['{}_log'.format(c)]=np.log(sample[c] + 1.0)
#             # ^2
#             sample['{}_square'.format(c)]=np.square(sample[c])
#             # e
#             sample['{}_exp'.format(c)]=np.exp(sample[c])
            
#         features += [b+'_log' for b in dense_cols]
#         features += [b+'_square' for b in dense_cols]
#         features += [b+'_exp' for b in dense_cols]
        # 把分类数据id转化成int格式
        sample[["authorid", "bgm_song_id", "bgm_singer_id",'watch_count_group','video_time_group']] = \
            sample[["authorid", "bgm_song_id", "bgm_singer_id",'watch_count_group','video_time_group']].astype(int)
        
        file_path='{}/generater_data/{}_{}_concat_sample.csv'.format(ROOT_PATH,action,day)
        print('Save to: {}'.format(file_path))
        sample[features].to_csv(file_path, index=False)
    

#### TODO在concat时 进行数值特征非线性转化

In [8]:
# 把test数据 拼接上 u i特征
def test_concat(df_test):
    # 用户基本特征
    df_users=pd.read_csv(DATASET_PATH+'/user_info.csv')
    df_users = df_users.set_index('userid')
    # 用户统计特征
    df_users_static=pd.read_csv(DATASET_PATH+'/user_feature_sum_avg.csv')
    df_users_static=df_users_static.drop_duplicates(subset=['userid','date_'], keep='last')
    # test的时候直接使用14天的统计数据
    df_users_static=df_users_static[df_users_static['date_']==14]
    df_users_static=df_users_static.set_index('userid')
    
    # 视频特征
    df_feed=pd.read_csv(DATASET_PATH+'/feed_feature.csv')
    df_feed = df_feed.set_index('feedid')
    
    features = ["userid", "feedid", "device", "authorid", "bgm_song_id", "bgm_singer_id",\
                'watch_count_group','video_time_group','feed_cluter',\
                "videoplayseconds","watch_count","play_times",'des_words','ocr_words','asr_words',\
                'manual_tag','machine_tag','manual_keywords','machine_keywords','feed_emb_id']
    
    features=features+['user_'+b+'_sum_group' for b in FEA_COLUMN_LIST]+['user_'+b+'_mean_group' for b in FEA_COLUMN_LIST]

    sample=df_test
    sample = sample.join(df_feed, on="feedid", how="left", rsuffix="_feed")
    sample = sample.join(df_users, on="userid", how="left", rsuffix="_user_id")
    sample = sample.join(df_users_static, on="userid", how="left", rsuffix="_user_static")

    # 把各种统计信息更新到features中
    user_feature_col = [b+"_sum" for b in FEA_COLUMN_LIST]+[b+"_mean" for b in FEA_COLUMN_LIST]
    # test中可能有冷启动 所以必须填充空值
    sample[user_feature_col] = sample[user_feature_col].fillna(0.0)

    features += user_feature_col
#     features += ACTION_LIST #test里没有ACTION_LIST了

    # id=0 填充未知分类数据和离散数据
    sample[["authorid", "bgm_song_id", "bgm_singer_id",'watch_count_group','video_time_group']] += 1  
    sample[["authorid", "bgm_song_id", "bgm_singer_id", "videoplayseconds",'watch_count_group','video_time_group']] = \
        sample[["authorid", "bgm_song_id", "bgm_singer_id", "videoplayseconds",\
                'watch_count_group','video_time_group']].fillna(0)

    # 连续型数据进行规范化尺度
#     sample["videoplayseconds"] = np.log(sample["videoplayseconds"] + 1.0)
# #         sample["watch_count"] = np.log(sample["watch_count"] + 1.0)

    dense_cols=['videoplayseconds','watch_count']+user_feature_col
#     for c in dense_cols:
#         # 先log再增加非线性
#         sample[c] = np.log(sample[c] + 1.0)
#         # log
#         sample['{}_log'.format(c)]=np.log(sample[c] + 1.0)
#         # ^2
#         sample['{}_square'.format(c)]=np.square(sample[c])
#         # e
#         sample['{}_exp'.format(c)]=np.exp(sample[c])

#     features += [b+'_log' for b in dense_cols]
#     features += [b+'_square' for b in dense_cols]
#     features += [b+'_exp' for b in dense_cols]

    # 把分类数据id转化成int格式
    sample[["authorid", "bgm_song_id", "bgm_singer_id",'watch_count_group','video_time_group']] = \
        sample[["authorid", "bgm_song_id", "bgm_singer_id",'watch_count_group','video_time_group']].astype(int)

    file_path='{}/test_a_concat.csv'.format(DATASET_PATH)
    print('Save to: {}'.format(file_path))
    sample[features].to_csv(file_path, index=False)

In [5]:
# 1 进行history采样
t0=time.time()
df_arr=generate_sample(14)
t1=time.time()
print('generate sample history data cost time {:.2f}s'.format(t1-t0))

-----------read_comment-------------
pos num:249710;neg num:3745650
Save to: /testcbd017_gujinfang/GJFCode/WeChat_2021/Code/data/generater_data/read_comment_14_sample.csv
(3995360, 8)
-----------like-------------
pos num:183556;neg num:2753340
Save to: /testcbd017_gujinfang/GJFCode/WeChat_2021/Code/data/generater_data/like_14_sample.csv
(2936896, 8)
-----------click_avatar-------------
pos num:52876;neg num:528760
Save to: /testcbd017_gujinfang/GJFCode/WeChat_2021/Code/data/generater_data/click_avatar_14_sample.csv
(581636, 8)
-----------forward-------------
pos num:27161;neg num:271610
Save to: /testcbd017_gujinfang/GJFCode/WeChat_2021/Code/data/generater_data/forward_14_sample.csv
(298771, 8)
generate sample history data cost time 24.78s


In [5]:
# 1 或者通过读取已有文件 获取采样数据
df_arr=get_generate_sample(14)

--------read_comment---------
(3995360, 8)
pos num:249710;neg num:3745650
day pos num:20597;neg num:317305,day total num:337902
--------like---------
(2936896, 8)
pos num:183556;neg num:2753340
day pos num:14614;neg num:233385,day total num:247999
--------click_avatar---------
(581636, 8)
pos num:52876;neg num:528760
day pos num:4540;neg num:44722,day total num:49262
--------forward---------
(298771, 8)
pos num:27161;neg num:271610
day pos num:2061;neg num:23138,day total num:25199


In [13]:
# 2 采样之后进行concat
t0=time.time()
sample_concat(df_arr,14)
t1=time.time()
print('concat sample history and user/feed feature cost time {:.2f}s'.format(t1-t0))

read_comment
Save to: /testcbd017_gujinfang/GJFCode/WeChat_2021/Code/data/generater_data/read_comment_14_concat_sample.csv
like
Save to: /testcbd017_gujinfang/GJFCode/WeChat_2021/Code/data/generater_data/like_14_concat_sample.csv
click_avatar
Save to: /testcbd017_gujinfang/GJFCode/WeChat_2021/Code/data/generater_data/click_avatar_14_concat_sample.csv
forward
Save to: /testcbd017_gujinfang/GJFCode/WeChat_2021/Code/data/generater_data/forward_14_concat_sample.csv
concat sample history and user/feed feature cost time 730.38s


In [12]:
# 3 对于test数据也要进行concat (因为添加了附加信息)
df_test=pd.read_csv(TEST_FILE)
t0=time.time()
test_concat(df_test)
t1=time.time()
print('concat test history and user/feed feature cost time {:.2f}s'.format(t1-t0))

Save to: /testcbd017_gujinfang/GJFCode/WeChat_2021/Code/data/wechat_algo_data1/test_a_concat.csv
concat test history and user/feed feature cost time 34.44s


In [None]:
#### TODO 对于train test的所有id类型 需不需要Label_encoder
#### TODO 如果Label_encoder 再这里进行 还是train_model之前进行