In [1]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import gc, os
import pickle
import warnings
import multiprocessing as mp
import lightgbm as lgb
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt 

# 节省内存的一个函数
# 减少内存
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,100*(start_mem-end_mem)/start_mem,(time.time()-starttime)/60))
    return df

In [2]:
data_path = './data_v2/'
save_path = './5000_sample/'
pd.set_option('display.max_columns', None)

# 预处理

In [3]:
vid_info = pd.read_csv(data_path + 'vid_info.csv')
# 榜A
# candidate_items = pd.read_csv(data_path + 'candidate_items_A.csv')
# 榜 B 更新
candidate_items = pd.read_csv(data_path + 'candidate_items_B.csv')
seq_train = pd.read_csv(data_path + 'main_vv_seq_train.csv')
# 榜 B 
seq_train_A = pd.read_csv(data_path + 'main_vv_seq_train_A.csv')
seq_train = pd.concat((seq_train,seq_train_A))
seq_train = seq_train.sort_values(by=['did','seq_no'],ascending=False)
del seq_train_A

vid_info = reduce_mem(vid_info)
candidate_items = reduce_mem(candidate_items)
seq_train = reduce_mem(seq_train)

# 视频库标签编码 , 这里的 训练和测试都一样
data_did_lb = LabelEncoder()
vid_info_lb = LabelEncoder()
cid_info_lb = LabelEncoder()

seq_train['did'] = data_did_lb.fit_transform(seq_train[['did']])

vid_info['vid'] = vid_info_lb.fit_transform(vid_info[['vid']])
vid_info['cid'] = cid_info_lb.fit_transform(vid_info[['cid']])

seq_train['vid'] = vid_info_lb.transform(seq_train[['vid']])
candidate_items['vid'] = vid_info_lb.transform(candidate_items[['vid']])

vid_info['stars'] = vid_info['stars'].apply(eval)
vid_info['tags'] = vid_info['tags'].apply(eval)
vid_info['key_word'] = vid_info['key_word'].apply(eval)

vid_info['stars'] = vid_info['stars'].apply(set)
vid_info['tags'] = vid_info['tags'].apply(set)
vid_info['key_word'] = vid_info['key_word'].apply(set)

# 获取当前数据的历史点击和最后一次点击
def get_test_train(train_):
    
    train_.sort_values(by=['did','seq_no'],inplace=True,ascending=False)
    train_['site'] =  train_.groupby('did').cumcount()+1
    
    local_final_log = train_[train_['site'] == 1].reset_index(drop=True)
    train_d = train_[train_['site'] != 1].reset_index(drop=True)

    del local_final_log['site']
    del train_d['site']

    return local_final_log,train_d

# 划分数据集
local_final_log,train_data = get_test_train(seq_train.copy()) 
all_data = seq_train 

del seq_train
print(all_data.shape,train_data.shape)

-- Mem. usage decreased to  7.56 Mb (44.8% reduction),time spend:0.00 min
-- Mem. usage decreased to  0.09 Mb (0.0% reduction),time spend:0.00 min
-- Mem. usage decreased to 203.51 Mb (48.6% reduction),time spend:0.01 min
(5767322, 8) (5596413, 8)


## 工程第一个步骤 简单的召回

In [4]:
def recall_data(data,vid_info_,candidate_items_,log):
    
##################################################################################################################
    data = data.merge(vid_info_[['vid','cid']],on='vid',how='left')
    # 用户历史日志中观看的视频，包含了合集的信息，以及需要追的剧
    tmp = data.groupby(['did','cid']).size().sort_values(ascending=False).reset_index()
    tmp.columns = ['did','cid','cid_n']
    data = data.merge(tmp,on=['did','cid'],how='left')
    del tmp 
##################################################################################################################
    print("———————————————————————— 候选视频序列字典 ——————————————————————")
    vid_info_cand = vid_info_[vid_info_.vid.isin(candidate_items_.vid.unique())]
    vid_info_cand['vid_pop'] = vid_info_cand['vid'].map(data.vid.value_counts())
    vid_info_cand.rename(columns={'vid':'candi_vid'},inplace=True)
    vid_info_cand = vid_info_cand.fillna(0)

    print("_______________________ 召回序列行为字典**最重要的** _______________________")
    ################### 规则一号 ####################
    data['next_vid'] = data.groupby(['did']).vid.shift(1)
    vid_habit = data.groupby(['vid','next_vid']).size().sort_values(ascending=False).reset_index()
    vid_habit = vid_habit[vid_habit.next_vid.isin(candidate_items_.vid.unique())]
    vid_habit.sort_values(by=['vid','next_vid',0],ascending=False)
    vid_habit.rename(columns={0:'next_score','next_vid':'candi_vid'},inplace=True)
    # vid_habit = vid_habit.merge(vid_info[['vid','cid']],on='vid',how='left')
    # 规则字典
    vid_action = dict()
    for vid,cand in tqdm(vid_habit.groupby('vid')):
        vid_action[vid] = cand['candi_vid']

    # 热度视频
    hot_200 = vid_info_cand.sort_values(by=['vid_pop'],ascending=False).head(200).reset_index(drop=True)['candi_vid']
##################################################################################################################
    ## 召回的字典格式转化为 pd 
    def user_cid_recall_df_func(recall_df,did_df):
        print(" —————————————————————————— 召回字典转换为 pd ——————————————————————————")
        user_did = did_df.did.unique()
        user_cid_recall_list = []
        for i in tqdm(range(len(user_did))):
            did = user_did[i]
            recall_ = recall_df[did]
            for j in range(len(recall_)):
                user_cid_recall_list.append((did,recall_.iloc[j]))
        user_cid_recall_df = pd.DataFrame(user_cid_recall_list,columns=['did','candi_vid'])
        return user_cid_recall_df

    ## 检查训练数据的合集召回的候选集命中率
    def user_cid_recall_hit_score_func(recall_dict,valid):
        print("——————————————————————————— 召回集击中率模块 ————————————————————————————")
        hit_total = 0
        sample_n = 0
        total = valid.shape[0]
        for i in tqdm(range(len(valid))):
            local_valid_vid = valid.iloc[i].vid
            local_valid_did = valid.iloc[i].did  
            candi_vid = recall_dict[local_valid_did]
            if local_valid_vid in candi_vid.values: 
                hit_total += 1
        print(hit_total / total)
##################################################################################################################
    # 而候选视频在 日志文件的下一个统计里面时最好的，不需要在频繁筛选
    def rr(his):
        recall = pd.Series()
        # 策略 1、用户前一条视频推荐 ，保证序列性, 但对新的视频可能不是很好
        last = his.iloc[0] 
        last_vid = last.vid 
        last_fpn = last.fpn
        last_did = last.did
        if last_vid in vid_action: 
            recall = recall.append(vid_action[last_vid][~vid_action[last_vid].isin(his['vid'].unique())].head(300)) # 100:0.820 

        # 策略 2、历史更新视频召回，保证序列有效性，这里cid_count 需要做一个巧妙的计算，这是必须的，因为很多在日志文件里面没有找到, 还给你看这个，我全部给你召了，最多不就是 157 吗，给你办了,？？？ 添加一个 cid - candi
        user_h = his[his['cid_n'] >= 2]
        user_h = user_h.groupby('cid').head(1)
        for i in range(len(user_h)):
            vid_ = user_h.iloc[i].vid 
            if vid_ in vid_action:
                vid_A = vid_action[vid_]
                recall = recall.append(vid_A[~vid_A.isin(his['vid'].unique())].head(10))

        if len(recall) < 50:
            recall = recall.append(hot_200[~hot_200.isin(his['vid'].unique())].head(50 - len(recall)))

        recall = recall.drop_duplicates()
        return recall
##################################################################################################################
    def user_cid_recall_dict_func(user_hist_watch,vid_info_cand):
        print("—————————————————————————— 召回模块 ————————————————————————")
        user_cid_recall = dict()
        user_hist_watch = user_hist_watch.sort_values(by=['did','seq_no'],ascending=False)
        for did, hist in tqdm(user_hist_watch.groupby('did')):
            user_cid_recall[did] = rr(hist)
        return user_cid_recall

    gc.collect()
    # # 本地数据召回
    data_recall = user_cid_recall_dict_func(data,vid_info_cand)
    # # 查看击中率
    user_cid_recall_hit_score_func(data_recall,local_final_log)
    # 转化为 pd
    data_recall_df = user_cid_recall_df_func(data_recall,data)
    print(data_recall_df.shape)


    data_recall_df = reduce_mem(data_recall_df)
    gc.collect()
    print("mo sir, !内劳人科滴票准共乘浣乘辣!")
    return data_recall_df

In [23]:
# 本地数据模型
train_recall = recall_data(train_data.copy(),vid_info,candidate_items,local_final_log)
# 线上需要提交的召回
# test_recall = recall_data(all_data.copy(),vid_info,candidate_items,local_final_log)

———————————————————————— 候选视频序列字典 ——————————————————————
_______________________ 召回序列行为字典**最重要的** _______________________


100%|██████████| 62363/62363 [00:03<00:00, 19554.40it/s]


—————————————————————————— 召回模块 ————————————————————————


100%|██████████| 170909/170909 [09:25<00:00, 302.23it/s]


——————————————————————————— 召回集击中率模块 ————————————————————————————


100%|██████████| 170909/170909 [00:24<00:00, 6959.24it/s]


0.8579185414460326
 —————————————————————————— 召回字典转换为 pd ——————————————————————————


100%|██████████| 170909/170909 [02:53<00:00, 985.70it/s] 


(42885803, 2)
-- Mem. usage decreased to 327.19 Mb (33.3% reduction),time spend:0.01 min
mo sir, !内劳人科滴票准共乘浣乘辣!


In [24]:
train_recall.to_csv(save_path + 'train_recall.csv',index=False)
# train_recall = pd.read_csv(save_path + 'train_recall.csv')
# print(train_recall.shape)

## 本地的召回的数据集分为 训练 验证 测试，但是本题 是 按照用户来划分的

In [25]:
import random 
def user_train_valid(tr,va,te): 
    # 召回的数据 
    all_num = set(train_recall.did.unique()) 
    # 训练数据 
    num1 = random.sample(all_num,tr) 
    # 验证数据 
    all_num = all_num.difference(num1) 
    num2 = random.sample(all_num,va) 
    # 测试数据 
    all_num = all_num.difference(num2) 
    num3 = random.sample(all_num,te)

    print(len(num1))
    print(len(num2))
    print(len(num3))
    return num1,num2,num3

num1, num2, num3 = user_train_valid(110000,30000,30909)
# 用于训练的数据
trn_ = train_recall[train_recall['did'].isin(num1)]
trn_.sort_values(by=['did'],ascending=False,inplace=True) 

# 用于验证的数据
val_ = train_recall[train_recall['did'].isin(num2)]
val_.sort_values(by=['did'],ascending=False,inplace=True) 

# # 用于测试的数据
tst_ = train_recall[train_recall['did'].isin(num3)] 
tst_.sort_values(by=['did'],ascending=False,inplace=True) 

del recall_data
print(tst_.shape)

110000
30000
30909
(7758800, 2)


# 负采样函数

In [26]:

def neg_and_label(data_recall_,final_log,test=False):
    print("———————————————————————————— 转化为监督学习 ————————————————————————————")

    def get_rank_label_df(recall_df, label_df):
        local_ = label_df.rename(columns={'vid': 'candi_vid'},inplace=False)
        recall_df_ = recall_df.merge(local_[['did','candi_vid','seq_no']], how='left', on=['did','candi_vid'])
        recall_df_['label'] = recall_df_['seq_no'].apply(lambda x: 0 if np.isnan(x) else 1)
        del recall_df_['seq_no']
        return recall_df_
    # 给用户数据贴上标签，转换成监督学习的方式，但召回的数据都服从用户下一视频的观看，这样不行，需要负采样
    print("贴上标签")
    print("召回数据的大小: ",data_recall_.shape)
    data_label_df = get_rank_label_df(data_recall_,final_log)

    # 负采样函数，这里可以控制负采样时的比例, 这里给了一个默认的值
    def neg_sample_recall_data(recall_data_df, sample_rate=0.1):
        pos_data = recall_data_df[recall_data_df['label'] == 1]
        neg_data = recall_data_df[recall_data_df['label'] == 0]

        print('pos_data_num:', len(pos_data), 'neg_data_num:', len(neg_data), 'pos/neg:', len(pos_data)/len(neg_data))

        def neg_sample_func(group_df):
            neg_num = len(group_df)
            sample_num = max(int(neg_num * sample_rate), 50) 
            sample_num = min(sample_num, 50) # 保证最多不超过5个，这里可以根据实际情况进行选择
            return group_df.sample(n=sample_num, replace=True)

        neg_data_did_sample = neg_data.groupby('did',group_keys=False).apply(neg_sample_func)

        neg_data_vid_sample = neg_data.groupby('candi_vid', group_keys=False).apply(neg_sample_func)

        neg_data_new = neg_data_did_sample.append(neg_data_vid_sample)
        # neg_data_new = neg_data_new.sort_values(['did','candi_vid']).drop_duplicates(['did','candi_vid'], keep='last')
        neg_data_new = neg_data_new.drop_duplicates(['did','candi_vid'], keep='last')

        # 将正样本数据合并
        data_new = pd.concat([pos_data, neg_data_new], ignore_index=True)

        return data_new
        
    if test == True:
        return data_label_df 

    print("对负标签下采样")
    data_label_df = neg_sample_recall_data(data_label_df)
    
    print("负采样后的数据大小：",data_label_df.shape)
    data_label_df['candi_vid'] = data_label_df['candi_vid'].apply(int)

    return data_label_df

In [27]:
# 训练数据的负采样
trn_ = neg_and_label(trn_,local_final_log)
# 验证数据的负采样
val_ = neg_and_label(val_,local_final_log)
# 测试集不需要负采样
tst_ = neg_and_label(tst_,local_final_log,True)

———————————————————————————— 转化为监督学习 ————————————————————————————
贴上标签
召回数据的大小:  (27605835, 2)
对负标签下采样
pos_data_num: 94396 neg_data_num: 27511439 pos/neg: 0.0034311545826446956
负采样后的数据大小： (5208109, 3)
———————————————————————————— 转化为监督学习 ————————————————————————————
贴上标签
召回数据的大小:  (7521168, 2)
对负标签下采样
pos_data_num: 25747 neg_data_num: 7495421 pos/neg: 0.003435030533975343
负采样后的数据大小： (1572523, 3)
———————————————————————————— 转化为监督学习 ————————————————————————————
贴上标签
召回数据的大小:  (7758800, 2)


In [28]:
trn_['data'] = 1
val_['data'] = 2
tst_['data'] = 3 
local_recall = pd.concat([trn_ ,val_ ,tst_])
del trn_ 
del val_ 
del tst_
print(local_recall.shape)

(14539432, 4)


In [6]:
# local_recall = reduce_mem(local_recall)
# local_recall.to_csv(save_path + 'local_recall.csv',index=False)
local_recall = pd.read_csv(save_path + 'local_recall.csv')

In [20]:
print(local_recall.shape)

(14539432, 4)


## 工程的第二个步骤：特征工程

In [7]:
print("———————————————————— 一阶用户特征抽取 ——————————————————————")
# 一、数值类型特征
def value_feature(data_,data_recall_,vid_info_):
    # 在用户日志中的信息
    data_ = data_.sort_values(by=['did','seq_no'],ascending=False)
    data_recall_ = data_recall_.sort_values(by=['did'],ascending=False)
    data_ = data_.merge(vid_info_,on='vid',how='left')
    # 本题是 芒果TV 过去两周的数据 
    data_['n_time'] = data_.groupby('did',group_keys=False)['time_gap'].apply(np.cumsum)
    data_['wr'] = data_['vts'] / data_['duration']
    data_['fr'] = data_['hb'] / data_['duration']
    data_['did_count'] = data_['did'].map(data_.did.value_counts())
    data_['next_vid'] = data_.groupby('did').vid.shift(1)
    data_['nn_vid'] = data_.groupby('did').vid.shift(2)
    
    # fpn_score : 用户在刷视频的时候统计得到的分数 {pd} ( 这里有两种模式：连续剧、上下联动观看m) , 最后一条视频在 fpn下 得到的分数
    data_fpn = data_[data_['fpn'] != 130] 
    data_fpn['next_vid'] = data_fpn.groupby('did').vid.shift(1) 
    fpn_habit = data_fpn.groupby(['vid','next_vid']).size().sort_values(ascending=False).reset_index() 
    fpn_habit = fpn_habit[fpn_habit.next_vid.isin(candidate_items.vid.unique())] 
    fpn_habit.rename(columns={0:'fpn_score','next_vid':'candi_vid'},inplace=True) 

    # nn_score: 下下视频的统计得分 {pd}
    nnv = data_.groupby(['vid','nn_vid']).size().sort_values(ascending=False).reset_index()
    nnv = nnv[nnv.nn_vid.isin(candidate_items.vid.unique())]
    nnv.rename(columns={0:'nen_score','nn_vid':'candi_vid'},inplace=True)

    # next_score
    nv = data_.groupby(['vid','next_vid']).size().sort_values(ascending=False).reset_index()
    nv = nv[nv.next_vid.isin(candidate_items.vid.unique())]
    nv.rename(columns={0:'next_score','next_vid':'candi_vid'},inplace=True)

    # 视频在过去两周的热度、一周的热度、一天的热度和过去两小时的热度
    data_ = data_.sort_values(by=['did','seq_no'],ascending=False)
    data_7 = data_[data_['n_time'] <= 604800]
    data_2 = data_[data_['n_time'] <= 7200] 

    # 用户最后一次观看的视频日志，连接上下观看视频
    d = data_.groupby('did').head(1) 

    # 合并召回数据和用户日志数据 , 最后一次的观看vid 的 fpn_score, next_score, nen_score 
    data_recall_ = data_recall_.merge(d[['did','vid']],on='did',how='left')
    data_recall_ = data_recall_.merge(fpn_habit,on=['vid','candi_vid'],how='left')
    data_recall_ = data_recall_.merge(nnv,on=['vid','candi_vid'],how='left')
    data_recall_ = data_recall_.merge(nv,on=['vid','candi_vid'],how='left')
    del data_recall_['vid']

    # 用户日志中的视频热度的评分
    data_recall_['vid_pop'] = data_recall_['candi_vid'].map(data_['vid'].value_counts())
    data_recall_['vid_pop_7'] = data_recall_['candi_vid'].map(data_7['vid'].value_counts())
    data_recall_['vid_pop_2'] = data_recall_['candi_vid'].map(data_2['vid'].value_counts())

    # wr_mean , fr_mean
    col = ['vid','wr','fr']
    data_7 = data_7[col].groupby('vid').agg(list).reset_index()
    data_7['wr_mean'] = data_7['wr'].apply(np.mean)
    data_7['fr_mean'] = data_7['fr'].apply(np.mean)

    data_recall_ = data_recall_.merge(data_7[['vid','wr_mean','fr_mean']],left_on='candi_vid',right_on='vid',how='left')
    del data_recall_['vid']
    data_recall_ = data_recall_.fillna(0)
    
    del data_
    del data_7 
    del data_2 
    data_recall_ = reduce_mem(data_recall_)
    gc.collect()

    return data_recall_

———————————————————— 一阶用户特征抽取 ——————————————————————


In [8]:
data_fea_val = value_feature(train_data.copy(),local_recall.copy(),vid_info)
data_fea_val

# 提交数据
# all_fea_val = value_feature(all_data.copy,all_recall.copy(),vid_info)

-- Mem. usage decreased to 499.17 Mb (65.4% reduction),time spend:0.04 min


Unnamed: 0,did,candi_vid,label,data,fpn_score,nen_score,next_score,vid_pop,vid_pop_7,vid_pop_2,wr_mean,fr_mean
0,170908,98102.0,1,1,18.0,593.0,23.0,755,477.0,49.0,0.409424,0.873047
1,170908,93812.0,0,1,0.0,0.0,0.0,17964,12008.0,772.0,0.445312,0.689941
2,170908,76264.0,0,1,0.0,0.0,1.0,284,165.0,21.0,0.154663,0.366455
3,170908,67500.0,0,1,0.0,0.0,1.0,1042,621.0,129.0,0.201782,0.492676
4,170908,61876.0,0,1,1.0,1.0,1.0,9404,8808.0,511.0,0.854004,0.416016
...,...,...,...,...,...,...,...,...,...,...,...,...
14539427,0,31359.0,0,1,28.0,19.0,23.0,14722,10816.0,1465.0,0.479248,0.856445
14539428,0,63964.0,0,1,6.0,11.0,11.0,676,374.0,33.0,0.431641,0.510254
14539429,0,22143.0,0,1,4.0,40.0,10.0,5418,2796.0,421.0,0.629883,0.740723
14539430,0,55044.0,0,1,13.0,30.0,12.0,9828,4156.0,478.0,0.470459,0.746582


In [12]:
data_fea_val.to_csv(save_path + 'data_fea_val.csv',index=False)

In [9]:
def stat_feature(data_,data_recall_,vid_info_):
    data_ = data_.merge(vid_info_,on='vid',how='left')
    data_['vid_pop'] = data_['vid'].map(data_.vid.value_counts())
    data_['wr'] = data_['vts'] / data_['duration']
    data_['fr'] = data_['hb'] / data_['duration']

    user_habit = data_.groupby('did').head(1)[['did','wr','fr']]

    data_ = data_.sort_values(by=['did','seq_no'],ascending=False)
    data_['next_vid'] = data_.groupby('did').vid.shift(-1)
    data_['next_wr'] = data_.groupby('did').wr.shift(-1)
    data_['next_fr'] = data_.groupby('did').fr.shift(-1)
    data_['next_vts'] = data_.groupby('did').vts.shift(-1)
    vid_info_n = vid_info_.copy()
    # 下一序列的视频信息做特征
    vid_info_n.rename(columns={'vid':'next_vid','duration':'n_duration','title_length':'n_title_length'},inplace=True)

    data_ = data_.merge(vid_info_n,on='next_vid',how='left')

    data_ = data_.sort_values(by=['did','seq_no'],ascending=False)
    data_seq = data_.groupby('did').head(1).copy()
    # data_seq = data_seq.groupby('did').tail(1)

    # 序列上的差异特征
    data_seq['time_diff'] = abs((data_seq['time_gap'].values - data_seq['next_vts'].values))
    data_seq['wr_favor'] = (data_seq['next_wr'].values + data_seq['wr'].values) / 2
    data_seq['fr_favor'] = (data_seq['next_fr'].values + data_seq['fr'].values) / 2

    # 保存用户在序列上的特征
    user_habit = user_habit.merge(data_seq[['did','time_diff','wr_favor','fr_favor']],on='did',how='left')                                   
    del data_seq
    del vid_info_n 

    # 所有用户日志做成一个列表
    data_ = data_.sort_values(by=['did','seq_no'],ascending=False)
    data_ = data_.groupby('did').head(6)

    cols = ['did','duration','title_length','vid_pop']
    data_ = data_[cols]
    data_ = data_.groupby('did').agg(list).reset_index()
    data_['dura_mean'] = data_['duration'].apply(np.mean)
    data_['dura_max'] = data_['duration'].apply(np.max)
    data_['dura_min'] = data_['duration'].apply(np.min)
    data_['dura_last'] = data_['duration'].apply(lambda x: x[0])

    data_['vp_mean'] = data_['vid_pop'].apply(np.mean)
    data_['vp_max'] = data_['vid_pop'].apply(np.max)
    data_['vp_min'] = data_['vid_pop'].apply(np.min)
    data_['vp_last'] = data_['vid_pop'].apply(lambda x: x[0])

    data_['til_mean'] = data_['title_length'].apply(np.mean)
    data_['til_last'] = data_['title_length'].apply(lambda x: x[0])
    data_ = data_[['did','dura_mean','dura_max','dura_min','vp_mean','vp_max','vp_min','til_mean','dura_last','vp_last','til_last']]

    user_habit = user_habit.merge(data_,on='did',how='left') 
    user_habit = reduce_mem(user_habit)
    del data_ 
    gc.collect()
    return user_habit

In [10]:
data_fea_sta = stat_feature(train_data.copy(),local_recall.copy(),vid_info)
data_fea_sta
# 提交数据
# all_fea_sta = stat_feature(all_data,all_recall,vid_info)

-- Mem. usage decreased to  8.15 Mb (47.9% reduction),time spend:0.00 min


Unnamed: 0,did,wr,fr,time_diff,wr_favor,fr_favor,dura_mean,dura_max,dura_min,vp_mean,vp_max,vp_min,til_mean,dura_last,vp_last,til_last
0,170908,0.582520,0.782715,8029.0,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.00000,1674,756,24
1,170907,0.292236,0.930176,2222.0,0.581055,0.928711,1859.0,4799,91,10904.0,34167,33,43.65625,2550,10238,21
2,170906,1.033203,0.983398,17.0,0.612793,0.975098,2264.0,4194,60,12232.0,19300,4533,39.50000,60,9678,86
3,170905,0.025452,0.236694,7.0,0.014290,0.118896,4328.0,5837,1834,9728.0,26950,354,76.18750,4832,26950,105
4,170904,0.019073,0.008904,76868.0,0.084900,0.031860,2248.0,4739,60,12024.0,24920,3132,78.81250,1573,6038,121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170904,4,0.749512,0.904297,177.0,0.506348,0.937012,4616.0,6495,137,13752.0,33496,64,76.31250,6495,25376,97
170905,3,0.617676,0.970215,3.0,0.315918,0.488770,3696.0,5861,787,5436.0,26249,46,102.50000,5241,26249,90
170906,2,0.745117,0.939453,0.0,0.374023,0.969238,4072.0,8240,2134,1355.0,4219,30,36.84375,2516,430,30
170907,1,0.399414,1.000000,5.0,0.370605,0.971191,2148.0,2453,1252,1543.0,2008,1119,38.15625,1252,1119,64


In [17]:
data_fea_sta = reduce_mem(data_fea_sta)
data_fea_sta.to_csv(save_path + 'data_fea_sta.csv',index=False)

-- Mem. usage decreased to  8.15 Mb (0.0% reduction),time spend:0.00 min


In [11]:
# 类别类型特征 
print('———————————————————— 一阶用户类别特征抽取 ————————————————————')
def cate_feature(data_,recall_data_,vid_info):
    data_ = data_.merge(vid_info,on='vid',how='left')
    recall_data_ = recall_data_.merge(vid_info,left_on='candi_vid',right_on='vid',how='left')
    
    cat_cols = ['did','seq_no','cpn','fpn','cid','is_intact','series_id','serialno','classify_id']
    cat_cols_ = ['did','candi_vid','cid','is_intact','series_id','serialno','classify_id']

    data_ = data_[cat_cols]
    recall_data_ = recall_data_[cat_cols_]
    data_ = data_.sort_values(by=['did','seq_no'],ascending=False)
    data_ = data_.groupby('did').agg(list).reset_index()
    data_['cpn'] = data_['cpn'].apply(lambda x: x[0])
    data_['fpn'] = data_['fpn'].apply(lambda x: x[0])
    cid_h = dict()
    isi_h = dict()
    cla_h = dict()
    ser_h = dict()
    for did, hist in tqdm(data_.groupby('did')):
        cid_h[did] = Counter(hist.iloc[0].cid)
        isi_h[did] = Counter(hist.iloc[0].is_intact)
        cla_h[did] = Counter(hist.iloc[0].classify_id)
        ser_h[did] = Counter(hist.iloc[0].series_id)
    print("类型特征计数开始")
    def sim_(df):
        did = df.iloc[0].did
        df['cid_s'] = df['cid'].apply(lambda x: cid_h[did][x])
        df['isi_s'] = df['is_intact'].apply(lambda x: isi_h[did][x])
        df['cla_s'] = df['classify_id'].apply(lambda x: cla_h[did][x])
        df['ser_s'] = df['series_id'].apply(lambda x: ser_h[did][x])
        return df[['cid_s','isi_s','cla_s','ser_s']]

    recall_data_[['cid_s','isi_s','cla_s','ser_s']] = recall_data_.groupby('did',group_keys=False).apply(sim_)
    gc.collect()
    recall_data_ = recall_data_.merge(data_[['did','cpn','fpn']],on='did',how='left')
    recall_data_['cp_fp'] = recall_data_['cpn'].astype('str') + '_' + recall_data_['fpn'].astype('str')
    recall_data_['sn_is'] = recall_data_['serialno'].astype('str') + '_' + recall_data_['is_intact'].astype('str')
    recall_data_['cla_ser'] = recall_data_['classify_id'].astype('str') + '_' + recall_data_['series_id'].astype('str')

    recall_data_ = recall_data_[['did','candi_vid','cp_fp','sn_is','cla_ser','cid_s','isi_s','cla_s','ser_s']]
    recall_data_ = reduce_mem(recall_data_)
    return recall_data_

———————————————————— 一阶用户类别特征抽取 ————————————————————


In [30]:
data_fea_cat = cate_feature(train_data.copy(),local_recall.copy(),vid_info)
data_fea_cat
# 提交数据
# all_fea_cat = cate_feature(all_data.copy(),all_recall.copy(),vid_info)

100%|██████████| 170909/170909 [00:50<00:00, 3412.63it/s]


类型特征计数开始
-- Mem. usage decreased to 665.56 Mb (33.3% reduction),time spend:0.01 min


Unnamed: 0,did,candi_vid,cp_fp,sn_is,cla_ser,cid_s,isi_s,cla_s,ser_s
0,170908,98102.0,1_26,33_1,2_0,30,67,69,69
1,170907,7358.0,1_130,3_1,1_97664,1,79,2,1
2,170904,13785.0,1_68,13_1,2_0,6,6,7,7
3,170903,73120.0,1_68,1_5,2_0,24,0,26,26
4,170902,94570.0,1_0,2_1,2_0,1,26,9,3
...,...,...,...,...,...,...,...,...,...
14539427,1,21778.0,32_139,42_3,1_51680,0,11,2,1
14539428,1,24172.0,32_139,1_1,2_105039,0,47,57,0
14539429,1,25648.0,32_139,37_1,1_51680,0,47,2,1
14539430,1,27955.0,32_139,5_1,2_0,0,47,57,58


In [12]:
# data_fea_cat = reduce_mem(data_fea_cat)
# data_fea_cat.to_csv(save_path + 'data_fea_cat.csv',index=False)
data_fea_cat = pd.read_csv(save_path + 'data_fea_cat.csv')

In [32]:
# 这个要改改 Counter
def lab_feature(data_,recall_data_,vid_info_):
    combine = lambda bt : {i for p in bt for i in p}

    data_ = data_.merge(vid_info,on='vid',how='left')
    recall_data_ = recall_data_.merge(vid_info,left_on='candi_vid',right_on='vid',how='left')

    lab_cols_ = ['did','candi_vid','stars','key_word','tags']
    lab_cols = ['did','stars','key_word','tags']
    data_ = data_[lab_cols]
    recall_data_ = recall_data_[lab_cols_]
    data_ = data_.groupby('did').agg(list).reset_index()

    data_['stars'] = data_['stars'].apply(combine)
    data_['tags'] = data_['tags'].apply(combine)
    data_['key_word'] = data_['key_word'].apply(combine)

    data_.rename(columns={'stars':'stars_h','tags':'tags_h','key_word':'key_word_h'},inplace=True)
    recall_data_ = recall_data_.merge(data_[['did','stars_h','tags_h','key_word_h']],on='did',how='left')

    gc.collect()

    def sim_score(df1,df2):
        return len(df1 & df2)
    recall_data_['stars_sim'] = recall_data_.apply(lambda row: sim_score(row['stars'],row['stars_h']),axis=1)
    recall_data_['tags_sim'] = recall_data_.apply(lambda row: sim_score(row['tags'],row['tags_h']),axis=1)
    recall_data_['key_word_sim'] = recall_data_.apply(lambda row: sim_score(row['key_word'],row['key_word_h']),axis=1)

    recall_data_ = reduce_mem(recall_data_)
    return recall_data_[['did','candi_vid','stars_sim','tags_sim','key_word_sim']]

In [33]:
data_fea_lab = lab_feature(train_data.copy(),local_recall.copy(),vid_info)
data_fea_lab

# all_fea_lab = lab_feature(all_data.copy(),all_recall.copy(),vid_info)

-- Mem. usage decreased to 942.88 Mb (22.7% reduction),time spend:0.01 min


Unnamed: 0,did,candi_vid,stars_sim,tags_sim,key_word_sim
0,170908,98102.0,5,23,10
1,170907,7358.0,7,0,14
2,170904,13785.0,5,0,12
3,170903,73120.0,6,1,9
4,170902,94570.0,3,1,8
...,...,...,...,...,...
14539427,1,21778.0,1,0,1
14539428,1,24172.0,0,7,1
14539429,1,25648.0,3,0,5
14539430,1,27955.0,0,3,0


In [13]:
# data_fea_lab = reduce_mem(data_fea_lab)
# data_fea_lab.to_csv(save_path + 'data_fea_lab.csv',index=False)
data_fea_lab = pd.read_csv(save_path + 'data_fea_lab.csv')

In [66]:
vid_emb = pd.read_csv(save_path + 'emb_vid.csv')

In [75]:
data_fea = data_fea_val.merge(data_fea_sta,on='did',how='left')
data_fea = data_fea.merge(data_fea_cat,on=['did','candi_vid'],how='left')
data_fea = data_fea.merge(data_fea_lab,on=['did','candi_vid'],how='left')
data_fea = data_fea.merge(vid_emb,on=['did','candi_vid'],how='left')
data_fea = reduce_mem(data_fea)

-- Mem. usage decreased to 1552.98 Mb (30.9% reduction),time spend:0.06 min


In [25]:
import sys
print(sys.getsizeof(data_fea) / 1024 / 1024 / 1024, 'GB')

3.619150928221643 GB


In [15]:
# t = train_data.groupby('did').head(1)
# data_fea = data_fea.merge(t[['did','vid']],on='did',how='left')
# data_fea

Unnamed: 0,did,candi_vid,label,data,fpn_score,nen_score,next_score,vid_pop,vid_pop_7,vid_pop_2,wr_mean,fr_mean,wr,fr,time_diff,wr_favor,fr_favor,dura_mean,dura_max,dura_min,vp_mean,vp_max,vp_min,til_mean,dura_last,vp_last,til_last,cp_fp,sn_is,cla_ser,cid_s,isi_s,cla_s,ser_s,stars_sim,tags_sim,key_word_sim,vid
0,170908,98102.0,1,1,18.0,593.0,23.0,755,477.0,49.0,0.409424,0.873047,0.582520,0.782715,8029.0,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1674,756,24,1_26,33_1,2_0,30,67,69,69,5,23,10,110982
1,170908,93812.0,0,1,0.0,0.0,0.0,17964,12008.0,772.0,0.445312,0.689941,0.582520,0.782715,8029.0,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1674,756,24,1_26,28_1,1_106430,0,67,0,0,0,0,1,110982
2,170908,76264.0,0,1,0.0,0.0,1.0,284,165.0,21.0,0.154663,0.366455,0.582520,0.782715,8029.0,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1674,756,24,1_26,1_1,2_0,0,67,69,69,0,17,2,110982
3,170908,67500.0,0,1,0.0,0.0,1.0,1042,621.0,129.0,0.201782,0.492676,0.582520,0.782715,8029.0,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1674,756,24,1_26,2_1,1_0,0,67,0,69,0,0,1,110982
4,170908,61876.0,0,1,1.0,1.0,1.0,9404,8808.0,511.0,0.854004,0.416016,0.582520,0.782715,8029.0,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1674,756,24,1_26,3_3,1_97664,0,0,0,0,0,0,1,110982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14539427,0,31359.0,0,1,28.0,19.0,23.0,14722,10816.0,1465.0,0.479248,0.856445,0.505859,0.974609,3005.0,0.403076,0.670898,5204.0,7918,2717,16384.0,38545,4829,47.0,6521,38545,47,1_130,7_1,2_0,0,6,0,0,0,0,0,75604
14539428,0,63964.0,0,1,6.0,11.0,11.0,676,374.0,33.0,0.431641,0.510254,0.505859,0.974609,3005.0,0.403076,0.670898,5204.0,7918,2717,16384.0,38545,4829,47.0,6521,38545,47,1_130,14_1,1_51680,0,6,6,6,3,0,7,75604
14539429,0,22143.0,0,1,4.0,40.0,10.0,5418,2796.0,421.0,0.629883,0.740723,0.505859,0.974609,3005.0,0.403076,0.670898,5204.0,7918,2717,16384.0,38545,4829,47.0,6521,38545,47,1_130,2_1,2_0,0,6,0,0,0,0,0,75604
14539430,0,55044.0,0,1,13.0,30.0,12.0,9828,4156.0,478.0,0.470459,0.746582,0.505859,0.974609,3005.0,0.403076,0.670898,5204.0,7918,2717,16384.0,38545,4829,47.0,6521,38545,47,1_130,4_1,1_0,0,6,6,0,0,0,1,75604


# embedding 特征

In [16]:
from gensim.models import Word2Vec
# 用户侧的 embedding
vid_list = train_data.groupby(['did'])['vid'].apply(list).values
vid_w2v = Word2Vec(sentences=vid_list[:], window=5, min_count=1, sg=1, workers=8,seed=2022)

user_w2v_emb_dict = {k: vid_w2v.wv[k] for k in train_data['vid']}

def emb_v(df1,df2):
    return np.dot(user_w2v_emb_dict[df1],user_w2v_emb_dict[df2])
data_fea['vid_emb'] = data_fea.apply(lambda row: emb_v(row['candi_vid'],row['vid']),axis=1)

In [76]:
del data_fea['vid_pop']

In [84]:
gc.collect()
data_fea

Unnamed: 0,did,candi_vid,label,data,fpn_score,nen_score,next_score,vid_pop_7,vid_pop_2,wr_mean,fr_mean,wr,fr,time_diff,wr_favor,fr_favor,dura_mean,dura_max,dura_min,vp_mean,vp_max,vp_min,til_mean,dura_last,vp_last,til_last,cp_fp,sn_is,cla_ser,cid_s,isi_s,cla_s,ser_s,stars_sim,tags_sim,key_word_sim,vid_emb
0,170908,98102.0,1,1,18.0,593.0,23.0,477.0,49.0,0.409424,0.873047,0.582520,0.782715,2.230278,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1674,756,24,1_26,33_1,2_0,30,67,69,69,5,23,10,41.781250
1,170908,93812.0,0,1,0.0,0.0,0.0,12008.0,772.0,0.445312,0.689941,0.582520,0.782715,2.230278,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1674,756,24,1_26,28_1,1_106430,0,67,0,0,0,0,1,24.250000
2,170908,76264.0,0,1,0.0,0.0,1.0,165.0,21.0,0.154663,0.366455,0.582520,0.782715,2.230278,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1674,756,24,1_26,1_1,2_0,0,67,69,69,0,17,2,12.335938
3,170908,67500.0,0,1,0.0,0.0,1.0,621.0,129.0,0.201782,0.492676,0.582520,0.782715,2.230278,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1674,756,24,1_26,2_1,1_0,0,67,0,69,0,0,1,21.218750
4,170908,61876.0,0,1,1.0,1.0,1.0,8808.0,511.0,0.854004,0.416016,0.582520,0.782715,2.230278,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1674,756,24,1_26,3_3,1_97664,0,0,0,0,0,0,1,21.531250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14539427,0,31359.0,0,1,28.0,19.0,23.0,10816.0,1465.0,0.479248,0.856445,0.505859,0.974609,0.834722,0.403076,0.670898,5204.0,7918,2717,16384.0,38545,4829,47.0,6521,38545,47,1_130,7_1,2_0,0,6,0,0,0,0,0,29.296875
14539428,0,63964.0,0,1,6.0,11.0,11.0,374.0,33.0,0.431641,0.510254,0.505859,0.974609,0.834722,0.403076,0.670898,5204.0,7918,2717,16384.0,38545,4829,47.0,6521,38545,47,1_130,14_1,1_51680,0,6,6,6,3,0,7,22.671875
14539429,0,22143.0,0,1,4.0,40.0,10.0,2796.0,421.0,0.629883,0.740723,0.505859,0.974609,0.834722,0.403076,0.670898,5204.0,7918,2717,16384.0,38545,4829,47.0,6521,38545,47,1_130,2_1,2_0,0,6,0,0,0,0,0,18.953125
14539430,0,55044.0,0,1,13.0,30.0,12.0,4156.0,478.0,0.470459,0.746582,0.505859,0.974609,0.834722,0.403076,0.670898,5204.0,7918,2717,16384.0,38545,4829,47.0,6521,38545,47,1_130,4_1,1_0,0,6,6,0,0,0,1,20.781250


In [83]:
data_fea['time_diff'] = data_fea['time_diff'] / 3600 

In [None]:
# 合集大于 2 的用户 在时间粒度越大的时候越为重要
# add = lambda x: x[x['cid_s'] >= 2]
# data_fea[data_fea['cid_s'] >= 2]['cid_s'] = data_fea[data_fea['cid_s'] >= 2]['cid_s'] * data_fea[data_fea['cid_s'] >= 2]['time_diff']
# data_fea.apply(add)

In [85]:
# 在最近 2 小时观看中 ，最后一次时间粒度越大的用户，越发重要
data_fea['vid_pop_2'] = data_fea['vid_pop_2'] * data_fea['time_diff']

In [86]:
data_fea['time_diff'] = data_fea['time_diff'] * data_fea['cid_s']

In [87]:
# del data_fea['vid']
data_fea = data_fea.merge(vid_info[['vid','duration','title_length']],left_on='candi_vid',right_on='vid',how='left')
del data_fea['vid']

In [89]:
data_fea['duration_s'] = abs(data_fea['duration'] - data_fea['dura_last'])
data_fea['title_s'] = abs(data_fea['title_length'] - data_fea['til_last'])

In [99]:
train_data['did_c'] = train_data['did'].map(train_data.did.value_counts())
t = train_data.groupby('did').head(1)

data_fea = data_fea.merge(t[['did','did_c']],on='did',how='left')

data_fea['cid_ratio'] = data_fea['cid_s'] / data_fea['did_c']

In [101]:
# del data_fea['duration']
# del data_fea['title_length']
# del data_fea['dura_last']
# del data_fea['vp_last']
# del data_fea['til_last']

In [93]:
gc.collect()
# 二阶特征组合
category_k = ['sn_is','cla_ser','cp_fp']
for c in category_k:
    data_fea[c] = data_fea[c].astype('category')
tst_data = reduce_mem(data_fea)

-- Mem. usage decreased to 1247.97 Mb (2.2% reduction),time spend:0.06 min


In [102]:
trn_data = data_fea[data_fea['data'] == 1]
val_data = data_fea[data_fea['data'] == 2]
tst_data = data_fea[data_fea['data'] == 3]

# del data_fea 

g_train = trn_data.groupby(['did'], as_index=False).count()["label"].values
g_valid = val_data.groupby(['did'], as_index=False).count()["label"].values

del trn_data['data']
del val_data['data']
del tst_data['data']

In [95]:
def mrr6(stat_data,log,topk=6):

    stat_data = stat_data.sort_values(by=['did','pred_score'],ascending=False)
    stat_data = stat_data.groupby('did').head(topk)

    comp_dict = dict()
    for did,cand_vid in tqdm(stat_data.groupby('did')):
        comp_dict[did] = cand_vid['candi_vid']

    # 评分函数 dict()版本
    score = np.zeros(log.shape[0])

    for i in tqdm(range(len(log))):
        test_did = log.iloc[i].did
        test_vid = log.iloc[i].vid 
        
        recall_ = comp_dict[test_did]
        # mrr 评分
        for j in range(len(recall_)):
            if test_vid == recall_.iloc[j]:
                score[i] = 1/(1+j)
    # print(len(score))
    # print(np.mean(score))
    return np.mean(score)

In [35]:
def submit(submit_data,name):
    
    submit_data = submit_data.sort_values(by=['did','pred_score'],ascending=False)
    res = submit_data.groupby('did').head(6)

    res = res[['did','candi_vid','pred_score']]
    res.rename(columns={'candi_vid':'vid'},inplace=True)

    res['rank'] = res.groupby('did').cumcount()+1 
    res['vid'] = res['vid'].apply(int)
    # 标签编码还原

    res['did'] = data_did_lb.inverse_transform(res['did'])
    res['vid'] = vid_info_lb.inverse_transform(res['vid'])

    res[['did','vid','rank']].to_csv('./result/' + name,index=False)
    print('结果已保存：','./result/' + name)

In [97]:
trn_data

Unnamed: 0,did,candi_vid,label,fpn_score,nen_score,next_score,vid_pop_7,vid_pop_2,wr_mean,fr_mean,wr,fr,time_diff,wr_favor,fr_favor,dura_mean,dura_max,dura_min,vp_mean,vp_max,vp_min,til_mean,cp_fp,sn_is,cla_ser,cid_s,isi_s,cla_s,ser_s,stars_sim,tags_sim,key_word_sim,vid_emb,duration,title_length,duration_s,title_s,did_c,cid_ratio
0,170908,98102.0,1,18.0,593.0,23.0,477.0,109.283607,0.409424,0.873047,0.582520,0.782715,66.9375,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1_26,33_1,2_0,30,67,69,69,5,23,10,41.781250,2087,24,413,0,69,0.434783
1,170908,93812.0,0,0.0,0.0,0.0,12008.0,1721.774414,0.445312,0.689941,0.582520,0.782715,0.0000,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1_26,28_1,1_106430,0,67,0,0,0,0,1,24.250000,4459,104,2785,80,69,0.000000
2,170908,76264.0,0,0.0,0.0,1.0,165.0,46.835835,0.154663,0.366455,0.582520,0.782715,0.0000,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1_26,1_1,2_0,0,67,69,69,0,17,2,12.335938,3967,20,2293,4,69,0.000000
3,170908,67500.0,0,0.0,0.0,1.0,621.0,287.705841,0.201782,0.492676,0.582520,0.782715,0.0000,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1_26,2_1,1_0,0,67,0,69,0,0,1,21.218750,5450,98,3776,74,69,0.000000
4,170908,61876.0,0,1.0,1.0,1.0,8808.0,1139.671997,0.854004,0.416016,0.582520,0.782715,0.0000,0.367432,0.870605,2258.0,2543,1674,765.5,809,746,24.0,1_26,3_3,1_97664,0,0,0,0,0,0,1,21.531250,48,98,1626,74,69,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5208104,0,31359.0,0,28.0,19.0,23.0,10816.0,1222.868042,0.479248,0.856445,0.505859,0.974609,0.0000,0.403076,0.670898,5204.0,7918,2717,16384.0,38545,4829,47.0,1_130,7_1,2_0,0,6,0,0,0,0,0,29.296875,2299,20,4222,27,6,0.000000
5208105,0,63964.0,0,6.0,11.0,11.0,374.0,27.545834,0.431641,0.510254,0.505859,0.974609,0.0000,0.403076,0.670898,5204.0,7918,2717,16384.0,38545,4829,47.0,1_130,14_1,1_51680,0,6,6,6,3,0,7,22.671875,6916,57,395,10,6,0.000000
5208106,0,22143.0,0,4.0,40.0,10.0,2796.0,351.418060,0.629883,0.740723,0.505859,0.974609,0.0000,0.403076,0.670898,5204.0,7918,2717,16384.0,38545,4829,47.0,1_130,2_1,2_0,0,6,0,0,0,0,0,18.953125,818,20,5703,27,6,0.000000
5208107,0,55044.0,0,13.0,30.0,12.0,4156.0,398.997223,0.470459,0.746582,0.505859,0.974609,0.0000,0.403076,0.670898,5204.0,7918,2717,16384.0,38545,4829,47.0,1_130,4_1,1_0,0,6,6,0,0,0,1,20.781250,3976,50,2545,3,6,0.000000


In [104]:
lgb_cols = trn_data.columns
# 不用的特征列
lgb_cols_n = ['did','candi_vid','label','vid','cp_fp','sn_is','cid_s']

lgb_cols = [x for x in lgb_cols if x not in lgb_cols_n]
print(len(lgb_cols))
lgb_cols

32


['fpn_score',
 'nen_score',
 'next_score',
 'vid_pop_7',
 'vid_pop_2',
 'wr_mean',
 'fr_mean',
 'wr',
 'fr',
 'time_diff',
 'wr_favor',
 'fr_favor',
 'dura_mean',
 'dura_max',
 'dura_min',
 'vp_mean',
 'vp_max',
 'vp_min',
 'til_mean',
 'cla_ser',
 'isi_s',
 'cla_s',
 'ser_s',
 'stars_sim',
 'tags_sim',
 'key_word_sim',
 'vid_emb',
 'duration',
 'title_length',
 'duration_s',
 'title_s',
 'cid_ratio']

In [105]:
lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt',
                       objective='binary',
                       num_leaves=64,
                       reg_alpha=0.3,
                       reg_lambda=0.5,
                       max_depth=-1,
                       n_estimators=20000,
                       subsample=0.8,
                       feature_fraction=0.8,
                       bagging_fraction=0.8,
                       colsample_bytree=0.8,
                       learning_rate=0.06,
                       min_child_samples=10,
                       random_state=1998,
                       n_jobs= 16,
                       )

In [106]:
gc.collect() 
offline = True 
if offline: 
    lgb_ranker.fit(trn_data[lgb_cols], trn_data['label'], group=g_train,  
                eval_set=[(val_data[lgb_cols], val_data['label'])],  
                eval_group= [g_valid], eval_at=[6], 
                verbose=50, eval_metric=['map'], early_stopping_rounds=50,) 
else: 
    lgb_ranker.fit(trn_data[lgb_cols], trn_data['label'], group=g_train) 

[50]	valid_0's map@6: 0.818511	valid_0's binary_logloss: 0.0386358
[100]	valid_0's map@6: 0.824229	valid_0's binary_logloss: 0.0368862
[150]	valid_0's map@6: 0.826654	valid_0's binary_logloss: 0.0363287
[200]	valid_0's map@6: 0.827469	valid_0's binary_logloss: 0.0360893
[250]	valid_0's map@6: 0.82835	valid_0's binary_logloss: 0.0359439
[300]	valid_0's map@6: 0.829045	valid_0's binary_logloss: 0.0358134
[350]	valid_0's map@6: 0.829134	valid_0's binary_logloss: 0.0357214
[400]	valid_0's map@6: 0.829782	valid_0's binary_logloss: 0.0356311
[450]	valid_0's map@6: 0.83015	valid_0's binary_logloss: 0.0355723
[500]	valid_0's map@6: 0.830251	valid_0's binary_logloss: 0.0355259
[550]	valid_0's map@6: 0.830541	valid_0's binary_logloss: 0.0354855
[600]	valid_0's map@6: 0.830697	valid_0's binary_logloss: 0.0354429
[650]	valid_0's map@6: 0.831269	valid_0's binary_logloss: 0.03541
[700]	valid_0's map@6: 0.831089	valid_0's binary_logloss: 0.0353756


### 线下预测，调试

In [107]:
gc.collect()
# 模型泛化预测
tst_data['pred_score'] = lgb_ranker.predict(tst_data[lgb_cols], num_iteration=lgb_ranker.best_iteration_)
# 评分函数
tst_d = tst_data.did.unique()
tst_log = local_final_log[local_final_log['did'].isin(tst_d)]
mrr6(tst_data,tst_log,topk=6)

100%|██████████| 30909/30909 [00:01<00:00, 17137.99it/s]
100%|██████████| 30909/30909 [00:04<00:00, 6625.78it/s]


0.4733912452683684

In [None]:
df_importance = pd.DataFrame({
    'column': lgb_cols,
    'importance': lgb_ranker.feature_importances_,
})
df_importance.sort_values(by='importance',ascending=False)

In [None]:
lgb_ranker.booster_.save_model('model.txt')

In [44]:
train_f = pd.concat((trn_data,val_data))

In [49]:
import sys
print(sys.getsizeof(train_f) / 1024 / 1024 / 1024, 'GB')

0.6126533821225166 GB


In [47]:
train_f = reduce_mem(train_f)

-- Mem. usage decreased to 627.29 Mb (11.0% reduction),time spend:0.03 min


In [51]:
del train_f['vid_pop']

## lgb Ranker 的五折交叉验证

In [None]:
offline= False
res_path = './result/'

def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['did'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = train_f
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
df_importance_list = []
score_df = trn_df[['did', 'candi_vid','label']]
# sub_preds = np.zeros(all_f.shape[0])

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['did'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['did'].isin(valid_user)]
    
    # 训练集与验证集的用户分组
    train_idx.sort_values(by=['did'], inplace=True)
    g_train = train_idx.groupby(['did'], as_index=False).count()["label"].values
    
    valid_idx.sort_values(by=['did'], inplace=True)
    g_val = valid_idx.groupby(['did'], as_index=False).count()["label"].values
        
    lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt',
                        objective='binary',
                        num_leaves=64,
                        reg_alpha=0.3,
                        reg_lambda=0.5,
                        max_depth=-1,
                        n_estimators=20000,
                        subsample=0.8,
                        feature_fraction=0.8,
                        bagging_fraction=0.9,
                        learning_rate=0.06,
                        min_child_samples=10,
                        random_state=1998,
                        n_jobs= 16,
                        )


    lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,
                   eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], 
                   eval_at=[6], eval_metric=['ndcg','auc' ], early_stopping_rounds=50, verbose=50)

    gc.collect()
    
    valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)
    

    df_importance = pd.DataFrame({
        'column': lgb_cols,
        'importance': lgb_ranker.feature_importances_,
    })
    df_importance_list.append(df_importance)



    valid_idx.sort_values(by=['did', 'pred_score'])

    score_list.append(valid_idx[['did', 'candi_vid', 'pred_score']])

    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    # if not offline:
    #     sub_preds += lgb_ranker.predict(all_f[lgb_cols], lgb_ranker.best_iteration_)
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['did', 'candi_vid'])

df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

In [152]:
# 评分函数
# mrr6(score_df,local_final_log,topk=6)
t = local_final_log[local_final_log['did'].isin(num1 + num2)]
# mrr6(score_df,local_final_log,topk=6)
mrr6(score_df,t,topk=6)

100%|██████████| 140000/140000 [00:07<00:00, 18288.09it/s]
100%|██████████| 140000/140000 [00:27<00:00, 5092.20it/s]


0.6308615476190477

## 2、LGBM 分类模型

In [146]:
# 模型及参数的定义
lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary', num_leaves=301, reg_alpha=0, reg_lambda=1,
                            max_depth=21, n_estimators=100, subsample=0.7,feature_fraction=0.7,
                            learning_rate=0.06, random_state=1998, n_jobs= 16)  

In [147]:
# 模型训练
offline=True 
if offline:
    lgb_Classfication.fit(trn_data[lgb_cols], trn_data['label'],
                    eval_set=[(val_data[lgb_cols], val_data['label'])], 
                    eval_metric=['auc','logloss'], early_stopping_rounds=50,verbose=50)
else:
    lgb_Classfication.fit(trn_data[lgb_cols], trn_data['label'])

[50]	valid_0's auc: 0.931355	valid_0's binary_logloss: 0.0460436
[100]	valid_0's auc: 0.934681	valid_0's binary_logloss: 0.0450174


In [148]:
# 模型预测
trn_data['pred_score'] = lgb_Classfication.predict_proba(trn_data[lgb_cols])[:,1]
# 评分函数
tst_log = local_final_log[local_final_log['did'].isin(num1)]
mrr6(trn_data,tst_log,topk=6)
# 模型预测
tst_data['pred_score'] = lgb_Classfication.predict_proba(tst_data[lgb_cols])[:,1]
# 评分函数
tst_log = local_final_log[local_final_log['did'].isin(num3)]
mrr6(tst_data,tst_log,topk=6)

100%|██████████| 30909/30909 [00:03<00:00, 8803.64it/s] 
100%|██████████| 30909/30909 [00:05<00:00, 5197.00it/s]


0.4479380331510779

## LGBClassifyer 的交叉五折验证

In [None]:
offline=False

def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['did'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = train_data_vid_fea
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['did', 'candi_vid','label']]
sub_preds = np.zeros(all_data_vid_fea.shape[0])

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['did'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['did'].isin(valid_user)]
    
    # 训练集与验证集的用户分组
    train_idx.sort_values(by=['did'], inplace=True)
    g_train = train_idx.groupby(['did'], as_index=False).count()["label"].values
    
    valid_idx.sort_values(by=['did'], inplace=True)
    g_val = valid_idx.groupby(['did'], as_index=False).count()["label"].values
    

    # 模型及参数的定义
    lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=127, reg_alpha=0.0, reg_lambda=1,
                                max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                                learning_rate=0.01, min_child_weight=50, random_state=1998, n_jobs= 16, verbose=10)  

    lgb_Classfication.fit(trn_data[lgb_cols], trn_data['label'],
                    eval_set=[(val_data[lgb_cols], val_data['label'])], 
                    eval_metric=['auc', ],early_stopping_rounds=50,)
    
    valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)
    
    # 对输出结果进行归一化
    # valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['did', 'pred_score'])

    score_list.append(valid_idx[['did', 'candi_vid', 'pred_score']])
    
    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    if not offline:
        sub_preds += lgb_ranker.predict(all_data_vid_fea[lgb_cols], lgb_ranker.best_iteration_)
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['did', 'candi_vid'])
# 保存训练集交叉验证产生的新特征
# score_df[['did', 'candi_vid', 'pred_score', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False)
    
# 测试集的预测结果，多次交叉验证求平均,将预测的score和对应的rank特征保存，可以用于后面的staking，这里还可以构造其他更多的特征
all_data_vid_fea['pred_score'] = sub_preds / k_fold
# all_data_vid_fea['pred_score'] = all_data_vid_fea['pred_score'].transform(lambda x: norm_sim(x))
# all_data_vid_fea.sort_values(by=['did', 'pred_score'])
# all_data_vid_fea['pred_rank'] = all_data_vid_fea.groupby(['did'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
# all_data_vid_fea[['did', 'candi_vid', 'pred_score']].to_csv(save_path + 'all_data_lgbRanker.csv', index=False)

In [136]:
score_df.sort_values(by=['did','pred_score'],ascending=False,inplace=True)
# 评分函数
mrr6(score_df,local_final_log,topk=6)

100%|██████████| 170909/170909 [00:08<00:00, 21041.12it/s]
100%|██████████| 170909/170909 [00:36<00:00, 4654.25it/s]


170909
0.4575035057642762
