## LGB 排序模型模块

In [1]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import gc, os
import time
from datetime import datetime
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
def mrr_score(stat_data,cols,local_test_log,topk=6):
    stat_ = stat_data[cols].copy()
    if 'pred_rank' in cols:
        stat_ = stat_.sort_values(by=['did','pred_rank'],ascending=False)
    else:
        stat_ = stat_.sort_values(by=['did','pred_score'],ascending=False)
    stat_ = stat_.groupby('did').head(topk)

    # 转为字典会更快
    # 用一点trick 吧
    comp_dict = dict()
    for did,cand_vid in tqdm(stat_.groupby('did')):
        comp_dict[did] = cand_vid['candi_vid']

    # 评分函数 dict()版本
    score = []
    for i in tqdm(range(len(local_test_log))):
        test_did = local_test_log.iloc[i].did
        test_vid = local_test_log.iloc[i].vid 

        recall_ = comp_dict[test_did]
        # mrr 评分
        flag = 0
        for i in range(len(recall_)):
            if test_vid == recall_.iloc[i]:
                score.append(1/(i+1))
                flag = 1
        if flag == 0:
            score.append(0) 
    print(len(score))
    print(np.mean(score))

In [3]:
# 排序结果归一化
def norm_sim(sim_df, weight=0.0):
    # print(sim_df.head())
    min_sim = sim_df.min()
    max_sim = sim_df.max()
    if max_sim == min_sim:
        sim_df = sim_df.apply(lambda sim: 1.0)
    else:
        sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))

    sim_df = sim_df.apply(lambda sim: sim + weight)  # plus one
    return sim_df

In [4]:
save_path = './5000_sample/'
offline = True

In [5]:
## 用于验证的数据
local_train_log = pd.read_csv(save_path + 'local_train_log.csv')
local_valid_log = pd.read_csv(save_path + 'local_valid_log.csv')
local_test_log = pd.read_csv(save_path + 'local_test_log.csv')

## 原始的数据
local_train = pd.read_csv(save_path + 'local_train.csv')

In [6]:
local_valid = pd.concat((local_train,local_train_log))
local_test = pd.concat((local_valid,local_valid_log))
train = pd.concat((local_test,local_test_log))

# ____________________________________ 极其重要的
local_train = local_train.sort_values(by=['did','seq_no'],ascending=False).reset_index(drop=True)
local_valid = local_valid.sort_values(by=['did','seq_no'],ascending=False).reset_index(drop=True)
local_test = local_test.sort_values(by=['did','seq_no'],ascending=False).reset_index(drop=True)
train = train.sort_values(by=['did','seq_no'],ascending=False).reset_index(drop=True)

In [7]:
trn_user_recall = pd.read_csv(save_path + 'trn_user_recall_feature.csv')
val_user_recall = pd.read_csv(save_path + 'val_user_recall_feature.csv')
tst_user_recall = pd.read_csv(save_path + 'tst_user_recall_feature.csv')

In [8]:
local_test.columns

Index(['did', 'vid', 'cid', 'seq_no', 'serialno', 'is_intact', 'classify_id',
       'series_id', 'duration', 'vts', 'hb', 'time_gap', 'cpn', 'fpn',
       'title_length', 'upgc_flag', 'user_cid_n', 'cid_count', 'vid_count',
       'cid_own_vid', 'did_count'],
      dtype='object')

In [8]:
local_te = local_test[['did','cid','seq_no','serialno','is_intact','classify_id','series_id','duration','vid_count','time_gap','user_cid_n']].copy()
tr = train[['did','cid','seq_no','serialno','is_intact','classify_id','series_id','duration','vid_count','time_gap','user_cid_n']].copy()

## —————————分割线—————————————

In [9]:
trn_user_recall_feature = trn_user_recall.copy()
val_user_recall_feature = val_user_recall.copy()
tst_user_recall_feature = tst_user_recall.copy()

In [10]:
## 类别标签编码
from sklearn.preprocessing import LabelEncoder


lbe = LabelEncoder()
feature = ['V_cid','U_cid_like_hist','U_cid_like_rect','candi_vid']
for f in feature:
    trn_user_recall_feature[f] = lbe.fit_transform(trn_user_recall_feature[f])
    val_user_recall_feature[f] = lbe.fit_transform(val_user_recall_feature[f])
    tst_user_recall_feature[f] = lbe.fit_transform(tst_user_recall_feature[f])
trn_user_recall_feature.head(2)

Unnamed: 0,did,candi_vid,label,V_cid,V_is_intact,serialno,V_classify_id,V_series_id,V_duration,V_title_length,...,U_dura_max,U_dura_min,U_tl_mean,U_vid_count_mean,U_vid_count_max,U_vid_count_min,U_series_id,U_classify_id,U_is_intact,U_upgc_flag
0,00014f1f94c9dc0c98785386b89fa0e8,2923,0.0,918,2,1,3,0,57,39,...,8240,18,56.678571,9888.75,81484,30,105039,2,1,0
1,00014f1f94c9dc0c98785386b89fa0e8,954,0.0,442,1,25,2,105039,2606,31,...,8240,18,56.678571,9888.75,81484,30,105039,2,1,0


#### 从这里开始 调试 模型，———come on————！！！！

In [12]:
# # 防止中间出错之后重新读取数据
# trn_user_recall_feature_rank_model = trn_user_recall_feature.copy()
# val_user_recall_feature_rank_model = val_user_recall_feature.copy()

# tst_user_recall_feature_rank_model = tst_user_recall_feature.copy()

# # if offline:
# #     val_user_recall_feature_rank_model = val_user_recall_feature.copy()

In [11]:
# 排序模型分组
trn_user_recall_feature.sort_values(by=['did'], inplace=True)
g_train = trn_user_recall_feature.groupby(['did'], as_index=False).count()["label"].values
 
val_user_recall_feature.sort_values(by=['did'], inplace=True)
g_valid = val_user_recall_feature.groupby(['did'], as_index=False).count()["label"].values

len(g_train)


5000

In [39]:
# 定义特征列
# 三类特征，用户侧，物品侧，交叉侧，从分数来看，物品侧的特征更为重要
lgb_cols = [ 
            # 物品侧的基础特征
            'V_cid',
            'V_is_intact',
            'V_classify_id',
            'V_series_id',
            'V_duration',
            'V_title_length',
            'V_upgc_flag',      # 这两可能也较为重要
            # 物品侧的统计特征
            'V_vid_pop',
            'V_cid_pop',        # 这两个较为重要
            'V_cid_traffic_mean',
            
            'candi_vid',

            # 用户侧  用户侧的分数提升很少，只有 0.3 - 0.4 左右
            'U_cid_like_hist',
            'U_cid_like_rect',    # 去掉这两个 0.3645
            'U_dura_min',
            'U_dura_max',
            'U_dura_mean',   # 这三个的 不是很重要！
            'U_tl_mean',     # 有必要
            'U_vid_count_mean',
            'U_vid_count_max',
            'U_vid_count_min',
            # 用户侧
            'U_series_id',
            'U_classify_id',
            'U_is_intact',
            'U_upgc_flag', # 这个有比较大的提升

            # 交叉侧
            'V_follow', # 这是我究极上分的利器特征
            'V_collect',
            ]

## 1、LGBM 排序模型

In [40]:
# 排序模型定义
lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt',
                       num_leaves=127,
                       reg_alpha=0.0,
                       reg_lambda=1,
                       max_depth=-1,
                       n_estimators=5000,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       subsample_freq=1,
                       learning_rate=0.01,
                       min_child_weight=50,
                       random_state=1000,
                       n_jobs= 16
                       )


In [23]:
trn_user_recall_feature.shape

(476814, 31)

In [None]:
# 排序模型训练
offline = True
if offline:
    lgb_ranker.fit(trn_user_recall_feature[lgb_cols], trn_user_recall_feature['label'], group=g_train,
                eval_set=[(val_user_recall_feature[lgb_cols], val_user_recall_feature['label'])], 
                eval_group= [g_valid], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )
else:
    lgb_ranker.fit(trn_user_recall_feature[lgb_cols], trn_user_recall_feature['label'], group=g_train)

In [42]:
# 模型预测
tst_user_recall_feature['pred_score'] = lgb_ranker.predict(tst_user_recall_feature[lgb_cols], num_iteration=lgb_ranker.best_iteration_)

# 将这里的排序结果保存一份，用户后面的模型融合
# tst_user_recall_feature[['did', 'candi_vid', 'pred_score']].to_csv(save_path + 'lgb_tst_5000_ranker_score.csv', index=False)

In [43]:
def mark(cols,tst_user_recall_feature_):
    tst_user_recall_feature_['candi_vid'] = lbe.inverse_transform(tst_user_recall_feature_['candi_vid'])
    mrr_score(tst_user_recall_feature_,cols,local_test_log,topk=6)
cols = ['did','candi_vid','pred_score']
mark(cols,tst_user_recall_feature.copy())

100%|██████████| 5000/5000 [00:00<00:00, 30112.63it/s]
100%|██████████| 5000/5000 [00:00<00:00, 6086.88it/s]


5000
0.3649466666666667


In [27]:
# cols = ['did','candi_vid','pred_score']
# mrr_score(tst_user_recall_feature,cols,local_test_log,topk=6)

In [None]:
# 召回率：0.75 （12.8万）
# ———————————————————————— 禁止表 ———————————————————————
# 标签编码禁止： 'U_cid_like_hist', 'U_cid_like_rect'

# ———————————————————————— 记录表 ———————————————————————
# 全量 : 170909 》Ranker : 0.2170 》 head(6) : 61748 
# 5000样本：Ranker : 0.326 head(6) 正标签：2114 

# 恭喜mo.sir，贺喜mo.sir 找到了一个极其重要的特征
# ———————————————————————— 对比表 ———————————————————————
# no follow  0.2054          | follow  0.3348
# no LE      0.3308          | LE      0.3209    
# MD -1      0.3616 - 0.357  | Md 3    0.3308   超参数：1000(better)
# candi_vid  0.3633
# no cid(hist,rect) 0.3645
# no collect 0.3518          | collect 0.3555

# ----------------------- 召回率表 ---------------------
# + habit_action.head(10)    0.79   -> 提升 0.36

# ##################################################### 目前最好的评分 ： 0.3649


In [44]:
tst_user_recall_feature.sort_values(by=['did','pred_score'],ascending=False,inplace=True)

In [76]:
did = np.random.choice(tst_user_recall_feature.did.unique())
print(tst_user_recall_feature[tst_user_recall_feature['did'] == did].label.value_counts())

0.0    96
1.0     1
Name: label, dtype: int64


In [77]:
tst_user_recall_feature[tst_user_recall_feature['did'] == did].head(20)

Unnamed: 0,did,candi_vid,label,V_cid,V_is_intact,serialno,V_classify_id,V_series_id,V_duration,V_title_length,...,U_dura_min,U_tl_mean,U_vid_count_mean,U_vid_count_max,U_vid_count_min,U_series_id,U_classify_id,U_is_intact,U_upgc_flag,pred_score
90661,2fecb7c1e53bb6e3a1164ef7354adf32,4947,0.0,451,1,5,1,0,5475,116,...,1293,93.722222,15283.666667,81484,346,97664,1,1,0,0.233926
90704,2fecb7c1e53bb6e3a1164ef7354adf32,2470,0.0,142,1,2,1,105053,3249,105,...,1293,93.722222,15283.666667,81484,346,97664,1,1,0,0.134001
90656,2fecb7c1e53bb6e3a1164ef7354adf32,2487,0.0,887,1,44,1,51680,4832,105,...,1293,93.722222,15283.666667,81484,346,97664,1,1,0,0.131506
90629,2fecb7c1e53bb6e3a1164ef7354adf32,3378,0.0,651,1,1,1,0,4707,105,...,1293,93.722222,15283.666667,81484,346,97664,1,1,0,0.107009
90716,2fecb7c1e53bb6e3a1164ef7354adf32,3162,0.0,887,1,42,1,51680,4890,47,...,1293,93.722222,15283.666667,81484,346,97664,1,1,0,0.085305
90660,2fecb7c1e53bb6e3a1164ef7354adf32,4194,0.0,142,1,1,1,105053,2918,107,...,1293,93.722222,15283.666667,81484,346,97664,1,1,0,-0.02279
90721,2fecb7c1e53bb6e3a1164ef7354adf32,2666,0.0,357,1,11,1,0,6495,97,...,1293,93.722222,15283.666667,81484,346,97664,1,1,0,-0.054728
90710,2fecb7c1e53bb6e3a1164ef7354adf32,6228,0.0,51,1,12,1,55601,4790,117,...,1293,93.722222,15283.666667,81484,346,97664,1,1,0,-0.146136
90709,2fecb7c1e53bb6e3a1164ef7354adf32,1831,0.0,357,1,8,1,0,5698,91,...,1293,93.722222,15283.666667,81484,346,97664,1,1,0,-0.236164
90637,2fecb7c1e53bb6e3a1164ef7354adf32,3994,1.0,907,3,4,1,97664,60,98,...,1293,93.722222,15283.666667,81484,346,97664,1,1,0,-0.282345


In [78]:
# 测试集
local_te[local_te['did'] == did].head(5)

Unnamed: 0,did,cid,seq_no,serialno,is_intact,classify_id,series_id,duration,vid_count,time_gap,user_cid_n
140225,2fecb7c1e53bb6e3a1164ef7354adf32,d0ca271c9f8c6860498759f1bc30ee0d,18.0,1,1,1,106595,2030,32183,12.0,1
140226,2fecb7c1e53bb6e3a1164ef7354adf32,f703a7e171878113d1854d9e25b1df7f,17.0,3,1,1,97664,6244,61467,143942.0,3
140227,2fecb7c1e53bb6e3a1164ef7354adf32,81adb9534cfe82a795d343a8ff2eab64,16.0,50,1,1,106049,1493,6341,169695.0,2
140228,2fecb7c1e53bb6e3a1164ef7354adf32,81adb9534cfe82a795d343a8ff2eab64,15.0,49,1,1,106049,5241,26249,24334.0,2
140229,2fecb7c1e53bb6e3a1164ef7354adf32,4eef2762d472c86e4fd801c13974f71a,14.0,17,1,1,51680,6207,457,85586.0,3


In [79]:
# 查看测试集的答案
print(tr[tr['did'] == did].shape)
tr[tr['did'] == did].head(10)

(19, 11)


Unnamed: 0,did,cid,seq_no,serialno,is_intact,classify_id,series_id,duration,vid_count,time_gap,user_cid_n
144290,2fecb7c1e53bb6e3a1164ef7354adf32,f703a7e171878113d1854d9e25b1df7f,19.0,4,3,1,97664,60,9001,5758.0,3
144291,2fecb7c1e53bb6e3a1164ef7354adf32,d0ca271c9f8c6860498759f1bc30ee0d,18.0,1,1,1,106595,2030,32183,12.0,1
144292,2fecb7c1e53bb6e3a1164ef7354adf32,f703a7e171878113d1854d9e25b1df7f,17.0,3,1,1,97664,6244,61467,143942.0,3
144293,2fecb7c1e53bb6e3a1164ef7354adf32,81adb9534cfe82a795d343a8ff2eab64,16.0,50,1,1,106049,1493,6341,169695.0,2
144294,2fecb7c1e53bb6e3a1164ef7354adf32,81adb9534cfe82a795d343a8ff2eab64,15.0,49,1,1,106049,5241,26249,24334.0,2
144295,2fecb7c1e53bb6e3a1164ef7354adf32,4eef2762d472c86e4fd801c13974f71a,14.0,17,1,1,51680,6207,457,85586.0,3
144296,2fecb7c1e53bb6e3a1164ef7354adf32,f703a7e171878113d1854d9e25b1df7f,13.0,2,1,1,97664,6170,81484,84280.0,3
144297,2fecb7c1e53bb6e3a1164ef7354adf32,4eef2762d472c86e4fd801c13974f71a,12.0,14,1,1,51680,3311,346,1954.0,3
144298,2fecb7c1e53bb6e3a1164ef7354adf32,4eef2762d472c86e4fd801c13974f71a,11.0,11,1,1,51680,5508,583,754.0,3
144299,2fecb7c1e53bb6e3a1164ef7354adf32,f27d513482d67a5112e36152836de88f,10.0,41,1,1,51680,6521,38545,57224.0,2


In [80]:
# t = tst_user_recall_feature.copy()
# t.sort_values(by=['did','pred_score'],ascending=False,inplace=True)
# t = t.groupby('did').head(20)
# t.label.value_counts()

In [81]:
# t.to_csv('./5000_sample/lgb_rank20.csv',index=False)

In [82]:
# 五折交叉验证，这里的五折交叉是以用户为目标进行五折划分
#  这一部分与前面的单独训练和验证是分开的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['did'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_recall_feature
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['did', 'candi_vid','label']]
sub_preds = np.zeros(tst_user_recall_feature.shape[0])

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['did'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['did'].isin(valid_user)]
    
    # 训练集与验证集的用户分组
    train_idx.sort_values(by=['did'], inplace=True)
    g_train = train_idx.groupby(['did'], as_index=False).count()["label"].values
    
    valid_idx.sort_values(by=['did'], inplace=True)
    g_val = valid_idx.groupby(['did'], as_index=False).count()["label"].values
    
    # 定义模型
    lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=1000, n_jobs= 16)  
    # 训练模型
    lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,
                   eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], 
                   eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )
    
    # 预测验证集结果
    valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)
    
    # 对输出结果进行归一化
    valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['did', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['did'])['pred_score'].rank(ascending=False, method='first')
    
    # 将验证集的预测结果放到一个列表中，后面进行拼接
    score_list.append(valid_idx[['did', 'candi_vid', 'pred_score', 'pred_rank']])
    
    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    if not offline:
        sub_preds += lgb_ranker.predict(tst_user_recall_feature_rank_model[lgb_cols], lgb_ranker.best_iteration_)
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['did', 'candi_vid'])
# 保存训练集交叉验证产生的新特征
score_df[['did', 'candi_vid', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False)
    
# 测试集的预测结果，多次交叉验证求平均,将预测的score和对应的rank特征保存，可以用于后面的staking，这里还可以构造其他更多的特征
tst_user_recall_feature_rank_model['pred_score'] = sub_preds / k_fold
tst_user_recall_feature_rank_model['pred_score'] = tst_user_recall_feature_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_recall_feature_rank_model.sort_values(by=['did', 'pred_score'])
tst_user_recall_feature_rank_model['pred_rank'] = tst_user_recall_feature_rank_model.groupby(['did'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
tst_user_recall_feature_rank_model[['did', 'candi_vid', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)

In [83]:
# cols = ['did','candi_vid','pred_rank']
# mrr_score(tst_user_recall_feature_rank_model,cols,local_test_log)