In [4]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import gc, os
import time
from datetime import datetime
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

In [6]:
trn_user_item_feats_df = pd.read_csv('./FE/final_trn_user_item_feats_df.csv')
trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)
tst_user_item_feats_df = pd.read_csv('./FE/final_tst_user_item_feats_df.csv')
tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)

del tst_user_item_feats_df['label']

In [8]:
del trn_user_item_feats_df['Unnamed: 0']
del tst_user_item_feats_df['Unnamed: 0']

In [10]:

trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()
    
tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()

In [11]:
# feature columns
lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', 
            'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level',
            'click_environment','click_deviceGroup', 'click_os', 'click_country', 
            'click_region','click_referrer_type',
            'words_hbo', 'category_id', 'created_at_ts','words_count']

In [12]:
trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)
g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values

In [15]:
lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)  

In [16]:
lgb_ranker.fit(trn_user_item_feats_df[lgb_cols], trn_user_item_feats_df['label'], group=g_train)

LGBMRanker(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
           importance_type='split', learning_rate=0.01, max_depth=-1,
           min_child_samples=20, min_child_weight=50, min_split_gain=0.0,
           n_estimators=100, n_jobs=16, num_leaves=31, objective=None,
           random_state=2018, reg_alpha=0.0, reg_lambda=1, silent=True,
           subsample=0.7, subsample_for_bin=200000, subsample_freq=1)

In [17]:
tst_user_item_feats_df['pred_score'] = lgb_ranker.predict(tst_user_item_feats_df[lgb_cols], num_iteration=lgb_ranker.best_iteration_)

In [19]:
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]

In [25]:
def submit(recall_df, topk=5, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 判断是不是每个用户都有5篇文章及以上
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk
    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                                  3: 'article_3', 4: 'article_4', 5: 'article_5'})
    
    save_name = model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)

In [28]:
tst_user_item_feats_df[tst_user_item_feats_df["user_id"]==200001]

Unnamed: 0,user_id,click_article_id,sim0,time_diff0,word_diff0,sim_max,sim_min,sim_sum,sim_mean,score,...,click_os,click_country,click_region,click_referrer_type,words_hbo,category_id,created_at_ts,words_count,is_cat_hab,pred_score
10233,200001,336221,10.237462,638233000,18,10.237462,10.237462,10.237462,10.237462,1,...,2,1,18,7,176.0,437,1507613161000,158,0,-0.611670
93366,200001,272143,11.962455,13058000,8,11.962455,11.962455,11.962455,11.962455,3,...,2,1,18,7,176.0,399,1506961870000,184,0,-1.019778
116844,200001,234698,10.895900,643669000,7,10.895900,10.895900,10.895900,10.895900,1,...,2,1,18,7,176.0,375,1507618597000,183,0,-0.617775
327387,200001,235616,10.165027,661541000,44,10.165027,10.165027,10.165027,10.165027,1,...,2,1,18,7,176.0,375,1507636469000,220,0,-0.627089
369092,200001,300473,5.471592,14013000,6,5.471592,5.471592,5.471592,5.471592,1,...,2,1,18,7,176.0,428,1506960915000,182,0,0.064181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2423470,200001,187067,5.292248,25713000,54,5.292248,5.292248,5.292248,5.292248,1,...,2,1,18,7,176.0,302,1506949215000,122,0,0.074599
2437100,200001,70033,5.824065,43667000,41,5.824065,5.824065,5.824065,5.824065,1,...,2,1,18,7,176.0,136,1507018595000,217,0,-0.044256
2438515,200001,236613,4.835308,78904000,2,4.835308,4.835308,4.835308,4.835308,1,...,2,1,18,7,176.0,375,1506896024000,174,0,-0.178915
2452083,200001,50864,5.461739,48726000,26,5.461739,5.461739,5.461739,5.461739,1,...,2,1,18,7,176.0,99,1506926202000,202,0,-0.043501


In [26]:
submit(rank_results, topk=5, model_name="lgb_ranker")

In [29]:
# 模型及参数的定义
lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  

In [30]:
lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'])

[LightGBM] [Info] Number of positive: 154872, number of negative: 224410
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.048712
[LightGBM] [Debug] init for col-wise cost 0.000446 seconds, init for row-wise cost 0.013703 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 3397
[LightGBM] [Info] Number of data points in the train set: 379282, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.408329 -> initscore=-0.370876
[LightGBM] [Info] Start training from score -0.370876
[LightGBM] [Debug] Re-bagging, using 265663 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 8
[LightGBM] [Debug] Re-bagging, using 265164 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 7
[LightGBM] [Debug] Re-bagging, using 265552 data to train
[LightGBM

[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 265744 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 8
[LightGBM] [Debug] Re-bagging, using 265228 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 8
[LightGBM] [Debug] Re-bagging, using 265338 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 265268 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 8
[LightGBM] [Debug] Re-bagging, using 265320 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 265177 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 265208 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 8
[LightGBM] [Debug] Re-bagging, using 26

[LightGBM] [Debug] Re-bagging, using 265648 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 265534 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 8
[LightGBM] [Debug] Re-bagging, using 265201 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 265351 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 8
[LightGBM] [Debug] Re-bagging, using 265191 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 265600 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 265480 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 8
[LightGBM] [Debug] Re-bagging, using 265418 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 a

[LightGBM] [Debug] Re-bagging, using 265752 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[LightGBM] [Debug] Re-bagging, using 265098 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 265519 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 265329 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 265639 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 265516 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 265774 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 8
[LightGBM] [Debug] Re-bagging, using 265124 data to train
[LightGBM] [Debug] Trained a tree with leaves = 

[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 265194 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 13
[LightGBM] [Debug] Re-bagging, using 265008 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 265282 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 265469 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 13
[LightGBM] [Debug] Re-bagging, using 265690 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 265253 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 265669 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, usin

[LightGBM] [Debug] Re-bagging, using 265398 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 265739 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 265734 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 265049 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 265609 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 265306 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[LightGBM] [Debug] Re-bagging, using 265658 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 265456 data to train
[LightGBM] [Debug] Trained a tree with leaves =

[LightGBM] [Debug] Re-bagging, using 265500 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[LightGBM] [Debug] Re-bagging, using 265587 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 265386 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 265544 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 8
[LightGBM] [Debug] Re-bagging, using 265523 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 265163 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 265361 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 265179 data to train
[LightGBM] [Debug] Trained a tree with leaves =

[LightGBM] [Debug] Re-bagging, using 265013 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 265077 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 265621 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 265336 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 13
[LightGBM] [Debug] Re-bagging, using 265433 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 265677 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 266025 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 13
[LightGBM] [Debug] Re-bagging, using 265972 data to train
[LightGBM] [Debug] Trained a tree with leaves = 

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
               importance_type='split', learning_rate=0.01, max_depth=-1,
               min_child_samples=20, min_child_weight=50, min_split_gain=0.0,
               n_estimators=500, n_jobs=16, num_leaves=31, objective=None,
               random_state=2018, reg_alpha=0.0, reg_lambda=1, silent=True,
               subsample=0.7, subsample_for_bin=200000, subsample_freq=1,
               verbose=10)

In [31]:
tst_user_item_feats_df['pred_score'] = lgb_Classfication.predict_proba(tst_user_item_feats_df[lgb_cols])[:,1]

In [33]:
tst_user_item_feats_df[tst_user_item_feats_df["user_id"]==200001]

Unnamed: 0,user_id,click_article_id,sim0,time_diff0,word_diff0,sim_max,sim_min,sim_sum,sim_mean,score,...,click_os,click_country,click_region,click_referrer_type,words_hbo,category_id,created_at_ts,words_count,is_cat_hab,pred_score
10233,200001,336221,10.237462,638233000,18,10.237462,10.237462,10.237462,10.237462,1,...,2,1,18,7,176.0,437,1507613161000,158,0,0.132551
93366,200001,272143,11.962455,13058000,8,11.962455,11.962455,11.962455,11.962455,3,...,2,1,18,7,176.0,399,1506961870000,184,0,0.016482
116844,200001,234698,10.895900,643669000,7,10.895900,10.895900,10.895900,10.895900,1,...,2,1,18,7,176.0,375,1507618597000,183,0,0.134010
327387,200001,235616,10.165027,661541000,44,10.165027,10.165027,10.165027,10.165027,1,...,2,1,18,7,176.0,375,1507636469000,220,0,0.128850
369092,200001,300473,5.471592,14013000,6,5.471592,5.471592,5.471592,5.471592,1,...,2,1,18,7,176.0,428,1506960915000,182,0,0.245270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2423470,200001,187067,5.292248,25713000,54,5.292248,5.292248,5.292248,5.292248,1,...,2,1,18,7,176.0,302,1506949215000,122,0,0.247401
2437100,200001,70033,5.824065,43667000,41,5.824065,5.824065,5.824065,5.824065,1,...,2,1,18,7,176.0,136,1507018595000,217,0,0.180622
2438515,200001,236613,4.835308,78904000,2,4.835308,4.835308,4.835308,4.835308,1,...,2,1,18,7,176.0,375,1506896024000,174,0,0.124243
2452083,200001,50864,5.461739,48726000,26,5.461739,5.461739,5.461739,5.461739,1,...,2,1,18,7,176.0,99,1506926202000,202,0,0.196978


In [34]:
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_cls')