In [2]:
import time
from tqdm import tqdm
import collections
import math
import pickle
from datetime import datetime
import numpy as np
import pandas as pd

In [3]:
itemcf_recall_dict = pickle.load(open("./recall/offline_itemcf_user_recall_items_dict.pkl", "rb"))  # 50
ytb_recall_dict = pickle.load(open("./recall/ytb_sort_recall_dict.pkl", "rb"))  # 50
#cold_start_recall_dict = pickle.load(open("./recall/cold_start_user_items_dict.pkl", "rb"))  # ?
popular_recall_dict = pickle.load(open("./recall/popular_sort_recall_dict.pkl", "rb"))  # 50

In [95]:
final_recall_items_dict = {}

for user_id, item_list in tqdm(itemcf_recall_dict.items()):
    final_recall_items_dict.setdefault(user_id, {})
    min_sim = item_list[-1][1]
    max_sim = item_list[0][1]

    for item, score in item_list:
        final_recall_items_dict[user_id].setdefault(item, 0)
        final_recall_items_dict[user_id][item] += 1.5 * (score - min_sim) / (max_sim - min_sim)

"""
for user_id, item_list in tqdm(ytb_recall_dict.items()):
    final_recall_items_dict.setdefault(user_id, {})
    min_sim = item_list[-1][1]
    max_sim = item_list[0][1]

    for item, score in item_list:
        final_recall_items_dict[user_id].setdefault(item, 0)
        final_recall_items_dict[user_id][item] += 0 * (score - min_sim) / (max_sim - min_sim)
"""

for user_id, item_list in tqdm(popular_recall_dict.items()):
    final_recall_items_dict.setdefault(user_id, {})

    for item, score in item_list:
        final_recall_items_dict[user_id].setdefault(item, 0)
        final_recall_items_dict[user_id][item] += 1.5 * score

100%|██████████| 200000/200000 [00:08<00:00, 23087.26it/s]
100%|██████████| 200000/200000 [00:07<00:00, 27393.57it/s]


In [31]:
#pickle.dump(final_recall_items_dict, open('final_recall_items_dict.pkl','wb'))

In [4]:
# 节省数据内存
article_dtypes = {
    "article_id": "int32",
    "category_id": "int16",
    "created_at_ts": "int64",
    "words_count": "int16"}

click_log_dtypes = {
    "user_id": "int32",
    "click_article_id": "int32",
    "click_timestamp": "int64",
    "click_environment": "int8",
    "click_deviceGroup": "int8",
    "click_os": "int8",
    "click_country": "int8",
    "click_region": "int8",
    "click_referrer_type": "int8"}

def get_val_ans_click_df(data_save_path, name='offline_val_ans_df.csv'):
    val_ans_df = pd.read_csv(data_save_path + name, dtype=click_log_dtypes)
    return val_ans_df

data_save_path = "../data/"
val_ans_df = get_val_ans_click_df(data_save_path)
val_ans_dict = dict(zip(val_ans_df['user_id'], val_ans_df['click_article_id']))

In [75]:
def offline_recall_metrics(user_recall_items_dict, val_ans_dict, k=150):
    recall_cnt = 0
    for user_id, val_ans in val_ans_dict.items():
        if val_ans in set(list(zip(*sorted(sorted(list(user_recall_items_dict[user_id].items()), key=lambda x:x[1], reverse=True)[:k])))[0]):
            recall_cnt += 1
    hit = recall_cnt / len(val_ans_dict)
    print("hit rate: " + str(hit))
    return hit

In [94]:
offline_recall_metrics(final_recall_items_dict,val_ans_dict,k=5)

hit rate: 0.3439


0.3439

In [6]:
def offline_recall_metrics_other(user_recall_items_dict, val_ans_dict, k=150):
    recall_cnt = 0
    for user_id, val_ans in val_ans_dict.items():
        if val_ans in set(list(zip(*user_recall_items_dict[user_id][:k]))[0]):
            recall_cnt += 1
    hit = recall_cnt / len(val_ans_dict)
    print("hit rate: " + str(hit))
    return hit

In [81]:
offline_recall_metrics_other(itemcf_recall_dict, val_ans_dict, k=5)

hit rate: 0.34645


0.34645

In [9]:
offline_recall_metrics_other(ytb_recall_dict, val_ans_dict, k=50)

hit rate: 0.40765


0.40765

In [18]:
offline_recall_metrics_other(popular_recall_dict, val_ans_dict, k=5)

hit rate: 0.207975


0.207975

In [25]:
# 将字典的形式转换成df
user_item_score_list = []

for user_id in tqdm(final_recall_items_dict.keys()):
    if user_id < 200000:
        continue
    for item, score in final_recall_items_dict[user_id].items():
        user_item_score_list.append([user_id, item, score])

recall_df = pd.DataFrame(user_item_score_list, columns=['user_id', 'click_article_id', 'pred_score'])



  0%|          | 0/200000 [00:00<?, ?it/s][A[A

100%|██████████| 200000/200000 [00:00<00:00, 1950512.47it/s][A[A


In [44]:
# 生成提交文件
def submit(recall_df, topk=5, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 判断是不是每个用户都有5篇文章及以上
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk
    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                                  3: 'article_3', 4: 'article_4', 5: 'article_5'})
    
    save_name = './submit/' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)

In [45]:
submit(recall_df, topk=5, model_name=None)

In [48]:
recall_df[recall_df["user_id"]==200001]

Unnamed: 0,user_id,click_article_id,pred_score
7088105,200001,324823,3
7088106,200001,166581,3
7088107,200001,224658,3
7088108,200001,285808,2
7088109,200001,285343,3
...,...,...,...
7088223,200001,187067,1
7088224,200001,70033,1
7088225,200001,236613,1
7088226,200001,50864,1
