In [1]:
import time
from tqdm import tqdm
import collections
import math
import pickle
from datetime import datetime
import numpy as np
import pandas as pd

from loaddata_z import loadData

In [2]:
data_path = "../data/"
_loadData = loadData(data_path)

In [3]:
sample_df = _loadData.get_all_data(offline=False)
item_info_df = _loadData.get_item_info()

### itemCF

In [4]:
# 根据点击时间获取用户的点击文章序列   {user1: [(item1, time1), (item2, time2)..]...}
def get_user_item_time(click_df):
    
    click_df = click_df.sort_values('click_timestamp')
    
    def make_item_time_pair(df):
        return list(zip(df['click_article_id'], df['click_timestamp']))
    
    user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\
                                                            .reset_index().rename(columns={0: 'item_time_list'})
    user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
    
    return user_item_time_dict

# 获取文章id对应的基本属性，保存成字典的形式，方便后面召回阶段，冷启动阶段直接使用
def get_item_info_dict(item_info_df):
    item_created_a_time_dict = dict(zip(item_info_df['click_article_id'], item_info_df['created_at_ts']))
    max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
    item_info_df['created_at_ts'] = item_info_df[['created_at_ts']].apply(max_min_scaler)
    
    item_type_dict = dict(zip(item_info_df['click_article_id'], item_info_df['category_id']))
    item_words_dict = dict(zip(item_info_df['click_article_id'], item_info_df['words_count']))
    item_created_time_dict = dict(zip(item_info_df['click_article_id'], item_info_df['created_at_ts']))
    
    return item_type_dict, item_words_dict, item_created_time_dict, item_created_a_time_dict

def itemcf_sim(sample_df, item_created_time_dict):
    """
    文章与文章之间的相似性矩阵计算
    :param df: 数据表
    :item_created_time_dict:  文章创建时间的字典
    return : 文章与文章的相似性矩阵
    思路: 基于物品的协同过滤， 在多路召回部分会加上关联规则的召回策略
    """
    user_item_time_dict = get_user_item_time(sample_df)
    
    # 计算相似度
    i2i_sim = {}
    item_cnt = collections.defaultdict(int)
    for user, item_time_list in tqdm(user_item_time_dict.items()):
        for loc1, (i, i_click_time) in enumerate(item_time_list):
            item_cnt[i] += 1
            i2i_sim.setdefault(i, {})
            for loc2, (j, j_click_time) in enumerate(item_time_list):
                if i == j:
                    continue
                    
                # 考虑文章的正向顺序点击和反向顺序点击    
                loc_alpha = 1.0 if loc2 > loc1 else 0.7
                # 位置信息权重，其中的参数可以调节
                loc_weight = loc_alpha * (0.9 ** (np.abs(loc2 - loc1) - 1))
                # 点击时间权重，其中的参数可以调节
                click_time_weight = np.exp(0.7 ** np.abs(i_click_time - j_click_time))
                # 两篇文章创建时间的权重，其中的参数可以调节
                created_time_weight = np.exp(0.8 ** np.abs(item_created_time_dict[i] - item_created_time_dict[j]))
                
                i2i_sim[i].setdefault(j, 0)
                # 考虑多种因素的权重计算最终的文章之间的相似度
                i2i_sim[i][j] += loc_weight * click_time_weight * created_time_weight / math.log(len(item_time_list) + 1)
    
    i2i_sim_ = i2i_sim.copy()
    # 两篇文章的流行度权重，惩罚过于热门的物品
    popular_weight = 0.5
    for i, related_items in i2i_sim.items():
        for j, wij in related_items.items():
            tmpMax, tmpMin = max(item_cnt[i], item_cnt[j]), min(item_cnt[i], item_cnt[j])
            i2i_sim_[i][j] = wij / ((tmpMax ** popular_weight) * (tmpMin ** (1 - popular_weight)))
    # 将得到的相似性矩阵保存到本地
    pickle.dump(i2i_sim_, open('itemcf_i2i_sim.pkl', 'wb'))
    
    return i2i_sim_


def item_based_recommend(user_id, user_item_time_dict, i2i_sim, maxCnt, recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim, item_a_created_time_dict):
    """
        基于文章协同过滤的召回
        :param user_id: 用户id
        :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列   {user1: [(item1, time1), (item2, time2)..]...}
        :param i2i_sim: 字典，文章相似性矩阵
        :param sim_item_topk: 整数， 选择与当前文章最相似的前k篇文章
        :param recall_item_num: 整数， 最后的召回文章数量
        :param item_topk_click: 列表，点击次数最多的文章列表，用户召回补全
        
        return: 召回的文章列表 [(item1, score1), (item2, score2)...]
    """
    # 获取用户历史交互的文章
    user_hist_items = user_item_time_dict[user_id]
    user_hist_items_ = {item_id for item_id, _ in user_hist_items}
    
    item_rank = {}
    
    for loc, (i, click_time) in enumerate(user_hist_items):
        cnt = 0
        for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True): # 先选取每篇文章前sim_item_topk个相似文章
            if j in user_hist_items_:
                # 用户已经看过j文章了
                continue
            

            # 文章生成及用户点击时间权重
            if item_a_created_time_dict[j] > user_item_time_dict[user_id][-1][1] + 1 * (10 ** 5) or item_a_created_time_dict[j] < user_item_time_dict[user_id][-1][1] - 5 * (10 ** 8):
                continue
            
            if cnt == maxCnt:
                break
            cnt += 1
 
            # 文章创建时间差权重
            created_time_weight = np.exp(0.8 ** np.abs(item_created_time_dict[i] - item_created_time_dict[j]))

            # 相似文章和历史点击文章序列中历史文章所在的位置权重
            loc_weight = (0.9 ** (len(user_hist_items) - loc))
            
            content_weight = 1.0
            if emb_i2i_sim.get(i, {}).get(j, None) is not None:
                content_weight += emb_i2i_sim[i][j]
            if emb_i2i_sim.get(j, {}).get(i, None) is not None:
                content_weight += emb_i2i_sim[j][i]
            content_weight = np.exp(1.08 ** content_weight)
            
            item_rank.setdefault(j, 0)
            item_rank[j] += content_weight * created_time_weight * loc_weight * wij
    """
            
    # 不足10个，用热门商品补全
    if len(item_rank) < recall_item_num:
        for i, item in enumerate(item_topk_click):
            if item in item_rank.items(): # 填充的item应该不在原来的列表中
                continue
            item_rank[item] = - i - 100 # 随便给个负数就行
            if len(item_rank) == recall_item_num:
                break
    """
    item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]
        
    return item_rank

# 获取近期点击最多的文章
def get_item_topk_click(click_df, k):
    topk_click = click_df['click_article_id'].value_counts().index[:k]
    return topk_click

In [5]:
item_type_dict, item_words_dict, item_created_time_dict, item_created_a_time_dict = get_item_info_dict(item_info_df)

In [6]:
item_created_a_time_dict 

{0: 1513144419000,
 1: 1405341936000,
 2: 1408667706000,
 3: 1408468313000,
 4: 1407071171000,
 5: 1407413929000,
 6: 1409896802000,
 7: 1412559620000,
 8: 1414351550000,
 9: 1412526792000,
 10: 1412517036000,
 11: 1412557141000,
 12: 1412558276000,
 13: 1412560598000,
 14: 1412546405000,
 15: 1412445832000,
 16: 1414418564000,
 17: 1414336074000,
 18: 1414361017000,
 19: 1477602915000,
 20: 1475848477000,
 21: 1475452367000,
 22: 1475450244000,
 23: 1475452376000,
 24: 1475443950000,
 25: 1475442203000,
 26: 1475449619000,
 27: 1478811408000,
 28: 1471976442000,
 29: 1405164375000,
 30: 1408359430000,
 31: 1418493805000,
 32: 1424231781000,
 33: 1423743072000,
 34: 1437910575000,
 35: 1443272076000,
 36: 1483243750000,
 37: 1487984363000,
 38: 1501174042000,
 39: 1514210186000,
 40: 1363197043000,
 41: 1367147115000,
 42: 1367078306000,
 43: 1366902489000,
 44: 1366622705000,
 45: 1366555193000,
 46: 1366970968000,
 47: 1367153075000,
 48: 1366734592000,
 49: 1366973840000,
 50: 13667

In [7]:
user_recall_items_dict = collections.defaultdict(dict)
user_item_time_dict = get_user_item_time(sample_df)
emb_i2i_sim = pickle.load(open('emb_i2i_sim1.pkl','rb'))
i2i_sim = pickle.load(open('emb_i2i_sim1.pkl', 'rb'))

sim_item_topk = 15000
recall_item_num = 100 # 稍微召回多一点文章，便于后续的规则筛选

user_item_time_dict = get_user_item_time(sample_df)
#item_type_dict, item_words_dict, item_created_time_dict, item_created_a_time_dict = get_item_info_dict(item_info_df)

item_topk_click = get_item_topk_click(sample_df, k=50)

In [8]:
len(i2i_sim[1].items())

100

In [9]:
for user in tqdm(sample_df['user_id'].unique()):
    user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, 150, 
                                                        recall_item_num, item_topk_click,item_created_time_dict, emb_i2i_sim, item_created_a_time_dict)
#pickle.dump(user_recall_items_dict, open('cold_start_items_raw_dict.pkl', 'wb'))

100%|██████████| 250000/250000 [02:25<00:00, 1719.58it/s]


In [10]:
# 基于规则进行文章过滤
# 保留文章主题与用户历史浏览主题相似的文章
# 保留文章字数与用户历史浏览文章字数相差不大的文章
# 保留最后一次点击当天的文章
# 按照相似度返回最终的结果

def get_click_article_ids_set(all_click_df):
    return set(all_click_df.click_article_id.values)

def cold_start_items(user_recall_items_dict, user_hist_item_typs_dict, user_hist_item_words_dict, \
                     user_last_item_created_time_dict, item_type_dict, item_words_dict, 
                     item_created_time_dict, click_article_ids_set, recall_item_num):
    """
        冷启动的情况下召回一些文章
        :param user_recall_items_dict: 基于内容embedding相似性召回来的很多文章， 字典， {user1: [(item1, item2), ..], }
        :param user_hist_item_typs_dict: 字典， 用户点击的文章的主题映射
        :param user_hist_item_words_dict: 字典， 用户点击的历史文章的字数映射
        :param user_last_item_created_time_idct: 字典，用户点击的历史文章创建时间映射
        :param item_tpye_idct: 字典，文章主题映射
        :param item_words_dict: 字典，文章字数映射
        :param item_created_time_dict: 字典， 文章创建时间映射
        :param click_article_ids_set: 集合，用户点击过得文章, 也就是日志里面出现过的文章
        :param recall_item_num: 召回文章的数量， 这个指的是没有出现在日志里面的文章数量
    """
    
    cold_start_user_items_dict = {}
    for user, item_list in tqdm(user_recall_items_dict.items()):
        cold_start_user_items_dict.setdefault(user, [])
        for item, score in item_list:
            # 获取历史文章信息
            hist_item_type_set = user_hist_item_typs_dict[user]
            hist_mean_words = user_hist_item_words_dict[user]
            hist_last_item_created_time = user_last_item_created_time_dict[user]
            hist_last_item_created_time = datetime.fromtimestamp(hist_last_item_created_time)
            
            # 获取当前召回文章的信息
            curr_item_type = item_type_dict[item]
            curr_item_words = item_words_dict[item]
            curr_item_created_time = item_created_time_dict[item]
            curr_item_created_time = datetime.fromtimestamp(curr_item_created_time)

            # 首先，文章不能出现在用户的历史点击中， 然后根据文章主题，文章单词数，文章创建时间进行筛选
            if curr_item_type not in hist_item_type_set or \
                item in click_article_ids_set or \
                abs(curr_item_words - hist_mean_words) > 200 or \
                abs((curr_item_created_time - hist_last_item_created_time).days) > 90: 
                continue
                
            cold_start_user_items_dict[user].append((item, score))      # {user1: [(item1, score1), (item2, score2)..]...}
    
    # 需要控制一下冷启动召回的数量
    cold_start_user_items_dict = {k: sorted(v, key=lambda x:x[1], reverse=True)[:recall_item_num] \
                                  for k, v in cold_start_user_items_dict.items()}
    
    pickle.dump(cold_start_user_items_dict, open('cold_start_user_items_dict.pkl', 'wb'))
    
    return cold_start_user_items_dict


def get_user_hist_item_info_dict(all_click):
    
    # 获取user_id对应的用户历史点击文章类型的集合字典
    user_hist_item_typs = all_click.groupby('user_id')['category_id'].agg(set).reset_index()
    user_hist_item_typs_dict = dict(zip(user_hist_item_typs['user_id'], user_hist_item_typs['category_id']))
    
    # 获取user_id对应的用户点击文章的集合
    user_hist_item_ids_dict = all_click.groupby('user_id')['click_article_id'].agg(set).reset_index()
    user_hist_item_ids_dict = dict(zip(user_hist_item_ids_dict['user_id'], user_hist_item_ids_dict['click_article_id']))
    
    # 获取user_id对应的用户历史点击的文章的平均字数字典
    user_hist_item_words = all_click.groupby('user_id')['words_count'].agg('mean').reset_index()
    user_hist_item_words_dict = dict(zip(user_hist_item_words['user_id'], user_hist_item_words['words_count']))
    
    # 获取user_id对应的用户最后一次点击的文章的创建时间
    all_click_ = all_click.sort_values('click_timestamp')
    user_last_item_created_time = all_click_.groupby('user_id')['created_at_ts'].apply(lambda x: x.iloc[-1]).reset_index()
    
    max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
    user_last_item_created_time['created_at_ts'] = user_last_item_created_time[['created_at_ts']].apply(max_min_scaler)
    
    user_last_item_created_time_dict = dict(zip(user_last_item_created_time['user_id'], \
                                                user_last_item_created_time['created_at_ts']))
    
    return user_hist_item_typs_dict, user_hist_item_ids_dict, user_hist_item_words_dict, user_last_item_created_time_dict

In [11]:
all_click_df_ = sample_df.copy()
all_click_df_ = all_click_df_.merge(item_info_df, how='left', on='click_article_id')
user_hist_item_typs_dict, user_hist_item_ids_dict, user_hist_item_words_dict, user_last_item_created_time_dict = get_user_hist_item_info_dict(all_click_df_)
click_article_ids_set = get_click_article_ids_set(sample_df)
# 需要注意的是
# 这里使用了很多规则来筛选冷启动的文章，所以前面再召回的阶段就应该尽可能的多召回一些文章，否则很容易被删掉
cold_start_user_items_dict = cold_start_items(user_recall_items_dict, user_hist_item_typs_dict, user_hist_item_words_dict, \
                                              user_last_item_created_time_dict, item_type_dict, item_words_dict, \
                                              item_created_time_dict, click_article_ids_set, recall_item_num)

100%|██████████| 250000/250000 [00:07<00:00, 34492.45it/s]


In [12]:
cold_start_user_items_dict

{199999: [],
 199998: [(1833, 2.5469202159403515), (172603, 0.8152072954587558)],
 199997: [(224878, 6.341208547257652)],
 199996: [(343128, 2.971895430976518)],
 199995: [],
 199994: [(16720, 5.776935660673582),
  (277110, 5.197281958375263),
  (277320, 4.09210778067405),
  (182211, 2.236865991883298),
  (184204, 2.205328731484932),
  (183107, 2.051181940307662),
  (181456, 2.021806863348979),
  (58940, 1.8733291472446163),
  (292941, 1.6497661019016368),
  (273127, 0.7874681564722706)],
 199993: [],
 199992: [(132785, 1.2985657045973946)],
 199991: [],
 199990: [(83737, 4.324934090073864)],
 199989: [],
 199988: [],
 199987: [],
 199986: [(301779, 5.167329841310373),
  (198561, 4.102492844795607),
  (199157, 4.039163226768637),
  (132785, 2.7149783002929655)],
 199985: [],
 199984: [],
 199983: [],
 199982: [(132785, 2.7149783002929655)],
 199981: [(288433, 1.1459302648168161), (132785, 0.2673631630188433)],
 199980: [],
 199979: [],
 199978: [(283039, 4.993087386553273), (299403, 4.