In [46]:
import time
from tqdm import tqdm
import collections
import math
import pickle
from datetime import datetime
import numpy as np
import pandas as pd

In [47]:
from loaddata_z import loadData

### 加载数据模块
确定数据类型减少内存占用

In [48]:
data_path = "../data/"
_loadData = loadData(data_path)

In [49]:
sample_df = _loadData.get_sample_data()

In [50]:
sample_df

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
32,199990,272143,1507029564434,4,1,17,1,7,5
33,199990,198659,1507029670707,4,1,17,1,7,1
34,199990,84629,1507029700707,4,1,17,1,7,1
52,199984,70594,1507029553199,4,3,2,1,5,7
53,199984,272143,1507029583199,4,3,2,1,5,7
...,...,...,...,...,...,...,...,...,...
1112454,187635,177653,1508210907604,4,3,2,1,25,1
1112547,69654,70986,1508211726802,4,3,2,1,25,1
1112548,69654,50644,1508211756802,4,3,2,1,25,1
1112573,5,211442,1508211243884,4,4,2,1,25,2


In [51]:
item_info_df = _loadData.get_item_info()

In [52]:
# item_emb_df = _loadData.get_item_emb()

In [53]:
import time
for t in sample_df[sample_df["user_id"] == 1000]["click_timestamp"]:
    print(time.gmtime(t // 1000))

### ItemCF

In [56]:
# 根据点击时间获取用户的点击文章序列   {user1: [(item1, time1), (item2, time2)..]...}
def get_user_item_time(click_df):
    
    click_df = click_df.sort_values('click_timestamp')
    
    def make_item_time_pair(df):
        return list(zip(df['click_article_id'], df['click_timestamp']))
    
    user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\
                                                            .reset_index().rename(columns={0: 'item_time_list'})
    user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
    
    return user_item_time_dict

In [57]:
# 获取文章id对应的基本属性，保存成字典的形式，方便后面召回阶段，冷启动阶段直接使用
def get_item_info_dict(item_info_df):
    max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
    item_info_df['created_at_ts'] = item_info_df[['created_at_ts']].apply(max_min_scaler)
    
    item_type_dict = dict(zip(item_info_df['click_article_id'], item_info_df['category_id']))
    item_words_dict = dict(zip(item_info_df['click_article_id'], item_info_df['words_count']))
    item_created_time_dict = dict(zip(item_info_df['click_article_id'], item_info_df['created_at_ts']))
    
    return item_type_dict, item_words_dict, item_created_time_dict

### UserCF

In [60]:
# 根据时间获取商品被点击的用户序列  {item1: [(user1, time1), (user2, time2)...]...}
def get_item_user_time_dict(click_df):
    def make_user_time_pair(df):
        return list(zip(df['user_id'], df['click_timestamp']))
    
    click_df = click_df.sort_values('click_timestamp')
    item_user_time_df = click_df.groupby('click_article_id')['user_id', 'click_timestamp'].apply(lambda x: make_user_time_pair(x))\
                                                            .reset_index().rename(columns={0: 'user_time_list'})
    
    item_user_time_dict = dict(zip(item_user_time_df['click_article_id'], item_user_time_df['user_time_list']))
    return item_user_time_dict

In [61]:
from sklearn.preprocessing import MinMaxScaler

In [62]:
def get_user_activate_degree_dict(click_df):
    click_cnt_df = click_df.groupby('user_id')['click_article_id'].count().reset_index()
    
    # 用户活跃度归一化[0, 1]
    mm = MinMaxScaler()
    click_cnt_df['click_article_id'] = mm.fit_transform(click_cnt_df[['click_article_id']])
    user_activate_degree_dict = dict(zip(click_cnt_df['user_id'], click_cnt_df['click_article_id']))
    
    return user_activate_degree_dict

In [63]:
def get_item_popular_degree_dict(click_df):
    item_cnt_df = click_df.groupby('click_article_id')['user_id'].count().reset_index()
    # 物品流行度归一化[0, 1]
    mm = MinMaxScaler()
    item_cnt_df['user_id'] = mm.fit_transform(item_cnt_df[['user_id']])
    item_popular_degree_dict = dict(zip(item_cnt_df['click_article_id'], item_cnt_df['user_id']))
    
    return item_popular_degree_dict

In [74]:
def usercf_sim(sample_df, user_activate_degree_dict):
    """
    用户相似性矩阵计算
    :param all_click_df: 数据表
    :param user_activate_degree_dict: 用户活跃度的字典
    return 用户相似性矩阵
        
    思路: 基于用户的协同过滤 + 关联规则
    """
    item_user_time_dict = get_item_user_time_dict(sample_df)
    u2u_sim = {}
    user_cnt = collections.defaultdict(int)
    
    for item, user_time_list in tqdm(item_user_time_dict.items()):
        for u, _ in user_time_list:
            user_cnt[u] += 1
            u2u_sim.setdefault(u, {})
            for v, _ in user_time_list:
                
                if u == v:
                    continue
                # 用户平均活跃度作为活跃度的权重，这里的式子也可以改善
                activate_weight = 100 * 0.5 * (user_activate_degree_dict[u] + user_activate_degree_dict[v])
                #item_weight = np.exp(0.9 ** item_popular_degree_dict[item])
                # 惩罚热门物品的权重
                u2u_sim[u].setdefault(v, 0)
                u2u_sim[u][v] += activate_weight / math.log(len(user_time_list) + 1)
                
    u2u_sim_ = u2u_sim.copy()
    for u, related_users in u2u_sim.items():
        for v, wij in related_users.items():
            u2u_sim_[u][v] = wij / math.sqrt(user_cnt[u] * user_cnt[v])
            
    # 将得到的相似性矩阵保存到本地
    pickle.dump(u2u_sim_, open('usercf_u2u_sim.pkl', 'wb'))

    return u2u_sim_

In [79]:
def user_based_recommend(user_id, user_item_time_dict, u2u_sim, sim_user_topk, recall_item_num, item_topk_click, item_created_time_dict):
    """
    基于文章协同过滤的召回
    :param user_id: 用户id
    :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列   {user1: [(item1, time1), (item2, time2)..]...}
    :param u2u_sim: 字典，文章相似性矩阵
    :param sim_user_topk: 整数， 选择与当前用户最相似的前k个用户
    :param recall_item_num: 整数， 最后的召回文章数量
    :param item_topk_click: 列表，点击次数最多的文章列表，用户召回补全
    :param item_created_time_dict: 文章创建时间列表
    :param emb_i2i_sim: 字典基于内容embedding算的文章相似矩阵

    return: 召回的文章列表 [(item1, score1), (item2, score2)...]
    """
    # 历史交互
    user_item_time_list = user_item_time_dict[user_id] #  [(item1, time1), (item2, time2)..]
    user_hist_items = set([i for i, t in user_item_time_list]) # 存在一个用户与某篇文章的多次交互，这里得去重
    items_rank = {}
    for sim_u, wuv in sorted(u2u_sim[user_id].items(), key=lambda x: x[1], reverse=True)[: sim_user_topk]:
        for i, click_time in user_item_time_dict[sim_u]:
            if i in user_hist_items:
                continue
            items_rank.setdefault(i, 0)
        
            
            for loc, (j, click_time) in enumerate(user_item_time_list):
                # 点击时的相对位置权重
                loc_weight = 0.95 ** (len(user_item_time_list) - loc)
                # 创建时间差权重
                created_time_weight = np.exp(0.9 * np.abs(item_created_time_dict[i] - item_created_time_dict[j]))
                
            items_rank[i] += loc_weight * content_weight * created_time_weight * wuv
    
    # 热度补全
    if len(items_rank) < recall_item_num:
        for i, item in enumerate(item_topk_click):
            if item in items_rank:
                continue
            items_rank[item] = - 100 # 随便给个复数就行
            if len(items_rank) == recall_item_num:
                break
    
    items_rank = sorted(items_rank.items(), key=lambda x: x[1], reverse=True)[: recall_item_num]
    
    return items_rank

In [66]:
user_item_time_dict = get_user_item_time(sample_df)

  if __name__ == '__main__':


In [67]:
user_activate_degree_dict = get_user_activate_degree_dict(sample_df)

In [68]:
item_type_dict, item_words_dict, item_created_time_dict = get_item_info_dict(item_info_df)

In [75]:
user_item_time_dict = get_user_item_time(sample_df)
item_type_dict, item_words_dict, item_created_time_dict = get_item_info_dict(item_info_df)
u2u_sim = usercf_sim(sample_df, user_activate_degree_dict)

  if __name__ == '__main__':
  import sys
100%|████████████████████████████████████████████████████████████████████████████| 6545/6545 [00:05<00:00, 1266.22it/s]


In [76]:
submit_df = pd.read_csv("../data/sample_submit.csv")

In [80]:
sim_user_topk = 10
recall_item_num = 10
item_topk_click = get_item_topk_click(sample_df, k=50)

user_recall_items_dict = collections.defaultdict(dict)
for user in tqdm(submit_df['user_id'].unique()):
    user_recall_items_dict[user] = user_based_recommend(user, user_item_time_dict, u2u_sim, sim_user_topk, recall_item_num, item_topk_click, item_created_time_dict)

  0%|                                                                                        | 0/50000 [00:00<?, ?it/s]


KeyError: 200000