In [17]:
import pandas as pd  
import numpy as np
from tqdm import tqdm  
from collections import defaultdict  
import os, math, warnings, math, pickle
import faiss
import collections
import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from deepctr.feature_column import SparseFeat, VarLenSparseFeat
from deepmatch.models import YoutubeDNN
from deepmatch.utils import sampledsoftmaxloss,NegativeSampler
warnings.filterwarnings('ignore')
from data import get_all_click_sample, get_all_click_df, get_item_info_df, get_item_emb_dict
from config import get_user_item_time, get_item_user_time_dict, get_hist_and_last_click, get_item_info_dict, get_user_hist_item_info_dict, get_item_topk_click
from metrics_recall import metrics_recall
from sim_matrix import itemcf_sim, get_user_activate_degree_dict, embedding_sim
from sim_matrix import usercf_sim
from sim_matrix import embedding_sim
from recall import item_based_recommend, user_based_recommend, combine_recall_results
from cold_start_items import get_click_article_ids_set, cold_start_items

In [18]:
# 定义数据路径
data_path='/Users/linjiaxi/Desktop/RecommendationSystem/Competition/Alibaba - News Recommendation Competition/data/'
save_path='/Users/linjiaxi/Desktop/RecommendationSystem/Competition/Alibaba - News Recommendation Competition/tmp/'
metric_recall=True

In [19]:
# 定义归一化
max_min_scaler=lambda x: (x-np.min(x))/(np.max(x)-np.min(x))

In [20]:
# 定义数据
# 使用debug模式

# 用户的点击记录
all_click_df=get_all_click_sample(data_path,1000)
# 文章的基本属性
item_info_df=get_item_info_df(data_path)
# 文章的embedding字典
item_emb_dict=get_item_emb_dict(data_path)

In [21]:
all_click_df.head()

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
621,199777,156624,1507029874807,4,3,2,1,21,2
622,199777,160417,1507029923847,4,3,2,1,21,2
623,199777,159847,1507029953847,4,3,2,1,21,2
1429,199490,336476,1507034135792,4,1,17,1,25,2
1430,199490,208322,1507038604498,4,1,17,1,25,2


In [22]:
item_info_df.head()

Unnamed: 0,click_article_id,category_id,created_at_ts,words_count
0,0,0,1513144419000,168
1,1,1,1405341936000,189
2,2,1,1408667706000,250
3,3,1,1408468313000,230
4,4,1,1407071171000,162


In [23]:
num_items=2
for key, value in list(item_emb_dict.items())[:num_items]:
    print(f"{key}:{value}")

0:[-0.02118425 -0.12580893 -0.01813001  0.00668391  0.10909397  0.11846624
 -0.04404838 -0.07354293 -0.06579412  0.02170995  0.05630901  0.04666488
  0.11492702 -0.06951095  0.08220764  0.03534407 -0.10814503 -0.09250724
 -0.08225472 -0.02008969 -0.08756393  0.00569023  0.02347829  0.00616275
  0.07813909 -0.02409734  0.02564285 -0.06146177 -0.04006071  0.04641773
  0.03656221  0.07079111 -0.04878796  0.06438719 -0.01364673  0.01566297
  0.01740611 -0.08162891 -0.0595786   0.04555704 -0.00811461 -0.09602179
 -0.05048424 -0.12364366  0.00806219  0.06342559  0.038073   -0.08184084
 -0.00657207  0.05539924 -0.03188176  0.08788847 -0.06689828 -0.06069421
  0.00577     0.03791584  0.05912034 -0.03743939  0.12048548  0.09241205
  0.11193531 -0.08243855  0.04701659  0.0512825   0.08581513  0.01362305
  0.10491944 -0.01347765 -0.02687927  0.04976083 -0.01186304 -0.03300955
  0.04284385  0.0112872  -0.0237316   0.00624597 -0.02099457 -0.08150581
  0.03947292 -0.10468995  0.06278647  0.05387553 

In [24]:
# 获取文章的属性信息，保存成字典的形式方便查询
item_type_dict, item_words_dict, item_created_time_dict=get_item_info_dict(item_info_df)

In [25]:
num_items=2
for key, value in list(item_type_dict.items())[:num_items]:
    print(f"{key}:{value}")

0:0
1:1


In [26]:
num_items=2
for key,value in list(item_created_time_dict.items())[:num_items]:
    print(f"{key}:{value}")

0:0.9784319658749242
1:0.6802953033702287


In [27]:
num_items=2
for key,value in list(item_words_dict.items())[:num_items]:
    print(f"{key}:{value}")

0:168
1:189


In [28]:
# 定义一个多路召回的字典，将各路召回的结果都保存在这个字典中
user_multi_recall_dict={'itemcf_sim_itemcf_recall':{},
                        'embedding_sim_item_recall':{},
                        'cold_start_recall':{}}

In [29]:
user_item_time_dict=get_user_item_time(all_click_df)

In [30]:
# 基于embedding的相似性矩阵
item_emb_df = pd.read_csv(data_path + '/articles_emb.csv')
emb_i2i_sim = embedding_sim(all_click_df, item_emb_df, save_path, topk=10) # topk可以自行设置

255756it [00:19, 13248.84it/s]


In [31]:
# 基于物品的协同过滤的相似性矩阵
i2i_sim = itemcf_sim(all_click_df, item_created_time_dict)

100%|██████████| 1000/1000 [00:02<00:00, 358.11it/s]


In [32]:
# 基于用户的协同过滤的相似性矩阵
user_activate_degree_dict = get_user_activate_degree_dict(all_click_df)
u2u_sim = usercf_sim(all_click_df, user_activate_degree_dict)

100%|██████████| 1783/1783 [00:00<00:00, 2469.94it/s]


In [33]:
# itemcf sim召回
# 先进行itemcf召回, 为了召回评估，所以提取最后一次点击

if metric_recall:
    trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)
else:
    trn_hist_click_df = all_click_df

user_recall_items_dict = collections.defaultdict(dict)
user_item_time_dict = get_user_item_time(trn_hist_click_df)

i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl', 'rb'))
emb_i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl', 'rb'))

sim_item_topk = 20
recall_item_num = 10
item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)

for user in tqdm(trn_hist_click_df['user_id'].unique()):
    user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, \
                                                        i2i_sim, sim_item_topk, recall_item_num, \
                                                        item_topk_click, item_created_time_dict, emb_i2i_sim)

user_multi_recall_dict['itemcf_sim_itemcf_recall'] = user_recall_items_dict
pickle.dump(user_multi_recall_dict['itemcf_sim_itemcf_recall'], open(save_path + 'itemcf_recall_dict.pkl', 'wb'))

if metric_recall:
    # 召回效果评估
    metrics_recall(user_multi_recall_dict['itemcf_sim_itemcf_recall'], trn_last_click_df)

100%|██████████| 1000/1000 [00:01<00:00, 644.31it/s]


In [34]:
# embedding sim召回
# 这里是为了召回评估，所以提取最后一次点击
if metric_recall:
    trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)
else:
    trn_hist_click_df = all_click_df

user_recall_items_dict = collections.defaultdict(dict)
user_item_time_dict = get_user_item_time(trn_hist_click_df)
i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl','rb'))

sim_item_topk = 20
recall_item_num = 10

item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)

for user in tqdm(trn_hist_click_df['user_id'].unique()):
    user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, sim_item_topk, 
                                                        recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim)
    
user_multi_recall_dict['embedding_sim_item_recall'] = user_recall_items_dict
pickle.dump(user_multi_recall_dict['embedding_sim_item_recall'], open(save_path + 'embedding_sim_item_recall.pkl', 'wb'))

if metric_recall:
    # 召回效果评估
    metrics_recall(user_multi_recall_dict['embedding_sim_item_recall'], trn_last_click_df, topk=recall_item_num)

100%|██████████| 1000/1000 [00:01<00:00, 871.26it/s]

 topk:  10  :  hit_num:  25 hit_rate:  0.025 user_num :  1000





In [35]:
# 冷启动召回
# 先进行itemcf召回
trn_hist_click_df=all_click_df

user_recall_items_dict=collections.defaultdict(dict)
user_item_time_dict=get_user_item_time(trn_hist_click_df)
i2i_sim=pickle.load(open(save_path+'emb_i2i_sim.pkl','rb'))

sim_item_topk=150
recall_item_num=100 # 召回的文章数量

item_topk_click=get_item_topk_click(trn_hist_click_df,k=50)
for user in tqdm(trn_hist_click_df['user_id'].unique()):
    user_recall_items_dict[user]=item_based_recommend(user,user_item_time_dict,i2i_sim,sim_item_topk,recall_item_num,item_topk_click,item_created_time_dict,emb_i2i_sim)
pickle.dump(user_recall_items_dict,open(save_path+'cold_start_items_raw_dict.pkl','wb'))

100%|██████████| 1000/1000 [00:00<00:00, 1246.00it/s]


In [36]:
all_click_df_=all_click_df.copy()
all_click_df_=all_click_df_.merge(item_info_df,how='left',on='click_article_id')
user_hist_item_typs_dict,user_hist_item_ids_dict,user_hist_item_words_dict,user_last_item_created_time_dict=get_user_hist_item_info_dict(all_click_df_)
click_article_ids_set = get_click_article_ids_set(all_click_df)
# 需要注意的是
# 这里使用了很多规则来筛选冷启动的文章，所以前面再召回的阶段就应该尽可能的多召回一些文章，否则很容易被删掉
cold_start_user_items_dict = cold_start_items(user_recall_items_dict, user_hist_item_typs_dict, user_hist_item_words_dict, \
                                              user_last_item_created_time_dict, item_type_dict, item_words_dict, \
                                              item_created_time_dict, click_article_ids_set, recall_item_num)

user_multi_recall_dict['cold_start_recall'] = cold_start_user_items_dict

100%|██████████| 1000/1000 [00:01<00:00, 950.02it/s]


In [37]:
# 多路召回合并
# 对多路召回的权重给了一个相同的值
weight_dict = {'itemcf_sim_itemcf_recall': 1.0,
               'embedding_sim_item_recall': 1.0,
               'cold_start_recall': 1.0}

In [38]:
# 最终合并之后每个用户召回150个商品进行排序
final_recall_items_dict_rank = combine_recall_results(user_multi_recall_dict, weight_dict, topk=150)

多路召回合并...


 67%|██████▋   | 2/3 [00:00<00:00, 14.53it/s]

itemcf_sim_itemcf_recall...
embedding_sim_item_recall...
cold_start_recall...


100%|██████████| 3/3 [00:00<00:00, 12.49it/s]
