数据加载

In [1]:
import pandas as pd

# demo 验证
train_df = pd.read_csv('Data/New_Movie_RS.csv')
print(train_df.shape)
train_df.head(1)

(50000, 13)


Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags
0,0,"1988年的妮可 Nico, 1988",7.5,565,15.2%48.2%32.3%3.4%0.8%,2019-10-05,尾黑,2018-06-23,3,成本低廉的PPT电影，用Nico生命中最后一年发生的事给Nico的歌配上情节，倒不算尴尬。女...,66%31%3%,4,"['音乐', '电影', '儿子', '丝绒', '人物', '传记', '传记片', '歌..."


#### 数据预处理

In [2]:
# 去除空值
train_df.dropna(axis=0, how='any', inplace=True)

# 两列去除重复
train_df.drop_duplicates(subset=['Movie_Name','Username'],keep='first',inplace=True)

train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49994 entries, 0 to 49999
Data columns (total 13 columns):
ID                           49994 non-null int64
Movie_Name                   49994 non-null object
Movie_Score                  49994 non-null float64
Review_Count                 49994 non-null int64
Movie_Star_Distribution      49994 non-null object
Collect_Date                 49994 non-null object
Username                     49994 non-null object
Post_Date                    49994 non-null object
Score                        49994 non-null int64
User_Comment                 49994 non-null object
User_Comment_Distribution    49994 non-null object
Comment_Like_Count           49994 non-null int64
Movie_Tags                   49994 non-null object
dtypes: float64(1), int64(4), object(8)
memory usage: 5.3+ MB


In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from lightfm import LightFM, cross_validation
from scipy.sparse import csr_matrix, coo_matrix
from lightfm.evaluation import auc_score
from lightfm.data import Dataset
import numpy as np

### 数据预处理

In [4]:
# 建立用户名和 id 映射的字典
user_dict = {}
for index, value in enumerate(train_df['Username'].unique()):
    user_dict[value] = index
train_df['uid_int'] = train_df['Username'].apply(lambda x: user_dict[x])

# 字典翻转
reverse_user_dict = {v: k for k, v in user_dict.items()}

# 建立电影名和 id 映射的字典
item_dict = {}
for index, value in enumerate(train_df['Movie_Name'].unique()):
    item_dict[value] = index
train_df['item_int'] = train_df['Movie_Name'].apply(lambda x: item_dict[x])

# 字典翻转
reverse_item_dict = {v: k for k, v in item_dict.items()}

#### 设置电影和用户特征

In [5]:
train_df.head(1)

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int
0,0,"1988年的妮可 Nico, 1988",7.5,565,15.2%48.2%32.3%3.4%0.8%,2019-10-05,尾黑,2018-06-23,3,成本低廉的PPT电影，用Nico生命中最后一年发生的事给Nico的歌配上情节，倒不算尴尬。女...,66%31%3%,4,"['音乐', '电影', '儿子', '丝绒', '人物', '传记', '传记片', '歌...",0,0


In [6]:
# 电影特征
items_f = ['Movie_Score', 'Review_Count', 'item_int',
           'New_Movie_Tags', 'New_Movie_Star_Distribution']

# 用户特征
users_f = ['uid_int', 'New_User_Comment_Distribution','New_User_Comment_Distribution']

#### 数据划分

优化特征提取:
1. 电影信息拆分:单独编码
2. 用户信息拆分：单独编码
3. 通过交互信息进行 join

用户交互表划分

In [26]:
user_post_event = train_df[['uid_int', 'item_int', 'Score','Post_Date']]
user_post_event.shape

(49994, 4)

In [27]:
# 时间排序,时间倒叙,最近的排在前面
user_post_event = user_post_event.sort_values(by='Post_Date', ascending=False)
# 查看最后几行数据
user_post_event.tail()

Unnamed: 0,uid_int,item_int,Score,Post_Date
39566,20882,92,4,2005-09-12
39784,3747,92,4,2005-09-12
47225,33341,107,4,2005-08-22
204,203,1,2,2005-07-19
14868,12971,39,5,2005-07-12


增加时间过滤，不需要很久远的数据。

In [28]:
Time_Threshold =  '2018-01-01'
# 直接对时间字段进行截断
user_post_event = user_post_event[user_post_event['Post_Date'] > Time_Threshold]
user_post_event.tail()

Unnamed: 0,uid_int,item_int,Score,Post_Date
44727,20373,102,3,2018-01-02
155,155,0,5,2018-01-02
18989,16045,50,3,2018-01-02
22853,18625,59,5,2018-01-02
35372,26784,82,4,2018-01-02


In [29]:
user_post_event.shape

(11971, 4)

In [39]:
# 找到每位用户请求到的集内id
%time raw_sentences = [user_post_event[user_post_event['uid_int'] == i]['item_int'].unique().tolist() for i in user_post_event['uid_int'].unique()]
len(raw_sentences)

CPU times: user 9.77 s, sys: 0 ns, total: 9.77 s
Wall time: 9.94 s


10158

word2vec 使用字符输入。

In [54]:
raw_id = []
for r_list in raw_sentences:
    raw_id.append([str(i) for i in r_list])
len(raw_id)

10158

In [55]:
from gensim import corpora, similarities
from gensim.models import Word2Vec 
import multiprocessing

# 体现内容的共线关联
%time model = Word2Vec(raw_id, size=300, workers=multiprocessing.cpu_count()*2,min_count=1)

CPU times: user 128 ms, sys: 0 ns, total: 128 ms
Wall time: 131 ms


In [None]:
# 相关查询
topn = 10
topic_list = topic_model.wv.most_similar(now_topic, topn=topn)
topic_list = [i[0] for i in topic_list]
print(topic_list)
topic_rec_list = data.loc[data['ISBN'].isin(topic_list)][item_feature].drop_duplicates('ISBN', keep='first', inplace=False)
topic_rec_list.head(10)

In [None]:
dsd

电影信息表划分

In [None]:
movie_info_temp = train_df[items_f].drop_duplicates(
    'item_int', keep='first', inplace=False)
movie_info_temp.shape

用户信息表划分

In [None]:
user_info_temp = train_df[users_f].drop_duplicates(
    'uid_int', keep='first', inplace=False)
user_info_temp.shape

In [None]:
def create_features(dataframe, features_name, id_col_name):
    # 特征分离
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features1 = list(zip(dataframe[id_col_name], features))
    features2 = features.apply(pd.Series).stack().reset_index(drop=True)
    return features1, features2


# 电影特征编码
%time movie_info_temp['items_features'], item_feature_list = create_features(movie_info_temp, items_f, 'item_int')

In [None]:
# 用户特征编码
%time user_info_temp['users_features'], user_feature_list = create_features(user_info_temp, users_f, 'uid_int')

In [None]:
# 开始制作数据集
make_dataset = Dataset()
make_dataset.fit(user_info_temp['uid_int'], movie_info_temp['item_int'],
                 item_features=item_feature_list, user_features=user_feature_list)

# 构建打分矩阵
interactions, weights = make_dataset.build_interactions(
    list(zip(user_post_event['uid_int'], user_post_event['item_int'], user_post_event['Score'])))

# 打印用户评分
train_df['Score'].unique()

### 训练推荐系统的模型

In [None]:
# model load
model_test = LightFM(no_components=30, random_state=1)

# 固定划分数据
train, test = cross_validation.random_train_test_split(
    interactions, test_percentage=0.2)

模型训练

In [None]:
# 电影特征处理
items_features = make_dataset.build_item_features(
    movie_info_temp['items_features'])

# 用户特征处理
users_features = make_dataset.build_user_features(
    user_info_temp['users_features'])

# model train
%time model_test.fit(train, item_features=items_features, user_features=users_features, epochs=10, verbose=False)

模型验证

In [None]:
# 开始验证模型
auc = auc_score(model_test, test, item_features=items_features,
                user_features=users_features)

# 计算 auc 的均值
np.mean(auc)

### 首页用户个性化推荐
1. 包含召回和排序两个部分。
2. 适合首页 Feed 流的离线推荐计算场景。

使用全量数据进行推荐模型训练。

In [None]:
# 推荐模型训练
model = LightFM(no_components=30, random_state=1)

%time model.fit(interactions, sample_weight=weights, item_features=items_features, user_features=users_features, epochs=30,  verbose=False)

召回
1. 过滤用户看过的
2. 尽可能找到用户喜欢的

In [None]:
# 按照用户名进行推荐
Username = '尾黑'

# id 转换
user_x = user_dict[Username]

# 对电影进行去重
allready_knews = train_df.loc[train_df['uid_int'].isin(
    [user_x])].drop_duplicates('item_int', keep='first', inplace=False)

暂时使用全量用户没看过的，且电影分数大于 Threshold 的电影作为召回。

In [None]:
# 过滤看过的内容
known_items = allready_knews['item_int'].tolist()
df_use_for_prediction = train_df.loc[~train_df['item_int'].isin(known_items)].drop_duplicates(
    'item_int', keep='first', inplace=False)

# 挑选高质量的电影
Threshold = 7
df_use_for_prediction = df_use_for_prediction[df_use_for_prediction['Movie_Score'] > Threshold]

df_use_for_prediction.head(1)

模型开始对召回的电影候选集进行打分。

In [None]:
df_use_for_prediction['rec_score'] = model.predict(user_ids=user_x, item_ids=df_use_for_prediction['item_int'].tolist(),
                                                   item_features=items_features, user_features=users_features)

对排序结果进行 topk 选取。

In [12]:
# 取 topk
top_rec_item = 5
rec_list_ = df_use_for_prediction.sort_values(by='rec_score', ascending=False)[
    :top_rec_item]['item_int']

# 排序后的电影推荐列表
rec_list_.tolist()

NameError: name 'df_use_for_prediction' is not defined

In [None]:
# 利用推荐列表找出电影信息
result = train_df.loc[train_df['item_int'].isin(rec_list_.tolist())][[
    'Movie_Name', 'item_int', 'Movie_Score', 'Movie_Tags']].drop_duplicates('Movie_Name', keep='first', inplace=False)

# 推荐结果显示
result.head()

### 看了还看
1. 根据当前看的内容找到看过此次内容相关的内容。
2. 对结果依旧使用阈值过滤，挑选优质内容。

In [None]:
# 提取电影特征向量
embedding_movie, feature_movie = csr_matrix(
    model.item_embeddings), csr_matrix(items_features)

# 电影特征提取
movie_inner_max = np.dot(feature_movie, embedding_movie)

# 电影之间求得余弦相似度
movie_cosine_sim = cosine_similarity(movie_inner_max)

In [None]:
# 找到与查询电影最近余弦距离上的电影
def next_movie_recommendations(movie_name, cosine_sim, topk):
    # 电影名转换唯一的 id
    int_id = item_dict[movie_name]
    # 去除自身 id
    sim_scores = sorted(
        list(enumerate(cosine_sim[int_id])),
        key=lambda x: x[1], reverse=True)[1:topk+1]
    return sim_scores

假设当前看的电影。

In [13]:
# 利用电影名查询
movie_name = '24小时狂欢派对 24 Hour Party People'

train_df.loc[train_df['Movie_Name'].isin([movie_name])].drop_duplicates(
    'item_int', keep='first', inplace=False).head(1)

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int
159,159,24小时狂欢派对 24 Hour Party People,7.9,1509,26.2%46.9%23.0%3.2%0.7%,2019-10-05,Lane,2008-09-22,3,080921 可能和真是一部for fans only的片子，反正我没看出特别多乐趣。只是W...,75%19%6%,0,"['音乐', '纪录片', '电影', '时候', '演员', '片子', '工厂', '乐...",159,1


推理用户接下来看的 topk 个电影，并去除当前看的电影和自己已经看过的电影。

In [14]:
topk = 20
next_movie = [i[0] for i in next_movie_recommendations(
    movie_name, movie_cosine_sim, topk)]

#  推荐和看过的取差集
next_list = list(set(next_movie).difference(set(known_items)))
next_list

NameError: name 'next_movie_recommendations' is not defined

增加阈值过滤进行下一个结果输出，保证推荐质量。

In [15]:
next_movie_result = train_df.loc[train_df['item_int'].isin(
    next_list)].drop_duplicates('item_int', keep='first', inplace=False)


# 使用 Threshold 进行过滤
Threshold = 8
next_movie_result = next_movie_result[next_movie_result['Movie_Score'] >= Threshold]

# 显示看了还看的推荐结果
next_movie_result.head()[items_f+['Movie_Name']]

NameError: name 'next_list' is not defined

### Push 推荐
1. 通过计算每位用户对单一电影的兴趣度，去除看过的即可得出 topk 的 push 结果。
2. 通过电影找用户，解决电影冷启动问题。

In [16]:
movie_int = item_dict[movie_name]

# 查看当前电影
item_rec_user = train_df.loc[train_df['item_int'].isin(
    [movie_int])].drop_duplicates('item_int', keep='first', inplace=False)
item_rec_user.head(1)

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int
159,159,24小时狂欢派对 24 Hour Party People,7.9,1509,26.2%46.9%23.0%3.2%0.7%,2019-10-05,Lane,2008-09-22,3,080921 可能和真是一部for fans only的片子，反正我没看出特别多乐趣。只是W...,75%19%6%,0,"['音乐', '纪录片', '电影', '时候', '演员', '片子', '工厂', '乐...",159,1


In [17]:
movie_name

'24小时狂欢派对 24 Hour Party People'

In [18]:
# 利用交互矩阵的维度
n_users, n_movies = interactions.shape

movie2user_rec = model.predict(user_ids=np.arange(n_users), item_ids=np.repeat(
    movie_int, n_users), item_features=items_features, user_features=users_features)

# 建立感兴趣的用户索引倒排
wait_rec_list = np.argsort(-movie2user_rec).tolist()

NameError: name 'interactions' is not defined

In [None]:
# 找出看过此商品的用户 id
item_known_users_int = train_df[train_df['item_int']
                                == movie_int]['uid_int'].unique().tolist()

推荐候选集合过滤，得出最终的 Push 用户列表。

In [19]:
push_topk = 10
push_result = []
count = 0
# 增加早停
for x in wait_rec_list:
    if x not in item_known_users_int:
        push_result.append(x)
        count += 1
        if count > push_topk:
            break
push_result

NameError: name 'wait_rec_list' is not defined

打印待 Push 用户的信息。

In [None]:
item_rec_user = train_df.loc[train_df['uid_int'].isin(
    push_result)].drop_duplicates('uid_int', keep='first', inplace=False)
item_rec_user.head()[users_f+['Username']]

### 相关用户推荐
1. 相关用户推荐，找到用户相关的用户，挖掘用户的潜在兴趣。
2. 增加平台和用户的交互。

In [20]:
#  提取用户特征向量
embedding_user, feature_user = csr_matrix(
    model.user_embeddings), csr_matrix(users_features)

# 用户特征提取
user_inner_max = np.dot(feature_user, embedding_user)

# 用户之间求得余弦相似度，容易因用户数量过大造成 MemoryError
user_cosine_sim = cosine_similarity(user_inner_max)

NameError: name 'model' is not defined

In [None]:
# 找到与查询电影最近余弦距离上的电影
def sim_user_recommendations(user_name, cosine_sim, topk):
    # 用户名转换唯一的 id
    int_id = user_dict[user_name]
    # 去除自身 id，建立倒排索引
    sim_scores = sorted(
        list(enumerate(cosine_sim[int_id])),
        key=lambda x: x[1], reverse=True)[1:topk+1]
    return sim_scores

找到与当前用户 Username 相关的 topk 个相关用户。

In [21]:
topk = 5
sim_user_list = [i[0] for i in sim_user_recommendations(
    Username, user_cosine_sim, topk)]
sim_user_list

NameError: name 'sim_user_recommendations' is not defined

In [None]:
sim_user_result = train_df.loc[train_df['uid_int'].isin(
    sim_user_list)].drop_duplicates('uid_int', keep='first', inplace=False)

# todo：可以添加优质用户筛选策略
# Threshold
# sim_user_result = sim_user_result[sim_user_result['User_level'] >= Threshold]

# 显示相关用户结果
sim_user_result.head()[users_f+['Username']]