In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('Data/New_Movie_RS.csv')
print(train_df.shape)
train_df.head(1)

(10000, 13)


Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags
0,0,"1988年的妮可 Nico, 1988",7.5,565,15.2%48.2%32.3%3.4%0.8%,2019-10-05,尾黑,2018-06-23,3,成本低廉的PPT电影，用Nico生命中最后一年发生的事给Nico的歌配上情节，倒不算尴尬。女...,66%31%3%,4,"['音乐', '电影', '儿子', '丝绒', '人物', '传记', '传记片', '歌..."


In [3]:
!pip install lightfm

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [4]:
import numpy as np
from lightfm import LightFM, cross_validation
from scipy.sparse import csr_matrix, coo_matrix
from lightfm.evaluation import auc_score, reciprocal_rank, recall_at_k, precision_at_k
from lightfm.data import Dataset
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# 建立用户名和 id 映射的字典
user_dict = {}
for index, value in enumerate(train_df['Username'].unique()):
    user_dict[value] = index
train_df['uid_int'] = train_df['Username'].apply(lambda x: user_dict[x])
# 字典翻转
reverse_user_dict = {v: k for k, v in user_dict.items()}

# 建立电影名和 id 映射的字典
item_dict = {}
for index, value in enumerate(train_df['Movie_Name'].unique()):
    item_dict[value] = index
train_df['item_int'] = train_df['Movie_Name'].apply(lambda x: item_dict[x])
# 字典翻转
reverse_item_dict = {v: k for k, v in item_dict.items()}

#### 设置电影和用户特征

In [7]:
def create_features(dataframe, features_name, id_col_name):
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features1 = list(zip(dataframe[id_col_name], features))
    features2 = features.apply(pd.Series).stack().reset_index(drop=True)
    return features1, features2

In [8]:
# 电影特征
items_f = ['Movie_Score', 'Review_Count',
           'Movie_Star_Distribution', 'item_int', 'Movie_Tags']

# 用户特征
users_f = ['User_Comment_Distribution', 'uid_int']

train_df['items_features'], item_feature_list = create_features(
    train_df, items_f, 'item_int')

train_df['users_features'], user_feature_list = create_features(
    train_df, users_f, 'uid_int')


train_df.head(1)

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int,items_features,users_features
0,0,"1988年的妮可 Nico, 1988",7.5,565,15.2%48.2%32.3%3.4%0.8%,2019-10-05,尾黑,2018-06-23,3,成本低廉的PPT电影，用Nico生命中最后一年发生的事给Nico的歌配上情节，倒不算尴尬。女...,66%31%3%,4,"['音乐', '电影', '儿子', '丝绒', '人物', '传记', '传记片', '歌...",0,0,"(0, [7.5, 565, 15.2%48.2%32.3%3.4%0.8%, 0, ['音...","(0, [66%31%3%, 0])"


In [9]:
# 开始制作数据集
make_dataset = Dataset()
make_dataset.fit(train_df['uid_int'], train_df['item_int'],
                 item_features=item_feature_list, user_features=user_feature_list)

In [10]:
# 构建打分矩阵
interactions, weights = make_dataset.build_interactions(
    list(zip(train_df['uid_int'], train_df['item_int'], train_df['Score'])))

# 打印用户评分
train_df['Score'].unique()

array([3, 1, 2, 4, 5])

model train

In [11]:
# model evaluate
model = LightFM(no_components=30, random_state=1)

# 固定划分数据
train, test = cross_validation.random_train_test_split(
    interactions, test_percentage=0.2)

In [12]:
items_features = make_dataset.build_item_features(train_df['items_features'])
users_features = make_dataset.build_user_features(train_df['users_features'])
%time model.fit(train, item_features=items_features, user_features=users_features, epochs=10, verbose=False)

# model evaluate
auc = auc_score(model, test, item_features=items_features,
                user_features=users_features)
np.mean(auc)

CPU times: user 620 ms, sys: 4 ms, total: 624 ms
Wall time: 626 ms


0.6973772

### 首页用户个性化推荐
1. 包含召回和排序两个部分。
2. 适合首页 Feed 流的离线推荐计算场景。

In [13]:
# model train
%time model.fit(interactions, sample_weight=weights, item_features=items_features, user_features=users_features, epochs=30,  verbose=False)

CPU times: user 2.47 s, sys: 0 ns, total: 2.47 s
Wall time: 2.47 s


<lightfm.lightfm.LightFM at 0x7f10e372dac8>

召回
1. 过滤用户看过的
2. 尽可能找到用户喜欢的

In [14]:
# 按照用户名进行推荐
Username = '尾黑'

# id 转换
user_x = user_dict[Username]

# 对电影进行去重
allready_knews = train_df.loc[train_df['uid_int'].isin(
    [user_x])].drop_duplicates('item_int', keep='first', inplace=False)

allready_knews.head()

[0]


Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int,items_features,users_features
0,0,"1988年的妮可 Nico, 1988",7.5,565,15.2%48.2%32.3%3.4%0.8%,2019-10-05,尾黑,2018-06-23,3,成本低廉的PPT电影，用Nico生命中最后一年发生的事给Nico的歌配上情节，倒不算尴尬。女...,66%31%3%,4,"['音乐', '电影', '儿子', '丝绒', '人物', '传记', '传记片', '歌...",0,0,"(0, [7.5, 565, 15.2%48.2%32.3%3.4%0.8%, 0, ['音...","(0, [66%31%3%, 0])"


暂时使用全量用户没看过的，且电影分数大于 hread_hold 的电影作为召回。

In [15]:
# 过滤看过的内容
known_items = allready_knews['item_int'].tolist()
df_use_for_prediction = train_df.loc[~train_df['item_int'].isin(known_items)].drop_duplicates('item_int', keep='first', inplace=False)

# 挑选高质量的电影
Threshold = 8
df_use_for_prediction = df_use_for_prediction[df_use_for_prediction['Movie_Score'] > Threshold]

对召回结果进行打分预测。

In [16]:
# 对过滤之后的内容进行排序
df_use_for_prediction['rec_score'] = model.predict(user_x, df_use_for_prediction['item_int'].tolist(), 
                                                   item_features=items_features, user_features=users_features)

# 取 topk
top_rec_item = 5
rec_list_ = df_use_for_prediction.sort_values(by='rec_score', ascending=False)[:top_rec_item]['item_int']

# 排序后的电影推荐列表
rec_list_.tolist()

[8, 15]

In [17]:
# 利用推荐列表找出电影信息
result = train_df.loc[train_df['item_int'].isin(rec_list_.tolist())][[
    'Movie_Name', 'item_int', 'Movie_Score', 'Movie_Tags']].drop_duplicates('Movie_Name', keep='first', inplace=False)

result.head(top_rec_item)

Unnamed: 0,Movie_Name,item_int,Movie_Score,Movie_Tags
2364,一年级生 The First Grader,8,8.1,"['老人', '故事', '电影', '孩子', '历史', '励志', '影片', '剧情..."
5394,三个臭皮匠 The Three Stooges,15,9.3,"['喜剧', '小时候', '时候', '经典', '电视', '电影', '电视剧', '..."


### 看了还看
1. 根据当前看的内容找到看过此次内容相关的内容。
2. 对结果依旧使用阈值过滤，挑选优质内容。

假设当前看的电影。

In [18]:
# 利用电影名查询
movie_name = '1988年的妮可 Nico, 1988'
train_df.loc[train_df['Movie_Name'].isin([movie_name])].drop_duplicates(
    'item_int', keep='first', inplace=False).head()

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int,items_features,users_features
0,0,"1988年的妮可 Nico, 1988",7.5,565,15.2%48.2%32.3%3.4%0.8%,2019-10-05,尾黑,2018-06-23,3,成本低廉的PPT电影，用Nico生命中最后一年发生的事给Nico的歌配上情节，倒不算尴尬。女...,66%31%3%,4,"['音乐', '电影', '儿子', '丝绒', '人物', '传记', '传记片', '歌...",0,0,"(0, [7.5, 565, 15.2%48.2%32.3%3.4%0.8%, 0, ['音...","(0, [66%31%3%, 0])"


In [39]:
# 提取电影特征向量
embedding_movie = csr_matrix(model.item_embeddings)

# 对特征进行转换
feature_movie = csr_matrix(items_features)

# 电影特征提取
inner_max = np.dot(feature_movie, embedding_movie)

# 电影之间求得余弦相似度
movie_cosine_sim = cosine_similarity(inner_max)

In [48]:
# 找到与查询电影最近余弦距离上的电影
def next_movie_recommendations(movie_name, cosine_sim, topk):
    # 电影名转换唯一的 id
    int_id = item_dict[movie_name]
    # 去除自身 id
    sim_scores = sorted(
        list(enumerate(cosine_sim[int_id])),
        key=lambda x: x[1], reverse=True)[1:topk+1]
    return sim_scores

推理用户接下来看的 topk 个电影。

In [50]:
topk = 20
next_movie = next_movie_recommendations(movie_name, movie_cosine_sim, topk)

查询看了还看的电影信息，并增加阈值过滤，保证推荐质量。

In [60]:
next_movie_result = train_df.loc[train_df['item_int'].isin(
    [i[0] for i in next_movie])].drop_duplicates('item_int', keep='first', inplace=False)

# 使用 Threshold 进行过滤
next_movie_result = next_movie_result[next_movie_result['Movie_Score'] >= Threshold]

# 显示下一个电影的推荐结果
next_movie_result.head()[['Movie_Name', 'item_int', 'Movie_Score', 'Movie_Tags']]

Unnamed: 0,Movie_Name,item_int,Movie_Score,Movie_Tags
5394,三个臭皮匠 The Three Stooges,15,9.3,"['喜剧', '小时候', '时候', '经典', '电视', '电影', '电视剧', '..."
8811,不朽真情 Immortal Beloved,23,8.0,"['音乐', '电影', '爱情', '故事', '有点', '剧情', '天才', '狗血..."


### Push 推荐
1. 通过电影找用户，解决电影冷启动问题

### 相关用户推荐
1. 相关用户推荐，找到用户相关的用户，挖掘用户的潜在兴趣。
2. 增加平台和用户的交互。

假设当前的用户。

In [47]:
# 利用用户名查询
user_name = '尾黑'
train_df.loc[train_df['Username'].isin([user_name])].drop_duplicates('uid_int', keep='first', inplace=False).head()

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int,items_features,users_features
0,0,"1988年的妮可 Nico, 1988",7.5,565,15.2%48.2%32.3%3.4%0.8%,2019-10-05,尾黑,2018-06-23,3,成本低廉的PPT电影，用Nico生命中最后一年发生的事给Nico的歌配上情节，倒不算尴尬。女...,66%31%3%,4,"['音乐', '电影', '儿子', '丝绒', '人物', '传记', '传记片', '歌...",0,0,"(0, [7.5, 565, 15.2%48.2%32.3%3.4%0.8%, 0, ['音...","(0, [66%31%3%, 0])"


In [43]:
#  提取用户特征向量
embedding_user = csr_matrix(model.user_embeddings)

# 对特征进行转换
feature_user = csr_matrix(users_features)

# 用户特征提取
inner_max_user = np.dot(feature_user, embedding_user)
 
# 电影之间求得余弦相似度
user_cosine_sim = cosine_similarity(inner_max_user)

In [52]:
# 找到与查询电影最近余弦距离上的电影
def sim_user_recommendations(user_name, cosine_sim, topk):
    # 用户名转换唯一的 id
    int_id = user_dict[user_name]
    # 去除自身 id
    sim_scores = sorted(
        list(enumerate(cosine_sim[int_id])),
        key=lambda x: x[1], reverse=True)[1:topk+1]
    return sim_scores

找到 topk 个相关用户。

In [61]:
topk = 5
sim_user = sim_user_recommendations(user_name, user_cosine_sim, topk)

In [62]:
sim_user_result = train_df.loc[train_df['uid_int'].isin(
    [i[0] for i in sim_user])].drop_duplicates('uid_int', keep='first', inplace=False)

# todo：可以添加优质用户筛选策略
# Threshold 
# sim_user_result = sim_user_result[sim_user_result['User_level'] >= Threshold]

# 显示相关用户结果
sim_user_result.head()[['Username','User_Comment_Distribution']]

Unnamed: 0,Username,User_Comment_Distribution
38,青原,66%31%3%
85,琴萧一曲,66%31%3%
125,夜间门房,66%31%3%
151,豆友185134492,66%31%3%
158,张家玮,66%31%3%
