In [1]:
import pandas as pd

In [2]:
# demo 验证
train_df = pd.read_csv('Data/New_Movie_RS.csv',nrows=10000)
print(train_df.shape)
train_df.head(1)

(10000, 13)


Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags
0,0,"1988年的妮可 Nico, 1988",7.5,565,15.2%48.2%32.3%3.4%0.8%,2019-10-05,尾黑,2018-06-23,3,成本低廉的PPT电影，用Nico生命中最后一年发生的事给Nico的歌配上情节，倒不算尴尬。女...,66%31%3%,4,"['音乐', '电影', '儿子', '丝绒', '人物', '传记', '传记片', '歌..."


In [3]:
# 去除空值
train_df.dropna(axis=0, how='any', inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9999 entries, 0 to 9999
Data columns (total 13 columns):
ID                           9999 non-null int64
Movie_Name                   9999 non-null object
Movie_Score                  9999 non-null float64
Review_Count                 9999 non-null int64
Movie_Star_Distribution      9999 non-null object
Collect_Date                 9999 non-null object
Username                     9999 non-null object
Post_Date                    9999 non-null object
Score                        9999 non-null int64
User_Comment                 9999 non-null object
User_Comment_Distribution    9999 non-null object
Comment_Like_Count           9999 non-null int64
Movie_Tags                   9999 non-null object
dtypes: float64(1), int64(4), object(8)
memory usage: 1.1+ MB


In [4]:
!pip install lightfm

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
from lightfm import LightFM, cross_validation
from scipy.sparse import csr_matrix, coo_matrix
from lightfm.evaluation import auc_score 
from lightfm.data import Dataset
import numpy as np

In [6]:
# 建立用户名和 id 映射的字典
user_dict = {}
for index, value in enumerate(train_df['Username'].unique()):
    user_dict[value] = index
train_df['uid_int'] = train_df['Username'].apply(lambda x: user_dict[x])
# 字典翻转
reverse_user_dict = {v: k for k, v in user_dict.items()}

# 建立电影名和 id 映射的字典
item_dict = {}
for index, value in enumerate(train_df['Movie_Name'].unique()):
    item_dict[value] = index
train_df['item_int'] = train_df['Movie_Name'].apply(lambda x: item_dict[x])
# 字典翻转
reverse_item_dict = {v: k for k, v in item_dict.items()}

#### 设置电影和用户特征

In [7]:
def create_features(dataframe, features_name, id_col_name):
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features1 = list(zip(dataframe[id_col_name], features))
    features2 = features.apply(pd.Series).stack().reset_index(drop=True)
    return features1, features2

In [8]:
# 电影特征
items_f = ['Movie_Score', 'Review_Count','item_int' ]

# 用户特征
users_f = ['uid_int','User_Comment_Distribution']

train_df['items_features'], item_feature_list = create_features(
    train_df, items_f, 'item_int')

train_df['users_features'], user_feature_list = create_features(
    train_df, users_f, 'uid_int')


train_df.head(1)

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int,items_features,users_features
0,0,"1988年的妮可 Nico, 1988",7.5,565,15.2%48.2%32.3%3.4%0.8%,2019-10-05,尾黑,2018-06-23,3,成本低廉的PPT电影，用Nico生命中最后一年发生的事给Nico的歌配上情节，倒不算尴尬。女...,66%31%3%,4,"['音乐', '电影', '儿子', '丝绒', '人物', '传记', '传记片', '歌...",0,0,"(0, [7.5, 565.0, 0.0])","(0, [0, 66%31%3%])"


In [9]:
# 开始制作数据集
make_dataset = Dataset()
make_dataset.fit(train_df['uid_int'], train_df['item_int'],
                 item_features=item_feature_list, user_features=user_feature_list)

In [10]:
# 构建打分矩阵
interactions, weights = make_dataset.build_interactions(
    list(zip(train_df['uid_int'], train_df['item_int'], train_df['Score'])))

# 打印用户评分
train_df['Score'].unique()

array([3, 1, 2, 4, 5])

数据集划分

In [11]:
# model load
model_test = LightFM(no_components=30, random_state=1)

# 固定划分数据
train, test = cross_validation.random_train_test_split(
    interactions, test_percentage=0.2)

模型训练

In [12]:
# 电影特征处理
items_features = make_dataset.build_item_features(train_df['items_features'])

# 用户特征处理
users_features = make_dataset.build_user_features(train_df['users_features'])

# model train
%time model_test.fit(train, item_features=items_features, user_features=users_features, epochs=10, verbose=False)

CPU times: user 236 ms, sys: 8 ms, total: 244 ms
Wall time: 242 ms


<lightfm.lightfm.LightFM at 0x7f348552fa20>

模型验证

In [13]:
# 开始验证模型
auc = auc_score(model_test, test, item_features=items_features,
                user_features=users_features)

# 计算 auc 的均值
np.mean(auc)

0.736518

### 首页用户个性化推荐
1. 包含召回和排序两个部分。
2. 适合首页 Feed 流的离线推荐计算场景。

使用全量数据进行推荐模型训练。

In [14]:
# 推荐模型训练
model = LightFM(no_components=30, random_state=1)

%time model.fit(interactions, sample_weight=weights, item_features=items_features, user_features=users_features, epochs=30,  verbose=False)

CPU times: user 856 ms, sys: 4 ms, total: 860 ms
Wall time: 859 ms


<lightfm.lightfm.LightFM at 0x7f34855127f0>

召回
1. 过滤用户看过的
2. 尽可能找到用户喜欢的

In [15]:
# 按照用户名进行推荐
Username = '尾黑'

# id 转换
user_x = user_dict[Username]

# 对电影进行去重
allready_knews = train_df.loc[train_df['uid_int'].isin([user_x])].drop_duplicates('item_int', keep='first', inplace=False)

暂时使用全量用户没看过的，且电影分数大于 Threshold 的电影作为召回。

In [16]:
# 过滤看过的内容
known_items = allready_knews['item_int'].tolist()
df_use_for_prediction = train_df.loc[~train_df['item_int'].isin(known_items)].drop_duplicates(
    'item_int', keep='first', inplace=False)

# 挑选高质量的电影
Threshold = 8
df_use_for_prediction = df_use_for_prediction[df_use_for_prediction['Movie_Score'] > Threshold]

df_use_for_prediction.head(1)

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int,items_features,users_features
2364,2394,一年级生 The First Grader,8.1,2483,28.9%47.6%22.5%0.9%0.2%,2019-10-05,mark,2012-05-08,3,这是一部典型的慢热电影，前面部分预示着电影的两条线，一条是老人要读小学，一条是老人早年为肯尼...,77%21%2%,0,"['老人', '故事', '电影', '孩子', '历史', '励志', '影片', '剧情...",1025,8,"(8, [8.1, 2483.0, 8.0])","(1025, [1025, 77%21%2%])"


对召回的结果进行排序。

In [17]:
# 对过滤之后的内容进行排序
df_use_for_prediction['rec_score'] = model.predict(user_ids=user_x, item_ids=df_use_for_prediction['item_int'].tolist(),
                                                   item_features=items_features, user_features=users_features)

df_use_for_prediction.head(1)

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int,items_features,users_features,rec_score
2364,2394,一年级生 The First Grader,8.1,2483,28.9%47.6%22.5%0.9%0.2%,2019-10-05,mark,2012-05-08,3,这是一部典型的慢热电影，前面部分预示着电影的两条线，一条是老人要读小学，一条是老人早年为肯尼...,77%21%2%,0,"['老人', '故事', '电影', '孩子', '历史', '励志', '影片', '剧情...",1025,8,"(8, [8.1, 2483.0, 8.0])","(1025, [1025, 77%21%2%])",5.618755


对排序结果进行 topk 选取。

In [18]:
# 取 topk
top_rec_item = 5
rec_list_ = df_use_for_prediction.sort_values(by='rec_score', ascending=False)[:top_rec_item]['item_int']

# 排序后的电影推荐列表
rec_list_.tolist()

[8, 15]

In [19]:
# 利用推荐列表找出电影信息
result = train_df.loc[train_df['item_int'].isin(rec_list_.tolist())][[
    'Movie_Name', 'item_int', 'Movie_Score', 'Movie_Tags']].drop_duplicates('Movie_Name', keep='first', inplace=False)

# 推荐结果显示
result.head(1)

Unnamed: 0,Movie_Name,item_int,Movie_Score,Movie_Tags
2364,一年级生 The First Grader,8,8.1,"['老人', '故事', '电影', '孩子', '历史', '励志', '影片', '剧情..."


### 看了还看
1. 根据当前看的内容找到看过此次内容相关的内容。
2. 对结果依旧使用阈值过滤，挑选优质内容。

In [20]:
# 提取电影特征向量
embedding_movie, feature_movie = csr_matrix(
    model.item_embeddings), csr_matrix(items_features)

# 电影特征提取
movie_inner_max = np.dot(feature_movie, embedding_movie)

# 电影之间求得余弦相似度
movie_cosine_sim = cosine_similarity(movie_inner_max)

In [21]:
# 找到与查询电影最近余弦距离上的电影
def next_movie_recommendations(movie_name, cosine_sim, topk):
    # 电影名转换唯一的 id
    int_id = item_dict[movie_name]
    # 去除自身 id
    sim_scores = sorted(
        list(enumerate(cosine_sim[int_id])),
        key=lambda x: x[1], reverse=True)[1:topk+1]
    return sim_scores

假设当前看的电影。

In [22]:
# 利用电影名查询
movie_name = '55步 55 Steps'

train_df.loc[train_df['Movie_Name'].isin([movie_name])].drop_duplicates(
    'item_int', keep='first', inplace=False).head(1)

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int,items_features,users_features
1411,1433,55步 55 Steps,7.9,890,23.2%51.5%24.3%0.8%0.3%,2019-10-05,vivi,2018-10-20,3,Helena Bonham Carter 的演技其实确实一直都这么好啊。我们之所以被她们所感...,75%22%3%,4,"['电影', '律师', '女主', '法律', '题材', '患者', '故事', '精神...",727,3,"(3, [7.9, 890.0, 3.0])","(727, [727, 75%22%3%])"


推理用户接下来看的 topk 个电影，并去除当前看的电影和自己已经看过的电影。

In [23]:
topk = 20
next_movie = [i[0] for i in next_movie_recommendations(
    movie_name, movie_cosine_sim, topk)]

#  推荐和看过的取差集
next_list = list(set(next_movie).difference(set(known_items)))
next_list

[1, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 17, 18, 19, 21, 22, 23, 25]

增加阈值过滤进行下一个结果输出，保证推荐质量。

In [24]:
next_movie_result = train_df.loc[train_df['item_int'].isin(
    next_list)].drop_duplicates('item_int', keep='first', inplace=False)


# 使用 Threshold 进行过滤
Threshold = 7.5
next_movie_result = next_movie_result[next_movie_result['Movie_Score'] >= Threshold]

# 显示下一个电影的推荐结果
next_movie_result.head()[['Movie_Name', 'item_int',
                          'Movie_Score', 'Movie_Tags']]

Unnamed: 0,Movie_Name,item_int,Movie_Score,Movie_Tags
159,24小时狂欢派对 24 Hour Party People,1,7.9,"['音乐', '纪录片', '电影', '时候', '演员', '片子', '工厂', '乐..."
373,42号传奇 42,2,7.9,"['成就', '励志', '桥段', '夫妻', '棒球', '电影', '核心', '数字..."
2042,一个人的遭遇 Судьба человека,6,7.9,"['电影', '镜头', '时代', '摄影', '画面', '影片', '小说', '感觉..."
2132,一代巨星桑杰君 Sanju,7,7.5,"['电影', '故事', '父亲', '友情', '人生', '媒体', '亲情', '朋友..."
2364,一年级生 The First Grader,8,8.1,"['老人', '故事', '电影', '孩子', '历史', '励志', '影片', '剧情..."


### Push 推荐
1. 通过计算每位用户对单一电影的兴趣度，去除看过的即可得出 topk 的 push 结果。
2. 通过电影找用户，解决电影冷启动问题。

In [25]:
movie_int = item_dict[movie_name]

# 查看当前物品
item_rec_user = train_df.loc[train_df['item_int'].isin([movie_int])].drop_duplicates('item_int', keep='first', inplace=False)
item_rec_user.head(1)

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int,items_features,users_features
1411,1433,55步 55 Steps,7.9,890,23.2%51.5%24.3%0.8%0.3%,2019-10-05,vivi,2018-10-20,3,Helena Bonham Carter 的演技其实确实一直都这么好啊。我们之所以被她们所感...,75%22%3%,4,"['电影', '律师', '女主', '法律', '题材', '患者', '故事', '精神...",727,3,"(3, [7.9, 890.0, 3.0])","(727, [727, 75%22%3%])"


In [26]:
movie_name

'55步 55 Steps'

In [27]:
# 利用交互矩阵的维度
n_users, n_movies = interactions.shape

movie2user_rec = model.predict(user_ids=np.arange(n_users), item_ids=np.repeat(
    movie_int, n_users), item_features=items_features, user_features=users_features)

# 建立感兴趣的用户索引倒排
wait_rec_list = np.argsort(-movie2user_rec).tolist()

In [28]:
# 找出看过此商品的用户 id
item_known_users_int =  train_df[train_df['item_int'] == movie_int]['uid_int'].unique().tolist()

推荐候选集合过滤，得出最终的 Push 用户列表。

In [29]:
push_topk = 10
push_result = []
count = 0
# 增加早停
for x in wait_rec_list:
    if x not in item_known_users_int:
        push_result.append(x)
        count +=1
        if count > push_topk:
            break
push_result

[764, 748, 737, 813, 200, 1285, 212, 750, 1276, 1077, 990]

打印待 Push 用户的信息。

In [30]:
item_rec_user = train_df.loc[train_df['uid_int'].isin(push_result)].drop_duplicates('uid_int', keep='first', inplace=False)[users_f]
item_rec_user.head()

Unnamed: 0,uid_int,User_Comment_Distribution
201,200,75%19%6%
213,212,75%19%6%
743,737,72%25%3%
754,748,72%25%3%
756,750,72%25%3%


### 相关用户推荐
1. 相关用户推荐，找到用户相关的用户，挖掘用户的潜在兴趣。
2. 增加平台和用户的交互。

In [31]:
#  提取用户特征向量
embedding_user, feature_user = csr_matrix(
    model.user_embeddings), csr_matrix(users_features)

# 用户特征提取
user_inner_max = np.dot(feature_user, embedding_user)

# 用户之间求得余弦相似度，容易因用户数量过大造成 MemoryError
user_cosine_sim = cosine_similarity(user_inner_max)

In [32]:
# 找到与查询电影最近余弦距离上的电影
def sim_user_recommendations(user_name, cosine_sim, topk):
    # 用户名转换唯一的 id
    int_id = user_dict[user_name]
    # 去除自身 id，建立倒排索引
    sim_scores = sorted(
        list(enumerate(cosine_sim[int_id])),
        key=lambda x: x[1], reverse=True)[1:topk+1]
    return sim_scores

假设当前的用户。

In [36]:
# 直接使用 Username 作为当前用户名

train_df.loc[train_df['Username'].isin([Username])].drop_duplicates(
    'uid_int', keep='first', inplace=False).head()

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int,items_features,users_features
0,0,"1988年的妮可 Nico, 1988",7.5,565,15.2%48.2%32.3%3.4%0.8%,2019-10-05,尾黑,2018-06-23,3,成本低廉的PPT电影，用Nico生命中最后一年发生的事给Nico的歌配上情节，倒不算尴尬。女...,66%31%3%,4,"['音乐', '电影', '儿子', '丝绒', '人物', '传记', '传记片', '歌...",0,0,"(0, [7.5, 565.0, 0.0])","(0, [0, 66%31%3%])"


找到 topk 个相关用户。

In [37]:
topk = 5
sim_user_list = [i[0] for i in sim_user_recommendations(user_name, user_cosine_sim, topk)]
sim_user_list

[6953, 7161, 6711, 7041, 7924]

In [38]:
sim_user_result = train_df.loc[train_df['uid_int'].isin(sim_user_list)].drop_duplicates('uid_int', keep='first', inplace=False)

# todo：可以添加优质用户筛选策略
# Threshold 
# sim_user_result = sim_user_result[sim_user_result['User_level'] >= Threshold]

# 显示相关用户结果
sim_user_result.head()[['Username','User_Comment_Distribution']]

Unnamed: 0,Username,User_Comment_Distribution
7272,宇宙电影患者,44%44%12%
7561,根本英俊石小叔,44%44%12%
7674,冥王星的灯笼裤,44%44%12%
7829,请叫我王老师,44%44%12%
8685,林潮,44%44%12%
