## 推荐模型训练

In [1]:
import pandas as pd

# 数据加载
train_df = pd.read_csv('Data/Movie_RS.csv')
print(train_df.shape)
train_df.head(1)

(10000, 13)


Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags
0,0,"1988年的妮可 Nico, 1988",7.5,565,15.2%48.2%32.3%3.4%0.8%,2019-10-05,尾黑,2018-06-23,3,成本低廉的PPT电影，用Nico生命中最后一年发生的事给Nico的歌配上情节，倒不算尴尬。女...,66%31%3%,4,"['音乐', '电影', '儿子', '丝绒', '人物', '传记', '传记片', '歌..."


In [2]:
# 去除空值
train_df.dropna(axis=0, how='any', inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9999 entries, 0 to 9999
Data columns (total 13 columns):
ID                           9999 non-null int64
Movie_Name                   9999 non-null object
Movie_Score                  9999 non-null float64
Review_Count                 9999 non-null int64
Movie_Star_Distribution      9999 non-null object
Collect_Date                 9999 non-null object
Username                     9999 non-null object
Post_Date                    9999 non-null object
Score                        9999 non-null int64
User_Comment                 9999 non-null object
User_Comment_Distribution    9999 non-null object
Comment_Like_Count           9999 non-null int64
Movie_Tags                   9999 non-null object
dtypes: float64(1), int64(4), object(8)
memory usage: 1.1+ MB


In [3]:
!pip install lightfm

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [4]:
from sklearn.metrics.pairwise import cosine_similarity
from lightfm import LightFM, cross_validation
from scipy.sparse import csr_matrix, coo_matrix
from lightfm.evaluation import auc_score
from lightfm.data import Dataset
import numpy as np

### 数据预处理

In [5]:
# 建立用户名和 id 映射的字典
user_dict = {value: index for index, value in enumerate(train_df['Username'].unique())}    
    
# 开始对原数据进行转换
train_df['uid_int'] = train_df['Username'].apply(lambda x: user_dict[x])

# 用户 id 字典翻转
reverse_user_dict = {v: k for k, v in user_dict.items()}

# 建立电影名和 id 映射的字典
item_dict = {value: index for index, value in enumerate(train_df['Movie_Name'].unique())}  
    
# 开始对原数据进行转换
train_df['item_int'] = train_df['Movie_Name'].apply(lambda x: item_dict[x])

# 电影 id 字典翻转
reverse_item_dict = {v: k for k, v in item_dict.items()}

# 对电影标签进行处理
train_df['New_Movie_Tags'] = train_df['Movie_Tags'].apply(
    lambda x: ','.join(eval(x)))

# 对电影打分分布进行处理
train_df['New_Movie_Star_Distribution'] = train_df['Movie_Star_Distribution'].apply(
    lambda x: ','.join(x.split('%')))

# 对用户打分分布进行处理
train_df['New_User_Comment_Distribution'] = train_df['User_Comment_Distribution'].apply(
    lambda x: ','.join(x.split('%')))

#### 设置电影和用户特征

In [6]:
train_df.head(1)

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int,New_Movie_Tags,New_Movie_Star_Distribution,New_User_Comment_Distribution
0,0,"1988年的妮可 Nico, 1988",7.5,565,15.2%48.2%32.3%3.4%0.8%,2019-10-05,尾黑,2018-06-23,3,成本低廉的PPT电影，用Nico生命中最后一年发生的事给Nico的歌配上情节，倒不算尴尬。女...,66%31%3%,4,"['音乐', '电影', '儿子', '丝绒', '人物', '传记', '传记片', '歌...",0,0,"音乐,电影,儿子,丝绒,人物,传记,传记片,歌声,巅峰,人生,经历,故事,电影节,女主,女人...","15.2,48.2,32.3,3.4,0.8,",66313


In [7]:
# 电影特征
items_f = ['Movie_Score', 'Review_Count', 'item_int',
           'New_Movie_Tags', 'New_Movie_Star_Distribution']

# 用户特征
users_f = ['uid_int', 'New_User_Comment_Distribution']

#### 数据划分

优化特征提取:
1. 电影信息拆分:单独编码
2. 用户信息拆分：单独编码
3. 通过交互信息进行 join

用户交互表划分

In [8]:
user_post_event = train_df[['uid_int', 'item_int', 'Score']]
user_post_event.shape

(9999, 3)

电影信息表划分

In [9]:
movie_info_temp = train_df[items_f].drop_duplicates(
    'item_int', keep='first', inplace=False)
movie_info_temp.shape

(26, 5)

用户信息表划分

In [10]:
user_info_temp = train_df[users_f].drop_duplicates(
    'uid_int', keep='first', inplace=False)
user_info_temp.shape

(9004, 2)

In [11]:
def create_features(dataframe, features_name, id_col_name):
    # 特征分离
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features1 = list(zip(dataframe[id_col_name], features))
    features2 = features.apply(pd.Series).stack().reset_index(drop=True)
    return features1, features2


# 电影特征编码
%time movie_info_temp['items_features'], item_feature_list = create_features(movie_info_temp, items_f, 'item_int')

CPU times: user 84 ms, sys: 0 ns, total: 84 ms
Wall time: 84.3 ms


In [12]:
# 用户特征编码
%time user_info_temp['users_features'], user_feature_list = create_features(user_info_temp, users_f, 'uid_int')

CPU times: user 4.86 s, sys: 56 ms, total: 4.92 s
Wall time: 4.93 s


In [13]:
# 开始制作数据集
make_dataset = Dataset()
make_dataset.fit(user_info_temp['uid_int'], movie_info_temp['item_int'],
                 item_features=item_feature_list, user_features=user_feature_list)

# 构建打分矩阵
interactions, weights = make_dataset.build_interactions(
    list(zip(user_post_event['uid_int'], user_post_event['item_int'], user_post_event['Score'])))

# 打印用户评分
train_df['Score'].unique()

array([3, 1, 2, 4, 5])

### 训练推荐系统的模型

In [14]:
# model load
model_test = LightFM(no_components=30, random_state=1)

# 固定划分数据
train, test = cross_validation.random_train_test_split(
    interactions, test_percentage=0.2)

模型训练

In [15]:
# 电影特征处理
items_features = make_dataset.build_item_features(
    movie_info_temp['items_features'])

# 用户特征处理
users_features = make_dataset.build_user_features(
    user_info_temp['users_features'])

# model train
%time model_test.fit(train, item_features=items_features, user_features=users_features, epochs=10, verbose=False)

CPU times: user 744 ms, sys: 0 ns, total: 744 ms
Wall time: 745 ms


<lightfm.lightfm.LightFM at 0x7f07ca5c2780>

模型验证

In [16]:
# 对推荐模型的性能进行评测
auc = auc_score(model_test, test, item_features=items_features,
                user_features=users_features)

# 计算 auc 的均值
np.mean(auc)

0.7186843

## 推荐场景梳理

### 首页用户个性化推荐
1. 包含召回和排序两个部分。
2. 适合首页 Feed 流的离线推荐计算场景。

使用全量数据进行推荐模型训练。

In [17]:
# 推荐模型训练
model = LightFM(no_components=30, random_state=1)

%time model.fit(interactions, sample_weight=weights, item_features=items_features, user_features=users_features, epochs=30,  verbose=False)

CPU times: user 3.04 s, sys: 4 ms, total: 3.04 s
Wall time: 3.04 s


<lightfm.lightfm.LightFM at 0x7f07cb5207b8>

召回
1. 过滤用户看过的
2. 尽可能找到用户喜欢的

In [18]:
# 按照用户名进行推荐
Username = '尾黑'

# id 转换
user_x = user_dict[Username]

# 对电影进行去重
allready_knews = train_df.loc[train_df['uid_int'].isin(
    [user_x])].drop_duplicates('item_int', keep='first', inplace=False)

暂时使用全量用户没看过的，且电影分数大于 Threshold 的电影作为召回。

In [19]:
# 过滤看过的内容
known_items = allready_knews['item_int'].tolist()
df_use_for_prediction = train_df.loc[~train_df['item_int'].isin(known_items)].drop_duplicates(
    'item_int', keep='first', inplace=False)

# 挑选高质量的电影
Threshold = 7
df_use_for_prediction = df_use_for_prediction[df_use_for_prediction['Movie_Score'] > Threshold]

df_use_for_prediction.head(1)

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int,New_Movie_Tags,New_Movie_Star_Distribution,New_User_Comment_Distribution
159,159,24小时狂欢派对 24 Hour Party People,7.9,1509,26.2%46.9%23.0%3.2%0.7%,2019-10-05,Lane,2008-09-22,3,080921 可能和真是一部for fans only的片子，反正我没看出特别多乐趣。只是W...,75%19%6%,0,"['音乐', '纪录片', '电影', '时候', '演员', '片子', '工厂', '乐...",159,1,"音乐,纪录片,电影,时候,演员,片子,工厂,乐队,感觉,流水,幻觉,文化,厂牌,手法,酸性,...","26.2,46.9,23.0,3.2,0.7,",75196


模型开始对召回的电影候选集进行打分。

In [20]:
df_use_for_prediction['rec_score'] = model.predict(user_ids=user_x, item_ids=df_use_for_prediction['item_int'].tolist(),
                                                   item_features=items_features, user_features=users_features)

对排序结果进行 topk 选取。

In [21]:
# 取 topk
top_rec_item = 5
rec_list_ = df_use_for_prediction.sort_values(by='rec_score', ascending=False)[
    :top_rec_item]['item_int']

# 排序后的电影推荐列表
rec_list_.tolist()

[11, 19, 2, 8, 9]

In [22]:
# 利用推荐列表找出电影信息
result = train_df.loc[train_df['item_int'].isin(rec_list_.tolist())][[
    'Movie_Name', 'item_int', 'Movie_Score', 'Movie_Tags']].drop_duplicates('Movie_Name', keep='first', inplace=False)

# 推荐结果显示
result.head()

Unnamed: 0,Movie_Name,item_int,Movie_Score,Movie_Tags
373,42号传奇 42,2,7.9,"['成就', '励志', '桥段', '夫妻', '棒球', '电影', '核心', '数字..."
2364,一年级生 The First Grader,8,8.1,"['老人', '故事', '电影', '孩子', '历史', '励志', '影片', '剧情..."
2973,一曲难忘 A Song to Remember,9,7.8,"['电影', '音乐', '老师', '教授', '音乐课', '时候', '艺术', '有..."
3679,一轮明月,11,7.9,"['电影', '大师', '人物', '感觉', '人生', '主旋律', '流水账', '..."
6230,不可告人 Ne le dis à personne,19,7.1,"['有点', '故事', '悬疑片', '节奏', '电影', '法国人', '剧情', '..."


### 看了还看
1. 根据当前看的内容找到看过此次内容相关的内容。
2. 对结果依旧使用阈值过滤，挑选优质内容。

In [23]:
# 提取电影特征向量
embedding_movie, feature_movie = csr_matrix(
    model.item_embeddings), csr_matrix(items_features)

# 电影特征提取
movie_inner_max = np.dot(feature_movie, embedding_movie)

# 电影之间求得余弦相似度
movie_cosine_sim = cosine_similarity(movie_inner_max)

In [24]:
# 找到与查询电影最近余弦距离上的电影
def next_movie_recommendations(movie_name, cosine_sim, topk):
    # 电影名转换唯一的 id
    int_id = item_dict[movie_name]
    # 去除自身 id
    sim_scores = sorted(
        list(enumerate(cosine_sim[int_id])),
        key=lambda x: x[1], reverse=True)[1:topk+1]
    return sim_scores

假设当前看的电影。

In [25]:
# 利用电影名查询
movie_name = '24小时狂欢派对 24 Hour Party People'

train_df.loc[train_df['Movie_Name'].isin([movie_name])].drop_duplicates(
    'item_int', keep='first', inplace=False).head(1)

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int,New_Movie_Tags,New_Movie_Star_Distribution,New_User_Comment_Distribution
159,159,24小时狂欢派对 24 Hour Party People,7.9,1509,26.2%46.9%23.0%3.2%0.7%,2019-10-05,Lane,2008-09-22,3,080921 可能和真是一部for fans only的片子，反正我没看出特别多乐趣。只是W...,75%19%6%,0,"['音乐', '纪录片', '电影', '时候', '演员', '片子', '工厂', '乐...",159,1,"音乐,纪录片,电影,时候,演员,片子,工厂,乐队,感觉,流水,幻觉,文化,厂牌,手法,酸性,...","26.2,46.9,23.0,3.2,0.7,",75196


推理用户接下来看的 topk 个电影，并去除当前看的电影和自己已经看过的电影。

In [26]:
topk = 20
next_movie = [i[0] for i in next_movie_recommendations(
    movie_name, movie_cosine_sim, topk)]

#  推荐和看过的取差集
next_list = list(set(next_movie).difference(set(known_items)))
next_list

[2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 20, 22, 23, 24, 25]

增加阈值过滤进行下一个结果输出，保证推荐质量。

In [27]:
next_movie_result = train_df.loc[train_df['item_int'].isin(
    next_list)].drop_duplicates('item_int', keep='first', inplace=False)


# 使用 Threshold 进行过滤
Threshold = 8
next_movie_result = next_movie_result[next_movie_result['Movie_Score'] >= Threshold]

# 显示看了还看的推荐结果
next_movie_result.head()[items_f+['Movie_Name']]

Unnamed: 0,Movie_Score,Review_Count,item_int,New_Movie_Tags,New_Movie_Star_Distribution,Movie_Name
2364,8.1,2483,8,"老人,故事,电影,孩子,历史,励志,影片,剧情,力量,国家,片子,民族,有点,政治,黑人,感...","28.9,47.6,22.5,0.9,0.2,",一年级生 The First Grader
5394,9.3,1010,15,"喜剧,小时候,时候,经典,电视,电影,电视剧,西部片,有点,先生,意味,时间,趣事,探险,规...","73.4,20.3,5.8,0.4,0.1,",三个臭皮匠 The Three Stooges
8811,8.0,3581,23,"音乐,电影,爱情,故事,有点,剧情,天才,狗血,时候,传记,月光,星空,音乐家,钢琴,爱人,...","27.5,47.5,21.9,2.5,0.6,",不朽真情 Immortal Beloved


### Push 推荐
1. 通过计算每位用户对单一电影的兴趣度，去除看过的即可得出 topk 的 push 结果。
2. 通过电影找用户，解决电影冷启动问题。

In [28]:
movie_int = item_dict[movie_name]

# 查看当前电影
item_rec_user = train_df.loc[train_df['item_int'].isin(
    [movie_int])].drop_duplicates('item_int', keep='first', inplace=False)
item_rec_user.head(1)

Unnamed: 0,ID,Movie_Name,Movie_Score,Review_Count,Movie_Star_Distribution,Collect_Date,Username,Post_Date,Score,User_Comment,User_Comment_Distribution,Comment_Like_Count,Movie_Tags,uid_int,item_int,New_Movie_Tags,New_Movie_Star_Distribution,New_User_Comment_Distribution
159,159,24小时狂欢派对 24 Hour Party People,7.9,1509,26.2%46.9%23.0%3.2%0.7%,2019-10-05,Lane,2008-09-22,3,080921 可能和真是一部for fans only的片子，反正我没看出特别多乐趣。只是W...,75%19%6%,0,"['音乐', '纪录片', '电影', '时候', '演员', '片子', '工厂', '乐...",159,1,"音乐,纪录片,电影,时候,演员,片子,工厂,乐队,感觉,流水,幻觉,文化,厂牌,手法,酸性,...","26.2,46.9,23.0,3.2,0.7,",75196


In [29]:
movie_name

'24小时狂欢派对 24 Hour Party People'

In [30]:
# 利用交互矩阵的维度
n_users, n_movies = interactions.shape

movie2user_rec = model.predict(user_ids=np.arange(n_users), item_ids=np.repeat(
    movie_int, n_users), item_features=items_features, user_features=users_features)

# 建立感兴趣的用户索引倒排
wait_rec_list = np.argsort(-movie2user_rec).tolist()

In [31]:
# 找出看过此商品的用户 id
item_known_users_int = train_df[train_df['item_int']
                                == movie_int]['uid_int'].unique().tolist()

推荐候选集合过滤，得出最终的 Push 用户列表。

In [32]:
push_topk = 10
push_result = []
count = 0
# 增加早停
for x in wait_rec_list:
    if x not in item_known_users_int:
        push_result.append(x)
        count += 1
        if count > push_topk:
            break
push_result

[3, 6, 833, 1184, 1316, 368, 1390, 1003, 673, 805, 685]

打印待 Push 用户的信息。

In [33]:
item_rec_user = train_df.loc[train_df['uid_int'].isin(
    push_result)].drop_duplicates('uid_int', keep='first', inplace=False)
item_rec_user.head()[users_f+['Username']]

Unnamed: 0,uid_int,New_User_Comment_Distribution,Username
3,3,66313,张晚禾
6,6,66313,Lycidas
373,368,72253,(๑⁼̴̀д⁼̴́๑)
679,673,72253,Justin
691,685,72253,BobMing


### 相关用户推荐
1. 相关用户推荐，找到用户相关的用户，挖掘用户的潜在兴趣。
2. 增加平台和用户的交互。

In [34]:
#  提取用户特征向量
embedding_user, feature_user = csr_matrix(
    model.user_embeddings), csr_matrix(users_features)

# 用户特征提取
user_inner_max = np.dot(feature_user, embedding_user)

# 用户之间求得余弦相似度，容易因用户数量过大造成 MemoryError
user_cosine_sim = cosine_similarity(user_inner_max)

In [35]:
# 找到与查询电影最近余弦距离上的电影
def sim_user_recommendations(user_name, cosine_sim, topk):
    # 用户名转换唯一的 id
    int_id = user_dict[user_name]
    # 去除自身 id，建立倒排索引
    sim_scores = sorted(
        list(enumerate(cosine_sim[int_id])),
        key=lambda x: x[1], reverse=True)[1:topk+1]
    return sim_scores

找到与当前用户 Username 相关的 topk 个相关用户。

In [36]:
topk = 5
sim_user_list = [i[0] for i in sim_user_recommendations(
    Username, user_cosine_sim, topk)]
sim_user_list

[108, 96, 135, 26, 29]

打印相关用户的信息。

In [37]:
sim_user_result = train_df.loc[train_df['uid_int'].isin(
    sim_user_list)].drop_duplicates('uid_int', keep='first', inplace=False)

# todo：可以添加优质用户筛选策略
# Threshold
# sim_user_result = sim_user_result[sim_user_result['User_level'] >= Threshold]

# 显示相关用户结果
sim_user_result.head()[users_f+['Username']]

Unnamed: 0,uid_int,New_User_Comment_Distribution,Username
26,26,66313,宁二狗
29,29,66313,李草木
96,96,66313,Memento
108,108,66313,朴儿
135,135,66313,小五在冬眠
