# Trend趋势推荐 - 加权分数排序

## 加权分数计算公式

$$
Weighted Rating(WR) = (\frac{v}{v+m}R) + (\frac{m}{v+m}C)\\
v\ is\ the\ number\ of\ votes\ for\ the\ movie\\
m\ is\ the\ minimum\ votes\ required\ to be listed\ in the\ chart\\
R\ is\ the\ average\ rating\ of\ the\ movie\\
C\ is\ the\ mean\ vote\ across\ the whole\ report
$$

In [9]:
# 读取ratings得出对于每个电影id的平均得分与评分人数
import pandas as pd
import numpy as np

ratings = pd.read_csv('dataset/themovie/data_origin/ratings.csv')
# header = ['userId', 'movieId', 'rating', 'timestamp']

# 解析时间戳, 转换为年份, 只保留距离最大年份近1年的数据
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings['timestamp'] = ratings['timestamp'].apply(lambda x: x.year)

# 获取最大年份
max_year = ratings['timestamp'].max()

# 保留近1年的数据
ratings = ratings[ratings['timestamp'] >= max_year - 1]

# 读取电影id
movieId = ratings['movieId'].unique()

# 读取每个电影的平均得分
movie_rating = ratings.groupby('movieId')['rating'].mean()
# 读取每个电影的评分人数
movie_rating_count = ratings.groupby('movieId')['rating'].count()

# 将movie_rating与movie_rating_count合并保存为csv
movie_rating = pd.DataFrame(movie_rating)
movie_rating_count = pd.DataFrame(movie_rating_count)
movie_rating = movie_rating.merge(movie_rating_count, left_index=True, right_index=True)
movie_rating.columns = ['rating', 'rating_count']
movie_rating.to_csv('dataset/themovie/movie_rating.csv', index=True, header=True)

In [10]:
# 保留至少90%的movieId
m = movie_rating['rating_count'].quantile(0.9)
print('保留至少90%的movieId的评分人数为：', m)

# 看看有多少电影被保留
movie_rating = movie_rating.copy().loc[movie_rating['rating_count'] >= m]

print('保留至少90%的movieId后，剩余电影数量为：', len(movie_rating))

保留至少90%的movieId的评分人数为： 93.0
保留至少90%的movieId后，剩余电影数量为： 3992


In [11]:
# 计算C
C = movie_rating['rating'].mean()

def weighted_rating(x, m=m, C=C):
    v = x['rating_count']
    R = x['rating']
    
    return (v/(v+m) * R) + (m/(m+v) * C)

# calculate score
movie_rating['score'] = movie_rating.apply(weighted_rating, axis=1)

# sort score
movie_rating = movie_rating.sort_values('score', ascending=False)

# 前15名
movie_rating.head(15)

Unnamed: 0_level_0,rating,rating_count,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
159817,4.47878,754,4.357643
318,4.353308,14193,4.346943
1203,4.287671,2263,4.251665
858,4.243621,7760,4.233341
5618,4.234541,4334,4.216495
296,4.222421,9509,4.214218
2959,4.214777,11931,4.208285
1221,4.218783,4781,4.202693
5971,4.205483,1769,4.16403
3000,4.192801,2028,4.156965


In [12]:
# 去掉rating和rating_count
movie_rating = movie_rating.drop(['rating', 'rating_count'], axis=1)

# 保存trend_movie.csv
movie_rating.to_csv('result/trend_movie.csv', index=True, header=True)

# Content-Based推荐 - 电影内容相似度

## 词向量相似度

In [1]:
import pandas as pd

# 读取movie.csv
movie = pd.read_csv('dataset/themovie/data_sql/movie.csv')

print (movie['overview'].head(5))

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object


  movie = pd.read_csv('dataset/themovie/data_sql/movie.csv')


为每个overview计算`Term Frequency-Inverse Document Frequency` (TF-IDF)向量, 使用scikit-learn的`TfidfVectorizer`函数来生成TF-IDF矩阵


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 定义一个移除所有english stop words如'the', 'a'的转换器
tfidf = TfidfVectorizer(stop_words='english')

# 替换NaN为''
movie['overview'] = movie['overview'].fillna('')

# 构建tfidf矩阵
tfidf_matrix = tfidf.fit_transform(movie['overview'])

# 输出矩阵形状
tfidf_matrix.shape

(45466, 48620)

计算相似度分数

将使用cosine similarity score来计算overview之间的相似度
$$
similarity = cos(\theta) = \frac{A \cdot B}{||A|| ||B||} = \frac{\sum_{i=1}^{n}A_iB_i}{\sqrt{\sum_{i=1}^{n}A_i^2}\sqrt{\sum_{i=1}^{n}B_i^2}}
$$

In [3]:
from sklearn.metrics.pairwise import linear_kernel

# 计算余弦相似度
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

我们将定义一个将电影标题作为输入的函数，并输出10个最相似电影的列表。首先，为此，我们需要对电影标题和数据框架索引进行反向映射。

In [4]:
# 构建电影标题和索引的反向映射
indices = pd.Series(movie.index, index=movie['title']).drop_duplicates()

# 为给定的电影标题，返回前20个相似的电影id, 和相似度
def get_recommendations(title, cosine_sim=cosine_sim):
    # 获取电影标题对应的索引
    idx = indices[title]

    # 如果找到的索引是一个列表（即有重复的电影标题），我们只取第一个
    if type(idx) == pd.Series:
        idx = idx.iloc[0]

    # 获取该电影的所有相似度
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 根据相似度排序
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 取前20个
    sim_scores = sim_scores[1:21]

    # 获取电影索引
    movie_indices = [i[0] for i in sim_scores]

    # 返回前20个相似的电影
    return movie_indices, sim_scores

In [66]:
get_recommendations('Heat')

([38878,
  42488,
  13449,
  1436,
  1402,
  11620,
  28894,
  15543,
  20067,
  22153,
  21025,
  5729,
  14187,
  45145,
  4418,
  6491,
  16868,
  20324,
  9418,
  7768],
 [(38878, 0.1999008154988109),
  (42488, 0.1881612126089842),
  (13449, 0.1767770034217985),
  (1436, 0.17138503558603266),
  (1402, 0.15806322749256596),
  (11620, 0.15503867219226491),
  (28894, 0.1548683789157499),
  (15543, 0.152394592626787),
  (20067, 0.15222697691572412),
  (22153, 0.14842828514745462),
  (21025, 0.14573239508646105),
  (5729, 0.1415591028351987),
  (14187, 0.14024451670660942),
  (45145, 0.13889345135547232),
  (4418, 0.13765307408806274),
  (6491, 0.13621694290098224),
  (16868, 0.13604858544970225),
  (20324, 0.1334226789983759),
  (9418, 0.13282367450099755),
  (7768, 0.13159000409852512)])

In [7]:
# 存储结果的列表
related_movie_list = []

# 读取movieId
movieId = movie['movieId'].unique()

from tqdm import tqdm
# 为每个movieId计算相似的movieId
for i in tqdm(movieId):
    # 获取相似的movieId
    related_movieId, score = get_recommendations(movie[movie['movieId'] == i]['title'].values[0])
    
    # 将movieId, related_movieId, score保存到related_movie_list
    for j in range(len(related_movieId)):
        related_movie_list.append({'movieId': i, 'related_movieId': related_movieId[j], 'score': score[j][1]})

# 将结果列表转换为DataFrame
related_movie = pd.DataFrame(related_movie_list)

# 保存为csv
related_movie.to_csv('result/related_movie.csv', index=True, header=True)

100%|██████████| 45436/45436 [11:52<00:00, 63.79it/s]


In [8]:
# related_movie保存为csv时去掉第一列

related_movie.to_csv('result/related_movie.csv', index=False, header=True)

# 协同过滤用户主页推荐


In [15]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

reader = Reader()
# 读取ratings.csv
ratings = pd.read_csv('dataset/themovie/data_origin/ratings_small.csv')
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [16]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5)

{'test_rmse': array([0.89837285, 0.88938036, 0.90212185, 0.90468165, 0.89173562]),
 'test_mae': array([0.69297839, 0.687292  , 0.69400552, 0.69531867, 0.68556561]),
 'fit_time': (0.625288724899292,
  0.6209487915039062,
  0.6138882637023926,
  0.6668648719787598,
  0.643740177154541),
 'test_time': (0.0625617504119873,
  0.06306624412536621,
  0.0644834041595459,
  0.07670426368713379,
  0.06699919700622559)}

In [17]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x277463299c0>

In [18]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [19]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=2.8076565821376933, details={'was_impossible': False})

In [20]:
# 为当前每个用户所有可能的电影预测评分
def predict_rating(userId, movieId):
    return svd.predict(userId, movieId).est

# 为当前用户推荐前n个电影
def recommend_movie(userId, movieId_list, n=20):
    # 为当前用户的所有电影预测评分
    movies = movieId_list
    predict_ratings = {}
    for movieId in movies:
        predict_ratings[movieId] = predict_rating(userId, movieId)
    
    # 根据评分排序
    sorted_predict_ratings = sorted(predict_ratings.items(), key=lambda x: x[1], reverse=True)
    
    # 返回前n个电影
    return sorted_predict_ratings[:n]

# 获取userId列表
userId_list = ratings['userId'].unique()

# 获取movieId列表
movieId_list = ratings['movieId'].unique()

# 存储结果的列表
recommend_movie_list = []

# 为每个用户推荐前20个电影
for userId in tqdm(userId_list):
    recommend_list = recommend_movie(userId, movieId_list)
    for i in range(len(recommend_list)):
        recommend_movie_list.append({'userId': userId, 'movieId': recommend_list[i][0], 'rating': recommend_list[i][1]})

# 将结果列表转换为DataFrame
recommend_movie = pd.DataFrame(recommend_movie_list)

# 保存为csv
recommend_movie.to_csv('result/recommend_movie.csv', index=False, header=True)

100%|██████████| 671/671 [00:22<00:00, 29.53it/s]
