# Trend趋势推荐 - 加权分数排序

## 加权分数计算公式

$$
Weighted Rating(WR) = (\frac{v}{v+m}R) + (\frac{m}{v+m}C)\\
v\ is\ the\ number\ of\ votes\ for\ the\ movie\\
m\ is\ the\ minimum\ votes\ required\ to be listed\ in the\ chart\\
R\ is\ the\ average\ rating\ of\ the\ movie\\
C\ is\ the\ mean\ vote\ across\ the whole\ report
$$

In [9]:
# 读取ratings得出对于每个电影id的平均得分与评分人数
import pandas as pd
import numpy as np

ratings = pd.read_csv('dataset/themovie/data_origin/ratings.csv')
# header = ['userId', 'movieId', 'rating', 'timestamp']

# 解析时间戳, 转换为年份, 只保留距离最大年份近1年的数据
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings['timestamp'] = ratings['timestamp'].apply(lambda x: x.year)

# 获取最大年份
max_year = ratings['timestamp'].max()

# 保留近1年的数据
ratings = ratings[ratings['timestamp'] >= max_year - 1]

# 读取电影id
movieId = ratings['movieId'].unique()

# 读取每个电影的平均得分
movie_rating = ratings.groupby('movieId')['rating'].mean()
# 读取每个电影的评分人数
movie_rating_count = ratings.groupby('movieId')['rating'].count()

# 将movie_rating与movie_rating_count合并保存为csv
movie_rating = pd.DataFrame(movie_rating)
movie_rating_count = pd.DataFrame(movie_rating_count)
movie_rating = movie_rating.merge(movie_rating_count, left_index=True, right_index=True)
movie_rating.columns = ['rating', 'rating_count']
movie_rating.to_csv('dataset/themovie/movie_rating.csv', index=True, header=True)

In [10]:
# 保留至少90%的movieId
m = movie_rating['rating_count'].quantile(0.9)
print('保留至少90%的movieId的评分人数为：', m)

# 看看有多少电影被保留
movie_rating = movie_rating.copy().loc[movie_rating['rating_count'] >= m]

print('保留至少90%的movieId后，剩余电影数量为：', len(movie_rating))

保留至少90%的movieId的评分人数为： 93.0
保留至少90%的movieId后，剩余电影数量为： 3992


In [11]:
# 计算C
C = movie_rating['rating'].mean()

def weighted_rating(x, m=m, C=C):
    v = x['rating_count']
    R = x['rating']
    
    return (v/(v+m) * R) + (m/(m+v) * C)

# calculate score
movie_rating['score'] = movie_rating.apply(weighted_rating, axis=1)

# sort score
movie_rating = movie_rating.sort_values('score', ascending=False)

# 前15名
movie_rating.head(15)

Unnamed: 0_level_0,rating,rating_count,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
159817,4.47878,754,4.357643
318,4.353308,14193,4.346943
1203,4.287671,2263,4.251665
858,4.243621,7760,4.233341
5618,4.234541,4334,4.216495
296,4.222421,9509,4.214218
2959,4.214777,11931,4.208285
1221,4.218783,4781,4.202693
5971,4.205483,1769,4.16403
3000,4.192801,2028,4.156965


In [None]:
# 去掉rating和rating_count
movie_rating = movie_rating.drop(['rating', 'rating_count'], axis=1)

# 保存trend_movie.csv
movie_rating.to_csv('result/trend_movie.csv', index=True, header=True)