In [2]:
import pandas as pd
import random

In [3]:
class MostPopular():
    # 初始化参数
    def __init__(self):
        # 推荐最流行且用户未有过交互行为的10部电影
        self.n_rec_movies = 10
        
        # 训练集与测试集
        self.ratio = 0.7
        self.train = {}
        self.test = {}
        
        # 电影元数据
        self.movies = 0
        self.movie_count = 0
        
        # 电影评分权重计算参数
        self.quantile = 0.8
        self.m = 0
        self.c = 0
        
        print('Recommended movies number = %d' % self.n_rec_movies)
        
    # 载入数据    
    def load_file(self, filepath):
        with open(filepath, 'r') as f:
            for i, line in enumerate(f):
                if i == 0:
                    continue
                yield line.strip('\r\n')
        print("Succeed in loading file!")
        
    
    # 划分训练、测试数据集
    def get_dataset(self, filepath):
        train_len = 0
        test_len  = 0
        for line in self.load_file(filepath):
            user, movieId, rating, timestamp = line.split(',')
            if random.random() < self.ratio:
                self.train.setdefault(user,[])
                self.train[user].append(movieId)
                train_len += 1
            else:
                self.test.setdefault(user, [])
                self.test[user].append(movieId)
                test_len += 1
        print('Succeed in spliting training dataset and testing dataset!')
        print('Training dataset = %d' % train_len)
        print('Testing dataset = %d' % test_len)

    
    # 处理电影title, 
    # “Shining, The (1980)”→“The Shining”
    # "Toy Story (1995)"→“Toy Story"
    def reset_title(self, x):
        if ',' in x:
            return ' '.join(x[:-7].split(',')[::-1]).strip()
        else:
            return x[:-7]
        
    
    # 计算评分权重
    def get_weight(self, x):
        v = x['rate_count']
        r = x['avg_rating']
        return (v / (v + self.m) * r + self.m / (v + self.m) * self.c)
    
    # 计算电影评分（被评次数、平均评分、评分权重）
    def movie_metadata(self, movies_csv, ratings_csv):
        print("Preparing movie metadate...")
        avg_rating = dict()
        rate_count = dict()
        self.movies = pd.read_csv(movies_csv)
        ratings = pd.read_csv(ratings_csv)
        self.movie_count = len(self.movies)
        
        self.movies['year'] = self.movies['title'].apply(lambda x: x[:-5:-1])
        self.movies['title'] = self.movies['title'].apply(self.reset_title)
        
        for movieId in set(ratings['movieId']):
            sum_rating = sum(ratings[ratings['movieId'] == movieId]['rating'])
            num_rating = len(ratings[ratings['movieId'] == movieId]['rating'])
            avg_rating[movieId] = round(sum_rating / num_rating, 2)
            rate_count[movieId] = num_rating
            
        r_cnt = pd.DataFrame(rate_count, index=[0]).transpose().reset_index()
        r_cnt.columns = ['movieId', 'rate_count']
        
        r_avg = pd.DataFrame(avg_rating, index=[0]).transpose().reset_index()
        r_avg.columns = ['movieId', 'avg_rating']
        
        r_cnt_avg = pd.merge(r_cnt, r_avg, how='left', on='movieId')
        self.movies = pd.merge(self.movies, r_cnt_avg, how='left', on='movieId')
        
        self.m = self.movies['rate_count'].quantile(self.quantile)
        self.c = self.movies['avg_rating'].mean()
        
        self.movies['r_weight'] = self.movies.apply(self.get_weight, axis=1)
        self.movies = self.movies.sort_values('r_weight', ascending=False)
        
        print("Succeed in preparing Movie metadate!")
    
    
    # 推荐最流行且用户未有过交互行为的10部电影
    def recommend(self, user):
        n = self.n_rec_movies
        watched = [int(i) for i in self.train[user]]
        un_watched = list(set(self.movies['movieId']) - set(watched))
        df_uw = pd.DataFrame(un_watched, columns=['movieId'])
        rec_movies = pd.merge(self.movies, df_uw, on='movieId')
        top_movies = rec_movies[['movieId', 'title']][:n]
        return top_movies
        
    
    # 产生推荐并通过准确率、召回率和覆盖率进行评估
    def evaluate(self):
        print("Evaluation start...")
        n = self.n_rec_movies
        # 准确率和召回率
        hit = 0
        rec_count = 0
        test_count = 0
        # 覆盖率
        all_rec_movies = set()
        
        for user in self.train:
            test_movies = self.test[user]
            top_movies = self.recommend(user)
            rec_movies = list(set(top_movies['movieId']))
            for movie in rec_movies:
                if str(movie) in test_movies:
                    hit += 1
                all_rec_movies.add(movie)
            rec_count += n
            test_count += len(test_movies)
            
        precision = hit / rec_count
        recall    = hit / test_count
        coverate  = len(all_rec_movies) / self.movie_count
        
        result = (precision, recall, coverate)
        print('Precision = %.4f\nRecall = %.4f\nCoverage = %.4f' % result)
    

In [4]:
if __name__ == '__main__':
    movies_file = './input_data/small/movies.csv'
    ratings_file = './input_data/small/ratings.csv'
    user = '3'
    mostpopular = MostPopular()
    mostpopular.get_dataset(ratings_file)
    mostpopular.movie_metadata(movies_file,ratings_file)
    print("for user {:}, 10 movies are recommended as follows:".format(user))
    print(mostpopular.recommend(user))
    mostpopular.evaluate()

Recommended movies number = 10
Succeed in loading file!
Succeed in spliting training dataset and testing dataset!
Training dataset = 70324
Testing dataset = 30512
Preparing movie metadate...
Succeed in preparing Movie metadate!
for user 3, 10 movies are recommended as follows:
   movieId                                           title
0      318                        The Shawshank Redemption
1      858                                   The Godfather
2     2959                                      Fight Club
3      260              Star Wars: Episode IV - A New Hope
4       50                              The Usual Suspects
5     1221                          The Godfather: Part II
6     1196  Star Wars: Episode V - The Empire Strikes Back
7    58559                                 The Dark Knight
8      296                                    Pulp Fiction
9     1213                                      Goodfellas
Evaluation start...
Precision = 0.1569
Recall = 0.0314
Coverage = 0.0037
