In [13]:
import random
import math
import pandas as pd
from operator import itemgetter

In [88]:
class UserBasedCF():
    # 初始化相关参数
    def __init__(self):
        # 找到与目标用户兴趣相似的20个用户,为其推荐10部电影
        self.n_sim_user = 20
        self.n_rec_movie = 10
        
        # 将数据集划分为训练集和测试集
        self.train_dataset = {}
        self.test_dataset = {}
        
        # 用户相似度矩阵
        self.user_sim_matrix = {}
        self.movie_count = 0
        
        print('Similar user number = %d' % self.n_sim_user)
        print('Recommended movie number = %d' % self.n_rec_movie)
        
    
    # 建立“用户-电影”表
    def get_dataset(self, filepath, pivot = 0.75):
        train_dataset_len = 0
        test_dataset_len = 0
        for line in self.load_file(filepath):
            user, movie, rating, timetamp = line.split(',')
            if random.random() < pivot:
                self.train_dataset.setdefault(user, {})
                self.train_dataset[user][movie] = rating
                train_dataset_len += 1
            else:
                self.test_dataset.setdefault(user, {})
                self.test_dataset[user][movie] =  rating
                test_dataset_len += 1
        print('Split training dataset and testing dataset success!')
        print('Training dataset = %d' % train_dataset_len)
        print('Testing dataset = %d' % test_dataset_len)
        
        
    # 载入数据集文件, 并返回文件的每一行
    def load_file(self, filepath):
        with open(filepath, 'r') as f:
            for i, line in enumerate(f):
                if i == 0: # 去掉表格的第一行, 即列名
                    continue
                yield line.strip('\r\n')
        print('Succeed to loading file!')
            
        
    # 计算用户之间的相似度
    def cal_user_sim(self):
        # 构件“电影-用户”倒排索引
        # key = movieId, value = list of userId who have seen this movie
        print('Building movie-user table...')
        movie_user = {}
        for user, movies in self.train_dataset.items():
            for movie in movies:
                if movie not in movie_user:
                    movie_user[movie] = set()
                movie_user[movie].add(user)
        print('Build movie-user table success!')
            
        self.movie_count = len(movie_user)
        print('Total movie number = %d' % self.movie_count)
            
        print('Build user co-rated movies matrix...')
        for movie, users in movie_user.items():
            for u in users:
                for v in users:
                    if u == v:
                        continue
                    self.user_sim_matrix.setdefault(u, {})
                    self.user_sim_matrix[u].setdefault(v, 0)
                    self.user_sim_matrix[u][v] += 1
        print('Build user co-rated movies matrix success!')
            
        # 计算相似性
        print('Calculating user similarity matrix...')
        for u, related_users in self.user_sim_matrix.items():
            for v, count in related_users.items():
                n_u = len(self.train_dataset[u])
                n_v = len(self.test_dataset[v])
                self.user_sim_matrix[u][v] = count / math.sqrt(n_u * n_v)
        print('Calculate user similarity matrix success!')
            
    
    # 建立为用户推荐电影的id以及兴趣
    def recommend(self, user):
        k = self.n_sim_user
        n = self.n_rec_movie
        rank = {}
        watched_movies = self.train_dataset[user]
            
        # v=similar user, w_uv = similar factor
        k_sim_users = sorted(self.user_sim_matrix[user].items(),
                            key=itemgetter(1), reverse=True)[0:k]
        for v,w_uv in k_sim_users:
            for movie, rating in self.train_dataset[v].items():
                if movie in watched_movies:
                    continue
                rank.setdefault(movie, 0)
                rank[movie] += w_uv
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:n]
        
    
    # 输出为用户推荐的电影
    def top_n_movies(self, filepath, user):
        movies = pd.read_csv(filepath)
        rec_movies = pd.DataFrame(self.recommend(user),columns=['movieId','interest'])
        rec_movies['movieId'] = rec_movies['movieId'].apply(lambda x: int(x))
        top_movies = pd.merge(rec_movies,movies,how='left',on='movieId')
        top_movies['title'] = top_movies['title'].apply(lambda x: 
            ' '.join(x[:-7].split(',')[::-1]).strip() if ',' in x else x[:-7])
        print("For user {:}, 10 movies are recommended as follows:".format(user))
        print(top_movies[['movieId','title']])
        return top_movies[['movieId','title']]
    
                          
    # 产生推荐并通过准确率、召回率和覆盖率进行评估
    def evaluate(self):
        print('Evaluation start...')
        n = self.n_rec_movie
        # 准确率和召回率
        hit = 0
        rec_count = 0
        test_count = 0
        # 覆盖率
        all_rec_movies = set()
        
        for i, user in enumerate(self.train_dataset):
            test_movies = self.test_dataset.get(user, {})
            rec_movies = self.recommend(user)
            for movie, w in rec_movies:
                if movie in test_movies:
                    hit += 1
                all_rec_movies.add(movie)
            rec_count += n
            test_count += len(test_movies)
            
        precision = hit / rec_count
        recall    = hit / test_count
        coverate  = len(all_rec_movies) / self.movie_count
        result = (precision, recall, coverate)
        print('Precision = %.4f\nRecall = %.4f\nCoverage = %.4f' % result)

In [100]:
if __name__ == '__main__':
    movies_file = './input_data/small/movies.csv'
    ratings_file = './input_data/small/ratings.csv'
    user = '2'
    userCF = UserBasedCF()
    userCF.get_dataset(ratings_file)
    userCF.cal_user_sim()
    userCF.top_n_movies(movies_file,user)
    userCF.evaluate()

Similar user number = 20
Recommended movie number = 10
Succeed to loading file!
Split training dataset and testing dataset success!
Training dataset = 75379
Testing dataset = 25457
Building movie-user table...
Build movie-user table success!
Total movie number = 8767
Build user co-rated movies matrix...
Build user co-rated movies matrix success!
Calculating user similarity matrix...
Calculate user similarity matrix success!
For user 2, 10 movies are recommended as follows:
   movieId                                          title
0    58559                                The Dark Knight
1     2959                                     Fight Club
2     2571                                     The Matrix
3      318                       The Shawshank Redemption
4      356                                   Forrest Gump
5      527                               Schindler's List
6    59315                                       Iron Man
7     2028                            Saving Private Ryan
