In [1]:
import random
import math
import pandas as pd
from operator import itemgetter

In [2]:
class ItemBasedCF():
    # 初始化参数
    def __init__(self):
        # 找到相似的20部电影, 为目标用户推荐10部电影
        self.n_sim_movie = 10
        self.n_rec_movie = 10
        
        # 将数据集分为训练集和测试集
        self.train_dataset = {}
        self.test_dataset = {}
        
        # 电影相似度矩阵
        self.movie_sim_matrix = {}
        self.movie_popular = {}
        self.movie_count = 0
        
        print('Similar movies number = %d' % self.n_sim_movie)
        print('Recommended movies number = %d' % self.n_rec_movie)
        
    
    
    # 建立用户-电影表
    def get_dataset(self, filepath, pivot = 0.75):
        train_dataset_len = 0
        test_dataset_len = 0
        for line in self.load_file(filepath):
            user, movie, rating, timestamp = line.split(',')
            if random.random() < pivot:
                self.train_dataset.setdefault(user, {})
                self.train_dataset[user][movie] = rating
                train_dataset_len += 1
            else:
                self.test_dataset.setdefault(user, {})
                self.test_dataset[user][movie] = rating
                test_dataset_len += 1
        print('Split training dataset and test dataset success!')
        print('Training dataset = %d' % train_dataset_len)
        print('Testing dataset = %d' % test_dataset_len)
    
    
    # 载入文件,返回文件的每一行
    def load_file(self, filepath):
        with open(filepath, 'r') as f:
            for i, line in enumerate(f):
                if i == 0:  # 去掉文件第一行的title
                    continue
                yield line.strip('\r\n')
        print('Succeed in laoding file!')
        
        
    # 计算电影之间的相似度
    def cal_movie_sim(self):
        for user, movies in self.train_dataset.items():
            for movie in movies:
                if movie not in self.movie_popular:
                    self.movie_popular[movie] = 0
                self.movie_popular[movie] += 1
                
        self.movie_count = len(self.movie_popular)
        print('Total movies number = %d' % self.movie_count)
        
        print('Build co-rated users matrix...')
        for user, movies in self.train_dataset.items():
            for m1 in movies:
                for m2 in movies:
                    if m1 == m2:
                        continue
                    self.movie_sim_matrix.setdefault(m1, {})
                    self.movie_sim_matrix[m1].setdefault(m2, 0)
                    self.movie_sim_matrix[m1][m2] += 1
        print('Build co-rated users matrix success!')
        
        # 计算电影之间的相似性
        print('Calculating movie similarity matrix...')
        for m1, related_movies in self.movie_sim_matrix.items():
            n_i = self.movie_popular[m1]
            for m2, count in related_movies.items():
                n_j = self.movie_popular[m2]
                self.movie_sim_matrix[m1][m2] = count / math.sqrt(n_i * n_j)
        print('Calculating movie similarity matrix success!')
        
        
    # 针对目标用户u,找到k部相似的电影,并推荐其N部电影
    def recommend(self, user):
        k = self.n_sim_movie
        n = self.n_rec_movie
        rank = {}
        watched_movies = self.train_dataset[user]
        
        for movie, rating in watched_movies.items():
            k_sim_movies = sorted(self.movie_sim_matrix[movie].items(),
                                  key=itemgetter(1),reverse=True)[0:k]
            for related_movie, w in k_sim_movies:
                if related_movie in watched_movies:
                    continue
                rank.setdefault(related_movie, 0)
                rank[related_movie] += w * float(rating)
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:n]
    
    
    # 输出为用户推荐的电影
    def top_movies(self, filepath, user):
        movies = pd.read_csv(filepath)
        rec_movies = pd.DataFrame(self.recommend(user), columns=['movieId','interest'])
        rec_movies['movieId'] = rec_movies['movieId'].apply(lambda x: int(x))
        top_movies = pd.merge(rec_movies, movies, how='left', on='movieId')
        top_movies['title'] = top_movies['title'].apply(lambda x: 
            ' '.join(x[:-7].split(',')[::-1]).strip() if ',' in x else x[:-7])
        print("For user {:}, 10 recommended movies are as follows: ".format(user))
        print(top_movies[['movieId', 'title']])
        return top_movies[['movieId', 'title']]
        
        
    # 产生推荐并通过准确率、召回率和覆盖率进行评估
    def evaluate(self):
        print('Evaluating start...')
        n = self.n_rec_movie
        # 准确率和召回率
        hit = 0
        rec_count = 0
        test_count = 0
        # 覆盖率
        all_rec_movies = set()
        
        for i, user in enumerate(self.train_dataset):
            test_movies = self.test_dataset.get(user,{})
            rec_movies = self.recommend(user)
            for movie, w in rec_movies:
                if movie in test_movies:
                    hit += 1
                all_rec_movies.add(movie)
            rec_count += n
            test_count += len(test_movies)
            
        precision = hit / rec_count
        recall    = hit / test_count
        coverage  = len(all_rec_movies) / self.movie_count
        result = (precision, recall, coverage)
        print('Precision = %.4f\nRecall = %.4f\nCoverage = %.4f' % result)

In [3]:
if __name__ == '__main__':
    movies_file = './input_data/small/movies.csv'
    rating_file = './input_data/small/ratings.csv'
    user = '2'
    itemCF = ItemBasedCF()
    itemCF.get_dataset(rating_file)
    itemCF.cal_movie_sim()
    itemCF.top_movies(movies_file,user)
    itemCF.evaluate()

Similar movies number = 10
Recommended movies number = 10
Succeed in laoding file!
Split training dataset and test dataset success!
Training dataset = 75690
Testing dataset = 25146
Total movies number = 8795
Build co-rated users matrix...
Build co-rated users matrix success!
Calculating movie similarity matrix...
Calculating movie similarity matrix success!
For user 2, 10 recommended movies are as follows: 
   movieId                     title
0   111759          Edge of Tomorrow
1   109374  The Grand Budapest Hotel
2   122904                  Deadpool
3    59315                  Iron Man
4     2959                Fight Club
5     5418       The Bourne Identity
6    70286                District 9
7   115149                 John Wick
8     2571                The Matrix
9    68954                        Up
Evaluating start...
Precision = 0.2784
Recall = 0.0675
Coverage = 0.0827
