In [1]:
import pandas as pd
import random
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.stem.snowball import SnowballStemmer

In [2]:
class ContentBased():
    # 初始化参数
    def __init__(self):
        # 推荐与用户喜好最相似的10部电影
        self.n_rec_movies = 10
        
        # 训练集与测试集
        self.ratio = 0.7
        self.train = {}
        self.test = {}
        
        # 电影元数据
        self.movies = 0
        self.tags = 0
        self.movie_count = 0
        
        # 电影评分权重计算参数
        self.quantile = 0.95
        self.m = 0
        self.c = 0
        
        # 电影相似度矩阵
        self.cosine_sim = 0
        
        print('Recommended movies number = %d' % self.n_rec_movies)
        
        
    # 载入数据    
    def load_file(self, filepath):
        with open(filepath, 'r') as f:
            for i, line in enumerate(f):
                if i == 0:
                    continue
                yield line.strip('\r\n')
        print("Succeed in loading file!")
        
    
    # 划分训练、测试数据集
    def get_dataset(self, filepath):
        train_len = 0
        test_len  = 0
        for line in self.load_file(filepath):
            user, movieId, rating, timestamp = line.split(',')
            if random.random() < self.ratio:
                self.train.setdefault(user,[])
                self.train[user].append(movieId)
                train_len += 1
            else:
                self.test.setdefault(user, [])
                self.test[user].append(movieId)
                test_len += 1
        print('Succeed in spliting training dataset and testing dataset!')
        print('Training dataset = %d' % train_len)
        print('Testing dataset = %d' % test_len)

    
    # 处理电影title, 
    # “Shining, The (1980)”→“The Shining”
    # "Toy Story (1995)"→“Toy Story"
    def reset_title(self, x):
        if ',' in x:
            return ' '.join(x[:-7].split(',')[::-1]).strip()
        else:
            return x[:-7]
        
    
    # 计算评分权重
    def get_weight(self, x):
        v = x['rate_count']
        r = x['avg_rating']
        return (v / (v + self.m) * r + self.m / (v + self.m) * self.c)
    
    
    # 计算电影评分（被评次数、平均评分、评分权重）
    def movie_metadata(self, movies_csv, ratings_csv):
        print("Preparing movie metadate...")
        avg_rating = dict()
        rate_count = dict()
        self.movies = pd.read_csv(movies_csv)
        ratings = pd.read_csv(ratings_csv)
        self.movie_count = len(self.movies)
        
        self.movies['year'] = self.movies['title'].apply(lambda x: x[:-5:-1])
        self.movies['title'] = self.movies['title'].apply(self.reset_title)
        self.movies['genres'] = self.movies['genres'].apply(lambda x: x.split('|'))
        
        for movieId in set(ratings['movieId']):
            sum_rating = sum(ratings[ratings['movieId'] == movieId]['rating'])
            num_rating = len(ratings[ratings['movieId'] == movieId]['rating'])
            avg_rating[movieId] = round(sum_rating / num_rating, 2)
            rate_count[movieId] = num_rating
            
        r_cnt = pd.DataFrame(rate_count, index=[0]).transpose().reset_index()
        r_cnt.columns = ['movieId', 'rate_count']
        
        r_avg = pd.DataFrame(avg_rating, index=[0]).transpose().reset_index()
        r_avg.columns = ['movieId', 'avg_rating']
        
        r_cnt_avg = pd.merge(r_cnt, r_avg, how='left', on='movieId')
        self.movies = pd.merge(self.movies, r_cnt_avg, how='left', on='movieId')
        
        self.m = self.movies['rate_count'].quantile(self.quantile)
        self.movies = self.movies[self.movies['rate_count'] >= self.m]
        self.c = self.movies['avg_rating'].mean()
        
        self.movies['r_weight'] = self.movies.apply(self.get_weight, axis=1)
        self.movies = self.movies.sort_values('r_weight', ascending=False)
        
        print("Succeed in preparing Movie metadate!")
    
    
    # 提取电影标签
    def tag_extraction(self, tag):
        stemmer = SnowballStemmer('english')
        d = dict()
        for m_id in set(tag['movieId']):
            m_tag = tag[tag['movieId'] == m_id]['tag']
            d[m_id] = str(list(set([stemmer.stem(i) for i in m_tag])))
        return d
    
    
    # 计算电影相似度
    def cal_movie_sim(self, tags_csv):
        self.tags = pd.read_csv(tags_csv)
        s = self.tags[['movieId','tag']]
        m_tags = pd.DataFrame(self.tag_extraction(s), index=[0]).transpose().reset_index()
        m_tags.columns = ['movieId','tag']
        m_tags['tag'] = m_tags['tag'].apply(literal_eval)
        self.movies = self.movies.merge(m_tags, how='left', on='movieId')
        self.movies['tag'] = self.movies['tag'].apply(lambda x: [] if str(x) == 'nan' else x)
        self.movies['description'] = self.movies['genres'] + self.movies['tag']
        self.movies['description'] = self.movies['description'].apply(lambda x: ' '.join(x))
        
        tf = TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')
        tfidf_matrix = tf.fit_transform(self.movies['description'])
        self.cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)    
        
        
    # 根据用户历史行为筛选出相似电影合集，按照评分权重排序选出前10部电影
    def recommend(self, user):
        n = self.n_rec_movies
        rec = set()
        columns = self.movies['movieId']
        index = self.movies['movieId']
        m_sim = pd.DataFrame(self.cosine_sim,columns=columns,index=index)
        watched = [int(i) for i in self.train[user]]
        for movie in watched:
            if movie in m_sim.columns:
                top_5 = list(m_sim.sort_values(movie,ascending=False).index[1:6])
                for i in top_5:
                    if i not in watched:
                        rec.add(i)
        rec_df = pd.DataFrame(list(rec), columns=['movieId'])
        rec_movies = rec_df.merge(self.movies,how='left',on='movieId')
        rec_movies = rec_movies.sort_values('avg_rating',ascending=False)
        top_movies = rec_movies[['movieId','title']][:n]
        return top_movies

    
    # 产生推荐并通过准确率、召回率和覆盖率进行评估
    def evaluate(self):
        print("Evaluation start...")
        n = self.n_rec_movies
        # 准确率和召回率
        hit = 0
        rec_count = 0
        test_count = 0
        # 覆盖率
        all_rec_movies = set()
        
        for user in self.train:
            test_movies = self.test[user]
            top_movies = self.recommend(user)
            rec_movies = list(set(top_movies['movieId']))
            for movie in rec_movies:
                if str(movie) in test_movies:
                    hit += 1
                all_rec_movies.add(movie)
            rec_count += n
            test_count += len(test_movies)
            
        precision = hit / rec_count
        recall    = hit / test_count
        coverate  = len(all_rec_movies) / self.movie_count
        
        result = (precision, recall, coverate)
        print('Precision = %.4f\nRecall = %.4f\nCoverage = %.4f' % result)
    

In [3]:
if __name__ == '__main__':
    movies_file = './input_data/small/movies.csv'
    ratings_file = './input_data/small/ratings.csv'
    tags_file = './input_data/small/tags.csv'
    user = '3'
    engine = ContentBased()
    engine.get_dataset(ratings_file)
    engine.movie_metadata(movies_file,ratings_file)
    engine.cal_movie_sim(tags_file)
    print("for user {:}, 10 movies are recommended as follows:".format(user))
    print(engine.recommend(user))
    engine.evaluate()

Recommended movies number = 10
Succeed in loading file!
Succeed in spliting training dataset and testing dataset!
Training dataset = 70489
Testing dataset = 30347
Preparing movie metadate...
Succeed in preparing Movie metadate!
for user 3, 10 movies are recommended as follows:
    movieId                                         title
16    81845                             The King's Speech
6       778                                 Trainspotting
17      953                         It's a Wonderful Life
5     81834  Harry Potter and the Deathly Hallows: Part 1
14     2997                          Being John Malkovich
1      4034                                       Traffic
12    59315                                      Iron Man
8      2797                                           Big
15     6870                                  Mystic River
13     8784                                  Garden State
Evaluation start...
Precision = 0.1441
Recall = 0.0290
Coverage = 0.0239
