In [1]:
%autosave 0

Autosave disabled


In [2]:
import sys
import math
import random
from operator import itemgetter

In [10]:
class UserBasedCF():
    def __init__(self):
        self.trainset = {}
        self.testset = {}
        
        self.n_sim_user = 20
        self.n_rec_movie = 10
        
        self.user_sim_mat = {}
        self.movie_popular = {}
        self.movie_count = 0
        
        print('similar user number = %d' % self.n_sim_user)
        print('recommended movie number = %d' % self.n_rec_movie)
        
    @staticmethod
    def loadfile(filename):
        fp = open(filename, 'r')
        for i, line in enumerate(fp):
            yield line.strip('\r\n')
            if i > 0 and i % 100000 == 0:
                print('loading %s(%s)' % (filename, i))
        fp.close()
        print('load %s success' % filename)
        
    def generate_dataset(self, filename, pivot=0.7):
        trainset_len = 0
        testset_len = 0
        
        for line in self.loadfile(filename):
            user, movie, rating, _ = line.split('\t')
            if random.random() < pivot:
                self.trainset.setdefault(user, {})
                self.trainset[user][movie] = int(rating)
                trainset_len += 1
            else:
                self.testset.setdefault(user, {})
                self.testset[user][movie] = int(rating)
                testset_len += 1
                
        print('分离训练集和测试集成功')
        print('train set = %s' % trainset_len)
        print('test set = %s' % testset_len)
        
    def calc_user_sim(self):
        print('building movie-users inverse table ...')
        movie2users = dict()
        
        for user, movies in self.trainset.items():
            for movie in movies:
                if movie not in movie2users:
                    movie2users[movie] = set()
                movie2users[movie].add(user)
                if movie not in self.movie_popular:
                    self.movie_popular[movie] = 0
                self.movie_popular[movie] += 1
        print('build movie-users inverse table success')
        
        self.movie_count = len(movie2users)
        print('total movie number = %d' % self.movie_count)
        
        usersim_mat = self.user_sim_mat
        print('building user co-rated movies matrix ...')
        
        for movie, users in movie2users.items():
            for u in users:
                for v in users:
                    if u == v:
                        continue
                    usersim_mat.setdefault(u, {})
                    usersim_mat[u].setdefault(v, 0)
                    usersim_mat[u][v] += 1
        print('build user co-rated movies matrix success')
        
        print('calculating user similarity matrix ...')
        simfactor_count = 0
        PRINT_STEP = 2000000
        for u, related_users in usersim_mat.items():
            for v, count in related_users.items():
                usersim_mat[u][v] = count / math.sqrt(len(self.trainset[u]) * len(self.trainset[v]))
                simfactor_count += 1
                if simfactor_count % PRINT_STEP == 0:
                    print('calculating user similarity factor(%d)' % simfactor_count)
        print('calculate user similarity matrix(similarity factor) success')
        print('Total similarity factor number = %d' % simfactor_count)
        
    def recommend(self, user):
        K = self.n_sim_user
        N = self.n_rec_movie
        rank = dict()
        watched_movies = self.trainset[user]
        
        for v, wuv in sorted(self.user_sim_mat[user].items(),
                            key=itemgetter(1),
                            reverse=True)[0:K]:
            for movie, rating in self.trainset[v].items():
                if movie in watched_movies:
                    continue
                rank.setdefault(movie, 0)
                rank[movie] += wuv * rating
        return sorted(rank.items(),
                     key=itemgetter(1),
                     reverse=True)[0:N]
    
    def evaluate(self):
        print('Evaluation start ...')
        
        N = self.n_rec_movie
        hit = 0
        rec_count = 0
        test_count = 0
        all_rec_movies = set()
        popular_sum = 0
        
        for i, user in enumerate(self.trainset):
            if i > 0 and i % 500 == 0:
                print('recommended for %d users' % i)
            test_movies = self.testset.get(user, {})
            rec_movies = self.recommend(user)
            
            for movie, _ in rec_movies:
                if movie in test_movies:
                    hit += 1
                all_rec_movies.add(movie)
                popular_sum += math.log(1 + self.movie_popular[movie])
            rec_count += N
            test_count += len(test_movies)
            
        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * self.movie_count)
        popularity = popular_sum / (1.0 * rec_count)
        
        print('precision = %.4f \t recall = %.4f \t coverage = %.4f \t popularity = %.4f' % (precision, recall, coverage, popularity))
        

In [11]:
ratingfile = 'u.data'

usercf = UserBasedCF()

similar user number = 20
recommended movie number = 10


In [12]:
usercf.generate_dataset(ratingfile, pivot=0.7)

load u.data success
分离训练集和测试集成功
train set = 70097
test set = 29903


In [13]:
usercf.calc_user_sim()

building movie-users inverse table ...
build movie-users inverse table success
total movie number = 1631
building user co-rated movies matrix ...
build user co-rated movies matrix success
calculating user similarity matrix ...
calculate user similarity matrix(similarity factor) success
Total similarity factor number = 808140


In [14]:
usercf.evaluate()

Evaluation start ...
recommended for 500 users
precision = 0.3888 	 recall = 0.1226 	 coverage = 0.2109 	 popularity = 5.2920


In [25]:
usercf.recommend('186')

[('50', 15.274370411916424),
 ('121', 14.840822341851513),
 ('181', 13.83940152147533),
 ('222', 12.211569195544458),
 ('98', 11.896459703280282),
 ('234', 11.133083591936712),
 ('64', 11.110588745076612),
 ('597', 10.932243501114469),
 ('748', 10.829978114871599),
 ('118', 10.732301023154479)]