In [5]:
import sys, util

def metrics(trainSet, testSet, recoSet):
    hit = 0
    all = 0
    print recoSet
    print
    print testSet
    for userID in trainSet:
        testItems = testSet.get(userID, { })
        recoItems = recoSet.get(userID, { })

        r = []
        for itemID in recoItems:
            if itemID in testItems:
                print itemID
                sys.exit()
        
    return 1.0 * hit / all
 
def calcItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1
 
    for i, simItems in simMatrix.items():
        for j, simIJ in simItems.items():
            simMatrix[i][j] /= math.sqrt(counter[i] * counter[j])
 
    return simMatrix
 
def calcIUFItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1 / math.log(1 + len(items), math.e)
 
    for i, simItems in simMatrix.items():
        for j, simIJ in simItems.items():
            simMatrix[i][j] /= math.sqrt(counter[i] * counter[j])
 
    return simMatrix
 
def calcNormalizedItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1
 
    for i, simItems in simMatrix.items():
        maxVal = 0
 
        for j, simIJ in simItems.items():
            simItems[j] = simIJ / math.sqrt(counter[i] * counter[j])
            if simItems[j] > maxVal:
                maxVal = simItems[j]
 
        for j, simIJ in simItems.items():
            simItems[j] = simIJ / maxVal
 
    return simMatrix
 
def calcRecommendation(trainSet, userID, simMatrix, K, N):
    import operator
 
    rank = dict()
 
    items = trainSet.get(userID)
    if items == None:
        return rank
 
    for i, rating in items.items():
        simItems = simMatrix.get(i)
        if simItems == None:
            continue
 
        for j, simIJ in sorted(simItems.items(), key = operator.itemgetter(1), reverse = True)[:K]:
            if not j in items:
                rank.setdefault(j, 0)
                rank[j] += rating * simIJ
 
    if len(rank) <= N:
        return rank
 
    ret = dict()
    for itemID, rating in sorted(rank.items(), key = operator.itemgetter(1), reverse = True)[:N]:
        ret[itemID] = rating
    return ret
 
if __name__ == "__main__":
    trainSet = util.loadDataSet("C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/u1.base", "\t")
    testSet  = util.loadDataSet("C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/u1.test", "\t")
    
    simMatrix = calcItemSimilarity(trainSet)
 
    K = 5   # k-nearest neighbors
    N = 40  # n-top recommendations
 
    for K in [ 5, 20, 40, 80, 160 ]:
        recSet = { }
        for userID in trainSet:
            recSet[userID] = calcRecommendation(trainSet, userID, simMatrix, K, N)
        metrics(trainSet, testSet, recSet)
 
        print "n = %s, k = %s, recall = %.4lf, precision = %.4lf, coverage = %.4lf, popularity = %.4lf" %(
            N, K, 
            util.recall(trainSet, testSet, recSet),
            util.precision(trainSet, testSet, recSet),
            util.coverage(trainSet, testSet, recSet),
            util.popularity(trainSet, testSet, recSet)
        )
    print
 
    # inverse user frequence
    print 'inverse user frequence'
    simMatrix = calcIUFItemSimilarity(trainSet)
 
    K = 5   # k-nearest neighbors
    N = 40  # n-top recommendations
 
    for K in [ 5, 20, 40, 80, 160 ]:
        recSet = { }
        for userID in trainSet:
            recSet[userID] = calcRecommendation(trainSet, userID, simMatrix, K, N)
 
        print "n = %s, k = %s, recall = %.4lf, precision = %.4lf, coverage = %.4lf, popularity = %.4lf" %(
            N, K, 
            util.recall(trainSet, testSet, recSet),
            util.precision(trainSet, testSet, recSet),
            util.coverage(trainSet, testSet, recSet),
            util.popularity(trainSet, testSet, recSet)
        )
    print
    
    print 'Normalized Similarity'
    simMatrix = calcNormalizedItemSimilarity(trainSet)
 
    K = 5   # k-nearest neighbors
    N = 40  # n-top recommendations
 
    for K in [ 5, 20, 40, 80, 160 ]:
        recSet = { }
        for userID in trainSet:
            recSet[userID] = calcRecommendation(trainSet, userID, simMatrix, K, N)
 
        print "n = %s, k = %s, recall = %.4lf, precision = %.4lf, coverage = %.4lf, popularity = %.4lf" %(
            N, K, 
            util.recall(trainSet, testSet, recSet),
            util.precision(trainSet, testSet, recSet),
            util.coverage(trainSet, testSet, recSet),
            util.popularity(trainSet, testSet, recSet)
        )
    print

{1: {385: 7.305497494138949, 258: 5.110949341016381, 393: 13.210317413661526, 14: 5.374746532502104, 529: 6.796091666926262, 403: 7.805422623349312, 405: 11.907114664870111, 151: 4.314244002682448, 218: 4.927790797464725, 285: 5.442275714907135, 161: 5.251298808562925, 550: 9.161708126531925, 423: 7.329626058251614, 170: 11.671052818789551, 174: 76.59533634734602, 559: 4.221132789801766, 566: 4.632444217278627, 183: 6.606166346602855, 56: 12.898673583218256, 60: 5.239477674094101, 61: 6.33158298835021, 196: 4.339779643869865, 69: 17.838201741193423, 70: 4.771494755397323, 202: 9.429455558717008, 588: 4.249363784101274, 82: 10.78177255822932, 212: 6.110267220441655, 474: 7.442262740186463, 475: 5.99491385229882, 96: 22.63813431629321, 97: 5.346207382040807, 98: 7.698403248936467, 100: 15.498903318096557, 357: 8.434881388033839, 235: 5.860891169651576, 210: 12.052958922255675, 302: 6.519792231368392, 117: 8.191117977992661, 121: 22.392151776288514}, 2: {896: 1.7820842224272613, 257: 3.20

SystemExit: 

To exit: use 'exit', 'quit', or Ctrl-D.


In [5]:
'''
@author: Lockvictor
'''
import sys, random, math
from operator import itemgetter


random.seed(0)


class ItemBasedCF():
    ''' TopN recommendation - ItemBasedCF '''
    def __init__(self):
        self.trainset = {}
        self.testset = {}

        self.n_sim_movie = 20
        self.n_rec_movie = 10

        self.movie_sim_mat = {}
        self.movie_popular = {}
        self.movie_count = 0

        print >> sys.stderr, 'Similar movie number = %d' % self.n_sim_movie
        print >> sys.stderr, 'Recommended movie number = %d' % self.n_rec_movie


    @staticmethod
    def loadfile(filename):
        ''' load a file, return a generator. '''
        fp = open(filename, 'r')
        for i, line in enumerate(fp):
            yield line.strip('\r\n')
            if i % 100000 == 0:
                print >> sys.stderr, 'loading %s(%s)' % (filename, i)
        fp.close()
        print >> sys.stderr, 'load %s succ' % filename


    def generate_dataset(self, filename, pivot=0.7):
        ''' load rating data and split it to training set and test set '''
        trainset_len = 0
        testset_len = 0

        for line in self.loadfile(filename):
            user, movie, rating, _ = line.split('::')
            # split the data by pivot
            if (random.random() < pivot):
                self.trainset.setdefault(user, {})
                self.trainset[user][movie] = int(round(float(rating)))
                trainset_len += 1
            else:
                self.testset.setdefault(user, {})
                self.testset[user][movie] = int(round(float(rating)))
                testset_len += 1

        print >> sys.stderr, 'split training set and test set succ'
        print >> sys.stderr, 'train set = %s' % trainset_len
        print >> sys.stderr, 'test set = %s' % testset_len


    def calc_movie_sim(self):
        ''' calculate movie similarity matrix '''
        print >> sys.stderr, 'counting movies number and popularity...'

        for user, movies in self.trainset.iteritems():
            for movie in movies:
                # count item popularity 
                if movie not in self.movie_popular:
                    self.movie_popular[movie] = 0
                self.movie_popular[movie] += 1

        print >> sys.stderr, 'count movies number and popularity succ'

        # save the total number of movies
        self.movie_count = len(self.movie_popular)
        print >> sys.stderr, 'total movie number = %d' % self.movie_count

        # count co-rated users between items
        itemsim_mat = self.movie_sim_mat
        print >> sys.stderr, 'building co-rated users matrix...'

        for user, movies in self.trainset.iteritems():
            for m1 in movies:
                for m2 in movies:
                    if m1 == m2: continue
                    itemsim_mat.setdefault(m1,{})
                    itemsim_mat[m1].setdefault(m2,0)
                    itemsim_mat[m1][m2] += 1

        print >> sys.stderr, 'build co-rated users matrix succ'

        # calculate similarity matrix 
        print >> sys.stderr, 'calculating movie similarity matrix...'
        simfactor_count = 0
        PRINT_STEP = 2000000

        for m1, related_movies in itemsim_mat.iteritems():
            for m2, count in related_movies.iteritems():
                itemsim_mat[m1][m2] = count / math.sqrt(
                        self.movie_popular[m1] * self.movie_popular[m2])
                simfactor_count += 1
                if simfactor_count % PRINT_STEP == 0:
                    print >> sys.stderr, 'calculating movie similarity factor(%d)' % simfactor_count

        print >> sys.stderr, 'calculate movie similarity matrix(similarity factor) succ'
        print >> sys.stderr, 'Total similarity factor number = %d' %simfactor_count


    def recommend(self, user):
        ''' Find K similar movies and recommend N movies. '''
        K = self.n_sim_movie
        N = self.n_rec_movie
        rank = {}
        watched_movies = self.trainset[user]

        for movie, rating in watched_movies.iteritems():
            for related_movie, w in sorted(self.movie_sim_mat[movie].items(),
                    key=itemgetter(1), reverse=True)[:K]:
                if related_movie in watched_movies:
                    continue
                rank.setdefault(related_movie, 0)
                rank[related_movie] += w * rating
        # return the N best movies
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N]


    def evaluate(self):
        ''' return precision, recall, coverage and popularity '''
        print >> sys.stderr, 'Evaluation start...'

        N = self.n_rec_movie
        #  varables for precision and recall 
        hit = 0
        rec_count = 0
        test_count = 0
        # varables for coverage
        all_rec_movies = set()
        # varables for popularity
        popular_sum = 0

        for i, user in enumerate(self.trainset):
            if i % 500 == 0:
                print >> sys.stderr, 'recommended for %d users' % i
            test_movies = self.testset.get(user, {})
            rec_movies = self.recommend(user)
            for movie, w in rec_movies:
                if movie in test_movies:
                    hit += 1
                all_rec_movies.add(movie)
                popular_sum += math.log(1 + self.movie_popular[movie])
            rec_count += N
            test_count += len(test_movies)

        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * self.movie_count)
        popularity = popular_sum / (1.0 * rec_count)

        print >> sys.stderr, 'precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' \
                % (precision, recall, coverage, popularity)


if __name__ == '__main__':
    ratingfile = 'C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.dat'
    itemcf = ItemBasedCF()
    itemcf.generate_dataset(ratingfile)
    itemcf.calc_movie_sim()
    itemcf.evaluate()

Similar movie number = 20
Recommended movie number = 10
loading C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.dat(0)


ValueError: invalid literal for int() with base 10: '4.5'