In [9]:
import sys, util
import metrics
import numpy as np


def metric(trainSet, testSet, recoSet,k):
    pres = 0
    ndcg = 0
    count  = 0
    for userID in trainSet:
        testItems = testSet.get(userID, { })
        recoItems = recoSet.get(userID, { })
        
        r = []
        for itemID, rat in recoItems.iteritems():
            if itemID in testItems:
                rating = testItems.get(itemID)
                r.append(0 if rat-0.49 < rating else 1)
        if len(r) > 1:
            count += 1
            pres += metrics.precision_at_k(r,min(k, len(r)))
        ndcg += metrics.ndcg_at_k(r,k)
    if count == 0:
        count = 1
    return 1.0*pres/count,1.0*ndcg/count
 
def calcItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1
 
    for i, simItems in simMatrix.items():
        for j, simIJ in simItems.items():
            simMatrix[i][j] /= math.sqrt(counter[i] * counter[j])
 
    return simMatrix
 
def calcIUFItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1 / math.log(1 + len(items), math.e)
 
    for i, simItems in simMatrix.items():
        for j, simIJ in simItems.items():
            simMatrix[i][j] /= math.sqrt(counter[i] * counter[j])
 
    return simMatrix
 
def calcNormalizedItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1
 
    for i, simItems in simMatrix.items():
        maxVal = 0
 
        for j, simIJ in simItems.items():
            simItems[j] = simIJ / math.sqrt(counter[i] * counter[j])
            if simItems[j] > maxVal:
                maxVal = simItems[j]
 
        for j, simIJ in simItems.items():
            simItems[j] = simIJ / maxVal
 
    return simMatrix
 
def calcRecommendation(trainSet, userID, simMatrix, K, N):
    import operator
 
    rank = dict()
 
    items = trainSet.get(userID)
    if items == None:
        return rank
 
    for i, rating in items.items():
        simItems = simMatrix.get(i)
        if simItems == None:
            continue
 
        for j, simIJ in sorted(simItems.items(), key = operator.itemgetter(1), reverse = True)[:K]:
            if not j in items:
                rank.setdefault(j, 0)
                rank[j] += rating * simIJ
 
    if len(rank) <= N:
        return rank
 
    ret = dict()
    for itemID, rating in sorted(rank.items(), key = operator.itemgetter(1), reverse = True)[:N]:
        ret[itemID] = rating
    return ret
 
if __name__ == "__main__":
    trainSet = util.loadDataSet("/home/clay/nir/movielens/ml-100k/u1.base", "\t")
    testSet  = util.loadDataSet("/home/clay/nir/movielens/ml-100k/u1.test", "\t")
    
    simMatrix = calcItemSimilarity(trainSet)
 
    K = 5   # k-nearest neighbors
    N = 40  # n-top recommendations
    for N in [1, 3, 5, 10, 40]:
        for K in [ 5, 20, 40, 80, 160 ]:
            recSet = { }
            for userID in trainSet:
                recSet[userID] = calcRecommendation(trainSet, userID, simMatrix, K, N)
            pres, ndcg=metric(trainSet, testSet, recSet,K)

            print "n = %s, k = %s, recall = %.4lf, precision = %.4lf, coverage = %.4lf, popularity = %.4lf, p_at_k = %.4lf, ndcg = %.4lf" %(
                N, K, 
                util.recall(trainSet, testSet, recSet),
                util.precision(trainSet, testSet, recSet),
                util.coverage(trainSet, testSet, recSet),
                util.popularity(trainSet, testSet, recSet),
                pres, ndcg
            )
        print
    print
    # inverse user frequence
    print 'inverse user frequence'
    simMatrix = calcIUFItemSimilarity(trainSet)
 
    K = 5   # k-nearest neighbors
    N = 40  # n-top recommendations
 
    for N in [1, 3, 5, 10, 40]:
        for K in [ 5, 20, 40, 80, 160 ]:
            recSet = { }
            for userID in trainSet:
                recSet[userID] = calcRecommendation(trainSet, userID, simMatrix, K, N)
            pres, ndcg=metric(trainSet, testSet, recSet,K)

            print "n = %s, k = %s, recall = %.4lf, precision = %.4lf, coverage = %.4lf, popularity = %.4lf, p_at_k = %.4lf, ndcg = %.4lf" %(
                N, K, 
                util.recall(trainSet, testSet, recSet),
                util.precision(trainSet, testSet, recSet),
                util.coverage(trainSet, testSet, recSet),
                util.popularity(trainSet, testSet, recSet),
                pres, ndcg
            )
        print
    print
    
    print 'Normalized Similarity'
    simMatrix = calcNormalizedItemSimilarity(trainSet)
 
    K = 5   # k-nearest neighbors
    N = 40  # n-top recommendations
 
    for N in [1, 3, 5, 10, 40]:
        for K in [ 5, 20, 40, 80, 160 ]:
            recSet = { }
            for userID in trainSet:
                recSet[userID] = calcRecommendation(trainSet, userID, simMatrix, K, N)
            pres, ndcg=metric(trainSet, testSet, recSet,K)

            print "n = %s, k = %s, recall = %.4lf, precision = %.4lf, coverage = %.4lf, popularity = %.4lf, p_at_k = %.4lf, ndcg = %.4lf" %(
                N, K, 
                util.recall(trainSet, testSet, recSet),
                util.precision(trainSet, testSet, recSet),
                util.coverage(trainSet, testSet, recSet),
                util.popularity(trainSet, testSet, recSet),
                pres, ndcg
            )
        print

n = 1, k = 5, recall = 0.0137, precision = 0.2906, coverage = 0.0418, popularity = 5.6771, p_at_k = 0.0000, ndcg = 273.0000
n = 1, k = 20, recall = 0.0138, precision = 0.2916, coverage = 0.0436, popularity = 5.6805, p_at_k = 0.0000, ndcg = 274.0000
n = 1, k = 40, recall = 0.0129, precision = 0.2725, coverage = 0.0442, popularity = 5.6986, p_at_k = 0.0000, ndcg = 256.0000
n = 1, k = 80, recall = 0.0132, precision = 0.2800, coverage = 0.0497, popularity = 5.7069, p_at_k = 0.0000, ndcg = 264.0000
n = 1, k = 160, recall = 0.0140, precision = 0.2959, coverage = 0.0521, popularity = 5.7178, p_at_k = 0.0000, ndcg = 279.0000

n = 3, k = 5, recall = 0.0357, precision = 0.2524, coverage = 0.0842, popularity = 5.5956, p_at_k = 0.9829, ndcg = 1.5641
n = 3, k = 20, recall = 0.0355, precision = 0.2506, coverage = 0.0758, popularity = 5.6261, p_at_k = 1.0000, ndcg = 1.5489
n = 3, k = 40, recall = 0.0340, precision = 0.2400, coverage = 0.0788, popularity = 5.6252, p_at_k = 1.0000, ndcg = 1.6239
n = 3,

In [6]:
'''
@author: Lockvictor
'''
import sys, random, math
from operator import itemgetter


random.seed(0)


class ItemBasedCF():
    ''' TopN recommendation - ItemBasedCF '''
    def __init__(self):
        self.trainset = {}
        self.testset = {}

        self.n_sim_movie = 20
        self.n_rec_movie = 10

        self.movie_sim_mat = {}
        self.movie_popular = {}
        self.movie_count = 0

        print >> sys.stderr, 'Similar movie number = %d' % self.n_sim_movie
        print >> sys.stderr, 'Recommended movie number = %d' % self.n_rec_movie


    @staticmethod
    def loadfile(filename):
        ''' load a file, return a generator. '''
        fp = open(filename, 'r')
        for i, line in enumerate(fp):
            yield line.strip('\r\n')
            if i % 100000 == 0:
                print >> sys.stderr, 'loading %s(%s)' % (filename, i)
        fp.close()
        print >> sys.stderr, 'load %s succ' % filename


    def generate_dataset(self, filename, pivot=0.7):
        ''' load rating data and split it to training set and test set '''
        trainset_len = 0
        testset_len = 0

        for line in self.loadfile(filename):
            user, movie, rating, _ = line.split('::')
            # split the data by pivot
            if (random.random() < pivot):
                self.trainset.setdefault(user, {})
                self.trainset[user][movie] = int(round(float(rating)))
                trainset_len += 1
            else:
                self.testset.setdefault(user, {})
                self.testset[user][movie] = int(round(float(rating)))
                testset_len += 1

        print >> sys.stderr, 'split training set and test set succ'
        print >> sys.stderr, 'train set = %s' % trainset_len
        print >> sys.stderr, 'test set = %s' % testset_len


    def calc_movie_sim(self):
        ''' calculate movie similarity matrix '''
        print >> sys.stderr, 'counting movies number and popularity...'

        for user, movies in self.trainset.iteritems():
            for movie in movies:
                # count item popularity 
                if movie not in self.movie_popular:
                    self.movie_popular[movie] = 0
                self.movie_popular[movie] += 1

        print >> sys.stderr, 'count movies number and popularity succ'

        # save the total number of movies
        self.movie_count = len(self.movie_popular)
        print >> sys.stderr, 'total movie number = %d' % self.movie_count

        # count co-rated users between items
        itemsim_mat = self.movie_sim_mat
        print >> sys.stderr, 'building co-rated users matrix...'

        for user, movies in self.trainset.iteritems():
            for m1 in movies:
                for m2 in movies:
                    if m1 == m2: continue
                    itemsim_mat.setdefault(m1,{})
                    itemsim_mat[m1].setdefault(m2,0)
                    itemsim_mat[m1][m2] += 1

        print >> sys.stderr, 'build co-rated users matrix succ'

        # calculate similarity matrix 
        print >> sys.stderr, 'calculating movie similarity matrix...'
        simfactor_count = 0
        PRINT_STEP = 2000000

        for m1, related_movies in itemsim_mat.iteritems():
            for m2, count in related_movies.iteritems():
                itemsim_mat[m1][m2] = count / math.sqrt(
                        self.movie_popular[m1] * self.movie_popular[m2])
                simfactor_count += 1
                if simfactor_count % PRINT_STEP == 0:
                    print >> sys.stderr, 'calculating movie similarity factor(%d)' % simfactor_count

        print >> sys.stderr, 'calculate movie similarity matrix(similarity factor) succ'
        print >> sys.stderr, 'Total similarity factor number = %d' %simfactor_count


    def recommend(self, user):
        ''' Find K similar movies and recommend N movies. '''
        K = self.n_sim_movie
        N = self.n_rec_movie
        rank = {}
        watched_movies = self.trainset[user]

        for movie, rating in watched_movies.iteritems():
            for related_movie, w in sorted(self.movie_sim_mat[movie].items(),
                    key=itemgetter(1), reverse=True)[:K]:
                if related_movie in watched_movies:
                    continue
                rank.setdefault(related_movie, 0)
                rank[related_movie] += w * rating
        # return the N best movies
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N]


    def evaluate(self):
        ''' return precision, recall, coverage and popularity '''
        print >> sys.stderr, 'Evaluation start...'

        N = self.n_rec_movie
        #  varables for precision and recall 
        hit = 0
        rec_count = 0
        test_count = 0
        # varables for coverage
        all_rec_movies = set()
        # varables for popularity
        popular_sum = 0

        for i, user in enumerate(self.trainset):
            if i % 500 == 0:
                print >> sys.stderr, 'recommended for %d users' % i
            test_movies = self.testset.get(user, {})
            rec_movies = self.recommend(user)
            for movie, w in rec_movies:
                if movie in test_movies:
                    hit += 1
                all_rec_movies.add(movie)
                popular_sum += math.log(1 + self.movie_popular[movie])
            rec_count += N
            test_count += len(test_movies)

        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * self.movie_count)
        popularity = popular_sum / (1.0 * rec_count)

        print >> sys.stderr, 'precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' \
                % (precision, recall, coverage, popularity)


if __name__ == '__main__':
    ratingfile = 'C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.dat'
    itemcf = ItemBasedCF()
    itemcf.generate_dataset(ratingfile)
    itemcf.calc_movie_sim()
    itemcf.evaluate()

Similar movie number = 20
Recommended movie number = 10
loading C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.dat(0)
loading C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.dat(100000)
loading C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.dat(200000)
loading C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.dat(300000)
loading C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.dat(400000)
loading C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.dat(500000)
loading C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.dat(600000)
loading C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.dat(700000)
loading C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.dat(800000)
loading C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.dat(900000)
loading C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.dat(1000000)
loading C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/ratings.

KeyboardInterrupt: 

In [1]:
import sys, util
import metrics
import numpy as np


def metric(trainSet, testSet, recoSet,k):
    pres = 0
    ndcg = 0
    count  = 0
    for userID in trainSet:
        testItems = testSet.get(userID, { })
        recoItems = recoSet.get(userID, { })
        
        r = []
        for itemID, rat in recoItems.iteritems():
            if itemID in testItems:
                rating = testItems.get(itemID)
                r.append(rating)
        if len(r) > 1:
            count += 1
            pres += metrics.precision_at_k(r,min(k, len(r)))
        ndcg += metrics.ndcg_at_k(r,k)
    if count == 0:
        count = 1
    return 1.0*pres/count,1.0*ndcg/count
 
def calcItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1
 
    for i, simItems in simMatrix.items():
        for j, simIJ in simItems.items():
            simMatrix[i][j] /= math.sqrt(counter[i] * counter[j])
 
    return simMatrix
 
def calcIUFItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1 / math.log(1 + len(items), math.e)
 
    for i, simItems in simMatrix.items():
        for j, simIJ in simItems.items():
            simMatrix[i][j] /= math.sqrt(counter[i] * counter[j])
 
    return simMatrix
 
def calcNormalizedItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1
 
    for i, simItems in simMatrix.items():
        maxVal = 0
 
        for j, simIJ in simItems.items():
            simItems[j] = simIJ / math.sqrt(counter[i] * counter[j])
            if simItems[j] > maxVal:
                maxVal = simItems[j]
 
        for j, simIJ in simItems.items():
            simItems[j] = simIJ / maxVal
 
    return simMatrix
 
def calcRecommendation(trainSet, userID, simMatrix, K, N):
    import operator
 
    rank = dict()
 
    items = trainSet.get(userID)
    if items == None:
        return rank
 
    for i, rating in items.items():
        simItems = simMatrix.get(i)
        if simItems == None:
            continue
 
        for j, simIJ in sorted(simItems.items(), key = operator.itemgetter(1), reverse = True)[:K]:
            if not j in items:
                rank.setdefault(j, 0)
                rank[j] += rating * simIJ
 
    if len(rank) <= N:
        return rank
 
    ret = dict()
    for itemID, rating in sorted(rank.items(), key = operator.itemgetter(1), reverse = True)[:N]:
        ret[itemID] = rating
    return ret
 
if __name__ == "__main__":
    trainSet = util.loadDataSet("/home/clay/nir/movielens/ml-100k/u1.base", "\t")
    testSet  = util.loadDataSet("/home/clay/nir/movielens/ml-100k/u1.test", "\t")
    
    simMatrix = calcItemSimilarity(trainSet)
 
    K = 5   # k-nearest neighbors
    N = 40  # n-top recommendations
    for N in [1, 3, 5, 10, 40]:
        for K in [ 5, 20, 40, 80, 160 ]:
            recSet = { }
            for userID in trainSet:
                recSet[userID] = calcRecommendation(trainSet, userID, simMatrix, K, N)
            pres, ndcg=metric(trainSet, testSet, recSet,K)

            print "n = %s, k = %s, recall = %.4lf, precision = %.4lf, coverage = %.4lf, popularity = %.4lf, p_at_k = %.4lf, ndcg = %.4lf" %(
                N, K, 
                util.recall(trainSet, testSet, recSet),
                util.precision(trainSet, testSet, recSet),
                util.coverage(trainSet, testSet, recSet),
                util.popularity(trainSet, testSet, recSet),
                pres, ndcg
            )
        print
    print
    # inverse user frequence
    print 'inverse user frequence'
    simMatrix = calcIUFItemSimilarity(trainSet)
 
    K = 5   # k-nearest neighbors
    N = 40  # n-top recommendations
 
    for N in [1, 3, 5, 10, 40]:
        for K in [ 5, 20, 40, 80, 160 ]:
            recSet = { }
            for userID in trainSet:
                recSet[userID] = calcRecommendation(trainSet, userID, simMatrix, K, N)
            pres, ndcg=metric(trainSet, testSet, recSet,K)

            print "n = %s, k = %s, recall = %.4lf, precision = %.4lf, coverage = %.4lf, popularity = %.4lf, p_at_k = %.4lf, ndcg = %.4lf" %(
                N, K, 
                util.recall(trainSet, testSet, recSet),
                util.precision(trainSet, testSet, recSet),
                util.coverage(trainSet, testSet, recSet),
                util.popularity(trainSet, testSet, recSet),
                pres, ndcg
            )
        print
    print
    
    print 'Normalized Similarity'
    simMatrix = calcNormalizedItemSimilarity(trainSet)
 
    K = 5   # k-nearest neighbors
    N = 40  # n-top recommendations
 
    for N in [1, 3, 5, 10, 40]:
        for K in [ 5, 20, 40, 80, 160 ]:
            recSet = { }
            for userID in trainSet:
                recSet[userID] = calcRecommendation(trainSet, userID, simMatrix, K, N)
            pres, ndcg=metric(trainSet, testSet, recSet,K)

            print "n = %s, k = %s, recall = %.4lf, precision = %.4lf, coverage = %.4lf, popularity = %.4lf, p_at_k = %.4lf, ndcg = %.4lf" %(
                N, K, 
                util.recall(trainSet, testSet, recSet),
                util.precision(trainSet, testSet, recSet),
                util.coverage(trainSet, testSet, recSet),
                util.popularity(trainSet, testSet, recSet),
                pres, ndcg
            )
        print

n = 1, k = 5, recall = 0.0137, precision = 0.2906, coverage = 0.0418, popularity = 5.6771, p_at_k = 0.0000, ndcg = 274.0000
n = 1, k = 20, recall = 0.0138, precision = 0.2916, coverage = 0.0436, popularity = 5.6805, p_at_k = 0.0000, ndcg = 275.0000
n = 1, k = 40, recall = 0.0129, precision = 0.2725, coverage = 0.0442, popularity = 5.6986, p_at_k = 0.0000, ndcg = 257.0000
n = 1, k = 80, recall = 0.0132, precision = 0.2800, coverage = 0.0497, popularity = 5.7069, p_at_k = 0.0000, ndcg = 264.0000
n = 1, k = 160, recall = 0.0140, precision = 0.2959, coverage = 0.0521, popularity = 5.7178, p_at_k = 0.0000, ndcg = 279.0000

n = 3, k = 5, recall = 0.0357, precision = 0.2524, coverage = 0.0842, popularity = 5.5956, p_at_k = 1.0000, ndcg = 1.5681
n = 3, k = 20, recall = 0.0355, precision = 0.2506, coverage = 0.0758, popularity = 5.6261, p_at_k = 1.0000, ndcg = 1.5430
n = 3, k = 40, recall = 0.0340, precision = 0.2400, coverage = 0.0788, popularity = 5.6252, p_at_k = 1.0000, ndcg = 1.6189
n = 3,

In [28]:
import sys, util
import metrics
import numpy as np


def metric(trainSet, testSet, recoSet,k):
    pres = 0
    ndcg = 0
    count  = 0
    for userID in trainSet:
        testItems = testSet.get(userID, { })
        recoItems = recoSet.get(userID, { })
        print testItems
        print
        print recoItems
        sys.exit()
        r = []
        for itemID, rat in recoItems.iteritems():
            if itemID in testItems:
                rating = testItems.get(itemID)
                r.append(rating)
        if len(r) > 1:
            count += 1
            pres += metrics.precision_at_k(r,min(k, len(r)))
        ndcg += metrics.ndcg_at_k(r,k)
    if count == 0:
        count = 1
    return 1.0*pres/count,1.0*ndcg/count
 
def calcItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1
 
    for i, simItems in simMatrix.items():
        for j, simIJ in simItems.items():
            simMatrix[i][j] /= math.sqrt(counter[i] * counter[j])
 
    return simMatrix
 
def calcIUFItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1 / math.log(1 + len(items), math.e)
 
    for i, simItems in simMatrix.items():
        for j, simIJ in simItems.items():
            simMatrix[i][j] /= math.sqrt(counter[i] * counter[j])
 
    return simMatrix
 
def calcNormalizedItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1
 
    for i, simItems in simMatrix.items():
        maxVal = 0
 
        for j, simIJ in simItems.items():
            simItems[j] = simIJ / math.sqrt(counter[i] * counter[j])
            if simItems[j] > maxVal:
                maxVal = simItems[j]
 
        for j, simIJ in simItems.items():
            simItems[j] = simIJ / maxVal
 
    return simMatrix
 
def calcRecommendation(trainSet, userID, simMatrix, K, N):
    import operator
 
    rank = dict()
 
    items = trainSet.get(userID)
    if items == None:
        return rank
 
    for i, rating in items.items():
        simItems = simMatrix.get(i)
        if simItems == None:
            continue
 
        for j, simIJ in sorted(simItems.items(), key = operator.itemgetter(1), reverse = True)[:K]:
            if not j in items:
                rank.setdefault(j, 0)
                rank[j] += rating * simIJ
 
    if len(rank) <= N:
        return rank
 
    ret = dict()
    for itemID, rating in sorted(rank.items(), key = operator.itemgetter(1), reverse = True)[:N]:
        ret[itemID] = rating
    return ret
 
if __name__ == "__main__":
    trainSet = util.loadDataSet("C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/u1.base", "\t")
    testSet  = util.loadDataSet("C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/u1.test", "\t")
    
    simMatrix = calcItemSimilarity(trainSet)
 
    K = 5   # k-nearest neighbors
    N = 40  # n-top recommendations
    for N in [5, 10, 40]:
        for K in [ 5, 20, 40, 80, 160 ]:
            recSet = { }
            for userID in trainSet:
                recSet[userID] = calcRecommendation(trainSet, userID, simMatrix, K, N)
            
            print "n = %s, k = %s, recall = %.4lf, precision = %.4lf, coverage = %.4lf, popularity = %.4lf" %(
                N, K, 
                util.recall(trainSet, testSet, recSet),
                util.precision(trainSet, testSet, recSet),
                util.coverage(trainSet, testSet, recSet),
                util.popularity(trainSet, testSet, recSet)
            )
        print
    print
    # inverse user frequence
    print 'inverse user frequence'
    simMatrix = calcIUFItemSimilarity(trainSet)
 
    K = 5   # k-nearest neighbors
    N = 40  # n-top recommendations
 
    for N in [1, 3, 5, 10, 40]:
        for K in [ 5, 20, 40, 80, 160 ]:
            recSet = { }
            for userID in trainSet:
                recSet[userID] = calcRecommendation(trainSet, userID, simMatrix, K, N)
            
            print "n = %s, k = %s, recall = %.4lf, precision = %.4lf, coverage = %.4lf, popularity = %.4lf" %(
                N, K, 
                util.recall(trainSet, testSet, recSet),
                util.precision(trainSet, testSet, recSet),
                util.coverage(trainSet, testSet, recSet),
                util.popularity(trainSet, testSet, recSet)
            )
        print
    print
    
    print 'Normalized Similarity'
    simMatrix = calcNormalizedItemSimilarity(trainSet)
 
    K = 5   # k-nearest neighbors
    N = 40  # n-top recommendations
 
    for N in [1, 3, 5, 10, 40]:
        for K in [ 5, 20, 40, 80, 160 ]:
            recSet = { }
            for userID in trainSet:
                recSet[userID] = calcRecommendation(trainSet, userID, simMatrix, K, N)

            print "n = %s, k = %s, recall = %.4lf, precision = %.4lf, coverage = %.4lf, popularity = %.4lf" %(
                N, K, 
                util.recall(trainSet, testSet, recSet),
                util.precision(trainSet, testSet, recSet),
                util.coverage(trainSet, testSet, recSet),
                util.popularity(trainSet, testSet, recSet)
            )
        print
    print recSet

n = 5, k = 5, recall = 0.0558, precision = 0.2365, coverage = 0.1285, popularity = 5.5340
n = 5, k = 20, recall = 0.0554, precision = 0.2350, coverage = 0.1012, popularity = 5.5869
n = 5, k = 40, recall = 0.0538, precision = 0.2282, coverage = 0.0976, popularity = 5.5803
n = 5, k = 80, recall = 0.0543, precision = 0.2303, coverage = 0.0982, popularity = 5.5852
n = 5, k = 160, recall = 0.0566, precision = 0.2401, coverage = 0.1018, popularity = 5.5933

n = 10, k = 5, recall = 0.0973, precision = 0.2064, coverage = 0.2370, popularity = 5.3876
n = 10, k = 20, recall = 0.0990, precision = 0.2101, coverage = 0.1473, popularity = 5.5068
n = 10, k = 40, recall = 0.0968, precision = 0.2053, coverage = 0.1345, popularity = 5.5006


KeyboardInterrupt: 

In [3]:
dict = {6: 5, 10: 3, 12: 5, 14: 5, 17: 3, 20: 4, 23: 4, 24: 3, 27: 2, 31: 3, 33: 4, 36: 2, 39: 4, 44: 5, 47: 4, 49: 3, 51: 4, 53: 3, 54: 3, 56: 4, 60: 5, 61: 4, 62: 3, 64: 5, 65: 4, 67: 3, 69: 3, 70: 3, 72: 4, 73: 3, 74: 1, 76: 4, 78: 1, 80: 4, 81: 5, 82: 5, 84: 4, 85: 3, 86: 5, 90: 4, 91: 5, 92: 3, 96: 5, 97: 3, 98: 4, 100: 5, 102: 2, 103: 1, 104: 1, 107: 4, 108: 5, 112: 1, 113: 5, 114: 5, 117: 3, 118: 3, 120: 1, 121: 4, 125: 3, 128: 4, 129: 5, 130: 3, 132: 4, 134: 4, 140: 1, 143: 1, 145: 2, 148: 2, 150: 5, 151: 4, 154: 5, 155: 2, 157: 4, 159: 3, 160: 4, 161: 4, 163: 4, 164: 3, 170: 5, 171: 5, 174: 5, 175: 5, 177: 5, 180: 3, 183: 5, 184: 4, 185: 4, 186: 4, 188: 3, 189: 3, 190: 5, 193: 4, 196: 5, 200: 3, 201: 3, 202: 5, 206: 4, 208: 5, 209: 4, 210: 4, 212: 4, 213: 2, 214: 4, 215: 3, 218: 3, 219: 1, 221: 5, 222: 4, 224: 5, 225: 2, 226: 3, 227: 4, 228: 5, 229: 4, 230: 4, 232: 3, 233: 2, 235: 5, 236: 4, 241: 4, 242: 5, 243: 1, 248: 4, 250: 4, 252: 2, 253: 5, 254: 1, 255: 2, 258: 5, 259: 1, 260: 1, 262: 3, 264: 2, 265: 4, 266: 1, 267: 4, 272: 3}

In [5]:
import numpy as np
np.mean(np.array(dict.values()))

3.5401459854014599

In [7]:
np.array(dict.values())

array([5, 1, 1, 5, 2, 4, 3, 4, 5, 5, 3, 3, 4, 4, 3, 2, 3, 4, 2, 3, 4, 5, 4,
       3, 4, 3, 3, 4, 1, 5, 4, 3, 5, 4, 3, 3, 3, 4, 3, 1, 4, 1, 4, 5, 5, 4,
       3, 5, 4, 5, 3, 5, 3, 4, 5, 2, 1, 1, 4, 5, 1, 5, 5, 3, 3, 1, 4, 3, 4,
       5, 3, 4, 4, 1, 1, 2, 2, 5, 4, 5, 2, 4, 3, 4, 4, 4, 3, 5, 5, 5, 5, 5,
       3, 5, 4, 4, 4, 3, 3, 5, 4, 5, 3, 3, 5, 4, 5, 4, 4, 4, 2, 4, 3, 3, 1,
       5, 4, 5, 2, 3, 4, 5, 4, 4, 3, 2, 5, 4, 4, 5, 1, 4, 4, 2, 5, 1, 2])

In [21]:
x = np.eye(4)
x[0][1] = 11
x

array([[  1.,  11.,   0.,   0.],
       [  0.,   1.,   0.,   0.],
       [  0.,   0.,   1.,   0.],
       [  0.,   0.,   0.,   1.]])

In [23]:
x[0]

array([  1.,  11.,   0.,   0.])

In [10]:
x[np.nonzero(x)]

array([ 1.])

In [9]:
x

array([ 0.,  1.,  0.])

In [42]:
import sys, util
import metrics
import numpy as np


def metric(trainSet, testSet, recoSet,k):
    pres = 0
    ndcg = 0
    count  = 0
    for userID in trainSet:
        testItems = testSet.get(userID, { })
        recoItems = recoSet.get(userID, { })
        print testItems
        print
        print recoItems
        sys.exit()
        r = []
        for itemID, rat in recoItems.iteritems():
            if itemID in testItems:
                rating = testItems.get(itemID)
                r.append(rating)
        if len(r) > 1:
            count += 1
            pres += metrics.precision_at_k(r,min(k, len(r)))
        ndcg += metrics.ndcg_at_k(r,k)
    if count == 0:
        count = 1
    return 1.0*pres/count,1.0*ndcg/count
 
def calcItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1
    print simMatrix
    print simItems
    sys.exit()
    for i, simItems in simMatrix.items():
        for j, simIJ in simItems.items():
            simMatrix[i][j] /= math.sqrt(counter[i] * counter[j])
 
    return simMatrix
 
def calcIUFItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1 / math.log(1 + len(items), math.e)
 
    for i, simItems in simMatrix.items():
        for j, simIJ in simItems.items():
            simMatrix[i][j] /= math.sqrt(counter[i] * counter[j])
 
    return simMatrix
 
def calcNormalizedItemSimilarity(trainSet):
    import math
 
    counter   = dict()
    simMatrix = dict()    
 
    for userID, items in trainSet.items():
        for i in items:
            counter.setdefault(i, 0)
            counter[i] += 1
 
            simItems = simMatrix.setdefault(i, dict())
 
            for j in items:
                if j != i:
                    simItems.setdefault(j, 0)
                    simItems[j] += 1

    for i, simItems in simMatrix.items():
        maxVal = 0
 
        for j, simIJ in simItems.items():
            
            simItems[j] = simIJ / math.sqrt(counter[i] * counter[j])
            print simItems
            sys.exit()
            if simItems[j] > maxVal:
                maxVal = simItems[j]
        print maxVal
        sys.exit()
        for j, simIJ in simItems.items():
            simItems[j] = simIJ / maxVal
 
    return simMatrix
 
def calcRecommendation(trainSet, userID, simMatrix, K, N):
    import operator
 
    rank = dict()
 
    items = trainSet.get(userID)
    if items == None:
        return rank
 
    for i, rating in items.items():
        simItems = simMatrix.get(i)
        if simItems == None:
            continue
 
        for j, simIJ in sorted(simItems.items(), key = operator.itemgetter(1), reverse = True)[:K]:
            if not j in items:
                rank.setdefault(j, 0)
                rank[j] += rating * simIJ
 
    if len(rank) <= N:
        return rank
 
    ret = dict()
    for itemID, rating in sorted(rank.items(), key = operator.itemgetter(1), reverse = True)[:N]:
        ret[itemID] = rating
    return ret
 
if __name__ == "__main__":
    trainSet = util.loadDataSet("C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/u1.base", "\t")
    testSet  = util.loadDataSet("C:/Users/Fizik/Downloads/coursera/nir/nir/movielens/u1.test", "\t")
    
    
    print 'Normalized Similarity'
    simMatrix = calcNormalizedItemSimilarity(trainSet)
 
    K = 5   # k-nearest neighbors
    N = 40  # n-top recommendations
 
    for N in [1, 3, 5, 10, 40]:
        for K in [ 5, 20, 40, 80, 160 ]:
            recSet = { }
            for userID in trainSet:
                recSet[userID] = calcRecommendation(trainSet, userID, simMatrix, K, N)
            
            print recSet
            sys.exit()

            print "n = %s, k = %s, recall = %.4lf, precision = %.4lf, coverage = %.4lf, popularity = %.4lf" %(
                N, K, 
                util.recall(trainSet, testSet, recSet),
                util.precision(trainSet, testSet, recSet),
                util.coverage(trainSet, testSet, recSet),
                util.popularity(trainSet, testSet, recSet)
            )
        print
    print recSet

Normalized Similarity
{2: 0.3690096157031852, 3: 60, 4: 106, 5: 38, 6: 9, 7: 203, 8: 110, 9: 146, 10: 44, 11: 112, 12: 114, 13: 92, 14: 78, 15: 162, 16: 22, 17: 43, 18: 2, 19: 31, 20: 34, 21: 51, 22: 141, 23: 77, 24: 99, 25: 160, 26: 30, 27: 28, 28: 151, 29: 60, 30: 15, 31: 81, 32: 34, 33: 45, 34: 2, 35: 5, 36: 5, 37: 3, 38: 62, 39: 39, 40: 26, 41: 17, 42: 70, 43: 21, 44: 35, 45: 33, 46: 12, 47: 62, 48: 46, 49: 46, 50: 278, 51: 36, 52: 36, 53: 61, 54: 46, 55: 66, 56: 171, 57: 14, 58: 87, 59: 26, 60: 24, 61: 19, 62: 64, 63: 55, 64: 133, 65: 63, 66: 79, 67: 52, 68: 70, 69: 157, 70: 114, 71: 120, 72: 74, 73: 70, 74: 1, 75: 2, 76: 20, 77: 67, 78: 18, 79: 167, 80: 38, 81: 57, 82: 135, 83: 87, 84: 13, 85: 33, 86: 66, 87: 79, 88: 104, 89: 136, 90: 50, 91: 81, 92: 42, 93: 59, 94: 71, 95: 137, 96: 153, 97: 125, 98: 168, 99: 94, 100: 225, 101: 44, 102: 27, 103: 7, 104: 2, 105: 48, 106: 39, 107: 20, 108: 36, 109: 74, 110: 14, 111: 153, 112: 13, 113: 5, 114: 32, 115: 8, 116: 49, 117: 214, 118: 170

SystemExit: 

To exit: use 'exit', 'quit', or Ctrl-D.
