In [1]:
import codecs
from math import sqrt

In [2]:
class recommender:

    def __init__(self, data, k=1, metric='cos', n=5):
        """ 初始化推荐模块
            data   训练数据
            k      K邻近算法中的值
            metric 使用何种距离计算方式
            n      推荐结果的数量
        """        
        self.k = k
        self.n = n
        self.username2id = {}
        self.userid2name = {}
        self.productid2name = {}
        # 将距离计算方式保存下来
        self.metric = metric
        if self.metric == 'pearson':
            self.fn = self.pearson
        if self.metric == 'cos':
            self.fn = self.cos_similarity
        #
        # 如果data是一个字典类型，则保存下来，否则忽略
        #
        if type(data).__name__ == 'dict':
            self.data = data



In [3]:
class recommender(recommender):
    def pearson(self, rating1, rating2):
        sum_xy = 0 
        sum_x = 0 
        sum_y = 0
        sum_x2 = 0
        sum_y2 = 0
        n = 0
        for key in rating1:
            if key in rating2:
                n += 1
                x = rating1[key]
                y = rating2[key]
                sum_xy += x * y
                sum_x += x
                sum_y += y
                sum_x2 += pow(x, 2)
                sum_y2 += pow(y, 2)
        # now compute denominator
        if n == 0:
            return 0
        denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n)
        if denominator == 0:
            return 0
        else:
            return (sum_xy - (sum_x * sum_y) / n) / denominator
    

In [4]:
class recommender(recommender):
    # 为了方便 这里直接使用余弦相似度计算
    def cos_similarity(self, rating1, rating2):
        sum_xy = 0
        _x = 0
        _y = 0
        sum_x2 = 0
        sum_y2 = 0
        sum_xy = 0
        for key in rating1:
            if key in rating2:
                x = rating1[key]
                y = rating2[key]
                xy = x * y
                x2 = x * x
                y2 = y * y
                sum_x2 += x2
                sum_y2 += y2
                sum_xy += xy

        _x = sqrt(sum_x2)
        _y = sqrt(sum_y2)
        if _x * _y == 0:
            return 0
        else:
            cos_xy = sum_xy / (_x * _y)
            return cos_xy
        
#     def cos_distance(self, rating1, rating2):
#         return 1 - self.cos_similarity(rating1, rating2)

In [5]:
class recommender(recommender):
    def convertProductID2name(self, id):
        """通过产品ID获取名称"""
        if id in self.productid2name:
            return self.productid2name[id]
        else:
            return id

In [6]:
class recommender(recommender):
    def loadBookDB(self, path=''):
        """加载BX数据集，path是数据文件位置"""
        self.data = {}
        i = 0
        #
        # 将书籍评分数据放入self.data
        #
        f = codecs.open(path + "BX-Book-Ratings.csv", 'r', 'utf8')
        for line in f:
            i += 1
            #separate line into fields
            fields = line.split(';')
            user = fields[0].strip('"')
            book = fields[1].strip('"')
            rating = int(fields[2].strip().strip('"'))
            if user in self.data:
                currentRatings = self.data[user]
            else:
                currentRatings = {}
            currentRatings[book] = rating
            self.data[user] = currentRatings
        f.close()

        # 将书籍信息存入self.productid2name
        # 包括isbn号、书名、作者等
        f = codecs.open(path + "BX-Books.csv", 'r', 'utf8')
        for line in f:
            i += 1
            #separate line into fields
            fields = line.split(';')
            isbn = fields[0].strip('"')
            title = fields[1].strip('"')
            author = fields[2].strip().strip('"')
            title = title + ' by ' + author
            self.productid2name[isbn] = title
        f.close()

        #  将用户信息存入self.userid2name和self.username2id
        f = codecs.open(path + "BX-Users.csv", 'r', 'utf8')
        for line in f:
            i += 1
            #separate line into fields
            fields = line.split(';')
            userid = fields[0].strip('"')
            location = fields[1].strip('"')
            if len(fields) > 3:
                age = fields[2].strip().strip('"')
            else:
                age = 'NULL'
            if age != 'NULL':
                value = location + '  (age: ' + age + ')'
            else:
                value = location
            self.userid2name[userid] = value
            self.username2id[location] = userid
        f.close()
        print(i)

In [7]:
class recommender(recommender):
    
    def computeNearestNeighbor(self, username):
        """creates a sorted list of users based on their distance to
            username"""
        distances = []
        for instance in self.data:
            if instance != username:
                distance = self.fn(self.data[username],self.data[instance])
                distances.append((instance, distance))
        # sort based on distance -- closest first
        distances.sort(key=lambda artistTuple: artistTuple[1],
                       reverse=True)
        return distances

In [17]:
class recommender(recommender):
    def recommend(self, user):
        """Give list of recommendations"""
        recommendations = {}
        # first get list of users  ordered by nearness
        nearest = self.computeNearestNeighbor(user)
        #
        # now get the ratings for the user
        #
        userRatings = self.data[user]
        #
        # determine the total distance
        totalDistance = 0.0
        for i in range(self.k):
            totalDistance += nearest[i][1]
        
        if totalDistance == 0.0:
            return []
        # now iterate through the k nearest neighbors
        # accumulating their rating
        for i in range(self.k):
            # compute slice of pie 
            weight = nearest[i][1] / totalDistance
            # get the name of the person
            name = nearest[i][0]
            # get the ratings for this person
            neighborRatings = self.data[name]
            # get the name of the person
            # now find bands neighbor rated that user didn't
            for artist in neighborRatings:
                if not artist in userRatings:
                    if artist not in recommendations:
                        recommendations[artist] = (neighborRatings[artist]
                                                  * weight)
                    else:
                        recommendations[artist] = (recommendations[artist]
                                                  + neighborRatings[artist]
                                                  * weight)
        # now make list from dictionary
        recommendations = list(recommendations.items())
        recommendations = [(self.convertProductID2name(k), v) 
                           for (k, v) in recommendations]
        # finally sort and return
        recommendations.sort(key=lambda artistTuple: artistTuple[1],
                        reverse = True)
        # Return the first n items
        return recommendations[:self.n]

In [50]:
instance = recommender({},3,'cos',3)
instance.loadBookDB('./BX-Dump/')

1700018


In [51]:
instance.recommend('197502')

[('0752524593', 3.333333333333333),
 ('Unadulterated Cat by Terry Pratchett', 3.333333333333333),
 ("The World of the Dark Crystal: The Collector's Edition by Brian Froud",
  3.333333333333333)]