In [5]:
# -*- coding:utf-8 -*-

"""
代码2-1  实例1：搭建你的第一个推荐系统-电影推荐系统
从中随机选择100个与用户进行计算
"""
import os
import json
import random
import math
from tqdm import tqdm 

In [6]:
class FirstRec:
    """
        初始化函数
            filePath: 原始文件路径
            seed：产生随机数的种子
            k：选取的近邻用户个数
            nitems：为每个用户推荐的电影数
    """
    def __init__(self,file_path,seed,k,n_items):
        self.file_path = file_path
        self.users_100 = self.__select_100_users()
        self.seed = seed
        self.k = k
        self.n_items = n_items
        self.train,self.test = self._load_and_split_data()    # 切分训练集和测试集 
        # 单下划线开头的表示的是protected类型的变量，即只能允许其本身与子类进行访问
        # 双下划线的表示的是私有类型的变量，只能是允许这个类本身进行访问了，连子类也不可以

    # 获取所有用户并随机选取100个
    def __select_100_users(self):
        print("随机选取100个用户！")
        if os.path.exists("data/train.json") and os.path.exists("data/test.json"):
            return list()  # 检查路径，否则返回空
        else:
            users = set()
            # 获取所有用户
            for file in tqdm(os.listdir(self.file_path)):
                one_path = "{}/{}".format(self.file_path, file)
#                 print("{}".format(one_path))
                # 数据形式为：文件第一行为电影ID, 后面的数据分为三列，分别为用户ID, 评分值, 时间
                with open(one_path, "r") as fp:
                    for line in fp.readlines():
                        if line.strip().endswith(":"):  # 第一行忽略
                            continue
                        userID, _ , _ = line.split(",")
                        users.add(userID)   # 去除userID
            # 随机选取100个
            users_100 = random.sample(list(users),100)  # 随机选取100个
            print(users_100)
            return users_100

    # 加载数据，并拆分为训练集和测试集
    def _load_and_split_data(self):
        train = dict()
        test = dict()
        if os.path.exists("data/train.json") and os.path.exists("data/test.json"):
            print("从文件中加载训练集和测试集")
            train = json.load(open("data/train.json"))
            test = json.load(open("data/test.json"))
            print("从文件中加载数据完成")
        else:
            # 设置产生随机数的种子，保证每次实验产生的随机结果一致
            random.seed(self.seed)
            for file in tqdm(os.listdir(self.file_path)):
                one_path = "{}/{}".format(self.file_path, file)   # 文件具体路径
#                 print("{}".format(one_path))
                with open(one_path,"r") as fp:
                    movieID = fp.readline().split(":")[0]
                    for line in fp.readlines():
                        if line.endswith(":"):
                            continue
                        userID, rate, _ = line.split(",")
                        # 判断用户是否在所选择的100个用户中
                        if userID in self.users_100:
                            if random.randint(1,20) == 1:  # 49： 1 切分训练集和测试集 
                                test.setdefault(userID, {})[movieID] = int(rate)
                            else:
                                train.setdefault(userID, {})[movieID] = int(rate)
            print("加载数据到 data/train.json 和 data/test.json")
            json.dump(train,open("data/train.json","w"))
            json.dump(test,open("data/test.json","w"))
            print("加载数据完成")
        return train,test

    """
        计算皮尔逊相关系数
            rating1：用户1的评分记录，形式如{"movieid1":rate1,"movieid2":rate2,...}
            rating2：用户1的评分记录，形式如{"movieid1":rate1,"movieid2":rate2,...}
    """
    def pearson(self,rating1,rating2):  # 近似计算皮尔森相关系数
        sum_xy = 0
        sum_x = 0
        sum_y = 0
        sum_x2 = 0
        sum_y2 = 0
        num = 0
        for key in rating1.keys():
            if key in rating2.keys():
                num += 1
                x = rating1[key]
                y = rating2[key]
                sum_xy += x * y
                sum_x += x
                sum_y += y
                sum_x2 += math.pow(x,2)
                sum_y2 += math.pow(y,2)
        if num == 0:
            return  0
        # 皮尔逊相关系数分母
        denominator = math.sqrt( sum_x2 - math.pow(sum_x,2) / num) * math.sqrt( sum_y2 - math.pow(sum_y,2) / num )
        if denominator == 0:
            return  0
        else:
            return ( sum_xy - ( sum_x * sum_y ) / num ) / denominator

    """
        用户userID进行电影推荐
            userID：用户ID
    """
    def recommend(self,userID):
        neighborUser = dict()
        for user in self.train.keys():
            if userID != user:
                distance = self.pearson(self.train[userID],self.train[user])
                neighborUser[user]=distance   # 得到该userID与训练集中每个用户的距离 
        # 字典排序
        newNU = sorted(neighborUser.items(),key = lambda k:k[1] ,reverse=True)  # 降序排序

        movies = dict()
        for (sim_user,sim) in newNU[:self.k]:  # 取前k个 
            for movieID in self.train[sim_user].keys():
                movies.setdefault(movieID,0)
                movies[movieID] += sim * self.train[sim_user][movieID]  
        newMovies = sorted(movies.items(), key = lambda  k:k[1], reverse=True)
        return newMovies

    """
        推荐系统效果评估函数
            num: 随机抽取 num 个用户计算准确率
    """
    def evaluate(self,num=30):
        print("开始计算准确率")
        precisions = list()
        random.seed(10)
        for userID in random.sample(self.test.keys(),num):
            hit = 0
            result = self.recommend(userID)[:self.n_items]
            for (item, rate) in result:
                if item in self.test[userID]:
                    hit += 1
            precisions.append(hit/self.n_items)  # 元素个数为num个
        return  sum(precisions) / precisions.__len__()

In [7]:
# main函数，程序的入口
if __name__ == "__main__":
    file_path = "../data/netflix/training_set"
    seed = 30
    k = 15
    n_items =20
    f_rec = FirstRec(file_path, seed, k, n_items)
    

  0%|                                                                                        | 0/17770 [00:00<?, ?it/s]

随机选取100个用户！


100%|███████████████████████████████████████████████████████████████████████████| 17770/17770 [02:30<00:00, 117.77it/s]
  0%|                                                                                        | 0/17770 [00:00<?, ?it/s]

['437082', '249762', '1302196', '2506851', '1604982', '342326', '342296', '162804', '1985084', '2350329', '2625415', '2367733', '250865', '1049731', '2010413', '835731', '1862165', '1193529', '391666', '668364', '1467578', '1468142', '757762', '816573', '143634', '614126', '763064', '912010', '1481817', '2569850', '398293', '2135939', '2095334', '522324', '1579863', '1387224', '2170076', '828759', '2118465', '1623359', '1843812', '583557', '40149', '1347243', '2624988', '1096183', '603781', '41561', '1529631', '2292139', '231965', '1320133', '1342847', '1409611', '1731131', '3096', '1837949', '15893', '213782', '949016', '1430551', '1532491', '2028847', '1977019', '2438114', '2261600', '1368667', '91476', '1426473', '1029173', '181620', '177338', '2189241', '1395436', '1925055', '772917', '250613', '1651443', '1688232', '965678', '1466125', '420036', '1801898', '1177689', '2262869', '2563198', '1078516', '813700', '143426', '342659', '2074048', '770031', '912488', '2082527', '690064', 

100%|████████████████████████████████████████████████████████████████████████████| 17770/17770 [05:49<00:00, 50.81it/s]

加载数据到 data/train.json 和 data/test.json
加载数据完成





In [8]:
 # 计算用户 1261880 和 2183135 的皮尔逊相关系数
r = f_rec.pearson(f_rec.train["1261880"],f_rec.train["2183135"])
print("1261880 和 2183135的皮尔逊相关系数为：{}".format(r))
# 为用户 1261880 进行电影推荐
result = f_rec.recommend("1261880")
print("为用户ID为1261880的推荐结果：{}".format(result))
print("算法的推荐准确率为: {}".format(f_rec.evaluate()))

1261880 和 2183135的皮尔逊相关系数为：0
为用户ID为1261880的推荐结果：[('13050', 23.18643530474817), ('12232', 17.4279894234192), ('2152', 16.234161538717597), ('6287', 15.745829364577617), ('6206', 15.704151445182093), ('1145', 14.877998166276516), ('11149', 14.567124304293266), ('6037', 14.289029855357564), ('17324', 13.919647479510926), ('483', 13.847094776993192), ('11607', 13.765113445777637), ('5496', 12.904206385983125), ('11521', 12.68501733522172), ('12161', 12.484547469859162), ('1542', 12.193541031475835), ('4745', 12.121747723966495), ('6030', 12.076890205019215), ('7617', 11.853775553681269), ('12582', 11.822907165191587), ('14454', 11.746147351069524), ('10583', 11.739118326341123), ('3106', 11.699185960700294), ('16139', 11.618724494048914), ('1962', 11.467653673748869), ('2452', 11.421383691597883), ('14103', 11.412090756622108), ('10947', 11.124081579118181), ('357', 11.103113578440977), ('10550', 11.022483496344979), ('2122', 10.996759575368145), ('1865', 10.901293609101437), ('6386', 10.8