In [1]:
# 模型保存和加载
import os
import pickle

class ModelManager():
    def save(filename, *data):
        with open(filename, "wb") as f:
            for d in data:
                pickle.dump(d, f)

    def load(filename, num=1):
        with open(filename, "rb") as f:
            ret = []
            for i in range(num):
                data = pickle.load(f)
                ret.append(data)
        return 

In [3]:
# 数据加载
import random
import pandas as pd

def LoadMovieLensData(filepath, train_rate):
    ratings = pd.read_table(filepath, sep="::", header=None, names=["UserID", "MovieID", "Rating", "TimeStamp"],\
                            engine='python')
    # 这里为了简便只取前两列
    ratings = ratings[['UserID','MovieID']]

    train = []
    test = []
    random.seed(3)
    # 对数据集进行拆分，分别产出训练集和测试集
    for idx, row in ratings.iterrows():
        user = int(row['UserID'])
        item = int(row['MovieID'])
        if random.random() < train_rate:
            train.append([user, item])
        else:
            test.append([user, item])
    return PreProcessData(train), PreProcessData(test)

def PreProcessData(originData):
    """
    建立User-Item表，结构如下：
        {"User1": {MovieID1, MoveID2, MoveID3,...}
         "User2": {MovieID12, MoveID5, MoveID8,...}
         ...
        }
    """
    trainData = dict()
    for user, item in originData:
        trainData.setdefault(user, set())
        trainData[user].add(item)
    return trainData

In [5]:
import math
from collections import defaultdict
from operator import itemgetter

class UserCF_Model(object):
    """ User based Collaborative Filtering Algorithm Implementation"""
    def __init__(self, trainData, similarity="cosine"):
        self._trainData = trainData
        self._similarity = similarity
        self._userSimMatrix = dict() # 用户相似度矩阵

    def similarity(self):
        # 建立User-Item倒排表
        item_user = dict()
        for user, items in self._trainData.items():
            for item in items:
                item_user.setdefault(item, set())
                item_user[item].add(user)

        # 建立用户物品交集矩阵W, 其中C[u][v]代表的含义是用户u和用户v之间共同喜欢的物品数
        for item, users in item_user.items():
            for u in users:
                for v in users:
                    if u == v:
                        continue
                    self._userSimMatrix.setdefault(u, defaultdict(int))
                    if self._similarity == "cosine":
                        self._userSimMatrix[u][v] += 1 #将用户u和用户v共同喜欢的物品数量加一
                    elif self._similarity == "iif":
                        self._userSimMatrix[u][v] += 1. / math.log(1 + len(users))

        # 建立用户相似度矩阵
        for u, related_user in self._userSimMatrix.items():
            # 相似度公式为 |N[u]∩N[v]|/sqrt(N[u]||N[v])
            for v, cuv in related_user.items():
                nu = len(self._trainData[u])
                nv = len(self._trainData[v])
                self._userSimMatrix[u][v] = cuv / math.sqrt(nu * nv)

    def recommend(self, user, N, K):
        """
        用户u对物品i的感兴趣程度：
            p(u,i) = ∑WuvRvi
            其中Wuv代表的是u和v之间的相似度， Rvi代表的是用户v对物品i的感兴趣程度，因为采用单一行为的隐反馈数据，所以Rvi=1。
            所以这个表达式的含义是，要计算用户u对物品i的感兴趣程度，则要找到与用户u最相似的K个用户，对于这k个用户喜欢的物品且用户u
            没有反馈的物品，都累加用户u与用户v之间的相似度。
        :param user: 被推荐的用户user
        :param N: 推荐的商品个数
        :param K: 查找的最相似的用户个数
        :return: 按照user对推荐物品的感兴趣程度排序的N个商品
        """
        recommends = dict()
        # 先获取user具有正反馈的item数组
        related_items = self._trainData[user]
        # 将其他用户与user按照相似度逆序排序之后取前K个
        for v, sim in sorted(self._userSimMatrix[user].items(), key=itemgetter(1), reverse=True)[:K]:
            # 从与user相似的用户的喜爱列表中寻找可能的物品进行推荐
            for item in self._trainData[v]:
                # 如果与user相似的用户喜爱的物品与user喜欢的物品重复了，直接跳过
                if item in related_items:
                    continue
                recommends.setdefault(item, 0.)
                recommends[item] += sim
        # 根据被推荐物品的相似度逆序排列，然后推荐前N个物品给到用户
        return dict(sorted(recommends.items(), key=itemgetter(1), reverse=True)[:N])

    def train(self):
        try:
            print("start load user similarity matrix")
            self._userSimMatrix = ModelManager.load("./TrainedModels/usercf.pkl")[0]
        except BaseException as e:
            print("Exception occurs: " + str(e))
            print("load user similarity matrix failed, start train...")
            self.similarity()
            # save user similarity matrix
            ModelManager.save("./TrainedModels/usercf.pkl", self._userSimMatrix)

In [8]:

if __name__ == "__main__":
    ####################################################################################
    # UserCF 基于用户的协同过滤算法
    ####################################################################################
    train, test = LoadMovieLensData("./dataset/ml-1m/ratings.dat", 0.8)
    print("train data size: %d, test data size: %d" % (len(train), len(test)))
    UserCF = UserCF_Model(train)
    UserCF.train()

    # print(UserCF.recommend(list(test.keys())[0], 5, 80))
    # print(UserCF.recommend(list(test.keys())[1], 5, 80))
    # print(UserCF.recommend(list(test.keys())[2], 5, 80))
    # print(UserCF.recommend(list(test.keys())[3], 5, 80))

    # 选取与user最相似的80个用户，并且从这些用户喜爱的物品中选取5个推荐给user
    print("start recommend ...")
    cnt = 0
    for user in test.keys():
        print(UserCF.recommend(user, 5, 80))
        cnt += 1
        if cnt == 5:
            break

    ############################# Toy Example ##############################
    # train = dict({'A':['a','b','d'], 'B':['a','c','d'], 'C':['b','e'], 'D':['c','d','e']})
    # test = dict({'C':['a']})
    # UserCF = usercf.UserCF(train)
    # UserCF.train()
    # print(UserCF.recommend('C', 5, 80))

train data size: 6040, test data size: 6030
start load user similarity matrix
Exception occurs: [Errno 2] No such file or directory: './TrainedModels/usercf.pkl'
load user similarity matrix failed, start train...
start recommend ...
{1: 11.555051809227317, 1210: 11.12726189948913, 595: 10.781958439643553, 1196: 10.612460693387929, 588: 9.559359361236249}
{110: 14.734830149382915, 733: 14.7136640075951, 1580: 14.59826271990206, 2916: 14.51919170679107, 1608: 14.403510110831407}
{1210: 11.614724814123607, 2628: 10.931244856050824, 480: 9.520842419523715, 2916: 9.378815313780072, 592: 8.717850533026743}
{589: 14.97874451581327, 2571: 14.753687137218197, 457: 13.043504366757148, 110: 12.938731080757192, 1387: 12.34911973650181}
{296: 15.52917768919492, 589: 14.08153064484636, 2396: 13.889729349962476, 1673: 13.042314831935716, 527: 12.157151618911644}
