In [1]:
import pandas as pd
import math
from operator import itemgetter
import random
import numpy as np
from scipy.sparse import csc_matrix, linalg, eye

In [2]:
data = pd.read_csv('ml-1m/ratings.dat',sep='::')
data.columns=['user_id','movie_id','rating','timestamp']
data.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,661,3,978302109
1,1,914,3,978301968
2,1,3408,4,978300275
3,1,2355,5,978824291
4,1,1197,3,978302268


In [3]:
def SplitData(data,M,k,seed=2019):
    test=[]
    train = []
    random.seed(seed) 
    def func_appl(x,M,k):
        if random.randint(0,M-1) == k:
            test.append([x['user_id'],x['movie_id']])
        else:
            train.append([x['user_id'],x['movie_id']])
    data.apply(lambda x:func_appl(x,M,k),axis=1)
    
    def convert_dict(data):
        data_dict = dict()
        for user,item in data:
            if user not in data_dict:
                data_dict[user] = set()
            data_dict[user].add(item)
        data_dict={k:list(data_dict[k])for k in data_dict}
        return data_dict
    return convert_dict(train),convert_dict(test)

## 评价指标
1. 召回率
2. 精确度
3. 覆盖率
4. 流行度

In [4]:
class Eval():
    def __init__(self, train, test, GetRecommendation,N):
        self.train = train
        self.test = test
        self.GetRecommendation = GetRecommendation
        self.N =N

    def Recall(self):
        hit = 0
        all = 0
        for user,items in self.test.items():
            tu = items
            rank = self.GetRecommendation(user, self.N)
            for item,pui in rank:
                if item in tu:
                    hit += 1
            all += len(tu)
        return round(hit / (all * 1.0), 2)

    def Precision(self):
        hit = 0
        all = 0
        for user in self.test.keys():
            tu = set(self.test[user])
            rank = self.GetRecommendation(user, self.N)
            for item,pui in rank:
                if item in tu:
                    hit += 1
            all += self.N
        return round(hit / (all * 1.0), 2)
    
    def Coverage(self):
        recommend_items = set()
        all_items = set()
        for user in self.test.keys():
            for item in self.train[user]:
                all_items.add(item)
            rank = self.GetRecommendation(user, self.N)
            for item,pui in rank:
                recommend_items.add(item)
        return round(len(recommend_items) / (len(all_items) * 1.0),2)

    def Popularity(self):
        item_pop = dict()
        for user, items in self.train.items():
            for item in items:
                if item not in item_pop:
                    item_pop[item] = 0
                else:
                    item_pop[item] += 1
        ret = 0
        n = 0
        for user in self.test.keys():
            rank =self. GetRecommendation(user, self.N)
            for item,pui in rank:
                ret += math.log(1 + item_pop[item])
                n += 1
        ret /= n * 1.0
        return round(ret,2)

    def eval(self):
        metric = {'Precision': self.Precision(),
                  'Recall': self.Recall(),
                  'Coverage': self.Coverage(),
                  'Popularity': self.Popularity()}
        print('Metric:', metric)
        return metric

In [9]:
def PersonalRank(train,alpha,N):
    
#     user_list = {u:i for i,u in enumerate(train.keys())}
#     item_list = {u:i+len(user_list)for i,u in enumerate(id2item)}
    
    
    item_users = dict()
    for user,items in train.items():
        for item in items:
            if item not in item_users:
                item_users[item]=list()
            item_users[item].append(user)
            
    G=dict()
    for user,items in train.items():
        for item in items:
            if user not in G:
                G[user]=dict()
            G[user][item]=1

    for item,users in item_users.items():
        for user in users:
            if item not in G:
                G[item]=dict()
            G[item][user]=1
            
    
    def GetRecommendation(user,n):
        items=[]
        for u,item in train.items():
            if user!=u:
                items.extend(item)           
        allitems=list(set(items))
        rank =dict()
        rank = {x:0 for x in G.keys()}
        rank[user]=1
        
        for step in range(N):
            tmp = {x:0 for x in G.keys()}
            for i,ri in G.items():
                for j, wij in ri.items():
                    if j not in tmp:
                        tmp[j]=0
                    tmp[j]+=alpha*rank[i]/(1.0*len(ri))
            tmp[user]+=1-alpha
            rank=tmp
        rank = sorted(rank.items(),key=itemgetter(1),reverse=True)
        res=[]
        i=0
        for i,score in rank:
            if i in allitems:
                res.append((i,score))
            i+=1
            if i>n:
                break
        return res
    return GetRecommendation
            
            

            

In [None]:
M=8
alpha=0.8
N=10
metrics = {'Precision': 0, 'Recall': 0, 
           'Coverage': 0, 'Popularity': 0}
for i in range(M):
    train,test = SplitData(data,M,i)
    GetRecommendation=PersonalRank(train,alpha,N)
    e = Eval(train,test,GetRecommendation,10)
    metric=e.eval()
    metrics={k: metrics[k]+metric[k] for k in metrics}
metrics={k: metrics[k]/M for k in metrics}
metrics


Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 0.99, 'Popularity': 4.41}


## 二分图转换为矩阵 

In [None]:
def PersonalRankII(train, alpha, N):
    '''
    :params: train, 训练数据
    :params: alpha, 继续随机游走的概率
    :params: N, 推荐TopN物品的个数
    :return: GetRecommendation, 获取推荐结果的接口
    ''' 
    
    # 构建索引
    items = []
    for user in train:
        items.extend(train[user])
    id2item = list(set(items))
    users = {u: i for i, u in enumerate(train.keys())}
    items = {u: i+len(users) for i, u in enumerate(id2item)}
    
    # 计算转移矩阵（注意！！！要按照出度进行归一化）
    item_user = {}
    for user in train:
        for item in train[user]:
            if item not in item_user:
                item_user[item] = []
            item_user[item].append(user)
            
    data, row, col = [], [], []
    for u in train:
        for v in train[u]:
            data.append(1 / len(train[u]))
            row.append(users[u])
            col.append(items[v])
    for u in item_user:
        for v in item_user[u]:
            data.append(1 / len(item_user[u]))
            row.append(items[u])
            col.append(users[v])
            
    M = csc_matrix((data, (row, col)), shape=(len(data), len(data)))
    
    # 获取接口函数
    def GetRecommendation(user):
        seen_items = set(train[user])
        # 解矩阵方程 r = (1-a)r0 + a(M.T)r
        r0 = [0] * len(data)
        r0[users[user]] = 1
        r0 = csc_matrix(r0)
        r = (1 - alpha) * linalg.inv(eye(len(data)) - alpha * M.T) * r0
        r = r.T.toarray()[0][len(users):]
        idx = np.argsort(-r)[:N]
        recs = [(id2item[ii], r[ii]) for ii in idx]
        return recs
    
    return GetRecommendation