In [1]:
import pandas as pd
import math
import random
from operator import itemgetter
import numpy as np

## 加载数据集 

In [2]:
data = pd.read_csv('ml-1m/ratings.dat',sep='::')
data.columns=['user_id','movie_id','rating','timestamp']
data.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,661,3,978302109
1,1,914,3,978301968
2,1,3408,4,978300275
3,1,2355,5,978824291
4,1,1197,3,978302268


In [3]:
def SplitData(data,M,k,seed=2019):
    test=[]
    train = []
    random.seed(seed) 
    def func_appl(x,M,k):
        if random.randint(0,M-1) == k:
            test.append([x['user_id'],x['movie_id']])
        else:
            train.append([x['user_id'],x['movie_id']])
    data.apply(lambda x:func_appl(x,M,k),axis=1)
    
    def convert_dict(data):
        data_dict = dict()
        for user,item in data:
            if user not in data_dict:
                data_dict[user] = set()
            data_dict[user].add(item)
        data_dict={k:list(data_dict[k])for k in data_dict}
        return data_dict
    return convert_dict(train),convert_dict(test)

## 评价指标
1. 召回率
2. 精确度
3. 覆盖率
4. 流行度

In [4]:
class Eval():
    def __init__(self, train, test, GetRecommendation,N):
        self.train = train
        self.test = test
        self.GetRecommendation = GetRecommendation
        self.N =N

    def Recall(self):
        hit = 0
        all = 0
        for user,items in self.test.items():
            tu = items
            rank = self.GetRecommendation(user, self.N)
            for item,pui in rank:
                if item in tu:
                    hit += 1
            all += len(tu)
        return round(hit / (all * 1.0), 2)

    def Precision(self):
        hit = 0
        all = 0
        for user in self.test.keys():
            tu = set(self.test[user])
            rank = self.GetRecommendation(user, self.N)
            for item,pui in rank:
                if item in tu:
                    hit += 1
            all += self.N
        return round(hit / (all * 1.0), 2)
    
    def Coverage(self):
        recommend_items = set()
        all_items = set()
        for user in self.test.keys():
            for item in self.train[user]:
                all_items.add(item)
            rank = self.GetRecommendation(user, self.N)
            for item,pui in rank:
                recommend_items.add(item)
        return round(len(recommend_items) / (len(all_items) * 1.0),2)

    def Popularity(self):
        item_pop = dict()
        for user, items in self.train.items():
            for item in items:
                if item not in item_pop:
                    item_pop[item] = 0
                else:
                    item_pop[item] += 1
        ret = 0
        n = 0
        for user in self.test.keys():
            rank =self. GetRecommendation(user, self.N)
            for item,pui in rank:
                ret += math.log(1 + item_pop[item])
                n += 1
        ret /= n * 1.0
        return round(ret,2)

    def eval(self):
        metric = {'Precision': self.Precision(),
                  'Recall': self.Recall(),
                  'Coverage': self.Coverage(),
                  'Popularity': self.Popularity()}
        print('Metric:', metric)
        return metric

In [5]:
def LFM(train,F,N,alpha,lr):
    all_items=dict()
    for user,items in train.items():
        for item in items:
            if item not in all_items:
                all_items[item]=0
            all_items[item]+=1
    all_items = list(all_items.items())
    item_list = [x[0] for x in all_items]
    pop_list = [x[1] for x in all_items]
    
    def RandomSelectNegativeSample(train):
        ret = dict()
        for user,items in train.items():
            if user not in ret:
                ret[user]=dict()
            for i in items:
                ret[user][i]=1
            n=0
            selected_items = np.random.choice(item_list,len(items)*3,pop_list)
            for i in selected_items:
                if i not in items:
                    ret[user][i]=0
                n+=1
                if n>len(items):
                    break
        return ret
    P=dict()
    Q=dict()
#     P = np.random.random((len(train.keys()),F)
#     Q = np.random.random((F,len(item_list)))                     
    for user in train:
        P[user] = np.random.random(F)
    for item in item_list:
        Q[item] = np.random.random(F)
    for step in range(0,N):
        samples = RandomSelectNegativeSample(train)
        for user,items in samples.items():
            for item, rui in items.items():
                eui = rui-(P[user]*Q[item]).sum()
                P[user]+=alpha*(eui*Q[item]-lr*P[user])
                Q[item]+=alpha*(eui*P[user]-lr*Q[item])
        alpha *= 0.9
    
    def GetRecommendation(user,n):
        rank=dict()
        ru=train[user]
        for u,puf in P.items():
            for i,qif in Q.items():
                if i not in ru:
                    if i not in rank:
                        rank[i]=0
                    rank[i]+=(puf*qif).sum()
        return sorted(rank.items(),key=itemgetter(1),reverse=True)[0:n]
    return GetRecommendation
            
        

In [None]:
F=100
N=100
alpha=0.02
lr =0.01
M=8
metrics = {'Precision': 0, 'Recall': 0, 
           'Coverage': 0, 'Popularity': 0}
for i in range(M):
    train,test = SplitData(data,M,i)
    GetRecommendation = LFM(train,F,N,alpha,lr)
    e = Eval(train,test,GetRecommendation,10)
    metric=e.eval()
    metrics={k: metrics[k]+metric[k] for k in metrics}
metrics={k: metrics[k]/M for k in metrics}
metrics

In [15]:
e = Eval(train,test,GetRecommendation,10)
e.eval()

Metric: {'Precision': 0.1, 'Recall': 0.05, 'Coverage': 0.02, 'Popularity': 5.39}


{'Precision': 0.1, 'Recall': 0.05, 'Coverage': 0.02, 'Popularity': 5.39}