In [1]:
import pandas as pd
import math
from sklearn.model_selection import train_test_split
import random
from operator import itemgetter

## 加载数据


In [2]:
data = pd.read_csv('ml-1m/ratings.dat',sep='::')
data.columns=['user_id','movie_id','rating','timestamp']
data =data.head(100000)
data.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,661,3,978302109
1,1,914,3,978301968
2,1,3408,4,978300275
3,1,2355,5,978824291
4,1,1197,3,978302268


## 分割测试集和训练集
M:划分的数目，最后需要取M折的平均
k：本次是第几次划分，k~(0,M)
seed:random的种子数，对于不同的k应设置成一样的

In [3]:
def SplitData(M,k,seed=2019):
    test=[]
    train = []
    random.seed(seed) 
    def func_appl(x,M,k):
        if random.randint(0,M-1) == k:
            test.append([x['user_id'],x['movie_id']])
        else:
            train.append([x['user_id'],x['movie_id']])
    data.apply(lambda x:func_appl(x,M,k),axis=1)
    return train,test        

In [4]:
def convert_dict(data):
    data_dict = {}
    for user,item in data:
        if user not in data_dict:
            data_dict[user] = set()
        data_dict[user].add(item)
    data_dict={k:list(data_dict[k])for k in data_dict}
    return data_dict

## 测评指标
1. Recall
2. Precision
3. Coverage
4. Popularity

In [5]:
class Eval():
    def __init__(self, train, test, GetRecommendation,N):
        self.train = train
        self.test = test
        self.GetRecommendation = GetRecommendation
        self.N =N

    def Recall(self):
        hit = 0
        all = 0
        for user,items in self.test.items():
            tu = items
            rank = self.GetRecommendation(user, self.N)
            for item,pui in rank:
                if item in tu:
                    hit += 1
            all += len(tu)
        return round(hit / (all * 1.0), 2)

    def Precision(self):
        hit = 0
        all = 0
        for user in self.test.keys():
            tu = set(self.test[user])
            rank = self.GetRecommendation(user, self.N)
            for item,pui in rank:
                if item in tu:
                    hit += 1
            all += self.N
        return round(hit / (all * 1.0), 2)
    
    def Coverage(self):
        recommend_items = set()
        all_items = set()
        for user in self.test.keys():
            for item in self.train[user]:
                all_items.add(item)
            rank = self.GetRecommendation(user, self.N)
            for item,pui in rank:
                recommend_items.add(item)
        return round(len(recommend_items) / (len(all_items) * 1.0),2)

    def Popularity(self):
        item_pop = dict()
        for user, items in self.train.items():
            for item in items:
                if item not in item_pop:
                    item_pop[item] = 0
                else:
                    item_pop[item] += 1
        ret = 0
        n = 0
        for user in self.test.keys():
            rank =self. GetRecommendation(user, self.N)
            for item,pui in rank:
                ret += math.log(1 + item_pop[item])
                n += 1
        ret /= n * 1.0
        return round(ret,2)

    def eval(self):
        metric = {'Precision': self.Precision(),
                  'Recall': self.Recall(),
                  'Coverage': self.Coverage(),
                  'Popularity': self.Popularity()}
        print('Metric:', metric)
        return metric

## 基于邻域的算法

1. 基于用户的协同过滤
2. 基于物品的协同过滤

In [6]:
def UserCF(train,K):    
    item_user = dict()
    for user,items in train.items():
        for item in items:
            if item not in item_user:
                item_user[item]=set()
            item_user[item].add(user)
    N=dict()
    C=dict()
    for item,users in item_user.items():
        for u in users:
            if u not in N:
                N[u]=0
            N[u]+=1
            if u not in C:
                C[u]=dict()
            for v in users:
                if u==v:
                    continue               
                if v not in C[u]:
                    C[u][v]=0
                C[u][v]+=1
    Sim=dict()
    for u,related_user in C.items():
        if u not in Sim:
            Sim[u]=dict()
        for v,cuv in related_user.items():           
            Sim[u][v]=cuv/math.sqrt(N[u]*N[v])
    print(Sim)        
    def GetRecommendation(user,n):
        rank = dict()
        interacted_items = set(train[user])
        for v,wuv in sorted(Sim[user].items(),key=itemgetter(1),reverse=True)[0:K]:
            for i in train[v]:
                if i not in interacted_items:
                    if i not in rank:
                        rank[i]=0
                    rank[i] += wuv
        rank = sorted(rank.items(),key=itemgetter(1),reverse=True)[0:n]
        return rank
        
    return GetRecommendation  

## 用户相似度计算改进 

In [7]:
def UserCFII(train,K):
    
    item_user = dict()
    for user,items in train.items():
        for item in items:
            if item not in item_user:
                item_user[item]=set()
            item_user[item].add(user)
    N=dict()
    C=dict()
    for item,users in item_user.items():
        for u in users:
            if u not in N:
                N[u]=0
            N[u]+=1
            if u not in C:
                C[u]=dict()
            for v in users:
                if u==v:
                    continue    
                if v not in C[u]:
                    C[u][v]=0
                C[u][v]+=1/math.log(1+len(users))
    Sim=dict()
    for u,related_user in C.items():
        for v,cuv in related_user.items():
            if u not in Sim:
                Sim[u]=dict()
            Sim[u][v]=cuv/math.sqrt(N[u]*N[v])
            
    def GetRecommendation(user,n):
        rank = dict()
        interacted_items = set(train[user])
        for v,wuv in sorted(Sim[user].items(),key=itemgetter(1),reverse=True)[0:K]:
            for i in train[v]:
                if i not in interacted_items:
                    if i not in rank:
                        rank[i]=0
                    rank[i] += wuv
        rank = sorted(rank.items(),key=itemgetter(1),reverse=True)[0:n]
        return rank
        
    return GetRecommendation  

## UserCF实验 

In [8]:
M=8
K=80
N=10
metrics = {'Precision': 0, 'Recall': 0, 
           'Coverage': 0, 'Popularity': 0}
for i in range(M):
    train,test = SplitData(M,i)
    train = convert_dict(train)
    test = convert_dict(test)
#     GetRecommendation = UserCF(train,K)
    GetRecommendation2 = UserCFII(train,K)
#     eval = Eval(train,test,GetRecommendation,N)
    eval2 = Eval(train,test,GetRecommendation2,N)
    metric = eval2.eval()
    metrics = {k: metrics[k]+metric[k] for k in metrics}

metrics = {k: metrics[k] / M for k in metrics}

Metric: {'Precision': 0.21, 'Recall': 0.11, 'Coverage': 0.08, 'Popularity': 5.27}
Metric: {'Precision': 0.2, 'Recall': 0.11, 'Coverage': 0.08, 'Popularity': 5.27}
Metric: {'Precision': 0.21, 'Recall': 0.11, 'Coverage': 0.08, 'Popularity': 5.28}
Metric: {'Precision': 0.2, 'Recall': 0.11, 'Coverage': 0.07, 'Popularity': 5.28}
Metric: {'Precision': 0.21, 'Recall': 0.11, 'Coverage': 0.08, 'Popularity': 5.26}
Metric: {'Precision': 0.2, 'Recall': 0.11, 'Coverage': 0.08, 'Popularity': 5.27}
Metric: {'Precision': 0.21, 'Recall': 0.11, 'Coverage': 0.07, 'Popularity': 5.27}
Metric: {'Precision': 0.19, 'Recall': 0.1, 'Coverage': 0.08, 'Popularity': 5.27}


## 基于物品的协同过滤 

In [9]:
def ItemCF(train,K):
    N=dict()
    C=dict()
    for user,items in train.items():
        for i in items:
            if i not in N:
                N[i]=0
            N[i]+=1
            if i not in C:
                C[i]=dict()
            for j in items:
                if j==i:
                    continue
                if j not in C[i]:
                    C[i][j]=0
                C[i][j]+=1
    W=dict()
    for i,related_items in C.items():
        if i not in W:
            W[i]=dict()
        for j,cij in related_items.items():
            W[i][j]=cij/math.sqrt(N[i]*N[j])
    def GetRecommendation(user,n):
        rank =dict()
        ru=set(train[user])
        for i,ri in W.items():
            for j,wij in sorted(W[i].items(),key=itemgetter(1),reverse=True)[0:K]:
                if j not in ru:
                    if j not in rank:
                            rank[j]=0     
                    rank[j]+=wij
        rank = sorted(rank.items(),key=itemgetter(1),reverse=True)[0:n]
        return rank
    return GetRecommendation
        

## 物品相似度改进计算 

In [10]:
def ItemCFII(train,K):
    N=dict()
    C=dict()
    for user,items in train.items():
        for i in items:
            if i not in N:
                N[i]=0
            N[i]+=1
            if i not in C:
                C[i]=dict()
            for j in items:
                if j==i:
                    continue
                if j not in C[i]:
                    C[i][j]=0
                C[i][j]+=1/math.log(1+len(items)*1.0)
    W=dict()
    for i,related_items in C.items():
        if i not in W:
            W[i]=dict()
        for j,cij in related_items.items():
            W[i][j]=cij/math.sqrt(N[i]*N[j])
    def GetRecommendation(user,n):
        rank=dict()
        ru=set(train[user])
        for i,rui in W.items():
            for j,wij in sorted(W[i].items(),key=itemgetter(1),reverse=True)[0:K]:
                if j not in ru:
                    if j not in rank:
                        rank[j]=0
                    rank[j]+=wij
        return sorted(rank.items(),key=itemgetter(1),reverse=True)[0:n]
    return GetRecommendation

In [11]:
M=8
N=10
metrics = {'Precision': 0, 'Recall': 0, 
           'Coverage': 0, 'Popularity': 0}
for K in range(10,90,10):
    train,test = SplitData(M,0)
    train = convert_dict(train)
    test = convert_dict(test)
    GetRecommendation = ItemCF(train,K)   
    eval = Eval(train,test,GetRecommendation,N)
    metric = eval.eval()
    metrics = {k: metrics[k]+metric[k] for k in metrics}
metrics = {k: metrics[k] / M for k in metrics}
metrics

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 0.01, 'Popularity': 0.39}
Metric: {'Precision': 0.01, 'Recall': 0.0, 'Coverage': 0.01, 'Popularity': 1.16}
Metric: {'Precision': 0.02, 'Recall': 0.01, 'Coverage': 0.01, 'Popularity': 2.62}
Metric: {'Precision': 0.04, 'Recall': 0.02, 'Coverage': 0.01, 'Popularity': 4.54}
Metric: {'Precision': 0.05, 'Recall': 0.03, 'Coverage': 0.02, 'Popularity': 4.73}
Metric: {'Precision': 0.05, 'Recall': 0.03, 'Coverage': 0.02, 'Popularity': 4.77}
Metric: {'Precision': 0.06, 'Recall': 0.03, 'Coverage': 0.02, 'Popularity': 4.76}
Metric: {'Precision': 0.06, 'Recall': 0.03, 'Coverage': 0.02, 'Popularity': 4.76}


{'Precision': 0.036250000000000004,
 'Recall': 0.01875,
 'Coverage': 0.015000000000000001,
 'Popularity': 3.4662499999999996}

In [12]:
M=8
N=10
metrics = {'Precision': 0, 'Recall': 0, 
           'Coverage': 0, 'Popularity': 0}
GetRecommendation2 = ItemCFII(train,80)
eval2 = Eval(train,test,GetRecommendation2,N)
metric = eval2.eval()
metrics = {k: metrics[k]+metric[k] for k in metrics}
metrics = {k: metrics[k] / M for k in metrics}
metrics

Metric: {'Precision': 0.09, 'Recall': 0.05, 'Coverage': 0.02, 'Popularity': 5.15}


{'Precision': 0.01125,
 'Recall': 0.00625,
 'Coverage': 0.0025,
 'Popularity': 0.64375}

## 物品相似度计算归一化 

In [15]:
def ItemCF_Norm(train,K):
    N=dict()
    C=dict()
    for user,items in train.items():
        for i in items:
            if i not in N:
                N[i]=0
            N[i]+=1
            if i not in C:
                C[i]=dict()
            for j in items:
                if j==i:
                    continue
                if j not in C[i]:
                    C[i][j]=0
                C[i][j]+=1/math.log(1+len(items)*1.0)
    W=dict()
    for i,related_items in C.items():
        if i not in W:
            W[i]=dict()
        for j,cij in related_items.items():
            W[i][j]=cij/math.sqrt(N[i]*N[j])
    for i in W:
        s=0
        for j in W[i]:
            s+=W[i][j]
        if s!=0:
            for j in W[i]:
                W[i][j]/=s
    def GetRecommendation(user,n):
        rank = dict()
        ru =train[user]
        for i,rui in W.items():
            for j ,wij in sorted(W[i].items(),key=itemgetter(1),reverse=True)[0:K]:
                if j not in ru:
                    if j not in rank:
                        rank[j]=0
                    rank[j]+=wij
        return sorted(rank.items(),key=itemgetter(1),reverse=True)[0:n]
    return GetRecommendation

In [16]:
M=8
N=10
metrics = {'Precision': 0, 'Recall': 0, 
           'Coverage': 0, 'Popularity': 0}
GetRecommendation3 = ItemCF_Norm(train,K)
eval3 = Eval(train,test,GetRecommendation3,N)
metric = eval3.eval()
metrics = {k: metrics[k]+metric[k] for k in metrics}
metrics = {k: metrics[k] / M for k in metrics}

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 0.01, 'Popularity': 1.21}
