In [2]:
import pandas as pd
import random 
import math
import operator

In [3]:
1 #定义数据结构
records = dict()
user_tags = dict()
tag_items = dict()
user_items = dict()

train_data = dict()
test_data = dict()

In [4]:
def data_load(path):
    # 读取数据，将结果以{
    #                   u1:{item1:[tag1,tag2,...],item2:[tag1,tag2,...],...},
    #                   u2:{item1:[tag1,tag2,...],item2:[tag1,tag2,...],...}
    #                       ...
    #                   un:{item1:[tag1,tag2,...],item2:[tag1,tag2,...],...}
    #                                                                      }形式保存于records字典中
    print('Data begin to load...')
    df = pd.read_csv(path,sep='\t')
    for i in range(len(df)):
        uid = df['userID'][i]
        itemid = df['bookmarkID'][i]
        tid = df['tagID'][i]
        records.setdefault(uid,{})
        records[uid].setdefault(itemid,[])
        records[uid][itemid].append(tid)
    print('数据加载完毕\n')
    print('数据集大小：%d'%len(df))
    print('打标签的人数为:%d'%len(records))

def train_test_split(ratio,seed=100):
    # 此函数用于分隔训练集与测试集，按设定比例ratio分隔数据集
    random.seed(seed)
    for u in records.keys():
        for i in records[u].keys():
            if random.random()<ratio:
                test_data.setdefault(u,{})
                test_data[u].setdefault(i,[])
                for t in records[u][i]:
                    test_data[u][i].append(t)
            else:
                train_data.setdefault(u,{})
                train_data[u].setdefault(i,[])
                for t in records[u][i]:
                    train_data[u][i].append(t)
                    
def AddValueToMat(mat,index,item,value=1):
    #此函数将计算index和item共同出现的次数value，并将其以记录在mat字典中，格式为：
    # {index,{item,value}}
    if index not in mat:
        mat.setdefault(index,{})
        mat[index].setdefault(item,value)
    else:
        if item not in mat[index]:
            mat[index][item] = value
        else:
            mat[index][item] += value
def initstate():
    #  
    records = train_data
    for u,i in records.items():
        for  item,tags in records[u].items():
            for tag in tags:
                AddValueToMat(user_tags,u,tag)
                AddValueToMat(tag_items,tag,item)
                AddValueToMat(user_items,u,item)
    print('user_tags,tag_items,user_items初始化完成\n')
    print('user_tags size:%d,tag_items size:%d,user_items size:%d'%(len(user_tags),len(tag_items),len(user_items)))

def recommend(user,N):
    recommend_item = dict() #推荐结果以字典形式返回
    tagged_item = user_items[user]
    for tag,wut in user_tags[user].items():
        user_tagged = sum(user_tags[user].values()) # 用户打过的tag的总数
        for item,wti in tag_items[tag].items():
            item_tagged_count = sum(tag_items[tag].values())
            if item in tagged_item:
                continue
            if item not in recommend_item:
                recommend_item[item] = wut*wti/user_tagged/item_tagged_count
            else:
                recommend_item[item] = recommend_item[item]+wut*wti/user_tagged/item_tagged_count
    return sorted(recommend_item.items(),key=operator.itemgetter(1),reverse=True)[0:N]

def precisionAndRecall(N):
    hit = 0
    h_recall = 0
    h_precision = 0
    for user,items in test_data.items():
        if user not in train_data:
            continue
        rank = recommend(user,N)
        for item,rui in rank:
            if item in items:
                hit = hit + 1
        h_recall = h_recall+len(items) 
        h_precision = h_precision+N
    print('一共推荐%d个,一共命中%d个,用户设置tag的总数%d个'%(h_precision,hit,h_recall))
    return (hit/(h_precision*1.0)),(hit/(h_recall*1))

def testRecommend():
    print('推荐结果评估')
    print('%3s %10s %10s'%('N','准确率','召回率'))
    for n in [5,10,20,40,60,80,100]:
        precision,recall = precisionAndRecall(n)
        print('%3d %10.3f%% %10.3f%%'%(n,precision*100,recall*100))

In [5]:
data_load('C:/Users/18280/Desktop/RS/Recommended_System-master/L2/delicious-2k/user_taggedbookmarks-timestamps.dat')
train_test_split(0.2)
initstate()
testRecommend()

Data begin to load...
数据加载完毕

数据集大小：437593
打标签的人数为:1867
user_tags,tag_items,user_items初始化完成

user_tags size:1860,tag_items size:36884,user_items size:1860
推荐结果评估
  N        准确率        召回率
一共推荐8930个,一共命中64个,用户设置tag的总数20861个
  5      0.717%      0.307%
一共推荐17860个,一共命中94个,用户设置tag的总数20861个
 10      0.526%      0.451%
一共推荐35720个,一共命中147个,用户设置tag的总数20861个
 20      0.412%      0.705%
一共推荐71440个,一共命中209个,用户设置tag的总数20861个
 40      0.293%      1.002%
一共推荐107160个,一共命中262个,用户设置tag的总数20861个
 60      0.244%      1.256%
一共推荐142880个,一共命中322个,用户设置tag的总数20861个
 80      0.225%      1.544%
一共推荐178600个,一共命中382个,用户设置tag的总数20861个
100      0.214%      1.831%
