In [18]:
import pandas as pd
import random
import math
import operator 

# 定义数据结构
records = {}
user_tags = dict()
tag_items = dict()
user_items = dict()
tag_users = dict()

train_data = dict()
test_data = dict()

# 加载数据·
def data_load(file_path):
    print('数据开始加载\n')
    df = pd.read_csv(file_path,sep = '\t')
    for i in range(len(df)):
        u = df['userID'][i]
        item = df['bookmarkID'][i]
        tag = df['tagID'][i]
        records.setdefault(u,{})
        records[u].setdefault(item,[])
        records[u][item].append(tag)
    print('数据加载完毕\n')
    print('数据集大小:%d,打标签的用户数量为:%d'%(len(df),len(records)))
    
# 拆分训练&测试集
def train_test_split(ratio,seed=100):
    random.seed(seed)
    for u in records.keys():
        for i in records[u].keys():
            if random.random()<ratio:
                test_data.setdefault(u,{})
                test_data[u].setdefault(i,[])
                for t in records[u][i]:
                    test_data[u][i].append(t)
            else:
                train_data.setdefault(u,{})
                train_data[u].setdefault(i,[])
                for t in records[u][i]:
                    train_data[u][i].append(t)
# 
def addValueToMat(mat,index,item,value=1):
    if index not in mat:
        mat.setdefault(index,{})
        mat[index].setdefault(item,value)
    else:
        if item not in mat[index]:
            mat[index][item] = value
        else:
            mat[index][item] += value

# 初始化，将数据集存入定义的数据结构中
def initstate():
    for u in train_data.keys():
        for i in train_data[u].keys():
            for t in train_data[u][i]:
                addValueToMat(user_tags,u,t)
                addValueToMat(tag_items,t,i)
                addValueToMat(user_items,u,i)
                addValueToMat(tag_users,t,u)
    print('数据初始化完成\n')
    print('usr_tags size:%d,tag_items size:%d,user_itmes size:%d,tag_users size:%d'%(len(user_tags),len(tag_items),len(user_items),len(tag_users)))
    
def recommend(user,N):
    recommend_items = {}
    tagged_items = user_items[user]
    for t,wut in user_tags[user].items():
        IDF = math.log(len(tag_users[t])+1)
        for i,wti in tag_items[t].items():
            if i in tagged_items:
                continue
            if i not in recommend_items:
                recommend_items[i] = wut*wti/IDF
            else:
                recommend_items[i] += wut*wti/IDF
    return sorted(recommend_items.items(),key=operator.itemgetter(1),reverse=True)[0:N]

def precisionAndRecall(N):
    hit = 0
    h_precision = 0
    h_recall = 0
    for u,items in test_data.items():
        if u not in train_data:
            continue
        rank = recommend(u,N)
        for item,rui in rank:
            if item in items:
                hit += 1
        h_precision += N
        h_recall    += len(items)
    print('一共命中了：%d,一共推荐了：%d,用户实际打标签的个数:%d'%(hit,h_precision,h_recall))
    return (hit/h_precision*1),(hit/h_recall*1)

def testRecommend():
    print('推荐结果评估\n')
    print('%3s %10s %10s'%('N','准确率','召回率'))
    for n in [5,10,20,40,60,80,100]:
        precision,recall = precisionAndRecall(n)
        print('%3d %10.3f%% %10.3f%%'%(n,precision*100,recall*100))

In [19]:
data_load('C:/Users/18280/Desktop/RS/Recommended_System-master/L2/delicious-2k/user_taggedbookmarks-timestamps.dat')
train_test_split(0.2)
initstate()
testRecommend()

数据开始加载

数据加载完毕

数据集大小:437593,打标签的用户数量为:1867
数据初始化完成

usr_tags size:1860,tag_items size:36884,user_itmes size:1860,tag_users size:36884
推荐结果评估

  N        准确率        召回率
一共命中了：90,一共推荐了：8930,用户实际打标签的个数:20861
  5      1.008%      0.431%
一共命中了：136,一共推荐了：17860,用户实际打标签的个数:20861
 10      0.761%      0.652%
一共命中了：196,一共推荐了：35720,用户实际打标签的个数:20861
 20      0.549%      0.940%
一共命中了：287,一共推荐了：71440,用户实际打标签的个数:20861
 40      0.402%      1.376%
一共命中了：352,一共推荐了：107160,用户实际打标签的个数:20861
 60      0.328%      1.687%
一共命中了：424,一共推荐了：142880,用户实际打标签的个数:20861
 80      0.297%      2.033%
一共命中了：481,一共推荐了：178600,用户实际打标签的个数:20861
100      0.269%      2.306%
