In [1]:
import numpy as np
import pandas as pd
import math
import operator

### 读取数据

In [2]:
file_path = "./data/user_taggedbookmarks-timestamps.dat"
data = pd.read_csv(file_path, sep='\t')
data.head(3)

Unnamed: 0,userID,bookmarkID,tagID,timestamp
0,8,1,1,1289255362000
1,8,2,1,1289255159000
2,8,7,1,1289238901000


### 数据结构定义与加载

In [3]:
# 字典类型，保存了user对item的tag，即{userid: {item1:[tag1, tag2], ...}}
records = {}

# 加载records数据
for row in data.iloc[:, :3].itertuples():
    u_id, i_id, t_id = row[1:]
    records.setdefault(u_id, {})
    records[u_id].setdefault(i_id, [])
    records[u_id][i_id].append(t_id)
print(f"设置tag的人数为{len(records)}.")
print("数据加载完成\n")

设置tag的人数为1867.
数据加载完成



### 划分训练集与测试集

In [4]:
def train_test_split(test_ratio, seed=0):
    """划分数据集的函数，test_ratio比例设置为测试集"""
    np.random.seed(seed)
    for u in records:
        size = len(records[u])
        ls_items = list(records[u].keys())
        # 这种抽取方式能更接近设定的划分比例，且提高了速度
        test_items = np.random.choice(ls_items, round(size*test_ratio), replace=False)
        train_items = [i for i in ls_items if i not in test_items]
        # 测试集
        for i in test_items:
            test_data.setdefault(u,{})
            test_data[u][i] = records[u][i].copy()
        # 训练集
        for i in train_items:
            train_data.setdefault(u,{})
            train_data[u][i] = records[u][i].copy()      
    print("训练集样本数 %d, 测试集样本数 %d" % (len(train_data),len(test_data)))
    
# 训练集，测试集
train_data = dict()
test_data = dict()
train_test_split(0.2)

训练集样本数 1867, 测试集样本数 1812


In [5]:
# 初始化训练集
def addValueToMat(mat, index, item, value=1):
    """用于更新用户、商品、标签间关系的函数"""
    if index not in mat:
        mat.setdefault(index,{})
        mat[index].setdefault(item,value)
    else:
        if item not in mat[index]:
            mat[index][item] = value
        else:
            mat[index][item] += value

# 用户标签，商品标签
user_tags = dict()
tag_items = dict()
user_items = dict()
tag_users = dict()
item_tags = dict()

# 从训练集里取数据
for u_id in train_data:
    for i_id in train_data[u_id]:
        for t_id in train_data[u_id][i_id]:
            addValueToMat(user_tags, u_id, t_id, value=1)
            addValueToMat(tag_items, t_id, i_id, value=1)
            addValueToMat(user_items, u_id, i_id, value=1)
            addValueToMat(tag_users, t_id, u_id, value=1)
            addValueToMat(item_tags, i_id, t_id, value=1)
print('生成数据集完成.')

生成数据集完成.


### 推荐算法

In [6]:
def simple_tag_based(wut, wti, nut=None, nti=None, ntu=None):
    return wut * wti

def norm_tag_based(wut, wti, nut=None, nti=None, ntu=None):
    return wut / nut * wti / nti

def tag_based_tfidf(wut, wti, nut=None, nti=None, ntu=None):
    return wut / np.log(1 + ntu) * wti

def recommend(user, N, method='simple'):
    """对用户user推荐Top-N"""
    recommend_items=dict()
    alg_dict = {'simple': simple_tag_based, 'norm': norm_tag_based, 'tfidf': tag_based_tfidf}
    alg_func = alg_dict[method]
    # 对Item进行打分，分数为所有的（用户对某标签使用的次数 wut, 乘以 商品被打上相同标签的次数 wti）之和
    ls_items = user_items[user].keys()  
    nut = len(user_tags[user])  # user打过的标签种类数
    for tag, wut in user_tags[user].items():
        nti = len(tag_items[tag])  # 被打过标签tag的商品数
        ntu = len(tag_users[tag])  # 打过tag标签的用户数
        for item, wti in tag_items[tag].items():
            if item in ls_items:
                continue
            if item not in recommend_items:
                recommend_items[item] = alg_func(wut, wti, nut, nti, ntu)
            else:
                recommend_items[item] = recommend_items[item] + alg_func(wut, wti, nut, nti, ntu)
    return sorted(recommend_items.items(), key=operator.itemgetter(1), reverse=True)[0:N]

### 评估函数及算法测试

In [7]:
# 使用测试集，计算准确率和召回率
def precisionAndRecall(N, method):
    hit = 0
    h_recall = 0
    h_precision = 0
    for user, items in test_data.items():
        if user not in train_data:
            continue
        # 获取Top-N推荐列表
        rank = recommend(user, N, method)
        for item, _ in rank:
            if item in items:
                hit = hit + 1
        h_recall = h_recall + len(items)
        h_precision = h_precision + N
    #print('一共命中 %d 个, 一共推荐 %d 个, 用户设置tag总数 %d 个' %(hit, h_precision, h_recall))
    # 返回准确率 和 召回率
    return (hit/(h_precision*1.0)), (hit/(h_recall*1.0))

# 使用测试集，对推荐结果进行评估
def testRecommend(method='simple'):
    print("推荐结果评估")
    print("%3s %10s %10s" % ('N',"精确率",'召回率'))
    for n in [5,10,20,40,60,80,100]:
        precision,recall = precisionAndRecall(n, method)
        print("%3d %10.3f%% %10.3f%%" % (n, precision * 100, recall * 100))

In [8]:
%%time
testRecommend(method='simple')

推荐结果评估
  N        精确率        召回率
  5      0.784%      0.339%
 10      0.701%      0.606%
 20      0.535%      0.925%
 40      0.374%      1.292%
 60      0.305%      1.583%
 80      0.272%      1.879%
100      0.250%      2.160%
Wall time: 3min 31s


In [9]:
%%time
testRecommend(method='norm')

推荐结果评估
  N        精确率        召回率
  5      0.751%      0.324%
 10      0.585%      0.506%
 20      0.392%      0.677%
 40      0.316%      1.092%
 60      0.276%      1.431%
 80      0.249%      1.722%
100      0.220%      1.898%
Wall time: 4min 26s


In [10]:
%%time
testRecommend(method='tfidf')

推荐结果评估
  N        精确率        召回率
  5      0.894%      0.386%
 10      0.751%      0.649%
 20      0.574%      0.992%
 40      0.407%      1.407%
 60      0.341%      1.769%
 80      0.296%      2.046%
100      0.267%      2.308%
Wall time: 16min 23s


显然，推荐效果不太好。但我觉得主要问题还是出在数据集上；商品被打标签的次数太少，因此对于同一个标签得到的商品热门程度缺乏区分度，所以推荐效果不太好。当然，算法本身也是最简单的推荐算法之一，效果也会有一定影响吧。

### 尝试改进计算推荐商品分数的计算方式，结果发现计算更慢了..= =

In [54]:
dse_user_tags = {u: pd.Series(user_tags[u]) for u in user_tags}
dse_item_tags = {i: pd.Series(item_tags[i]) for i in item_tags}

In [83]:
def fast_recommend(user, N):
    """对用户user推荐Top-N"""
    recommend_items=dict()
    # 对Item进行打分，分数为所有的（用户对某标签使用的次数 wut, 乘以 商品被打上相同标签的次数 wti）之和
    ls_items = user_items[user].keys()  
    set_u_items = set(user_items[user])
    set_t_items = set()
    nut = len(user_tags[user])
    for tag in user_tags[user]:
        set_t_items = set_t_items.union(tag_items[tag])
    set_t_items = set_t_items - set_u_items
    for item in set_t_items:
        recommend_items[item] = (dse_user_tags[user] * dse_item_tags[item]).sum()
    return sorted(recommend_items.items(), key=operator.itemgetter(1), reverse=True)[0:N]

In [80]:
%%time
recommend(8, 10)

Wall time: 11.9 ms


[(1526, 77),
 (1416, 61),
 (4087, 53),
 (14657, 52),
 (918, 51),
 (4639, 50),
 (12968, 50),
 (3203, 45),
 (4638, 39),
 (4535, 38)]

In [84]:
%%time
fast_recommend(8, 10)

Wall time: 8.36 s


[(1526, 77.0),
 (1416, 61.0),
 (4087, 53.0),
 (14657, 52.0),
 (918, 51.0),
 (4639, 50.0),
 (12968, 50.0),
 (3203, 45.0),
 (4638, 39.0),
 (4535, 38.0)]

还想过一种方法，对于用户u的标签`user_tags[u].keys()`，对于每个标签t，找到被打上标签t最多的m个商品，只计算给用户推荐这些商品的score，从而可以减少计算量。但由于数据集太小了，很多标签只在商品打过一两次，缺少区分度，因此放弃了。

In [10]:
for t in list(tag_items.keys())[:5]:
    print(f'tag {t}: ', sorted(tag_items[t].items(), key=operator.itemgetter(1), reverse=True)[:5])

tag 1:  [(1526, 3), (1, 2), (7, 2), (918, 2), (4087, 2)]
tag 6:  [(2484, 3), (7, 1), (2544, 1), (4256, 1), (4602, 1)]
tag 7:  [(15505, 2), (7, 1), (2, 1), (654, 1), (1187, 1)]
tag 8:  [(11253, 2), (12046, 2), (8, 1), (44682, 1), (63318, 1)]
tag 9:  [(3603, 3), (64471, 2), (3607, 2), (3608, 2), (3609, 2)]
