In [15]:
import os
os.sys.path.append(os.path.dirname(os.path.abspath('.')))

## 数据准备

In [16]:
import numpy as np

data = np.array([[2.5, 3.5, 3, 3.5, 2.5, 3],
                 [3, 3.5, 1.5, 5, 3.5, 3],
                 [2.5, 3, 0, 3.5, 0, 4],
                 [0, 3.5, 3, 0, 4, 4],
                 [3, 4, 2, 3, 2, 3],
                 [3, 4, 0, 5, 3.5, 3],
                 [0, 4.5, 0, 4, 1, 0]])
n_users, n_items = data.shape

In [18]:
from metrics.pairwise.euclidean_distances import euclidean_distances

dist_mat = euclidean_distances(data.T)    # 两两物品之间的距离矩阵
sim_mat = 1/(1+dist_mat)    # 将距离转化成相似度

因为需要同时存储物品索引与相似度，这里使用字典结构来存储。

In [42]:
# 两矩阵分别存储最相似物品的索引与相似度
sim_idx_mat = np.argsort(-sim_mat, axis=1)[:, 1:]
sim_val_mat = -np.sort(-sim_mat)[:, 1:]

sim_dict = dict()
for row in range(n_items):
    sim_dict[row] = [(item_idx, sim_val) for item_idx,
                     sim_val in zip(sim_idx_mat[row], sim_val_mat[row])]

print(sim_dict)

{0: [(5, 0.1886378647726465), (4, 0.16736577623297264), (3, 0.1639607805437114), (2, 0.15954492995986427), (1, 0.14285714285714285)], 1: [(3, 0.1951941016011038), (5, 0.16952084719853724), (4, 0.16202097927744855), (0, 0.14285714285714285), (2, 0.12002728245132872)], 2: [(4, 0.1886378647726465), (0, 0.15954492995986427), (5, 0.1560469703786189), (1, 0.12002728245132872), (3, 0.1030561550871519)], 3: [(1, 0.1951941016011038), (0, 0.1639607805437114), (5, 0.13579648178933995), (4, 0.13133048602716904), (2, 0.1030561550871519)], 4: [(2, 0.1886378647726465), (5, 0.1876127897984334), (0, 0.16736577623297264), (1, 0.16202097927744855), (3, 0.13133048602716904)], 5: [(0, 0.1886378647726465), (4, 0.1876127897984334), (1, 0.16952084719853724), (2, 0.1560469703786189), (3, 0.13579648178933995)]}


现在仍需要对最后一个用户做推荐，那么不再需要去原数据中找出相似用户的数据，而是根据用户本身，去找到最相似的候选物品。首先提取该用户已有的历史数据，与用户未曾接触过的物品(候选推荐)：

In [39]:
i = 6    # 最后一个用户

seen_items = np.arange(n_items)[data[i] != 0]    # 用户的历史数据

cand_items = np.arange(n_items)[data[i] == 0]    # 候选推荐

print(seen_items, cand_items)

[1 3 4] [0 2 5]


In [45]:
scores = dict()
tol_sim = dict()

# 遍历用户历史数据中的已评分物品
for idx, val in enumerate(data[i]):
    if idx in cand_items:
        continue

    # 遍历相似的候选物品
    for sim_item, sim_val in sim_dict[idx]:
        if sim_item in cand_items:
            scores[sim_item] = scores.get(sim_item, 0)
            scores[sim_item] += sim_val*val    # 加权评分

            tol_sim[sim_item] = tol_sim.get(sim_item, 0)
            tol_sim[sim_item] += sim_val    # 候选物品相似度求和，用作分母

ranking = [(item, score/tol_sim[item]) for item, score in scores.items()]

In [46]:
ranking

[(5, 3.0301303824299635), (0, 3.0917681109601265), (2, 2.771256307962468)]