In [1]:
import os
os.sys.path.append(os.path.dirname(os.path.abspath('.')))

## 数据准备

In [2]:
import numpy as np

data = np.array([[2.5, 3.5, 3, 3.5, 2.5, 3],
                 [3, 3.5, 1.5, 5, 3.5, 3],
                 [2.5, 3, 0, 3.5, 0, 4],
                 [0, 3.5, 3, 0, 4, 4],
                 [3, 4, 2, 3, 2, 3],
                 [3, 4, 0, 5, 3.5, 3],
                 [0, 4.5, 0, 4, 1, 0]])
n_users, n_items = data.shape

有了user-item数据后，可以计算两两user之间的相似度：

In [3]:
from metrics.pairwise.euclidean_distances import euclidean_distances

dist_mat=euclidean_distances(data)    # 两两用户之间的距离矩阵
sim_mat=1/(1+dist_mat)    # 将距离转化成相似度

指定一个用户$user_{i}$，首先找到跟其最相似的前$k$个用户：

In [4]:
i = 6    # 最后一个用户
k = 3    # 使用最相似的前3个用户
top_k_sim = sim_mat[i][sim_mat[i] != 1].argsort(
)[-1:-k-1:-1]    # 首先排除相似度为1的用户，然后取前k个最相似的用户

推荐的本质就是为用户推荐其未曾见过或用过的东西，所以找出指定用户未评分的物品，然后计算相似用户对该物品的加权评分：

In [5]:
cand_items_mask = (data[i] == 0)    # 提取未评价物品的布尔索引
cand_items = np.arange(len(data[i]))[cand_items_mask]    # 候选推荐物品的索引

# 相似用户对候选物品的评分矩阵，形状为(top_users,cand_items)
scores = data[top_k_sim, :][:, cand_items_mask]
# 对已评分用户相似度的求和，作为分母
denominator = np.sum(
    sim_mat[i, top_k_sim], axis=0)

scores = np.sum(
    scores * sim_mat[i, top_k_sim].reshape(-1, 1), axis=0)    # 以相似度加权并求和
scores = scores/denominator    # 除以相似度的累加

idx = np.argsort(scores)[::-1]    # 按分数排序后的索引
scores = scores[idx]
cand_items = cand_items[idx]

print(scores, cand_items)

[2.834951   2.         3.33009799]
[3.33009799 2.834951   2.        ] [5 0 2]


封装测试：

In [6]:
def CF(data, i, k=5):
    '''
    i: 用户idx
    k: 使用前k个最相似的用户
    '''
    dist_mat = euclidean_distances(data)    # 两两row之间的距离矩阵
    sim_mat = 1/(1+dist_mat)    # 将距离转化成相似度

    top_k_sim = sim_mat[i][sim_mat[i] != 1].argsort()[-1:-k-1:-1]

    cand_items_msak = (data[i] == 0)
    cand_items = np.arange(len(data[i]))[cand_items_msak]

    # 相似用户对候选物品的评分矩阵，形状为(top_users,cand_items)
    scores = data[top_k_sim, :][:, cand_items_msak]
    # 对已评分用户相似度的求和，作为分母
    denominator = np.sum(
        sim_mat[i, top_k_sim], axis=0)

    scores = np.sum(
        scores * sim_mat[i, top_k_sim].reshape(-1, 1), axis=0)    # 以相似度加权并求和
    scores = scores/denominator    # 除以相似度的累加

    idx = np.argsort(scores)[::-1]    # 按分数排序后的索引
    scores = scores[idx]
    cand_items = cand_items[idx]

    return [(item, score) for item, score in zip(cand_items, scores)]


CF(data, 6, 3)

[(5, 3.3300979931640846), (0, 2.834951003417958), (2, 2.0)]

如果需要针对物品推荐用户，将data矩阵转置即可。

In [7]:
data_T = data.T
CF(data_T, 2, 2)

[(6, 4.0), (5, 3.7848875039392977), (2, 2.8924437519696484)]