In [14]:
import math

import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [15]:
data = pd.read_csv('../datasets/ml-100k/u1.base.OCCF', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
test = pd.read_csv('../datasets/ml-100k/u1.test.OCCF', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
data.head()
# test.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,3,4,878542960
2,1,7,4,875071561
3,1,9,5,878543541
4,1,13,5,875071805


In [16]:
# 初始化
userNum = 943
itemNum = 1682
R = np.zeros((userNum, itemNum))

# 将u2.base的数据存入矩阵
for row in data.itertuples():
    userID, itemID, rating = row[1] - 1, row[2] - 1, row[3]
    R[userID, itemID] = 1

print(R)

testRank = {}
for row in test.itertuples():
    userID, itemID, rating = row[1] - 1, row[2] - 1, row[3]
    testRank.setdefault(userID, [])
    testRank[userID].append(itemID)
print(testRank)

[[1. 0. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
{0: [5, 11, 13, 19, 22, 32, 38, 43, 46, 50, 55, 59, 60, 63, 64, 71, 75, 79, 80, 81, 83, 85, 89, 90, 95, 97, 99, 106, 107, 112, 113, 120, 127, 128, 131, 133, 149, 150, 153, 156, 159, 160, 162, 169, 170, 173, 174, 176, 182, 183, 184, 185, 189, 192, 195, 201, 205, 207, 208, 209, 211, 213, 220, 221, 223, 226, 227, 228, 229, 234, 235, 240, 241, 247, 249, 252, 257, 264, 266], 1: [12, 49, 250, 256, 278, 291, 296, 298, 300, 302, 312, 315], 2: [317, 326, 327, 330, 347], 3: [49, 259, 287, 293, 302, 353, 356, 360], 4: [0, 16, 23, 39, 41, 61, 88, 99, 108, 152, 172, 208, 210, 221, 226, 266, 381, 384, 390, 421, 422, 427, 432, 434, 435], 5: [13, 18, 22, 31, 55, 58, 80, 86, 97, 99, 123, 130, 132, 133, 134, 135, 174, 179, 182, 186, 194, 196, 198, 207, 208, 210, 212, 220, 237, 268, 274, 303, 317, 356, 431, 462, 465, 466, 468, 474, 477, 478, 479, 480,

In [17]:
# user average
rank = {}
mu = np.average(R)
for i in tqdm(range(len(R))):
    for j in range(len(R[i])):
        if R[i, j] == 0:
            rank.setdefault(i, [])
            rank[i].append([j, np.sum(R[:, j]) / userNum - mu])

for u in tqdm(range(userNum)):
    rank[u] = sorted(rank[u], key=lambda u: u[1], reverse=True)

100%|██████████| 943/943 [00:05<00:00, 180.08it/s]
100%|██████████| 943/943 [00:00<00:00, 3567.52it/s]


In [18]:
def pre(rank, testRank, k):
    pre_u = 0
    for userID, testItemIDs in testRank.items():
        pre_u += len(set([x[0] for x in rank[userID][:k]]).intersection(testItemIDs)) / k
    print("Pre@" + str(k) + '=' + str(pre_u / len(testRank.items())))


pre(rank, testRank, 5)

Pre@5=0.23377192982456205


In [19]:
def rec(rank, testRank, k):
    rec_u = 0
    for userID, testItemIDs in testRank.items():
        rec_u += len(set([x[0] for x in rank[userID][:k]]).intersection(testItemIDs)) / len(testItemIDs)
    print("Rec@" + str(k) + '=' + str(rec_u / len(testRank.items())))


rec(rank, testRank, 5)

Rec@5=0.05712433087638161


In [20]:
def F1(rank, testRank, k):
    f1_u = 0
    for userID, testItemIDs in testRank.items():
        p1 = len(set([x[0] for x in rank[userID][:k]]).intersection(testItemIDs)) / k
        r1 = len(set([x[0] for x in rank[userID][:k]]).intersection(testItemIDs)) / len(testItemIDs)
        if p1 + r1 == 0:
            continue
        f1_u += 2 * (p1 * r1) / (p1 + r1)
    print("F1@" + str(k) + '=' + str(f1_u / len(testRank.items())))


F1(rank, testRank, 5)

F1@5=0.0774722624073191


In [21]:
def dcg(scores):
    res = 0
    for i, r in enumerate(scores):
        res += (2 ** (r) - 1) / math.log2(i + 2)
    return res


def NDCG(rank, testRank, k):
    res = 0
    for userID, testItemIDs in testRank.items():
        arr = [1 if item[0] in testItemIDs else 0 for item in rank[userID][:k]]
        arr2 = np.ones(len(set([x[0] for x in rank[userID][:k]]).intersection(testItemIDs)))
        res += dcg(arr) / dcg(arr2) if dcg(arr2) != 0 else 0
    print("NDCG@" + str(k) + '=' + str(res / len(testRank.items())))


NDCG(rank, testRank, 5)

NDCG@5=0.4562984362372017


In [22]:
def oneCall(rank, testRank, k):
    onecall_u = 0
    for userID, testItemIDs in testRank.items():
        p1 = len(set([x[0] for x in rank[userID][:k]]).intersection(testItemIDs))
        onecall_u += 1 if p1 >= 1 else 0
    print("1-call@" + str(k) + '=' + str(onecall_u / len(testRank.items())))


oneCall(rank, testRank, 5)

1-call@5=0.5877192982456141


In [23]:
def MRR(rank, testRank):
    mrr_u = 0
    for userID, testItemIDs in testRank.items():
        arr = [1 if item[0] in testItemIDs else 0 for item in rank[userID][:]]

        mrr_u += 1 / (arr.index(1) + 1) if 1 in arr else 0
    print("MRR=" + str(mrr_u / len(testRank.items())))


MRR(rank, testRank)

MRR=0.4656607532775578


In [24]:
def MAP(rank, testRank):
    map_u = 0
    for userID, testItemIDs in testRank.items():
        arr = [1 if item[0] in testItemIDs else 0 for item in rank[userID][:]]

        map_u += 1 / (arr.index(1) + 1) if 1 in arr else 0
    print("MAP=" + str(map_u / len(testRank.items())))


MRR(rank, testRank)

MRR=0.4656607532775578
