In [1]:
# VERSION 2.0
# https://github.com/wubinzzu/NeuRec/blob/master/evaluator/backend/python/metric.py
"""
@author: Zhongchuan Sun
"""
import numpy as np
import sys


def hit(rank, ground_truth):
    # HR is equal to Recall when dataset is loo split.
    last_idx = sys.maxsize
    for idx, item in enumerate(rank):
        if item in ground_truth:
            last_idx = idx
            break
    result = np.zeros(len(rank), dtype=np.float32)
    result[last_idx:] = 1.0
    return result


def precision(rank, ground_truth):
    # Precision is meaningless when dataset is loo split.
    hits = [1 if item in ground_truth else 0 for item in rank]
    result = np.cumsum(hits, dtype=np.float32)/np.arange(1, len(rank)+1)
    return result


def recall(rank, ground_truth):
    # Recall is equal to HR when dataset is loo split.
    hits = [1 if item in ground_truth else 0 for item in rank]
    result = np.cumsum(hits, dtype=np.float32) / len(ground_truth)
    return result


def map(rank, ground_truth):
    pre = precision(rank, ground_truth)
    pre = [pre[idx] if item in ground_truth else 0 for idx, item in enumerate(rank)]
    sum_pre = np.cumsum(pre, dtype=np.float32)
    # relevant_num = np.cumsum([1 if item in ground_truth else 0 for item in rank])
    relevant_num = np.cumsum([min(idx+1, len(ground_truth)) for idx, _ in enumerate(rank)])
    result = [p/r_num if r_num!=0 else 0 for p, r_num in zip(sum_pre, relevant_num)]
    return result


def ndcg(rank, ground_truth):
    len_rank = len(rank)
    idcg_len = min(len(ground_truth), len_rank)
    idcg = np.cumsum(1.0 / np.log2(np.arange(2, len_rank + 2)))
    idcg[idcg_len:] = idcg[idcg_len - 1]

    dcg = np.cumsum([1.0/np.log2(idx+2) if item in ground_truth else 0.0 for idx, item in enumerate(rank)])
    result = dcg/idcg
    return result


def mrr(rank, ground_truth):
    # MRR is equal to MAP when dataset is loo split.
    last_idx = sys.maxsize
    for idx, item in enumerate(rank):
        if item in ground_truth:
            last_idx = idx
            break
    result = np.zeros(len(rank), dtype=np.float32)
    result[last_idx:] = 1.0/(last_idx+1)
    return result


metric_dict = {"Precision": precision,
               "Recall": recall,
               "MAP": map,
               "NDCG": ndcg,
               "MRR": mrr}

In [2]:
import pandas as pd
import numpy as np

df_pred = pd.read_csv('./pred.csv')
df_test = pd.read_csv('./test.csv')

df_pred_new = pd.merge(
    df_pred, 
    df_test.loc[:, ['user_id', 'item_id', 'relevance']], on=['user_id', 'item_id'], how='left'
)

df_pred_new = df_pred_new[df_pred_new.user_id.isin(df_test.user_id)]

df_pred_new.fillna(0, inplace=True)

df_pred_new.loc[df_pred_new.relevance != 0, 'relevance'] = 1

df_pred_new.relevance.value_counts()

0.0    51516
1.0     3144
Name: relevance, dtype: int64

In [3]:
recommendations = {}

for user, item_id, score in zip(df_pred_new.user_id, df_pred_new.item_id, df_pred_new.score):

    if user in recommendations:
        recommendations[user] += [item_id]
    else:
        recommendations[user] = [item_id, score] 

In [4]:
gt_recommendations = {}

for user, item_id, score in zip(df_test.user_id, df_test.item_id, df_test.relevance):
    
#     if score > 0:

    if user in gt_recommendations:
        gt_recommendations[user][item_id] = score
    else:
        gt_recommendations[user] = {item_id: score}

In [5]:
np.mean(
    [
        precision(
            rank=df_pred_new[df_pred_new.user_id == user].item_id.values, 
            ground_truth=df_test[df_test.user_id == user].item_id.values
        )[19]
        for user in df_pred_new.user_id.unique()
    ]
)

0.057519209659714604

In [6]:
np.mean(
    [
        recall(
            rank=df_pred_new[df_pred_new.user_id == user].item_id.values, 
            ground_truth=df_test[df_test.user_id == user].item_id.values
        )[19]
        for user in df_pred_new.user_id.unique()
    ]
)

0.096321315

In [7]:
np.mean(
    [
        hit(
            rank=df_pred_new[df_pred_new.user_id == user].item_id.values, 
            ground_truth=df_test[df_test.user_id == user].item_id.values
        )[19]
        for user in df_pred_new.user_id.unique()
    ]
)

0.47457007

In [8]:
np.mean(
    [
        map(
            rank=df_pred_new[df_pred_new.user_id == user].item_id.values, 
            ground_truth=df_test[df_test.user_id == user].item_id.values
        )[19]
        for user in df_pred_new.user_id.unique()
    ]
)

0.0028783712115732244

In [9]:
np.mean(
    [
        mrr(
            rank=df_pred_new[df_pred_new.user_id == user].item_id.values, 
            ground_truth=df_test[df_test.user_id == user].item_id.values
        )[19]
        for user in df_pred_new.user_id.unique()
    ]
)

0.18632413

In [10]:
np.mean(
    [
        ndcg(
            rank=df_pred_new[df_pred_new.user_id == user].item_id.values, 
            ground_truth=df_test[df_test.user_id == user].item_id.values
        )[19]
        for user in df_pred_new.user_id.unique()
    ]
)

0.09326437896736814