In [3]:
import pandas as pd

### Copy from repository (commit 14e765b on 26 Aug 2020)

https://github.com/MaurizioFD/RecSys2019_DeepLearning_Evaluation/blob/c173c9688c12e8b2866ac5f707e555faf811996f/Base/Evaluation/Evaluator.py

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""

@author: Maurizio Ferrari Dacrema, Massimo Quadrana
"""


import numpy as np
import unittest
import scipy.sparse as sps


class _Metrics_Object(object):
    """
    Abstract class that should be used as superclass of all metrics requiring an object, therefore a state, to be computed
    """
    def __init__(self):
        pass

    def __str__(self):
        return "{:.4f}".format(self.get_metric_value())

    def add_recommendations(self, recommended_items_ids):
        raise NotImplementedError()

    def get_metric_value(self):
        raise NotImplementedError()

    def merge_with_other(self, other_metric_object):
        raise NotImplementedError()


####################################################################################################################
###############                 ACCURACY METRICS
####################################################################################################################


class MAP(_Metrics_Object):
    """
    Mean Average Precision, defined as the mean of the AveragePrecision over all users

    """

    def __init__(self):
        super(MAP, self).__init__()
        self.cumulative_AP = 0.0
        self.n_users = 0

    def add_recommendations(self, is_relevant, pos_items):
        self.cumulative_AP += average_precision(is_relevant, pos_items)
        self.n_users += 1

    def get_metric_value(self):
        return self.cumulative_AP/self.n_users

    def merge_with_other(self, other_metric_object):
        assert other_metric_object is MAP, "MAP: attempting to merge with a metric object of different type"

        self.cumulative_AP += other_metric_object.cumulative_AP
        self.n_users += other_metric_object.n_users



def average_precision(is_relevant, pos_items):

    if len(is_relevant) == 0:
        a_p = 0.0
    else:
        p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
        a_p = np.sum(p_at_k) / np.min([pos_items.shape[0], is_relevant.shape[0]])

    assert 0 <= a_p <= 1, a_p
    return a_p


class MRR(_Metrics_Object):
    """
    Mean Reciprocal Rank, defined as the mean of the Reciprocal Rank over all users

    """

    def __init__(self):
        super(MRR, self).__init__()
        self.cumulative_RR = 0.0
        self.n_users = 0

    def add_recommendations(self, is_relevant):
        self.cumulative_RR += rr(is_relevant)
        self.n_users += 1

    def get_metric_value(self):
        return self.cumulative_RR/self.n_users

    def merge_with_other(self, other_metric_object):
        assert other_metric_object is MAP, "MRR: attempting to merge with a metric object of different type"

        self.cumulative_RR += other_metric_object.cumulative_RR
        self.n_users += other_metric_object.n_users


def roc_auc(is_relevant):

    ranks = np.arange(len(is_relevant))
    pos_ranks = ranks[is_relevant]
    neg_ranks = ranks[~is_relevant]
    auc_score = 0.0

    if len(neg_ranks) == 0:
        return 1.0

    if len(pos_ranks) > 0:
        for pos_pred in pos_ranks:
            auc_score += np.sum(pos_pred < neg_ranks, dtype=np.float32)
        auc_score /= (pos_ranks.shape[0] * neg_ranks.shape[0])

    assert 0 <= auc_score <= 1, auc_score
    return auc_score



def arhr(is_relevant):
    # average reciprocal hit-rank (ARHR) of all relevant items
    # As opposed to MRR, ARHR takes into account all relevant items and not just the first
    # pag 17
    # http://glaros.dtc.umn.edu/gkhome/fetch/papers/itemrsTOIS04.pdf
    # https://emunix.emich.edu/~sverdlik/COSC562/ItemBasedTopTen.pdf

    p_reciprocal = 1/np.arange(1,len(is_relevant)+1, 1.0, dtype=np.float64)
    arhr_score = is_relevant.dot(p_reciprocal)

    assert not np.isnan(arhr_score), "ARHR is NaN"
    return arhr_score


def precision(is_relevant):

    if len(is_relevant) == 0:
        precision_score = 0.0
    else:
        precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)

    assert 0 <= precision_score <= 1, precision_score
    return precision_score


def precision_recall_min_denominator(is_relevant, n_test_items):

    if len(is_relevant) == 0:
        precision_score = 0.0
    else:
        precision_score = np.sum(is_relevant, dtype=np.float32) / min(n_test_items, len(is_relevant))

    assert 0 <= precision_score <= 1, precision_score
    return precision_score



def recall(is_relevant, pos_items):

    recall_score = np.sum(is_relevant, dtype=np.float32) / pos_items.shape[0]

    assert 0 <= recall_score <= 1, recall_score
    return recall_score


def rr(is_relevant):
    # reciprocal rank of the FIRST relevant item in the ranked list (0 if none)

    ranks = np.arange(1, len(is_relevant) + 1)[is_relevant]

    if len(ranks) > 0:
        return 1. / ranks[0]
    else:
        return 0.0




def ndcg(ranked_list, pos_items, relevance=None, at=None):

    if relevance is None:
        relevance = np.ones_like(pos_items)
    assert len(relevance) == pos_items.shape[0]

    # Create a dictionary associating item_id to its relevance
    # it2rel[item] -> relevance[item]
    it2rel = {it: r for it, r in zip(pos_items, relevance)}

    # Creates array of length "at" with the relevance associated to the item in that position
    rank_scores = np.asarray([it2rel.get(it, 0.0) for it in ranked_list[:at]], dtype=np.float32)

    # IDCG has all relevances to 1, up to the number of items in the test set
    ideal_dcg = dcg(np.sort(relevance)[::-1])

    # DCG uses the relevance of the recommended items
    rank_dcg = dcg(rank_scores)
    
    if rank_dcg == 0.0:
        return 0.0

    ndcg_ = rank_dcg / ideal_dcg

    return ndcg_


def dcg(scores):
    return np.sum(np.divide(np.power(2, scores) - 1, np.log(np.arange(scores.shape[0], dtype=np.float32) + 2)),
                  dtype=np.float32)


### Data preparation

In [4]:
df_pred = pd.read_csv('./preds_full.csv')
df_test = pd.read_csv('./test.csv')

df_pred_new = pd.merge(
    df_pred, 
    df_test.loc[:, ['user_id', 'item_id', 'relevance']], on=['user_id', 'item_id'], how='left'
)

df_pred_new = df_pred_new[df_pred_new.user_id.isin(df_test.user_id)]

df_pred_new.fillna(0, inplace=True)

df_pred_new.relevance.value_counts()

0.0    21307932
4.5       25887
5.0       12924
Name: relevance, dtype: int64

In [5]:
cutoff = 20

use code from repository
https://github.com/MaurizioFD/RecSys2019_DeepLearning_Evaluation/blob/master/Base/Evaluation/Evaluator.py#L307

In [7]:
precision_list = []
roc_auc_list = []
recall_list = []
hitrate_list = []
ndcg_list = []
mrr_obj = MRR()
map_obj = MAP()


for user in sorted(df_pred_new.user_id.unique()):
    is_relevant_current_cutoff = df_pred_new[df_pred_new.user_id == user].relevance.astype(bool).values[0:cutoff]
    recommended_items_current_cutoff = df_pred_new[df_pred_new.user_id == user].item_id.astype(int).values[0:cutoff]
    relevant_items = df_test[df_test.user_id == user].item_id.values
    relevant_items_score = df_test[df_test.user_id == user].relevance.values#.astype(int).values
    
    precision_list.append(precision(is_relevant_current_cutoff))
    roc_auc_list.append(roc_auc(is_relevant_current_cutoff))
    recall_list.append(recall(is_relevant_current_cutoff, relevant_items))
    hitrate_list.append(is_relevant_current_cutoff.sum())

    ndcg_list.append(
        ndcg(
            recommended_items_current_cutoff, relevant_items, 
            relevance=relevant_items_score, at=cutoff
        )
    )

    mrr_obj.add_recommendations(is_relevant_current_cutoff)
    map_obj.add_recommendations(is_relevant_current_cutoff, relevant_items)


averaging or call get_metric_value:
https://github.com/MaurizioFD/RecSys2019_DeepLearning_Evaluation/blob/master/Base/Evaluation/Evaluator.py#L255

### Evaluation metrics

In [8]:
results = pd.DataFrame()

In [9]:
results.at["Precision@20","value"] = np.mean(precision_list)

In [10]:
results.at["Recall@20","value"] = np.mean(recall_list)

In [11]:
results.at["AUC@20","value"] = np.mean(roc_auc_list)

In [12]:
results.at["HitRate@20","value"] = np.mean(hitrate_list)

In [13]:
results.at["MRR@20","value"] = mrr_obj.get_metric_value()

In [14]:
results.at["MAP@20","value"] = map_obj.get_metric_value()

In [15]:
results.at["NDCG@20","value"] = np.mean(ndcg_list)