In [1]:
import recbole
recbole.__version__

'0.2.1'

In [2]:
from recbole.evaluator.metrics import *

In [3]:
import pandas as pd

df_pred = pd.read_csv('./pred.csv')
df_test = pd.read_csv('./test.csv')

df_pred_new = pd.merge(
    df_pred, 
    df_test.loc[:, ['user_id', 'item_id', 'relevance']], on=['user_id', 'item_id'], how='left'
)

df_pred_new = df_pred_new[df_pred_new.user_id.isin(df_test.user_id)]

df_pred_new.fillna(0, inplace=True)

df_pred_new.relevance.value_counts()

0.0    51516
4.5     1840
5.0     1304
Name: relevance, dtype: int64

In [4]:
df_pred_new.head()

Unnamed: 0,user_id,item_id,score,relevance
40,3,590,0.280305,0.0
41,3,1136,0.263707,0.0
42,3,920,0.231234,0.0
43,3,1246,0.188023,0.0
44,3,318,0.187111,0.0


In [5]:
recommendations = {}

for user, item_id in zip(df_pred_new.user_id, df_pred_new.relevance):
    if user in recommendations:
        recommendations[user] += [item_id]
    else:
        recommendations[user] = [item_id]

In [6]:
pos_len = []

for user in recommendations:
    pos_len.append(df_test[df_test.user_id == user].shape[0])

In [7]:
pos_len = np.array(pos_len)

In [8]:
pos_idx = np.array(list(recommendations.values()))

In [9]:
pos_idx

array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 5. , ..., 0. , 5. , 0. ],
       ...,
       [4.5, 4.5, 4.5, ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [10]:
pos_len

array([  1,  15,  15, ..., 172,   9,  82])

Just averaging all user scores:
https://github.com/RUCAIBox/RecBole/blob/master/recbole/evaluator/evaluators.py#L140

In [11]:
np.mean(precision_(pos_idx, pos_len)[:,-1])

0.27076472740578117

In [12]:
np.mean(recall_(pos_idx, pos_len)[:,-1])

0.45203482337707523

In [13]:
np.mean(ndcg_(pos_idx, pos_len)[:,-1])

0.09326437896736814

In [14]:
np.mean(map_(pos_idx, pos_len)[:,-1])

0.8754284992014949

In [15]:
np.mean(mrr_(pos_idx, pos_len)[:,-1])

0.16001231669037094

In [16]:
np.mean(hit_(pos_idx, pos_len)[:,-1])

0.47457006952067327

In [26]:
auc_(
    np.array((df_pred_new['relevance'] > 0).astype(int).tolist()),
    np.array(df_pred_new['score'].tolist())
)

0.5305776379264664

Select only GT from top K by scores:

In [19]:
pos_len_ = []

for user in sorted(recommendations.keys()):
    pos_len_.append(df_pred_new[
        (df_pred_new.user_id == user) & (df_pred_new.relevance > 0)
    ].shape[0])

In [22]:
df_pred_new['rank'] = list(range(20)) * len(df_pred_new.user_id.unique())

In [23]:
pos_rank_sum = []
for user in sorted(recommendations.keys()):
    pos_rank_sum.append(sum(
        df_pred_new[
            (df_pred_new.user_id == user) & (df_pred_new.relevance > 0)
        ]['rank']))

In [24]:
gauc_(
    user_len_list=np.array([20] * len(recommendations)), 
    pos_len_list=np.array(pos_len_), 
    pos_rank_sum=np.array(pos_rank_sum)
)

No positive samples in some users, true positive value should be meaningless, these users have been removed from GAUC calculation


0.6688682890157615