This notebook is intended to show implementation of the metrics listed in the Task 0 of RecSys advanced course  ([course page](https://kb.epam.com/display/EPMCBDCCDS/RecSys+course))

In [None]:
import numpy as np
import pandas as pd

# Data loading

Data to get reference scores is generated with the following in mind:

* Each row represents a single user
* Column `prediction` represents predictions (list of items recommended)
* Column `ground_truth` represents ground truth data (items that user really bought)

Load the data from here - [link](https://kb.epam.com/download/attachments/789625167/recsys_task0_dataset.parquet?version=1&modificationDate=1625582742133&api=v2)

In [None]:
df = pd.read_parquet('recsys_task0_dataset.parquet')
df.head(5)

In [None]:
y_true = df['ground_truth'].values # Array of ground truths for a user (array of arrays)
y_pred = df['prediction'].values # Array of predictions for a user (array of arrays)

# HitRate@k
$y_{true}, y_{pred}$ – 2D arrays $(users, predictions)$ (or list of lists)

In [None]:
def hit_rate_at_k(y_true, y_pred, k=5):
    return np.mean([
        int(any(np.isin(y_pred[i][:k], y_true[i]))) 
        for i in range(y_true.shape[0])
    ])

In [None]:
print('HitRate@3: %.3f' % hit_rate_at_k(y_true, y_pred, k=3))

In [None]:
print('HitRate@5: %.3f' % hit_rate_at_k(y_true, y_pred, k=5))

# MAP@k
For MAP@k: $y_{true}, y_{pred}$ – 2D arrays $(users,predictions)$ (or list of lists)

For AP@k, Precision@k:  $y_{true}, y_{pred}$ – 1D arrays

In [None]:
def precision_at_k(y_true, y_pred, k=5):
    intersection = set(y_pred[:k]) & set(y_true)
    return len(intersection) / k


def average_precision_at_k(y_true, y_pred, k=5):
    hits = 0
    s = 0
    for i in range(len(y_pred[:k])):
        if y_pred[i] in y_true:
            hits += 1
            s += hits / (i + 1)
    if hits == 0:
        return 0
    return s / hits


def map_at_k(y_true, y_pred, k=5):
    assert len(y_true) == len(y_pred)
    return np.mean([
        average_precision_at_k(y_true[i], y_pred[i], k=k)
        for i in range(len(y_true))
    ])

In [None]:
print('MAP@3: %.3f' % map_at_k(y_true, y_pred, k=3))

In [None]:
print('MAP@5: %.3f' % map_at_k(y_true, y_pred, k=5))

# NDCG@k
For NDCG@k: $y_{true}, y_{pred}$ – 2D arrays $(users,predictions)$ (or list of lists)

For ndcg@k:  $y_{true}, y_{pred}$ – 1D arrays

In [None]:
def ndcg_at_k(y_true, y_pred, k=5):
    ideal_gain = sum([1 / np.log2(i + 2) for i in range(k)])
    dcg = sum([
        1 / np.log2(i + 2)
        for i, rating in enumerate(y_pred[:k])
        if rating in y_true
    ])
    return dcg / ideal_gain


def NDCG_at_k(y_true, y_pred, k=5):
    assert len(y_true) == len(y_pred)
    return np.mean([
        ndcg_at_k(y_true[i], y_pred[i], k=k)
        for i in range(len(y_true))
    ])

In [None]:
print('NDCG@3: %.3f' % NDCG_at_k(y_true, y_pred, k=3))

In [None]:
print('NDCG@5: %.3f' % NDCG_at_k(y_true, y_pred, k=5))