In [1]:
import pandas as pd
import numpy as np


In [2]:
path_to_firstdf = '/Users/Artem_Boltaev/Documents/EPAM Projects/6. RecSys_course/source_code/recsys_course_epam/data/raw/recsys_task0_dataset.parquet'

df = pd.read_parquet(path_to_firstdf, engine='pyarrow')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1188 entries, 1 to 6040
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ground_truth  1188 non-null   object
 1   prediction    1188 non-null   object
dtypes: object(2)
memory usage: 27.8+ KB


In [4]:
df.head()

Unnamed: 0_level_0,ground_truth,prediction
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[1193, 3408, 2355, 1287, 2804, 594, 919, 595, ...","[2858, 260, 1196, 1198, 593, 2028, 318, 527, 2..."
2,"[1357, 3068, 1537, 2194, 648, 2268, 3468, 1210...","[2858, 260, 1196, 1198, 593, 2028, 318, 527, 2..."
3,"[3421, 1394, 104, 2735, 1210, 1079, 1615, 1291...","[2858, 260, 1196, 1198, 593, 2028, 318, 527, 2..."
4,"[3468, 2951, 1214, 1036, 260, 2028, 480, 1198,...","[2858, 260, 1196, 1198, 593, 2028, 318, 527, 2..."
5,"[2987, 2333, 1175, 2337, 1535, 1392, 866, 2770...","[2858, 260, 1196, 1198, 593, 2028, 318, 527, 2..."


In [5]:
y_true = df['ground_truth'].apply(pd.Series).to_numpy()
y_pred = df['prediction'].apply(pd.Series).to_numpy()

# 1.HitRate@k

In [6]:

def hit_rate_at_k_score(y_true, y_pred, k: int):
    """
    Calculate Hit Rate at k.

    Parameters
    ----------
    y_true : array-like of shape (n_samples, n_items)
        Ground truth (correct) target values.

    y_pred : array-like of shape (n_samples, n_items)
        Estimated target values.

    k : int
        Number of recommendations to take into account.

    Returns
    -------
    score : float
        A non-negative floating point value (the best value is 1.0).
    """
    
    hits = [np.in1d(pred[:k], fact).any() for fact, pred in zip(y_true, y_pred)]
    return np.mean(hits)

Expected scores:

HitRate@3: 0.418

HitRate@5: 0.492

In [7]:
hit_rate_at_k_score(y_true, y_pred, 3)

0.4175084175084175

In [8]:
hit_rate_at_k_score(y_true, y_pred, 5)

0.49158249158249157

# 2.MAP@k

In [9]:
def mean_average_precision_at_k_score(y_true, y_pred, k):
    """
    Calculate mean average precision at k (MAP@k).

    Parameters
    ----------
    y_true : array-like of shape (n_samples, n_items)
        Ground truth (correct) target values.

    y_pred : array-like of shape (n_samples, n_items)
        Estimated target values.

    k : int
        Maximum number of recommendations to take into account.

    Returns
    -------
    score : float
        A non-negative floating point value (the best value is 1.0).
    """

    def _average_precision_at_k_score(fact, pred):
        hits = np.in1d(pred[:k], fact)
        precision_at_k_list = [hits[:i].sum() / i for i in range(1, len(hits) + 1) if hits[i - 1]]
        if precision_at_k_list:
            return np.mean(precision_at_k_list)
        else:
            return 0.0
    
    point_scores_list = [_average_precision_at_k_score(fact, pred) for fact, pred in zip(y_true, y_pred)]
    return np.average(point_scores_list)

EXPECTED VALUES

MAP@3: 0.325

MAP@5: 0.333

In [10]:
mean_average_precision_at_k_score(y_true, y_pred, 3)

0.3247053872053872

In [11]:
mean_average_precision_at_k_score(y_true, y_pred, 5)

0.3326155069210625

# 3.NDCG@k

In [58]:
#NDCG@k

def dcg_score_at_k(y_true, y_pred, k, gains="linear"):
    """Discounted cumulative gain (DCG) at rank k

    Warning - Custom function
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_pred : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Maximum number of recommendations to take into account.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    DCG @k : float
    """
    y_true = y_true[~np.isnan(y_true)]
    y_pred = y_pred[~np.isnan(y_pred)]

    y_true = y_pred[:k]

    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score_at_k(y_true, y_pred, k, gains="linear"):
    """Normalized discounted cumulative gain (NDCG) at rank k
    Parameters
    ----------
    y_true : array-like of shape (n_samples, n_items)
        Ground truth (correct) target values.

    y_pred : array-like of shape (n_samples, n_items)
        Estimated target values.
        
    k : int
        Maximum number of recommendations to take into account.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    NDCG @k : float
    """

    best_scores_list = []
    for i in range(len(y_true)-1):
        d = dcg_score_at_k(y_true[i], y_true[i], k, gains)
        best_scores_list.append(d)

    actual_scores_list = []
    for i in range(len(y_true)-1):
        d = dcg_score_at_k(y_true[i], y_pred[i], k, gains)
        actual_scores_list.append(d)

    res = [actual / best for actual, best in zip(actual_scores_list, best_scores_list)]

    return sum(res)/len(res)


EXPECTED Values

NDCG@3: 0.238

NDCG@5: 0.223

In [59]:
ndcg_score_at_k(y_true, y_pred, 3, gains="linear")


0.8192328866035623

In [60]:
ndcg_score_at_k(y_true, y_pred, 5, gains="linear")

0.7930310056305412

In [66]:
#normalizing scores to 1

y_true_norm = []

for i in range(len(y_true)):
    y_true_norm.append([x/x for x in y_true[i]])


y_pred_norm = []

for i in range(len(y_pred)):
    y_pred_norm_row = []
    for el in y_pred[i]:
        if el in y_true[i]:
            score = 1
        else:
            score = 0
        y_pred_norm_row.append(score)

    y_pred_norm.append(y_pred_norm_row)

y_pred_norm = np.array(y_pred_norm)
y_true_norm = np.array(y_true_norm)


y_true_norm = y_true_norm.astype(int)
y_pred_norm = y_pred_norm.astype(int)

In [67]:
ndcg_score_at_k(y_true_norm, y_pred_norm, 3, gains="linear")

0.23692442876000566

In [70]:
ndcg_score_at_k(y_true_norm, y_pred_norm, 5, gains="linear")

0.2206615472274326