In [48]:
import pandas as pd
from recommenders.evaluation.python_evaluation import (
    rmse,
    mae,
    rsquared,
    exp_var,
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    get_top_k_items,
    auc,
    logloss
)

In [49]:
df_true = pd.DataFrame(
        {
            "UserId": [1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
            "MovieId": [1, 2, 3, 1, 4, 5, 6, 7, 2, 5, 6, 8, 9, 10, 11, 12, 13, 14],
            "Rating": [5, 4, 3, 5, 5, 3, 3, 1, 5, 5, 5, 4, 4, 3, 3, 3, 2, 1],
        }
    )
df_pred = pd.DataFrame(
    {
        "UserId": [1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        "MovieId": [3, 10, 12, 10, 3, 5, 11, 13, 4, 10, 7, 13, 1, 3, 5, 2, 11, 14],
        "Rating": [14, 13, 12, 14, 13, 12, 11, 10, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5]
    }
)

In [26]:
df_true

Unnamed: 0,UserId,MovieId,Rating
0,1,1,5
1,1,2,4
2,1,3,3
3,2,1,5
4,2,4,5
5,2,5,3
6,2,6,3
7,2,7,1
8,3,2,5
9,3,5,5


In [27]:
df_pred

Unnamed: 0,UserId,MovieId,Rating
0,1,3,14
1,1,10,13
2,1,12,12
3,2,10,14
4,2,3,13
5,2,5,12
6,2,11,11
7,2,13,10
8,3,4,14
9,3,10,13


Use Case: Use Precision@K when you want to evaluate the accuracy of the top K recommendations. It is useful in scenarios where the user is likely to look at only the top K results, such as search engines.

In [28]:
eval_precision = precision_at_k(
    df_true, df_pred, col_user="UserId",col_item="MovieId", col_prediction="Rating", k=3
)
eval_precision

0.3333333333333333

Use Case: Use Recall@K when you want to evaluate how well the system retrieves all relevant items within the top K results. It is useful in scenarios where missing a relevant item is costly, such as in medical diagnosis or legal document retrieval.

In [29]:
eval_recall = recall_at_k(
    df_true, df_pred, col_user="UserId",col_item="MovieId", col_prediction="Rating", k=3
)
eval_recall

0.2111111111111111

In [None]:
# Precision@3, Recall@3

# User 1:
# Actual relevant items: [1, 2, 3]
# Predicted top 3 items: [3, 10, 12]

# Precision@3: Among the top 3 predicted items, the relevant item is 3.
# So, Precision@3 = 1/3 = 0.33

# Recall@3: There are 3 actual relevant items (1, 2, 3), and only one of them (3) appears in the top 3 predictions.
# So, Recall@3 = 1/3 = 0.33

# User 2:
# Actual relevant items: [1, 4, 5, 6, 7]
# Predicted top 3 items: [10, 3, 5]

# Precision@3: Among the top 3 predicted items, the relevant item is 5.
# So, Precision@3 = 1/3 = 0.33

# Recall@3: There are 5 actual relevant items, and only one of them (5) appears in the top 3 predictions.
# So, Recall@3 = 1/5 = 0.20

# User 3:
# Actual relevant items: [2, 5, 6, 8, 9, 10, 11, 12, 13, 14]
# Predicted top 3 items: [4, 10, 7]

# Precision@3: Among the top 3 predicted items, the relevant item is 10.
# So, Precision@3 = 1/3 = 0.33

# Recall@3: There are 10 actual relevant items, and only one of them (10) appears in the top 3 predictions.
# So, Recall@3 = 1/10 = 0.10

# Overall for precision 0.33 + 0.33 + 0.33 = 0.33333
# Overall for recall 0.33 + 0.2 + 0.1 = 0.2111

Use Case: It is useful in information retrieval tasks where both precision and the order of results matter.

In [17]:
eval_map = map_at_k(
    df_true, df_pred, col_user="UserId",col_item="MovieId", col_prediction="Rating", k=3
)
eval_map

0.2037037037037037

In [None]:
# MAP@3

# User 1:
# Actual relevant items: [1, 2, 3]
# Predicted top 3 items: [3, 10, 12]
# Rank 1: Predicted item is 3, which is relevant.
# Precision at rank 1 = 1/1 = 1.0
# Rank 2: Predicted item is 10, which is not relevant.
# Precision at rank 2 = 1/2 = 0.5 (cumulative precision remains 1.0)
# Rank 3: Predicted item is 12, which is not relevant.
# Precision at rank 3 = 1/3 = 0.33 (cumulative precision remains 1.0)
# Average Precision = (1.0) / 3 = 0.33

# User 2:
# Actual relevant items: [1, 4, 5, 6, 7]
# Predicted top 3 items: [10, 3, 5]
# Rank 1: Predicted item is 10, which is not relevant.
# Precision at rank 1 = 0
# Rank 2: Predicted item is 3, which is not relevant.
# Precision at rank 2 = 0
# Rank 3: Predicted item is 5, which is relevant.
# Precision at rank 3 = 1/3 = 0.33
# Average Precision = (0.33) / 3 = 0.11

# User 3:
# Actual relevant items: [2, 5, 6, 8, 9, 10, 11, 12, 13, 14]
# Predicted top 3 items: [4, 10, 7]
# Rank 1: Predicted item is 4, which is not relevant.
# Precision at rank 1 = 0
# Rank 2: Predicted item is 10, which is relevant.
# Precision at rank 2 = 1/2 = 0.5
# Rank 3: Predicted item is 7, which is not relevant.
# Precision at rank 3 = 0.5
# Average Precision = (0.5) / 3 = 0.17

# Overall 0.33 + 0.11 + 0.17 = 0.2037037037037037

Use Case: Use NDCG@K when you want to evaluate the ranking quality of the system, taking into account the position of the relevant items. It is useful in scenarios where the order of results is important, such as search engines or recommendation systems.

In [19]:
eval_ndcg = ndcg_at_k(
    df_true, df_pred, col_user="UserId",col_item="MovieId", col_rating="Rating", col_prediction="Rating", k=3
)
eval_ndcg

0.3333333333333333

While MAP@K also considers the position of relevant items, it does so less aggressively and is more focused on the overall precision across multiple queries. Use MAP@K when you want a balanced measure of precision and ranking quality across different queries.

In [None]:
# NDCG@3

# User 1:
# Actual relevant items: [1, 2, 3]
# Predicted top 3 items: [3, 10, 12]
# Rank 1: Predicted item is 3, which is relevant.
# DCG = 1 / log2(1 + 1) = 1.0
# Rank 2: Predicted item is 10, which is not relevant.
# DCG remains 1.0
# Rank 3: Predicted item is 12, which is not relevant.
# DCG remains 1.0
# Ideal DCG (IDCG) for top 3 relevant items [1, 2, 3]:
# IDCG = 1 / log2(1 + 1) + 1 / log2(2 + 1) + 1 / log2(3 + 1) = 1 + 0.63 + 0.5 = 2.13
# NDCG@3 = DCG / IDCG = 1.0 / 2.13 = 0.47

# User 2:
# Actual relevant items: [1, 4, 5, 6, 7]
# Predicted top 3 items: [10, 3, 5]
# Rank 1: Predicted item is 10, which is not relevant.
# DCG = 0
# Rank 2: Predicted item is 3, which is not relevant.
# DCG remains 0
# Rank 3: Predicted item is 5, which is relevant.
# DCG = 1 / log2(3 + 1) = 0.5
# Ideal DCG (IDCG) for top 3 relevant items [1, 4, 5]:
# IDCG = 1 / log2(1 + 1) + 1 / log2(2 + 1) + 1 / log2(3 + 1) = 1 + 0.63 + 0.5 = 2.13
# NDCG@3 = DCG / IDCG = 0.5 / 2.13 = 0.23

# User 3:
# Actual relevant items: [2, 5, 6, 8, 9, 10, 11, 12, 13, 14]
# Predicted top 3 items: [4, 10, 7]
# Rank 1: Predicted item is 4, which is not relevant.
# DCG = 0
# Rank 2: Predicted item is 10, which is relevant.
# DCG = 1 / log2(2 + 1) = 0.63
# Rank 3: Predicted item is 7, which is not relevant.
# DCG remains 0.63
# Ideal DCG (IDCG) for top 3 relevant items [2, 5, 6]:
# IDCG = 1 / log2(1 + 1) + 1 / log2(2 + 1) + 1 / log2(3 + 1) = 1 + 0.63 + 0.5 = 2.13
# NDCG@3 = DCG / IDCG = 0.63 / 2.13 = 0.30

# Overall 0.47 + 0.23 + 0.3 = 0.3333333

## My custom way for calculating metrics

Extra step need to do

To compute ranking metrics, we need predictions on all user, item pairs. We remove though the items already watched by the user, since we choose not to recommend them again. 

In [21]:
import pandas as pd
import numpy as np

def precision_at_k(true_items, predicted_items, k=3):
    predicted_at_k = predicted_items[:k]
    relevant_at_k = set(predicted_at_k).intersection(set(true_items))
    return len(relevant_at_k) / k

def recall_at_k(true_items, predicted_items, k=3):
    predicted_at_k = predicted_items[:k]
    relevant_at_k = set(predicted_at_k).intersection(set(true_items))
    return len(relevant_at_k) / len(true_items) if true_items else 0.0

def map_at_k(true_items, predicted_items, k=3):
    if not true_items:
        return 0.0
    avg_precision = 0.0
    relevant_count = 0
    for i in range(1, k + 1):
        if predicted_items[i - 1] in true_items:
            relevant_count += 1
            avg_precision += relevant_count / i
    return avg_precision / min(len(true_items), k)

def dcg_at_k(true_items, predicted_items, k=3):
    dcg = 0.0
    for i in range(k):
        if predicted_items[i] in true_items:
            dcg += 1 / np.log2(i + 2)
    return dcg

def ndcg_at_k(true_items, predicted_items, k=3):
    idcg = dcg_at_k(true_items, true_items[:k], k)
    dcg = dcg_at_k(true_items, predicted_items, k)
    return dcg / idcg if idcg > 0 else 0.0

results = []

for user in df_true[COL_USER].unique():
    true_items = df_true[df_true[COL_USER] == user][COL_ITEM].tolist()
    predicted_items = df_pred[df_pred[COL_USER] == user][COL_ITEM].tolist()

    precision_3 = precision_at_k(true_items, predicted_items, k=3)
    recall_3 = recall_at_k(true_items, predicted_items, k=3)
    map_3 = map_at_k(true_items, predicted_items, k=3)
    ndcg_3 = ndcg_at_k(true_items, predicted_items, k=3)
    
    results.append({
        "User": user,
        "Precision@3": precision_3,
        "Recall@3": recall_3,
        "MAP@3": map_3,
        "NDCG@3": ndcg_3
    })

df_results = pd.DataFrame(results)

average_precision_3 = df_results["Precision@3"].mean()
average_recall_3 = df_results["Recall@3"].mean()
average_map_3 = df_results["MAP@3"].mean()
average_ndcg_3 = df_results["NDCG@3"].mean()

print(f"Average Precision@3: {average_precision_3}")
print(f"Average Recall@3: {average_recall_3}")
print(f"Average MAP@3: {average_map_3}")
print(f"Average NDCG@3: {average_ndcg_3}")

Average Precision@3: 0.3333333333333333
Average Recall@3: 0.2111111111111111
Average MAP@3: 0.2037037037037037
Average NDCG@3: 0.3333333333333333


## Binary Metrics

In [None]:
# Not so studied the implicit feedback metrics

In [50]:
# Convert the original rating to 0 and 1.
df_true_bin = df_true.copy()
df_true_bin["Rating"] = df_true_bin["Rating"].apply(lambda x: 1 if x > 3 else 0)

df_true_bin

Unnamed: 0,UserId,MovieId,Rating
0,1,1,1
1,1,2,1
2,1,3,0
3,2,1,1
4,2,4,1
5,2,5,0
6,2,6,0
7,2,7,0
8,3,2,1
9,3,5,1


In [51]:
from sklearn.preprocessing import minmax_scale

# Convert the predicted ratings into a [0, 1] scale.
df_pred_bin = df_pred.copy()
df_pred_bin["Rating"] = minmax_scale(df_pred_bin["Rating"].astype(float))

df_pred_bin

Unnamed: 0,UserId,MovieId,Rating
0,1,3,1.0
1,1,10,0.888889
2,1,12,0.777778
3,2,10,1.0
4,2,3,0.888889
5,2,5,0.777778
6,2,11,0.666667
7,2,13,0.555556
8,3,4,1.0
9,3,10,0.888889


In [52]:
# Calculate the AUC metric
auc_score = auc(
    df_true_bin,
    df_pred_bin,
    col_user="UserId",
    col_item="MovieId", 
    col_rating="Rating", 
    col_prediction="Rating"
)

print(f"The auc score is {auc_score}")

The auc score is 0.33333333333333337


In [53]:
from sklearn.metrics import roc_auc_score
# Calculate the AUC
auc = roc_auc_score(df_true_bin['Rating'], df_pred_bin['Rating'])
auc

0.9135802469135803