In [1]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict

from helpers import load_dataset

In [2]:
# Load datasets
df_articles, df_clicks, article_embeddings = load_dataset()

In [3]:
# Create user profiles based on article clicks
user_profiles = df_clicks.groupby('user_id')['click_article_id'].apply(list).reset_index()
article_category_map = df_articles.set_index("article_id")["category_id"].to_dict()
user_profiles["categories"] = user_profiles["click_article_id"].apply(
    lambda x: [article_category_map[article_id] for article_id in x]
)
user_profiles

Unnamed: 0,user_id,click_article_id,categories
0,0,"[157541, 68866, 96755, 313996, 160158, 233470,...","[281, 136, 209, 431, 281, 375, 186, 186]"
1,1,"[327984, 183176, 235840, 96663, 59758, 160474,...","[435, 301, 375, 209, 123, 281, 43, 375, 412, 2..."
2,2,"[119592, 30970, 30760, 209122]","[247, 26, 26, 332]"
3,3,"[236444, 234318, 233688, 237452, 235745, 12096...","[375, 375, 375, 375, 375, 249, 375, 375, 375, ..."
4,4,"[336499, 271261, 48915, 44488, 195887, 195084,...","[437, 399, 92, 81, 317, 317, 132]"
...,...,...,...
322892,322892,"[42567, 39894]","[67, 66]"
322893,322893,"[50644, 36162]","[99, 43]"
322894,322894,"[36162, 168401]","[43, 297]"
322895,322895,"[289197, 63746]","[418, 133]"


In [4]:
# Merge datasets to get user-article-category information
df_merged = df_clicks.merge(df_articles, left_on='click_article_id', right_on='article_id')
df_user_item = df_merged[['user_id', 'article_id', 'category_id']]

In [5]:
# Create user-article-category interaction counts
interaction_counts = df_user_item.groupby(['user_id', 'article_id']).size()

In [7]:
# Convert series to dataframe and reset index
user_rating_matrix = interaction_counts.to_frame().reset_index()
user_rating_matrix.rename(columns={0: 'rating'}, inplace=True)

In [8]:
##### Normalize ratings #####
scaler = MinMaxScaler(feature_range=(0, 1))
user_rating_matrix["rating_norm"] = scaler.fit_transform(
    np.array(user_rating_matrix["rating"]).reshape(-1, 1)
)

In [9]:
# Filter out zero normalized ratings
X = user_rating_matrix[user_rating_matrix["rating_norm"] != 0.0]
X

Unnamed: 0,user_id,article_id,rating,rating_norm
48,5,36399,2,0.03125
111,5,284664,2,0.03125
129,5,348128,2,0.03125
152,6,233717,2,0.03125
477,17,74722,2,0.03125
...,...,...,...,...
2946563,321180,199393,2,0.03125
2946564,321180,199437,2,0.03125
2947399,321498,342473,2,0.03125
2949080,322199,87194,2,0.03125


In [12]:

##### Prepare dataset for Surprise library #####
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(X[["user_id", "article_id", "rating_norm"]], reader)
trainset, testset = train_test_split(data, test_size=0.25)
print("Number of interactions: ", len(X))

Number of interactions:  33875


In [14]:
# Train the SVD model and evaluate #####
svd = SVD()
svd.fit(trainset)
predictions = svd.test(testset)

In [16]:
# Perform cross-validation with additional metrics
def compute_metrics(predictions):
    """Compute various evaluation metrics from the predictions."""
    # Calculate RMSE and MAE
    metrics = {
        'rmse': accuracy.rmse(predictions, verbose=False),
        'mae': accuracy.mae(predictions, verbose=False),
        'ndcg_5': ndcg_at_k(predictions, k=5),
        'ndcg_10': ndcg_at_k(predictions, k=10),
        'mean_mrr': mean_reciprocal_rank(predictions)
    }
    return metrics


In [17]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions (list of Prediction objects): The list of predictions, as
        returned by the test method of an algorithm.
        n (int): The number of recommendation to output for each user. Default is 10.

    Returns:
        dict: A dictionary where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, category id, rating estimation), ...] of size n.
    """
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, article_category_map[iid], est))

    # Then sort the predictions for each user and retrieve the n highest ones
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[2], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


In [18]:
top_recommendations = get_top_n(predictions)
top_recommendations

defaultdict(list,
            {32408: [(156619, 281, 0.06789351588475254),
              (162655, 281, 0.005523867626126135)],
             101925: [(162718, 281, 0.20456678693288666),
              (160621, 281, 0.03183442757224015),
              (161979, 281, 0.03183442757224015),
              (156210, 281, 0.03183442757224015),
              (160549, 281, 0.03183442757224015),
              (157465, 281, 0.0293196213200731)],
             51476: [(156560, 281, 0.0008749639291446773)],
             125977: [(133160, 254, 0.02720855208648739)],
             4837: [(207374, 331, 0.07694993845284531),
              (32041, 26, 0.0064004259709563305),
              (202338, 327, 0),
              (74722, 142, 0)],
             29487: [(233470, 375, 0.025400763977540902)],
             82987: [(161178, 281, 0.02800318333379823)],
             29654: [(168623, 297, 0.03424863900109672)],
             21828: [(332623, 436, 0.034605506573250414)],
             1753: [(96210, 209, 0.0311705

In [57]:
def ndcg_at_k(predictions, k=5, relevance_threshold=0.1):
    from collections import defaultdict
    import math
    
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    ndcg = 0.0
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        print(f"\nUser: {uid}")
        print(f"Sorted ratings (estimated, true): {user_ratings}")
        
        dcg = 0.0
        idcg = 0.0
        
        relevant_ratings = [true_r for est, true_r in user_ratings if true_r > relevance_threshold]
        for i in range(min(len(relevant_ratings), k)):
            idcg += 1.0 / math.log2(i + 2)
        
        for i, (est, true_r) in enumerate(user_ratings[:k]):
            if true_r > relevance_threshold:
                dcg += 1.0 / math.log2(i + 2)
            print(f"Rank: {i+1}, Estimated Rating: {est}, True Rating: {true_r}, DCG: {dcg}")

        ndcg_value = dcg / idcg if idcg > 0 else 0
        ndcg += ndcg_value
        print(f"User NDCG: {ndcg_value}, IDCG: {idcg}")

    final_ndcg = ndcg / len(user_est_true)
    print(f"\nAverage NDCG@{k}: {final_ndcg}")
    return final_ndcg

def mean_reciprocal_rank(predictions, relevance_threshold=0.1):
    from collections import defaultdict
    
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    mrr = 0.0
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        print(f"\nUser: {uid}")
        print(f"Sorted ratings (estimated, true): {user_ratings}")
        
        for rank, (est, true_r) in enumerate(user_ratings):
            if true_r > relevance_threshold:
                mrr += 1.0 / (rank + 1)
                print(f"First relevant rank: {rank+1}, MRR contribution: {1.0/(rank+1)}")
                break

    final_mrr = mrr / len(user_est_true)
    print(f"\nMean Reciprocal Rank: {final_mrr}")
    return final_mrr

In [58]:
compute_metrics(predictions)


User: 32408
Sorted ratings (estimated, true): [(0.06789351588475254, 0.03125), (0.005523867626126135, 0.03125)]
Rank: 1, Estimated Rating: 0.06789351588475254, True Rating: 0.03125, DCG: 0.0
Rank: 2, Estimated Rating: 0.005523867626126135, True Rating: 0.03125, DCG: 0.0
User NDCG: 0, IDCG: 0.0

User: 101925
Sorted ratings (estimated, true): [(0.20456678693288666, 0.03125), (0.03183442757224015, 0.03125), (0.03183442757224015, 0.03125), (0.03183442757224015, 0.03125), (0.03183442757224015, 0.03125), (0.0293196213200731, 0.03125)]
Rank: 1, Estimated Rating: 0.20456678693288666, True Rating: 0.03125, DCG: 0.0
Rank: 2, Estimated Rating: 0.03183442757224015, True Rating: 0.03125, DCG: 0.0
Rank: 3, Estimated Rating: 0.03183442757224015, True Rating: 0.03125, DCG: 0.0
Rank: 4, Estimated Rating: 0.03183442757224015, True Rating: 0.03125, DCG: 0.0
Rank: 5, Estimated Rating: 0.03183442757224015, True Rating: 0.03125, DCG: 0.0
User NDCG: 0, IDCG: 0.0

User: 51476
Sorted ratings (estimated, true)

{'rmse': 0.04795554405211701,
 'mae': 0.026633823168403736,
 'ndcg_5': 0.004538515271707681,
 'ndcg_10': 0.004599839273743407,
 'mean_mrr': 0.004341900584385554}

In [26]:
predictions_all = svd.test(data.build_full_trainset().build_testset())
all_recommendations = get_top_n(predictions_all)


In [45]:
def sort_users_by_highest_score(user_scores):
    user_max_scores = []

    for user, scores in user_scores.items():
        if scores:  # Check if the list is non-empty
            highest_score = max(scores, key=lambda x: x[2])[2]
            user_max_scores.append((user, highest_score))

    sorted_users = sorted(user_max_scores, key=lambda x: x[1], reverse=True)
    return sorted_users

In [46]:
sorted_users_by_highest_score = sort_users_by_highest_score(all_recommendations)