# Réalisez une application de recommandation de contenu
## Filtrage collaboratif

Il existe plusieurs méthodes de filtrage collaboratif, j'en ai testé plusieurs (SAR & LightGCN décris dans la librairie "[recommenders](https://github.com/recommenders-team/recommenders/tree/main/examples/02_model_collaborative_filtering)"), mais ils sont en général plus adaptés pour un système en utilisant des notations, les résulat obtenu, le temps passé à les utiliser et tenter d'optimiser mon fait basculer sur une solution plus simple, le SVD.

L'analyse en valeurs singulières (SVD pour "Singular Value Decomposition" en anglais) est une technique de factorisation qui décompose une matrice en trois matrices distinctes.

- (U) décrit les caractéristiques essentiels des lignes de la matrice originale,
- (Σ) Sigma, contient les valeurs d'importance de chaque caractéristique,
- (V^T) décrit les caractéristiques essentiels des colonnes de la matrice originale.


### Importation des librairies

In [1]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict

from helpers import load_dataset

#### Récupération des datasets & préparation des données

In [2]:
# Load datasets
df_articles, df_clicks, _ = load_dataset()

Je prépare un profile par utilisateur pour connaitre tous les articles et leur categories visités

In [3]:
# Create user profiles based on article clicks
user_profiles = df_clicks.groupby('user_id')['click_article_id'].apply(list).reset_index()
article_category_map = df_articles.set_index("article_id")["category_id"].to_dict()
user_profiles["categories"] = user_profiles["click_article_id"].apply(
    lambda x: [article_category_map[article_id] for article_id in x]
)
user_profiles

Unnamed: 0,user_id,click_article_id,categories
0,0,"[157541, 68866, 96755, 313996, 160158, 233470,...","[281, 136, 209, 431, 281, 375, 186, 186]"
1,1,"[327984, 183176, 235840, 96663, 59758, 160474,...","[435, 301, 375, 209, 123, 281, 43, 375, 412, 2..."
2,2,"[119592, 30970, 30760, 209122]","[247, 26, 26, 332]"
3,3,"[236444, 234318, 233688, 237452, 235745, 12096...","[375, 375, 375, 375, 375, 249, 375, 375, 375, ..."
4,4,"[336499, 271261, 48915, 44488, 195887, 195084,...","[437, 399, 92, 81, 317, 317, 132]"
...,...,...,...
322892,322892,"[42567, 39894]","[67, 66]"
322893,322893,"[50644, 36162]","[99, 43]"
322894,322894,"[36162, 168401]","[43, 297]"
322895,322895,"[289197, 63746]","[418, 133]"


In [20]:
# Merge datasets to get user-article-category information
df_merged = df_clicks.merge(df_articles, left_on='click_article_id', right_on='article_id')
df_user_item = df_merged[['user_id', 'article_id', 'category_id']]
df_user_item.head()

Unnamed: 0,user_id,article_id,category_id
0,93863,96210,209
1,294036,96210,209
2,77136,96210,209
3,28126,96210,209
4,237725,96210,209


In [22]:
# Create user-article-category interaction counts (the rating method)
interaction_counts = df_user_item.groupby(['user_id', 'article_id']).size()

# Convert series to dataframe and reset index
user_rating_matrix = interaction_counts.to_frame().reset_index()
user_rating_matrix.rename(columns={0: 'rating'}, inplace=True)

In [23]:
# Normalize ratings
scaler = MinMaxScaler(feature_range=(0, 1))
user_rating_matrix["rating_norm"] = scaler.fit_transform(
    np.array(user_rating_matrix["rating"]).reshape(-1, 1)
)

In [8]:
# Filter out rating set to 0
X = user_rating_matrix[user_rating_matrix["rating_norm"] != 0.0]
X

Unnamed: 0,user_id,article_id,rating,rating_norm
48,5,36399,2,0.03125
111,5,284664,2,0.03125
129,5,348128,2,0.03125
152,6,233717,2,0.03125
477,17,74722,2,0.03125
...,...,...,...,...
2946563,321180,199393,2,0.03125
2946564,321180,199437,2,0.03125
2947399,321498,342473,2,0.03125
2949080,322199,87194,2,0.03125


In [9]:
# Prepare dataset for the Surprise library
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(X[["user_id", "article_id", "rating_norm"]], reader)
trainset, testset = train_test_split(data, test_size=0.25)
print("Number of interactions: ", len(X))

Number of interactions:  33875


#### Entrainement du dataset

In [10]:
# Train the SVD model and evaluate #####
svd = SVD()
svd.fit(trainset)
predictions = svd.test(testset)

In [11]:
# Perform cross-validation with additional metrics
def compute_metrics(predictions):
    """Compute various evaluation metrics from the predictions."""
    # Calculate RMSE and MAE
    metrics = {
        'rmse': accuracy.rmse(predictions, verbose=False),
        'mae': accuracy.mae(predictions, verbose=False),
        'ndcg_5': ndcg_at_k(predictions, k=5),
        'ndcg_10': ndcg_at_k(predictions, k=10),
        'mean_mrr': mean_reciprocal_rank(predictions)
    }
    return metrics


In [12]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions (list of Prediction objects): The list of predictions, as
        returned by the test method of an algorithm.
        n (int): The number of recommendation to output for each user. Default is 10.

    Returns:
        dict: A dictionary where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, category id, rating estimation), ...] of size n.
    """
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, article_category_map[iid], est))

    # Then sort the predictions for each user and retrieve the n highest ones
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[2], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


In [13]:
top_recommendations = get_top_n(predictions)
top_recommendations

defaultdict(list,
            {135817: [(141004, 265, 0.02571568939562426)],
             21877: [(158536, 281, 0.038344819717323855),
              (160974, 281, 0.0377590264057165)],
             211443: [(352901, 442, 0.02848018915340627)],
             54863: [(74589, 141, 0.007396874541337119)],
             47460: [(96210, 209, 0.06322617931364995)],
             211204: [(313996, 431, 0.030953432443986465)],
             11256: [(235300, 375, 0.18820029264988963),
              (237665, 375, 0.04031010859611636)],
             60722: [(160974, 281, 0.03318258629024314),
              (235230, 375, 0.01202946081197263)],
             118122: [(96210, 209, 0.032111969252138925),
              (15653, 7, 0.032018176409357675)],
             1556: [(123757, 250, 0.09555022034263905),
              (284547, 412, 0.08334511048084778),
              (233688, 375, 0)],
             46126: [(29766, 26, 0.012514425718876253), (205846, 331, 0)],
             97767: [(166581, 289, 0.0638093

In [14]:
def ndcg_at_k(predictions, k=5, relevance_threshold=0.1):
    from collections import defaultdict
    import math
    
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    ndcg = 0.0
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        print(f"\nUser: {uid}")
        print(f"Sorted ratings (estimated, true): {user_ratings}")
        
        dcg = 0.0
        idcg = 0.0
        
        relevant_ratings = [true_r for est, true_r in user_ratings if true_r > relevance_threshold]
        for i in range(min(len(relevant_ratings), k)):
            idcg += 1.0 / math.log2(i + 2)
        
        for i, (est, true_r) in enumerate(user_ratings[:k]):
            if true_r > relevance_threshold:
                dcg += 1.0 / math.log2(i + 2)
            print(f"Rank: {i+1}, Estimated Rating: {est}, True Rating: {true_r}, DCG: {dcg}")

        ndcg_value = dcg / idcg if idcg > 0 else 0
        ndcg += ndcg_value
        print(f"User NDCG: {ndcg_value}, IDCG: {idcg}")

    final_ndcg = ndcg / len(user_est_true)
    print(f"\nAverage NDCG@{k}: {final_ndcg}")
    return final_ndcg

def mean_reciprocal_rank(predictions, relevance_threshold=0.1):
    from collections import defaultdict
    
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    mrr = 0.0
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        print(f"\nUser: {uid}")
        print(f"Sorted ratings (estimated, true): {user_ratings}")
        
        for rank, (est, true_r) in enumerate(user_ratings):
            if true_r > relevance_threshold:
                mrr += 1.0 / (rank + 1)
                print(f"First relevant rank: {rank+1}, MRR contribution: {1.0/(rank+1)}")
                break

    final_mrr = mrr / len(user_est_true)
    print(f"\nMean Reciprocal Rank: {final_mrr}")
    return final_mrr

#### Évaluation des predictions

In [15]:
compute_metrics(predictions)


User: 135817
Sorted ratings (estimated, true): [(0.02571568939562426, 0.03125)]
Rank: 1, Estimated Rating: 0.02571568939562426, True Rating: 0.03125, DCG: 0.0
User NDCG: 0, IDCG: 0.0

User: 21877
Sorted ratings (estimated, true): [(0.038344819717323855, 0.0625), (0.0377590264057165, 0.0625)]
Rank: 1, Estimated Rating: 0.038344819717323855, True Rating: 0.0625, DCG: 0.0
Rank: 2, Estimated Rating: 0.0377590264057165, True Rating: 0.0625, DCG: 0.0
User NDCG: 0, IDCG: 0.0

User: 211443
Sorted ratings (estimated, true): [(0.02848018915340627, 0.03125)]
Rank: 1, Estimated Rating: 0.02848018915340627, True Rating: 0.03125, DCG: 0.0
User NDCG: 0, IDCG: 0.0

User: 54863
Sorted ratings (estimated, true): [(0.007396874541337119, 0.09375)]
Rank: 1, Estimated Rating: 0.007396874541337119, True Rating: 0.09375, DCG: 0.0
User NDCG: 0, IDCG: 0.0

User: 47460
Sorted ratings (estimated, true): [(0.06322617931364995, 0.03125)]
Rank: 1, Estimated Rating: 0.06322617931364995, True Rating: 0.03125, DCG: 0.

{'rmse': 0.04603807779089857,
 'mae': 0.0261331039826696,
 'ndcg_5': 0.004892732682655729,
 'ndcg_10': 0.0050173935244281044,
 'mean_mrr': 0.0047346978447196306}

In [16]:
predictions_all = svd.test(data.build_full_trainset().build_testset())
all_recommendations = get_top_n(predictions_all)

In [17]:
def sort_users_by_highest_score(user_scores):
    user_max_scores = []

    for user, scores in user_scores.items():
        if scores:  # Check if the list is non-empty
            highest_score = max(scores, key=lambda x: x[2])[2]
            user_max_scores.append((user, highest_score))

    sorted_users = sorted(user_max_scores, key=lambda x: x[1], reverse=True)
    return sorted_users

In [18]:
sorted_users_by_highest_score = sort_users_by_highest_score(all_recommendations)

In [25]:
sorted_users_by_highest_score

[(16280, 0.540128537044658),
 (2223, 0.4273752494695664),
 (19882, 0.3535821074985867),
 (33937, 0.34555815519747596),
 (15400, 0.3300515315795116),
 (40109, 0.3173566574472557),
 (16429, 0.31057909136603395),
 (45885, 0.30849429537547834),
 (82693, 0.30822880115941736),
 (109356, 0.30773117574319553),
 (145898, 0.30063470315218277),
 (13808, 0.29995840230556686),
 (4669, 0.29813539286058377),
 (52826, 0.29804121215399876),
 (82474, 0.29342736716988466),
 (210540, 0.2921850854826805),
 (12862, 0.2894044974638172),
 (43102, 0.28858938478414636),
 (58135, 0.2845176501376114),
 (17205, 0.2824887510115515),
 (7166, 0.28091123982020605),
 (283378, 0.2778225323776269),
 (200485, 0.2746726052003372),
 (27379, 0.2720315251193901),
 (5890, 0.27109876320335363),
 (12285, 0.2708992290871859),
 (64343, 0.2678242889702733),
 (885, 0.26221223694274576),
 (118110, 0.25855459012686516),
 (57719, 0.25836372616985803),
 (83594, 0.2578061847827968),
 (38696, 0.25753788411913003),
 (8783, 0.25416308105487