# Content based recommendation

In [6]:
#librairies
# classic Librairies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os
import shutil

#loading embeddings
import pickle

#sklearn utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import NearestNeighbors

In [7]:
# 1.1 Load article metadata
meta = pd.read_csv("data/archive/articles_metadata.csv")
# Ensure ordering matches the embeddings file
meta = meta.sort_values("article_id").reset_index(drop=True)

# 1.2 Load embeddings (shape ≈ [n_articles, 250])
with open("data/archive/articles_embeddings.pickle", "rb") as f:
    embeddings = pickle.load(f)

In [16]:
display(meta.head())
print(embeddings[:3, :5])  # first 3 vectors, first 5 dims

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,1513144419000,0,168
1,1,1,1405341936000,0,189
2,2,1,1408667706000,0,250
3,3,1,1408468313000,0,230
4,4,1,1407071171000,0,162


[[-0.16118301 -0.95723313 -0.13794445  0.05085534  0.83005524]
 [-0.52321565 -0.974058    0.73860806  0.15523443  0.626294  ]
 [-0.61961854 -0.9729604  -0.20736018 -0.12886102  0.04474759]]


In [20]:
"""# 2.1 Build the index
nn = NearestNeighbors(n_neighbors=10, metric="cosine", algorithm="auto")
nn.fit(embeddings)

# 2.2 Helper to get similar items
def get_similar_articles(article_id, k=5):
    # Find row index for this article_id
    idx = meta.index[meta["article_id"] == article_id].tolist()
    if not idx:
        raise ValueError("Article ID not found")
    idx = idx[0]
    # Query nearest neighbors (includes itself at distance 0)
    distances, indices = nn.kneighbors([embeddings[idx]], n_neighbors=k+1)
    # Exclude the first (itself)
    sim_idxs = indices[0][1:]
    return meta.iloc[sim_idxs][["article_id", "title", "category"]] #error on names"""

'# 2.1 Build the index\nnn = NearestNeighbors(n_neighbors=10, metric="cosine", algorithm="auto")\nnn.fit(embeddings)\n\n# 2.2 Helper to get similar items\ndef get_similar_articles(article_id, k=5):\n    # Find row index for this article_id\n    idx = meta.index[meta["article_id"] == article_id].tolist()\n    if not idx:\n        raise ValueError("Article ID not found")\n    idx = idx[0]\n    # Query nearest neighbors (includes itself at distance 0)\n    distances, indices = nn.kneighbors([embeddings[idx]], n_neighbors=k+1)\n    # Exclude the first (itself)\n    sim_idxs = indices[0][1:]\n    return meta.iloc[sim_idxs][["article_id", "title", "category"]] #error on names'

In [23]:
# 1. Pré-traitements sur meta
# ======================================
# Convertis le timestamp Unix en datetime
meta["created_at"] = pd.to_datetime(
    meta["created_at_ts"],
    unit="ms",
    origin="unix"
)
# (Optionnel) Renomme pour plus de clarté
meta = meta.rename(columns={
    "category_id": "category",
    "publisher_id": "publisher"
})

# 2. Ré-indexation des embeddings (si pas déjà fait)
# ======================================
nn = NearestNeighbors(n_neighbors=10, metric="cosine", algorithm="auto")
nn.fit(embeddings)

# 3. Fonction de recommandations
# ======================================
def get_similar_articles(article_id, k=5):
    # 3.1 Trouve l’index dans meta
    matches = meta.index[meta["article_id"] == article_id].tolist()
    if not matches:
        raise ValueError(f"Article ID {article_id} non trouvé dans meta")
    idx = matches[0]
    
    # 3.2 Recherche des k+1 voisins (le premier est l’article lui-même)
    distances, indices = nn.kneighbors(
        [embeddings[idx]],
        n_neighbors=k+1
    )
    
    # 3.3 Exclusion de l’article pivot et construction du DataFrame résultat
    neigh_idxs = indices[0][1:]      # on retire l’auto-voisin
    neigh_dists = distances[0][1:]   # distances cosinus
    
    result = meta.iloc[neigh_idxs][[
        "article_id", "category", "publisher", "words_count", "created_at"
    ]].copy()
    
    # 3.4 Ajout d’une colonne de similarité (1 – distance_cosine)
    result["similarity"] = 1 - neigh_dists
    
    return result.reset_index(drop=True)

# 4. Exemple d’utilisation
# ======================================
print(get_similar_articles(157541, k=5))


   article_id  category  publisher  words_count          created_at  \
0      157514       281          0          300 2017-12-22 09:50:19   
1      159284       281          0          245 2018-02-06 14:56:49   
2      157555       281          0          313 2017-08-08 15:46:41   
3      162369       281          0          314 2017-10-01 12:04:18   
4      156355       281          0          283 2017-10-07 17:23:14   

   similarity  
0    0.926801  
1    0.922801  
2    0.921066  
3    0.918505  
4    0.916194  


In [26]:
def build_user_profile(user_clicks, agg="mean"):
    """
    user_clicks: list or array of article_ids the user interacted with
    agg: aggregation method ("mean" or "weighted")
    """
    # Map to indices
    idxs = meta.index[meta["article_id"].isin(user_clicks)].tolist()
    user_embs = embeddings[idxs]

    if agg == "mean":
        return user_embs.mean(axis=0)
    # if you have weights (e.g. recency or click count), you can do:
    # elif agg == "weighted":
    #     weights = np.array([...])  # same length as user_embs
    #     return (user_embs * weights[:, None]).sum(axis=0) / weights.sum()


In [29]:
def recommend_for_user(user_clicks, k=10):
    """
    Retourne les k articles les plus similaires au profil d'un utilisateur.
    
    user_clicks : iterable de article_id lus/clickés par l'utilisateur
    k           : nombre de recommandations à retourner
    """
    # 1. Construction du profil utilisateur
    user_vec = build_user_profile(user_clicks, agg="mean")  # ou "weighted"

    # 2. Recherche des k plus proches voisins dans l'espace des embeddings
    distances, indices = nn.kneighbors([user_vec], n_neighbors=k)
    neigh_idxs = indices[0]
    neigh_dists = distances[0]

    # 3. Extraction des métadonnées correspondantes
    recs = meta.iloc[neigh_idxs][[
        "article_id", "category", "publisher", "words_count", "created_at"
    ]].copy()

    # 4. Ajout d'un score de similarité (1 – distance cosinus)
    recs["similarity"] = 1 - neigh_dists

    # 5. Réindexation propre du DataFrame
    return recs.reset_index(drop=True)


In [27]:
"""def recommend_for_user(user_clicks, k=10):
    user_vec = build_user_profile(user_clicks)
    dists, idxs = nn.kneighbors([user_vec], n_neighbors=k)
    recs = meta.iloc[idxs[0]][["article_id", "title", "category"]] #error on names
    return recs"""

In [31]:
# Suppose user 42 has read articles [157541, 280367, 71301]
print(recommend_for_user([157541, 280367, 71301], k=5))

   article_id  category  publisher  words_count          created_at  \
0      157970       281          0          235 2017-09-07 10:38:44   
1       71420       136          0          299 2017-12-27 19:39:20   
2      280719       412          0          223 2015-11-05 07:25:59   
3      158910       281          0          322 2017-01-12 10:17:20   
4       68685       136          0          309 2017-03-14 15:20:19   

   similarity  
0    0.852583  
1    0.841069  
2    0.839749  
3    0.835680  
4    0.835100  


In [None]:
"""# Only recommend articles from last 7 days
meta["pub_date"] = pd.to_datetime(meta["pub_date"])
recent_mask = meta["pub_date"] >= (pd.Timestamp.now() - pd.Timedelta(days=7))

def recommend_for_user_filtered(user_clicks, k=10, category=None, recent_days=None):
    user_vec = build_user_profile(user_clicks)
    dists, idxs = nn.kneighbors([user_vec], n_neighbors=500)  # larger pool
    candidates = meta.iloc[idxs[0]].copy()
    if category:
        candidates = candidates[candidates["category"] == category]
    if recent_days:
        cutoff = pd.Timestamp.now() - pd.Timedelta(days=recent_days)
        candidates = candidates[candidates["pub_date"] >= cutoff]
    return candidates.head(k)[["article_id","title","category","pub_date"]] #error on names
    """

In [47]:
def recommend_for_user_filtered(user_clicks, k=10, category=None, recent_days=None):
    """
    Recommande les k articles les plus similaires au profil d'un utilisateur,
    avec optional filtering on numeric category ID and freshness.
    
    user_clicks : iterable of article_id lus/clickés par l'utilisateur
    k           : nombre de recommandations souhaitées
    category    : int (category ID) à filtrer, ou None
    recent_days : int (seuil de fraîcheur en jours), ou None
    """
    # 1. Profil utilisateur
    user_vec = build_user_profile(user_clicks, agg="mean")
    
    # 2. On élargit le nombre de voisins pour pouvoir filtrer ensuite
    dists, idxs = nn.kneighbors([user_vec], n_neighbors=500)
    neigh_idxs  = idxs[0]
    neigh_dists = dists[0]
    
    # 3. On construit le DataFrame candidats
    candidates = meta.iloc[neigh_idxs].copy()
    candidates["similarity"] = 1 - neigh_dists
    
    # 4. Filtrage par catégorie (numeric ID)
    if category is not None:
        candidates = candidates[candidates["category"] == category]
    
    # 5. Filtrage par fraîcheur (created_at)
    if recent_days is not None:
        max_date = meta["created_at"].max()
        cutoff = max_date - pd.Timedelta(days=recent_days)
        candidates = candidates[candidates["created_at"] >= cutoff]
    
    # 6. Sélection des k premiers et choix des colonnes utiles
    return (
        candidates
        .sort_values("similarity", ascending=False)
        .head(k)
        .reset_index(drop=True)[
            ["article_id", "category", "publisher", "words_count", "created_at", "similarity"]
        ]
    )

In [103]:
# Exemple : filtrer pour la catégorie 10 et les 7 derniers jours
recs = recommend_for_user_filtered(
    user_clicks=[157541, 280367, 71301],
    k=5,
    category=None,
    recent_days=700
)
print(recs)

   article_id  category  publisher  words_count          created_at  \
0      157970       281          0          235 2017-09-07 10:38:44   
1       71420       136          0          299 2017-12-27 19:39:20   
2      158910       281          0          322 2017-01-12 10:17:20   
3       68685       136          0          309 2017-03-14 15:20:19   
4      289209       420          0          280 2017-09-20 10:30:57   

   similarity  
0    0.852583  
1    0.841069  
2    0.835680  
3    0.835100  
4    0.834944  


Giving a list of articles read by user returns us with a list of articles they might like

This still lacks completion, as we only filter by the content's embedding

We will move on to collaborative filtering, on which we'll build recommendation based on user's behavior