# Hybrid recommendation

In [122]:
#librairies
# classic Librairies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os
import shutil

#loading embeddings
import pickle

#sklearn utils
#from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import NearestNeighbors

#surprise
from surprise import Dataset, Reader, SVD, KNNBasic
from surprise.model_selection import train_test_split #really different from sklearn's ? 
from surprise import accuracy

In [123]:
# Options for cleaner display
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 1000)

In [124]:
# Load article metadata
data_articles = pd.read_csv("data/archive/articles_metadata.csv")
# Ensure ordering matches the embeddings file
data_articles = data_articles.sort_values("article_id").reset_index(drop=True)

# Load embeddings (shape ≈ [n_articles, 250])
with open("data/archive/articles_embeddings.pickle", "rb") as f:
    embeddings = pickle.load(f)
# check type
print("Type :", type(embeddings))

if isinstance(embeddings, dict):
    print("Clés :", list(embeddings.keys())[:10])
    if "ids" in embeddings:
        print("Premier id :", embeddings["ids"][0])
    if "vectors" in embeddings:
        print("Shape des vecteurs :", embeddings["vectors"].shape)

elif isinstance(embeddings, (list, tuple)):
    print("Longueur :", len(embeddings))
    print("Exemple élément[0] :", type(embeddings[0]))
    if hasattr(embeddings[0], "shape"):
        print("Shape :", embeddings[0].shape)

elif hasattr(embeddings, "shape"):  # probablement un np.ndarray
    print("Array numpy avec shape :", embeddings.shape)

else:
    print("Contenu :", str(embeddings)[:500])


Type : <class 'numpy.ndarray'>
Array numpy avec shape : (364047, 250)


In [125]:
print("Shape de data_articles :", data_articles.shape)
print(data_articles.columns.tolist())
data_articles.head()

Shape de data_articles : (364047, 5)
['article_id', 'category_id', 'created_at_ts', 'publisher_id', 'words_count']


Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,1513144419000,0,168
1,1,1,1405341936000,0,189
2,2,1,1408667706000,0,250
3,3,1,1408468313000,0,230
4,4,1,1407071171000,0,162


In [126]:
data_articles_shorter = data_articles[['article_id']]
data_articles_shorter

Unnamed: 0,article_id
0,0
1,1
2,2
3,3
4,4
...,...
364042,364042
364043,364043
364044,364044
364045,364045


In [127]:
data_article_shorter.to_csv("data/data_article_shorter", index=False)

In [128]:
# optional renaming of columns
"""# 2.2. Renommage et transformation de colonnes
data_articles = data_articles.rename(columns={
    "category_id": "category",
    "publisher_id": "publisher",
    "created_at_ts": "created_at"
})d

# Convertir created_at en datetime
data_articles["created_at"] = pd.to_datetime(data_articles["created_at"], unit="s", origin="unix")

# Garder uniquement les colonnes utiles
data_articles = data_articles[["article_id", "category", "publisher", "words_count", "created_at"]]

print("Après renommage :", data_articles.shape)
data_articles.head()
"""

'# 2.2. Renommage et transformation de colonnes\ndata_articles = data_articles.rename(columns={\n    "category_id": "category",\n    "publisher_id": "publisher",\n    "created_at_ts": "created_at"\n})d\n\n# Convertir created_at en datetime\ndata_articles["created_at"] = pd.to_datetime(data_articles["created_at"], unit="s", origin="unix")\n\n# Garder uniquement les colonnes utiles\ndata_articles = data_articles[["article_id", "category", "publisher", "words_count", "created_at"]]\n\nprint("Après renommage :", data_articles.shape)\ndata_articles.head()\n'

In [129]:
# Embedding checker
# 3.1. Charger le pickle embeddings (numpy ndarray)
with open("data/archive/articles_embeddings.pickle", "rb") as f_in:
    embeddings = pickle.load(f_in)

#il faut faire la réduction AVANT pour voir si l'embedding réduit est toujours compatible avec le data_articles

from sklearn.decomposition import PCA

pca = PCA(n_components=25)
new_embed = pca.fit_transform(embeddings)
print(pca.explained_variance_ratio_)

[0.09248701 0.07410172 0.06653015 0.04882732 0.04229694 0.03987891
 0.03714959 0.03283883 0.03228257 0.02882201 0.02742614 0.02508928
 0.02424399 0.02221346 0.02075701 0.02004806 0.01905697 0.01874528
 0.01748906 0.01640469 0.01548621 0.0151256  0.01399427 0.013067
 0.01237998]


In [130]:
"""# Embedding checker
# 3.1. Charger le pickle embeddings (numpy ndarray)
with open("data/archive/articles_embeddings.pickle", "rb") as f_in:
    embeddings = pickle.load(f_in)"""

# 3.2. Vérifier la forme
print("Type embeddings :", type(new_embed))
print("Shape embeddings :", new_embed.shape)
# embeddings doit être de shape (n_articles, 250)

# 3.3. Vérifier la correspondance entre embeddings et data_articles
#      On suppose que les embeddings sont **dans le même ordre** que les lignes de data_articles.
n_data_articles = data_articles.shape[0]
n_emb  = embeddings.shape[0]
print(f"Articles dans data_articles : {n_data_articles}, Lignes embeddings : {n_emb}")

if n_data_articles != n_emb:
    raise ValueError("Le nombre de lignes dans data_articles et dans embeddings ne correspond pas !"
                     " VÉRIFIE L’ORDRE DES ARTICLES.")


Type embeddings : <class 'numpy.ndarray'>
Shape embeddings : (364047, 25)
Articles dans data_articles : 364047, Lignes embeddings : 364047


In [131]:
import time

In [148]:
# 4.1. Instancier NearestNeighbors
CF_RADIUS = n_data_articles  # nombre de voisins à rescanner pour le blending
new_nn_index = NearestNeighbors(n_neighbors=CF_RADIUS, metric="cosine", algorithm="auto")

# 4.2. Entraîner l’index sur l’ensemble des embeddings
t0 = time.time()
new_nn_index.fit(new_embed)#embeddings
t1 = time.time()-t0
print("Index CB prêt (NearestNeighbors).")
print("trained in : ", t1, " secs")

Index CB prêt (NearestNeighbors).
trained in :  0.011813163757324219  secs


In [150]:
new_nn_index.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'cosine',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 364047,
 'p': 2,
 'radius': 1.0}

In [134]:
# 5.1. Charger l’instance Surprise picklée (algo_cf)
CF_MODEL_PATH = os.path.join("models_in_progress", "cf_model.pkl")

with open(CF_MODEL_PATH, "rb") as f_in:
    cf_algo = pickle.load(f_in)

# 5.2. Vérifier que cf_algo dispose bien de .predict, .train… etc.
print("Type cf_algo :", type(cf_algo))
# Exemple d’attribut attendu : cf_algo.__class__ doit être surprise.prediction_algorithms.matrix_factorization.SVD

Type cf_algo : <class 'surprise.prediction_algorithms.matrix_factorization.SVD'>


I don't need to reload dataset for CF since I have the model that's trained on it pickled

I can move on to reconstruct my functions

In [168]:
#save new embedding
with open("new_embed.pkl", "wb") as f:
    pickle.dump(new_embed, f, protocol=pickle.HIGHEST_PROTOCOL)

In [152]:
#save new index
with open("new_nn_index.pkl", "wb") as f:
    pickle.dump(new_nn_index, f, protocol=pickle.HIGHEST_PROTOCOL)

In [136]:
"""
#re-pickle files
with open("articles_embeddings.pickle", "wb") as f:
    pickle.dump(embeddings, f, protocol=pickle.HIGHEST_PROTOCOL)

with open("nn_index.pkl", "wb") as f:
    pickle.dump(nn_index, f, protocol=pickle.HIGHEST_PROTOCOL)

with open("cf_model.pkl", "wb") as f:
    pickle.dump(cf_algo, f, protocol=pickle.HIGHEST_PROTOCOL)
"""

"""
#check if files are pickled properly
import binascii

with open(CF_MODEL_PATH, "rb") as f:
    head_model = f.read(8)
#print(CF_MODEL_PATH, binascii.hexlify(head_model).decode())

with open("data/archive/articles_embeddings.pickle", "rb") as f:
    head_embedd = f.read(8)
#print(binascii.hexlify(head_embedd).decode())

NN_INDEX_PATH = os.getcwd()
PARENT = os.path.abspath(os.path.join(NN_INDEX_PATH, os.pardir))
PICKLE = os.path.join(PARENT, "pickles")
print(PICKLE)

path_to_nn = os.path.join(PICKLE, "nn_index.pkl")
with open(path_to_nn, "rb") as f:
    head_index = f.read(8)
#print(binascii.hexlify(head_index).decode())

print("model:", binascii.hexlify(head_model).decode(), "\n\n",
     "embedding", binascii.hexlify(head_embedd).decode(), "\n\n",
     "index:", binascii.hexlify(head_index).decode() )
     
"""

'\n#check if files are pickled properly\nimport binascii\n\nwith open(CF_MODEL_PATH, "rb") as f:\n    head_model = f.read(8)\n#print(CF_MODEL_PATH, binascii.hexlify(head_model).decode())\n\nwith open("data/archive/articles_embeddings.pickle", "rb") as f:\n    head_embedd = f.read(8)\n#print(binascii.hexlify(head_embedd).decode())\n\nNN_INDEX_PATH = os.getcwd()\nPARENT = os.path.abspath(os.path.join(NN_INDEX_PATH, os.pardir))\nPICKLE = os.path.join(PARENT, "pickles")\nprint(PICKLE)\n\npath_to_nn = os.path.join(PICKLE, "nn_index.pkl")\nwith open(path_to_nn, "rb") as f:\n    head_index = f.read(8)\n#print(binascii.hexlify(head_index).decode())\n\nprint("model:", binascii.hexlify(head_model).decode(), "\n\n",\n     "embedding", binascii.hexlify(head_embedd).decode(), "\n\n",\n     "index:", binascii.hexlify(head_index).decode() )\n     \n'

In [137]:
def build_user_profile(user_clicks):
    """
    user_clicks : liste d'article_id déjà vus (ex. [157541, 280367, 71301])
    Retourne un vecteur moyen des embeddings correspondants.
    Si aucun clic, renvoie un vecteur nul de même dimension.
    """
    if len(user_clicks) == 0:
        # Cold-start user : vecteur nul
        #np.zeros(embeddings.shape[1])
        return np.zeros(new_embed.shape[1])

    # Trouver les indices dans data_articles pour chaque article_id
    idxs = data_articles.index[data_articles["article_id"].isin(user_clicks)].tolist()
    #print("articles read : ", idxs)
    if len(idxs) == 0:
        # Aucun match (utilisateur a cliqué sur des articles hors data_articles)
        #np.zeros(embeddings.shape[1])
        return np.zeros(new_embed.shape[1])

    user_embs = new_embed[idxs] # embeddings[idxs]
    #print("user_emb : ", user_embs)
    #print("final profile : ", user_embs.mean(axis=0))
    return user_embs.mean(axis=0)

In [138]:
def build_user_profile2(user_clicks):
    """
    user_clicks : liste d'article_id déjà vus (ex. [157541, 280367, 71301])
    Retourne un vecteur moyen des embeddings correspondants.
    Si aucun clic, renvoie un vecteur nul de même dimension.
    """
    if len(user_clicks) == 0:
        # Cold-start user : vecteur nul
        #np.zeros(embeddings.shape[1])
        return np.zeros(new_embed.shape[1])

    # Trouver les indices dans data_articles pour chaque article_id
    idxs = data_articles_shorter.index[data_articles_shorter["article_id"].isin(user_clicks)].tolist()
    #print("articles read : ", idxs)
    if len(idxs) == 0:
        # Aucun match (utilisateur a cliqué sur des articles hors data_articles)
        #np.zeros(embeddings.shape[1])
        return np.zeros(new_embed.shape[1])

    user_embs = new_embed[idxs] # embeddings[idxs]
    #print("user_emb : ", user_embs)
    #print("final profile : ", user_embs.mean(axis=0))
    return user_embs.mean(axis=0)


In [139]:
def normalize_minmax(array):
    """
    Ramène array dans [0,1] par un simple min-max scaling.
    Si array.min() == array.max(), on renvoie un vecteur constant à 0.5.
    """
    mn = array.min()
    mx = array.max()
    if mx > mn:
        return (array - mn) / (mx - mn)
    else:
        return np.full_like(array, 0.5, dtype=float)


In [140]:
def score_cf_for_candidates(user_id, candidate_ids):
    """
    user_id : int
    candidate_ids : liste d'int (article_id)
    Retourne un numpy array de score CF brute : cf_algo.predict(user_id, iid).est
    """
    cf_scores = []
    for iid in candidate_ids:
        # on met r_ui=None car on ne connaît pas la vraie note
        pred = cf_algo.predict(uid=user_id, iid=iid, r_ui=None, verbose=False)
        cf_scores.append(pred.est)
    return np.array(cf_scores)


In [141]:
"""
with open("models_in_progress/nn_index.pkl", 'rb') as file:
        nn = pickle.load(file)
        print("Data loaded successfully!")
        print(nn)
"""

Data loaded successfully!
NearestNeighbors(metric='cosine', n_neighbors=10)


In [164]:
def recommend_hybrid(user_id, 
                     user_clicks, 
                     new_nn_index, 
                     k=10, 
                     alpha=0.5, 
                     total_candidates=CF_RADIUS):
    """
    Renvoie un DataFrame pandas des k articles recommandés pour user_id,
    en blendant le score CB (similarité cos) et le score CF (prediction SVD).
    
    user_id        : int
    user_clicks    : liste d'article_id déjà cliqués
    k              : nombre d’articles à retourner
    alpha          : poids du CF (0 <= alpha <= 1). ex. 0.5 pour 50% CF / 50% CB
    total_candidates : taille du pool initial de candidats CB
    
    Sortie : DataFrame contenant [
        article_id, category, publisher, words_count, created_at,
        score_cb, score_cf, score_hybrid
    ]
    """
    # ----- 1. Calculer le profil CB de l'utilisateur -----
    user_vec = build_user_profile2(user_clicks)  # vecteur 250-d
    
    # ----- 2. Récupérer le pool initial via NN sur embeddings -----

    distances, indices = new_nn_index.kneighbors([user_vec], n_neighbors=total_candidates)
    cand_idxs = indices[0]                # indices dans data_articles/embeddings
    sims_cb = 1.0 - distances[0]          # cosinus similarity = 1 - distance
    
    # IDs des articles candidats
    candidate_ids = data_articles_shorter.iloc[cand_idxs]["article_id"].tolist() 
    #data_articles.iloc[cand_idxs]["article_id"].tolist()
    
    # ----- 3. Construire le DataFrame brut des candidats -----
    df_cand = pd.DataFrame({
        "article_id": candidate_ids,
        "score_cb": sims_cb
    })
    
    # ----- 4. Calculer le score CF brut (cf_algo.predict) -----
    print('user_id type :', type(user_id))
    print('candidate_ids type :',type(candidate_ids[2]))
    raw_cf = score_cf_for_candidates(user_id, candidate_ids)
    df_cand["score_cf_raw"] = raw_cf
    
    # ----- 5. Normaliser le score CF en [0,1] -----
    df_cand["score_cf"] = normalize_minmax(df_cand["score_cf_raw"].values)
    
    # ----- 6. Calculer le score hybride -----
    df_cand["score_hybrid"] = alpha * df_cand["score_cf"] + (1 - alpha) * df_cand["score_cb"]
    
    # ----- 7. Trier par score_hybrid et prendre les top-k -----
    topk = (
        df_cand
        .sort_values("score_hybrid", ascending=False)
        .head(k)
        .merge(
            data_articles_shorter,
            on="article_id",
            how="left"
        )
    )
    
    # ----- 8. Sélection / ordre des colonnes à renvoyer -----
    return topk[[
        "article_id",
        #"category_id",
        #"publisher_id",
        #"words_count",
        #"created_at_ts",
        #"score_cb",
        #"score_cf",
        "score_hybrid"
    ]].reset_index(drop=True)


In [166]:
# Exemple : user_id=1234 a cliqué sur ces articles
test_user_id    = 10
test_user_clicks = [101, 204, 305, 408, 509]  # historique des articles cliqués

# On appelle la fonction hybride
recs = recommend_hybrid(
    user_id=test_user_id,
    user_clicks=test_user_clicks,
    new_nn_index = new_nn_index,
    k=5,
    alpha=0.6,            # 60% CF / 40% CB
    total_candidates=n_data_articles
)

print("Recommandations hybrides pour user", test_user_id)
recs

user_id type : <class 'int'>
candidate_ids type : <class 'int'>
Recommandations hybrides pour user 10


Unnamed: 0,article_id,score_hybrid
0,1661,0.853442
1,1975,0.849989
2,4866,0.820095
3,984,0.812999
4,3449,0.810835


nb de click par user sur un article / pondère par activité user lors de sa session (nb total de click par session)

par user's session  : \
nb total de click \
nb de click par article

temps passé sur chaque article \
nb de click sur un même lien

pour azure function deploy 

préparer une liste de user_id \
préparer des historiques différents

préparer sur papier les plans d'architecture imaginée/souhaitée/mise en place \
qu'est c eque je déploie en blob / qu'est ce que je déploie en aure Function