## DÉPENDANCES À ÉXÉCUTER ##

In [3]:
import scipy.sparse as sp
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
from typing import Any, Dict, List, Optional

class CFModel:
    def __init__(self, factors: int = 128, regularization: float = 0.01, iterations: int = 40, alpha: float = 40.0, K1: float = 100, B: float = 0.8):
        self.factors = factors
        self.reg = regularization
        self.iter = iterations
        self.alpha = alpha
        self.K1 = K1
        self.B = B
        self.model = AlternatingLeastSquares(factors=self.factors, regularization=self.reg, iterations=self.iter)
        self.user_items: Optional[sp.csr_matrix] = None

    def fit(self, interaction_matrix: sp.csr_matrix):
        self.user_items = interaction_matrix.tocsr()
        item_user = self.user_items.T
        weighted  = bm25_weight(item_user, K1=self.K1, B=self.B)
        self.model.fit(weighted)

    def recommend(self, user_id: Any, user_map: Dict[Any, int], video_map: Dict[Any, int], interaction_matrix: Optional[sp.csr_matrix] = None,N: int = 10) -> List[Any]:
        uidx = user_map.get(user_id)
        if uidx is None:
            return []
        if interaction_matrix is not None:
            user_items = interaction_matrix.tocsr()
        elif self.user_items is not None:
            user_items = self.user_items
        else:
            return []
        ids, scores = self.model.recommend(
            uidx,
            user_items,
            N=N,
            filter_already_liked_items=True
        )
        inv_video_map = {v: k for k, v in video_map.items()}
        return [inv_video_map[i] for i in ids]


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp
from scipy import sparse
import joblib

class ContentModel:
    def __init__(self, max_features: int = 10000, ngram_range=(1, 2), stop_words="english"):
        self.max_features = max_features
        self.ngram_range = ngram_range
        self.stop_words = stop_words
        self.tfidf = TfidfVectorizer(max_features=self.max_features, ngram_range=self.ngram_range, stop_words=self.stop_words)
        self.video_ids = None
        self.tfidf_matrix = None
        self.user_profiles = None
        self.user_map = None
        self.vid_map = None

    def fit(self, metadata_df: pd.DataFrame, interaction_matrix: sp.csr_matrix, user_map: dict, video_map: dict, text_field: str = "feat"):
        def to_text(x):
            if isinstance(x, (list, tuple, np.ndarray)):
                return " ".join(str(tok) for tok in x)
            if pd.isna(x):
                return ""
            return str(x)
        corpus = metadata_df[text_field].apply(to_text).tolist()
        tfidf_full = self.tfidf.fit_transform(corpus)
        all_video_ids = metadata_df["video_id"].tolist()
        ordered_videos = [None] * len(video_map)
        for vid, idx in video_map.items():
            ordered_videos[idx] = vid
        id2row = {v: i for i, v in enumerate(all_video_ids)}
        rows = [id2row[vid] for vid in ordered_videos]
        tfidf_aligned = tfidf_full[rows, :]
        self.tfidf_matrix = tfidf_aligned
        self.video_ids = ordered_videos
        um = interaction_matrix.astype("float32")
        row_sums = np.array(um.sum(axis=1)).flatten() + 1e-9
        um = um.multiply(1.0 / row_sums[:, None])
        self.user_profiles = um.dot(self.tfidf_matrix).toarray()
        sparse.save_npz("models/tfidf_matrix.npz", self.tfidf_matrix)
        joblib.dump(self.tfidf, "models/tfidf_vectorizer.pkl")
        joblib.dump(self.user_profiles, "models/user_profiles.npy")
        joblib.dump(user_map, "models/user_map_content.pkl")
        joblib.dump(video_map, "models/video_map_content.pkl")
        self.user_map = user_map
        self.vid_map = video_map
        print("ContentModel: models and profiles saved under models/")

    def recommend(self, user_id, N: int = 10) -> list:
        if self.user_profiles is None:
            self.tfidf_matrix = sparse.load_npz("models/tfidf_matrix.npz")
            self.user_profiles = joblib.load("models/user_profiles.npy")
            self.tfidf = joblib.load("models/tfidf_vectorizer.pkl")
            self.user_map = joblib.load("models/user_map_content.pkl")
            self.vid_map = joblib.load("models/video_map_content.pkl")
        inv_vid_map = {v: k for k, v in self.vid_map.items()}
        uidx = self.user_map.get(user_id)
        if uidx is None:
            return []
        profile = self.user_profiles[uidx].reshape(1, -1)
        sims = cosine_similarity(profile, self.tfidf_matrix).flatten()
        best = np.argpartition(-sims, N)[:N]
        best = best[np.argsort(-sims[best])]
        return [inv_vid_map[i] for i in best]


# 04 Génération des recommandations
Ce notebook utilise les modèles entraînés précédemment (CF, Content-Based et Hybrid) pour produire des listes de recommandations sur le jeu de test.

**Étapes principales :**
1. Chargement des données de test (`small_matrix.csv`).
2. Chargement des modèles et des mappings.
3. Génération des recommandations pour chaque utilisateur :
   - **CF-only** via `CF_model.pkl`
   - **Content-only** via `ContentBased_model.pkl`
   - **Hybrid + Popularité** (combinaison pondérée CF & CB + fallback populaire)
4. Export des résultats au format CSV (`submission_cf.csv`, `submission_content.csv`, `submission_hybrid.csv`).

## 1. Préliminaires : imports et chargements

On importe les librairies nécessaires et on charge le jeu de test ainsi que les mappings.

In [None]:
import pandas as pd
import joblib
import scipy.sparse as sp
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Chargement du jeu de test (liste d'utilisateurs à scorer)
test = pd.read_csv("../data/small_matrix.csv")
users = test['user_id'].unique()

# Chargement des artefacts CF
def load_cf_artifacts():
    model = joblib.load("models/CF_model.pkl")
    user_map = joblib.load("features/user_map.pkl")
    video_map = joblib.load("features/video_map.pkl")
    interaction_matrix = sp.load_npz("features/interaction_matrix.npz").tocsr()
    return model, user_map, video_map, interaction_matrix

# Chargement des artefacts CB
def load_cb_artifacts():
    model = joblib.load("models/Content-Based_model.pkl")
    return model

# Fallback popular videos

def load_popular_list(big_csv="../data/big_matrix.csv"):
    """
    Construit une liste de vidéos triées par popularité (nombre d'interactions).
    """
    df = pd.read_csv(big_csv, usecols=["video_id"])
    return df["video_id"].value_counts().index.tolist()

## 2. Génération CF-only
Pour chaque utilisateur :
- Récupérer l'index via `user_map`
- Appeler `model.recommend(...)`
- Stocker la liste de `video_id`, le rang et le score

In [9]:
cf_model, cf_user_map, cf_video_map, cf_matrix = load_cf_artifacts()
inv_cf_map = {v: k for k, v in cf_video_map.items()}

user_items = cf_matrix.tocsr()
inv_video_map = {v:k for k,v in cf_video_map.items()}
recs = []
als = getattr(cf_model, "model", cf_model)
for u in users:
	uidx = cf_user_map.get(u)
	if uidx is None:
		continue
	pairs = als.recommend(uidx, user_items, N=10, filter_already_liked_items=False)
	ids, scores = pairs
	for rank, (vidx, score) in enumerate(zip(ids, scores), start=1):
		recs.append({
			'user_id': u,
			'video_id': inv_video_map[vidx],
			'rank': rank,
			'score': score
		})
pd.DataFrame(recs).to_csv("submission_cf.csv", index=False)
print(f"Submission saved to submission_cf.csv")

Submission saved to submission_cf.csv


## 3. Génération Content-only

Pour chaque utilisateur :
- Appeler `cb_model.recommend(user_id, N)`
- Stocker les résultats

In [10]:
cb_model = load_cb_artifacts()

cb_recs = []
for u in users:
    recs = cb_model.recommend(u, N=10)
    for rank, vid in enumerate(recs, start=1):
        cb_recs.append({'user_id': u, 'video_id': vid, 'rank': rank})

pd.DataFrame(cb_recs).to_csv("submission_content.csv", index=False)
print("Content-only recommendations saved to submission_content.csv")

Content-only recommendations saved to submission_content.csv


## 4. Génération Hybrid + Popularité

On combine les scores CF et CB avec un poids `alpha` puis on complète par une liste de fallback basée sur la popularité.

In [13]:
cf_model = joblib.load("models/CF_model.pkl")
cb_model = joblib.load("models/Content-Based_model.pkl")
user_items_cf = cf_model.user_items
user_map_cf  = joblib.load("features/user_map.pkl")
video_map_cf = joblib.load("features/video_map.pkl")
inv_video_map = {col: vid for vid, col in video_map_cf.items()}
pop_list = load_popular_list()
N     = 10
alpha = 0.7
CF_K  = (5 * N)
CB_K  = (5 * N)
item_factors = cf_model.model.user_factors
user_factors = cf_model.model.item_factors
recs = []
for u in users:
	scores = {}
	if u in user_map_cf:
		uidx = user_map_cf[u]
		uvec = user_factors[uidx]
		sc_cf = item_factors.dot(uvec)
		seen = user_items_cf[uidx].indices
		sc_cf[seen] = -np.inf
		top_cf = np.argpartition(-sc_cf, CF_K)[:CF_K]
		top_cf = top_cf[np.argsort(-sc_cf[top_cf])]
		for idx in top_cf:
			scores[idx] = scores.get(idx, 0.0) + alpha * sc_cf[idx]
	if u in cb_model.user_map:
		uidx_cb = cb_model.user_map[u]
		up      = cb_model.user_profiles[uidx_cb].reshape(1, -1)
		sc_cb   = cosine_similarity(up, cb_model.tfidf_matrix).flatten()
		top_cb  = np.argpartition(-sc_cb, CB_K)[:CB_K]
		top_cb  = top_cb[np.argsort(-sc_cb[top_cb])]
		for idx in top_cb:
			scores[idx] = scores.get(idx, 0.0) + (1 - alpha) * sc_cb[idx]
	ranked = sorted(scores, key=lambda i: -scores[i])
	vids = [inv_video_map[i] for i in ranked]
	if len(vids) < N:
		for p in pop_list:
			if p not in vids:
				vids.append(p)
			if len(vids) == N:
				break
	vids = vids[:N]
	for rank, vid in enumerate(vids, start=1):
		score = scores.get(video_map_cf.get(vid, None), 0.0)
		recs.append({
			"user_id":  u,
			"video_id": vid,
			"rank":     rank,
			"score":    float(score)
		})
pd.DataFrame(recs).to_csv("submission_hybrid.csv", index=False)
print(f"Hybrid+pop submission saved to submission_hybrid.csv")

Hybrid+pop submission saved to submission_hybrid.csv


## Conclusion

Les fichiers CSV générés (`submission_cf.csv`, `submission_content.csv`, `submission_hybrid.csv`) sont utilisés pour l'évaluation dans 05_Evaluate.ipynb.