In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Charger et nettoyer immédiatement les valeurs manquantes
df = pd.read_csv("data/final_data.csv")
df.isna().sum()

title               0
partner             0
rating              0
reviews             0
metadata            0
link                0
category            0
scraped_at          0
source_domain       0
num_ratings         0
duration_hours      0
level_enc           0
title_clean         0
popularity_score    0
id                  0
dtype: int64

In [42]:
def recommend_courses(
    course_title,
    df,
    cosine_sim,
    top_n=10
):
    if course_title not in df["title"].values:
        return "❌ Course not found"

    idx = df[df["title"] == course_title].index[0]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:top_n+10]

    course_indices = [i[0] for i in sim_scores]

    recs = df.iloc[course_indices].copy()

    # Final score = similarity + popularity
    recs["final_score"] = (
        0.7 * [s[1] for s in sim_scores] +
        0.3 * (recs["popularity_score"] / recs["popularity_score"].max())
    )

    return recs.sort_values(
        by="final_score",
        ascending=False
    ).head(top_n)[
        ["title", "category", "rating", "num_ratings",
         "duration_hours", "source_domain", "final_score"]
    ]


In [45]:
def recommend_courses(
    course_query,
    df,
    cosine_sim,
    top_n=10
):
    matches = df[df["title"].str.contains(course_query, case=False, na=False)]

    if matches.empty:
        return "❌ Course not found"

    idx = matches.index[0]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+10]

    course_indices = [i[0] for i in sim_scores]
    recs = df.iloc[course_indices].copy()

    recs["final_score"] = (
        0.7 * [s[1] for s in sim_scores] +
        0.3 * (recs["popularity_score"] / (recs["popularity_score"].max() + 1e-6))
    )

    return recs.sort_values(
        by="final_score",
        ascending=False
    ).head(top_n)[
        ["title", "category", "rating", "num_ratings",
         "duration_hours", "source_domain", "final_score"]
    ]


In [None]:
recommend_courses(
    "Hands-On Machine Learning Project: Build, Train and Deploy Models",
    df,
    cosine_sim,
    top_n=5
)
