# Syst√®me de Recommandation de Cours
Ce notebook d√©montre le moteur de recommandation de cours utilisant TF-IDF et la similarit√© cosinus.

In [26]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Charger les donn√©es de cours
df = pd.read_csv("../processed_data/final_courses_shuffled.csv")
print(f"Charg√© {len(df)} cours")
df.head()

Charg√© 1855 cours


Unnamed: 0,title,partner,rating,reviews,metadata,link,category,scraped_at,source_domain,num_ratings,duration_hours,level_enc,title_clean,popularity_score,id,price
0,Mastering Artificial Intelligence,Selfcode Academy,4.4,416 reviews,All Levels ¬∑ Course,https://www.udemy.com/course/mastering-artific...,General,2026-02-02T20:24:47.830773,udemy,416,12.5,0,mastering artificial intelligence,26.545579,582,$9.99
1,Capstone Project: Digital Marketing and Growth...,IBM,0.0,0 reviews,Advanced ¬∑ Course ¬∑ 1 - 3 Months,https://www.coursera.org/learn/capstone-projec...,Digital Marketing,2026-01-29T02:34:11.841793,coursera,0,20.0,3,capstone project digital marketing and growth ...,0.0,1244,Inscrivez-vous gratuitement
2,The Art of Visual Storytelling,University of Colorado Boulder,4.6,135 reviews,Beginner ¬∑ Specialization ¬∑ 3 - 6 Months,https://www.coursera.org/specializations/the-a...,Video Editing,2026-01-28T23:15:44.255610,coursera,135,20.0,1,the art of visual storytelling,22.598212,1094,Inscrivez-vous gratuitement
3,Meta Social Media Marketing,Meta,4.8,23K reviews,Beginner ¬∑ Professional Certificate ¬∑ 3 - 6 Mo...,https://www.coursera.org/professional-certific...,Digital Marketing,2026-01-28T23:13:38.659416,coursera,23000,20.0,1,meta social media marketing,48.207806,916,Inscrivez-vous gratuitement
4,Python for OOP - The A to Z OOP Python Program...,Packt,0.0,0 reviews,Intermediate ¬∑ Course ¬∑ 1 - 3 Months,https://www.coursera.org/learn/packt-python-fo...,Data Science,2026-02-02T19:39:49.373141,coursera,0,20.0,2,python for oop the a to z oop python programm...,0.0,70,Inscrivez-vous gratuitement


In [27]:
# V√©rifier les valeurs manquantes
print("Valeurs manquantes :")
print(df.isna().sum())
print(f"\nTaille du jeu de donn√©es : {df.shape}")
print(f"\nColonnes : {list(df.columns)}")

Valeurs manquantes :
title               0
partner             0
rating              0
reviews             0
metadata            0
link                0
category            0
scraped_at          0
source_domain       0
num_ratings         0
duration_hours      0
level_enc           0
title_clean         0
popularity_score    0
id                  0
price               0
dtype: int64

Taille du jeu de donn√©es : (1855, 16)

Colonnes : ['title', 'partner', 'rating', 'reviews', 'metadata', 'link', 'category', 'scraped_at', 'source_domain', 'num_ratings', 'duration_hours', 'level_enc', 'title_clean', 'popularity_score', 'id', 'price']


In [28]:
# Pr√©parer le texte pour TF-IDF
# Combiner titre, cat√©gorie et autres champs pertinents
df['combined_text'] = df['title_clean'].fillna('') + ' ' + df['category'].fillna('')
df['combined_text'] = df['combined_text'].str.lower()
print("Exemple de texte combin√© :")
print(df['combined_text'].head())

Exemple de texte combin√© :
0            mastering artificial intelligence general
1    capstone project digital marketing and growth ...
2         the art of visual storytelling video editing
3        meta social media marketing digital marketing
4    python for oop  the a to z oop python programm...
Name: combined_text, dtype: object


In [29]:
# Construire la matrice TF-IDF
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    stop_words='english'
)

tfidf_matrix = tfidf.fit_transform(df['combined_text'])
print(f"Taille de la matrice TF-IDF : {tfidf_matrix.shape}")
print(f"Taille du vocabulaire : {len(tfidf.vocabulary_)}")

Taille de la matrice TF-IDF : (1855, 1730)
Taille du vocabulaire : 1730


In [30]:
# Calculer la similarit√© cosinus
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(f"Taille de la matrice de similarit√© : {cosine_sim.shape}")

Taille de la matrice de similarit√© : (1855, 1855)


In [14]:
def recommend_courses(course_query, df, cosine_sim, top_n=10):
    # Trouver les cours correspondants
    matches = df[df["title"].str.contains(course_query, case=False, na=False)]

    if matches.empty:
        return "‚ùå Cours non trouv√©"

    # Utiliser la premi√®re correspondance
    idx = matches.index[0]
    print(f"Cours trouv√© : {df.iloc[idx]['title']}\n")

    # Obtenir les scores de similarit√©
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Ignorer le premier (soi-m√™me) et prendre top_n + marge
    sim_scores = sim_scores[1:top_n+10]

    course_indices = [i[0] for i in sim_scores]
    recs = df.iloc[course_indices].copy()

    # Extraire les valeurs de similarit√© en tableau numpy pour le calcul
    similarity_values = np.array([s[1] for s in sim_scores])
    
    # Calculer le score final (70% similarit√© + 30% popularit√©)
    if 'popularity_score' in recs.columns:
        popularity_normalized = recs["popularity_score"].values / (recs["popularity_score"].max() + 1e-6)
        recs["final_score"] = 0.7 * similarity_values + 0.3 * popularity_normalized
    else:
        recs["final_score"] = similarity_values

    # Retourner les meilleures recommandations
    result_cols = ["title", "category", "rating", "num_ratings",
                   "duration_hours", "source_domain", "final_score"]
    
    # Inclure uniquement les colonnes existantes
    result_cols = [col for col in result_cols if col in recs.columns]
    
    return recs.sort_values(
        by="final_score",
        ascending=False
    ).head(top_n)[result_cols]

In [16]:
# Tester le syst√®me de recommandation
print("=" * 80)
print("RECOMMANDATIONS DE COURS")
print("=" * 80)

# Exemple 1 : Machine Learning
print("\nüìö Recommandations pour : 'Machine Learning'\n")
recs = recommend_courses("Machine Learning", df, cosine_sim, top_n=5)
recs

COURSE RECOMMENDATIONS

üìö Recommendations for: 'Machine Learning'

Found course: Mathematics for Machine Learning



Unnamed: 0,title,category,rating,num_ratings,duration_hours,source_domain,final_score
1315,Machine Learning,Machine Learning,4.9,38000,20.0,coursera,0.848631
1103,Machine Learning,Machine Learning,4.6,16000,20.0,coursera,0.807164
775,Machine Learning for All,Machine Learning,4.7,3500,5.0,coursera,0.771318
1476,Mathematics for Machine Learning and Data Science,Machine Learning,4.6,3100,20.0,coursera,0.765448
256,The Nuts and Bolts of Machine Learning,Machine Learning,4.8,597,20.0,coursera,0.726808


In [20]:
# Exemple 2 : Python
print("\nüìö Recommandations pour : 'Python'\n")
recs = recommend_courses("Python", df, cosine_sim, top_n=5)
recs


üìö Recommendations for: 'Python'

Found course: Python for OOP - The A to Z OOP Python Programming Course



Unnamed: 0,title,category,rating,num_ratings,duration_hours,source_domain,final_score
1753,Python OOP - Object Oriented Programming for B...,Data Science,4.6,4890,19.0,udemy,0.633082
1735,Learn Python: The Complete Python Programming ...,Data Science,4.3,5713,14.0,udemy,0.587327
1001,Python 3 Programming,Data Science,4.8,23000,40.0,coursera,0.559207
614,Python For Everybody : Python Programming MADE...,Data Science,4.9,15295,5.0,udemy,0.524629
83,Python Programming - Foundation Course for Dat...,Data Science,4.6,1055,4.5,udemy,0.503815


In [21]:
# Exemple 3 : Data Science
print("\nüìö Recommandations pour : 'Data Science'\n")
recs = recommend_courses("Data Science", df, cosine_sim, top_n=5)
recs


üìö Recommendations for: 'Data Science'

Found course: How to Start a Career in Data Science



Unnamed: 0,title,category,rating,num_ratings,duration_hours,source_domain,final_score
550,What is Data Science?,Data Science,4.7,77000,5.0,coursera,0.846345
1403,Data Science,Data Science,4.5,51000,20.0,coursera,0.823136
1205,Executive Data Science,Data Science,4.5,11000,20.0,coursera,0.784103
915,Genomic Data Science,Data Science,4.5,6800,20.0,coursera,0.771865
427,Introduction to Data Science,Data Science,4.6,102000,20.0,coursera,0.747003


In [None]:
# Example 4: Web Development
print("\nüìö Recommandations pour : 'Web Development'\n")
recs = recommend_courses("Web", df, cosine_sim, top_n=5)
recs


üìö Recommendations for: 'Web Development'

Found course: HTML, CSS, and Javascript for Web Developers

                                                  title          category  \
647        HTML, CSS, and Javascript for Web Developers   Web Development   
1849            Introduction to HTML, CSS, & JavaScript   Web Development   
1652  Introduction to Web Development with HTML, CSS...   Web Development   
1070                              The Singer Songwriter  Music Production   
734                                    The DIY Musician  Music Production   

      rating  num_ratings  duration_hours source_domain  final_score  
647      4.7        17000             5.0      coursera     0.900858  
1849     4.4          789             5.0      coursera     0.666890  
1652     4.6         2300            20.0      coursera     0.605320  
1070     4.8         4300            20.0      coursera     0.558924  
734      4.7         3600            20.0      coursera     0.547971  


In [23]:
def evaluate_recommendations(course_query, df, cosine_sim, top_n=10):
    print(f"\nüìä √âvaluation des recommandations pour : '{course_query}'")
    
    # 1. R√©cup√©rer les recommandations
    # On doit refaire une partie de la logique pour avoir les indices
    matches = df[df["title"].str.contains(course_query, case=False, na=False)]
    if matches.empty:
        print("‚ùå Cours non trouv√©")
        return
        
    idx = matches.index[0]
    source_category = df.iloc[idx]['category']
    
    # Calcul des scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    
    rec_indices = [i[0] for i in sim_scores]
    rec_df = df.iloc[rec_indices]
    
    # --- M√©triques ---
    
    # 1. Similarit√© Moyenne (Average Similarity)
    avg_sim = np.mean([i[1] for i in sim_scores])
    
    # 2. Diversit√© (1 - similarit√© moyenne entre les recommandations)
    # On prend la sous-matrice de similarit√© pour les items recommand√©s
    if len(rec_indices) > 1:
        intra_sim_matrix = cosine_similarity(tfidf_matrix[rec_indices])
        # On exclut la diagonale (similarit√© avec soi-m√™me = 1)
        # np.triu_indices(n, 1) donne les indices triangulaires sup√©rieurs (sans diagonale)
        upper_tri_indices = np.triu_indices(len(rec_indices), k=1)
        avg_intra_sim = np.mean(intra_sim_matrix[upper_tri_indices])
        diversity = 1 - avg_intra_sim
    else:
        diversity = 0.0
        
    # 3. Category Coverage (Pourcentage de m√™me cat√©gorie)
    same_category_count = rec_df[rec_df['category'] == source_category].shape[0]
    category_coverage = same_category_count / top_n
    
    # 4. Novelty (Inverse Popularity)
    # Plus le score est bas, plus les items sont populaires (donc moins "nouveaux")
    if 'popularity_score' in rec_df.columns:
        avg_popularity = rec_df['popularity_score'].mean()
        # Normalisons par le max global pour avoir une id√©e relative
        max_pop = df['popularity_score'].max()
        novelty = 1 - (avg_popularity / (max_pop + 1e-6))
    else:
        novelty = 0.0

    print(f"   ‚Ä¢ Similarit√© Moyenne avec la source : {avg_sim:.4f} (Plus haut = plus pertinent)")
    print(f"   ‚Ä¢ Diversit√© des r√©sultats           : {diversity:.4f} (Plus haut = r√©sultats vari√©s)")
    print(f"   ‚Ä¢ Coh√©rence de Cat√©gorie            : {category_coverage:.1%} ({same_category_count}/{top_n} m√™me cat√©gorie)")
    print(f"   ‚Ä¢ Nouveaut√© (Novelty)               : {novelty:.4f} (Plus haut = moins populaire/connu)")
    
    return rec_df[['title', 'category', 'popularity_score']]

# Test sur quelques exemples
evaluate_recommendations("Machine Learning", df, cosine_sim)


üìä √âvaluation des recommandations pour : 'Machine Learning'
   ‚Ä¢ Similarit√© Moyenne avec la source : 0.7585 (Plus haut = plus pertinent)
   ‚Ä¢ Diversit√© des r√©sultats           : 0.1448 (Plus haut = r√©sultats vari√©s)
   ‚Ä¢ Coh√©rence de Cat√©gorie            : 100.0% (10/10 m√™me cat√©gorie)
   ‚Ä¢ Nouveaut√© (Novelty)               : 0.5099 (Plus haut = moins populaire/connu)


Unnamed: 0,title,category,popularity_score
1476,Mathematics for Machine Learning and Data Science,Machine Learning,36.981608
256,The Nuts and Bolts of Machine Learning,Machine Learning,30.689236
775,Machine Learning for All,Machine Learning,38.355778
1103,Machine Learning,Machine Learning,44.52987
1315,Machine Learning,Machine Learning,51.672302
1759,Machine Learning,Machine Learning,7.48599
919,Introduction to Embedded Machine Learning,Machine Learning,31.789134
1712,IBM Machine Learning,Machine Learning,37.669248
997,Machine Learning with Apache Spark,Machine Learning,21.352195
870,Java in Machine Learning,Machine Learning,0.0


In [24]:
evaluate_recommendations("Python", df, cosine_sim)


üìä √âvaluation des recommandations pour : 'Python'
   ‚Ä¢ Similarit√© Moyenne avec la source : 0.4626 (Plus haut = plus pertinent)
   ‚Ä¢ Diversit√© des r√©sultats           : 0.5553 (Plus haut = r√©sultats vari√©s)
   ‚Ä¢ Coh√©rence de Cat√©gorie            : 100.0% (10/10 m√™me cat√©gorie)
   ‚Ä¢ Nouveaut√© (Novelty)               : 0.5760 (Plus haut = moins populaire/connu)


Unnamed: 0,title,category,popularity_score
1753,Python OOP - Object Oriented Programming for B...,Data Science,39.077699
1552,Learn Python FAST! Full Fast Track Python Prog...,Data Science,12.176043
1735,Learn Python: The Complete Python Programming ...,Data Science,37.197901
1458,"Python Programming Bundle: Intro to Python, Pa...",Data Science,11.029282
16,Python Programming with Red Hat,Data Science,0.0
1406,Data-Oriented Python Programming and Debugging,Data Science,12.182817
1355,Python Programming Master Course,Data Science,30.029841
83,Python Programming - Foundation Course for Dat...,Data Science,32.02632
1001,Python 3 Programming,Data Science,48.207806
246,Python Basics Course,Data Science,38.073756


In [25]:
evaluate_recommendations("Web", df, cosine_sim)


üìä √âvaluation des recommandations pour : 'Web'
   ‚Ä¢ Similarit√© Moyenne avec la source : 0.5379 (Plus haut = plus pertinent)
   ‚Ä¢ Diversit√© des r√©sultats           : 0.4029 (Plus haut = r√©sultats vari√©s)
   ‚Ä¢ Coh√©rence de Cat√©gorie            : 0.0% (0/10 m√™me cat√©gorie)
   ‚Ä¢ Nouveaut√© (Novelty)               : 0.5891 (Plus haut = moins populaire/connu)


Unnamed: 0,title,category,popularity_score
647,"HTML, CSS, and Javascript for Web Developers",Web Development,45.782829
1849,"Introduction to HTML, CSS, & JavaScript",Web Development,29.356945
1652,"Introduction to Web Development with HTML, CSS...",Web Development,35.609056
1516,"Build a Portfolio Website with HTML, CSS, and ...",Web Development,20.930398
335,"Full-Stack Web Development: PHP, HTML, CSS & J...",Web Development,14.109724
578,"Javascript, HTML and CSS | HTML CSS Javascript...",Computer Science,24.184479
1698,"Web Development with HTML5, CSS, and JavaScript",Web Development,0.0
260,"Become a Certified Web Developer: HTML, CSS an...",Computer Science,35.744292
1233,"Web Development Mastery: HTML, CSS & JavaScrip...",Web Development,11.18208
73,Web Development | HTML | CSS | JavaScript | jQ...,Web Development,35.041352
