# Course Recommendation System
This notebook demonstrates the course recommendation engine using TF-IDF and cosine similarity.

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the course data
df = pd.read_csv("../processed_data/final_courses_shuffled.csv")
print(f"Loaded {len(df)} courses")
df.head()

Loaded 1855 courses


Unnamed: 0,title,partner,rating,reviews,metadata,link,category,scraped_at,source_domain,num_ratings,duration_hours,level_enc,title_clean,popularity_score,id,price
0,Mastering Artificial Intelligence,Selfcode Academy,4.4,416 reviews,All Levels ¬∑ Course,https://www.udemy.com/course/mastering-artific...,General,2026-02-02T20:24:47.830773,udemy,416,12.5,0,mastering artificial intelligence,26.545579,582,$9.99
1,Capstone Project: Digital Marketing and Growth...,IBM,0.0,0 reviews,Advanced ¬∑ Course ¬∑ 1 - 3 Months,https://www.coursera.org/learn/capstone-projec...,Digital Marketing,2026-01-29T02:34:11.841793,coursera,0,20.0,3,capstone project digital marketing and growth ...,0.0,1244,Inscrivez-vous gratuitement
2,The Art of Visual Storytelling,University of Colorado Boulder,4.6,135 reviews,Beginner ¬∑ Specialization ¬∑ 3 - 6 Months,https://www.coursera.org/specializations/the-a...,Video Editing,2026-01-28T23:15:44.255610,coursera,135,20.0,1,the art of visual storytelling,22.598212,1094,Inscrivez-vous gratuitement
3,Meta Social Media Marketing,Meta,4.8,23K reviews,Beginner ¬∑ Professional Certificate ¬∑ 3 - 6 Mo...,https://www.coursera.org/professional-certific...,Digital Marketing,2026-01-28T23:13:38.659416,coursera,23000,20.0,1,meta social media marketing,48.207806,916,Inscrivez-vous gratuitement
4,Python for OOP - The A to Z OOP Python Program...,Packt,0.0,0 reviews,Intermediate ¬∑ Course ¬∑ 1 - 3 Months,https://www.coursera.org/learn/packt-python-fo...,Data Science,2026-02-02T19:39:49.373141,coursera,0,20.0,2,python for oop the a to z oop python programm...,0.0,70,Inscrivez-vous gratuitement


In [2]:
# Check for missing values
print("Missing values:")
print(df.isna().sum())
print(f"\nDataset shape: {df.shape}")

Missing values:
title               0
partner             0
rating              0
reviews             0
metadata            0
link                0
category            0
scraped_at          0
source_domain       0
num_ratings         0
duration_hours      0
level_enc           0
title_clean         0
popularity_score    0
id                  0
price               0
dtype: int64

Dataset shape: (1855, 16)


In [3]:
# Prepare text for TF-IDF
# Combine title, category, and other relevant fields
df['combined_text'] = df['title_clean'].fillna('') + ' ' + df['category'].fillna('')
df['combined_text'] = df['combined_text'].str.lower()
print("Sample combined text:")
print(df['combined_text'].head())

Sample combined text:
0            mastering artificial intelligence general
1    capstone project digital marketing and growth ...
2         the art of visual storytelling video editing
3        meta social media marketing digital marketing
4    python for oop  the a to z oop python programm...
Name: combined_text, dtype: object


In [4]:
# Build TF-IDF matrix
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    stop_words='english'
)

tfidf_matrix = tfidf.fit_transform(df['combined_text'])
print(f"TF-IDF Matrix shape: {tfidf_matrix.shape}")
print(f"Vocabulary size: {len(tfidf.vocabulary_)}")

TF-IDF Matrix shape: (1855, 1730)
Vocabulary size: 1730


In [5]:
# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(f"Similarity matrix shape: {cosine_sim.shape}")

Similarity matrix shape: (1855, 1855)


In [None]:
def recommend_courses(
    course_query,
    df,
    cosine_sim,
    top_n=10
):
    
    # Find matching courses
    matches = df[df["title"].str.contains(course_query, case=False, na=False)]

    if matches.empty:
        return "‚ùå Course not found"

    # Use the first match
    idx = matches.index[0]
    print(f"Found course: {df.iloc[idx]['title']}\n")

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Skip the first one (itself) and get top_n + buffer
    sim_scores = sim_scores[1:top_n+10]

    course_indices = [i[0] for i in sim_scores]
    recs = df.iloc[course_indices].copy()

    # Calculate final score (70% similarity + 30% popularity)
    if 'popularity_score' in recs.columns:
        recs["final_score"] = (
            0.7 * [s[1] for s in sim_scores] +
            0.3 * (recs["popularity_score"] / (recs["popularity_score"].max() + 1e-6))
        )
    else:
        recs["final_score"] = [s[1] for s in sim_scores]

    # Return top recommendations
    result_cols = ["title", "category", "rating", "num_ratings",
                   "duration_hours", "source_domain", "final_score"]
    
    # Only include columns that exist
    result_cols = [col for col in result_cols if col in recs.columns]
    
    return recs.sort_values(
        by="final_score",
        ascending=False
    ).head(top_n)[result_cols]

In [None]:
# Test the recommendation system
print("=" * 80)
print("COURSE RECOMMENDATIONS")
print("=" * 80)

# Example 1: Machine Learning
print("\nüìö Recommendations for: 'Machine Learning'\n")
recs = recommend_courses(
    "Machine Learning",
    df,
    cosine_sim,
    top_n=5
)
print(recs)

In [None]:
# Example 2: Python
print("\nüìö Recommendations for: 'Python'\n")
recs = recommend_courses(
    "Python",
    df,
    cosine_sim,
    top_n=5
)
print(recs)

In [None]:
# Example 3: Data Science
print("\nüìö Recommendations for: 'Data Science'\n")
recs = recommend_courses(
    "Data Science",
    df,
    cosine_sim,
    top_n=5
)
print(recs)