In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/udemy_courses.csv')
df

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3.0,2017-05-30T20:07:24Z,Business Finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2.0,2016-12-13T14:57:18Z,Business Finance
...,...,...,...,...,...,...,...,...,...,...,...,...
3673,775618,Learn jQuery from Scratch - Master of JavaScri...,https://www.udemy.com/easy-jquery-for-beginner...,True,100,1040,14,21,All Levels,2.0,2016-06-14T17:36:46Z,Web Development
3674,1088178,How To Design A WordPress Website With No Codi...,https://www.udemy.com/how-to-make-a-wordpress-...,True,25,306,3,42,Beginner Level,3.5,2017-03-10T22:24:30Z,Web Development
3675,635248,Learn and Build using Polymer,https://www.udemy.com/learn-and-build-using-po...,True,40,513,169,48,All Levels,3.5,2015-12-30T16:41:42Z,Web Development
3676,905096,CSS Animations: Create Amazing Effects on Your...,https://www.udemy.com/css-animations-create-am...,True,50,300,31,38,All Levels,3.0,2016-08-11T19:06:15Z,Web Development


In [3]:
df.isnull().sum()

course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
dtype: int64

In [7]:
for i in df.columns:
    print(f"{i}: {df[i].nunique()}")

course_id: 3672
course_title: 3663
url: 3672
is_paid: 2
price: 38
num_subscribers: 2197
num_reviews: 511
num_lectures: 229
level: 4
content_duration: 105
published_timestamp: 3672
subject: 4


In [13]:
df['level'].value_counts()

level
all levels            1925
beginner level        1268
intermediate level     421
expert level            58
Name: count, dtype: int64

In [9]:
df.isna().sum()

course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
dtype: int64

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
# Standardize text
df["course_title"] = df["course_title"].str.lower().str.strip()
df["subject"] = df["subject"].str.lower().str.strip()
df["level"] = df["level"].str.lower().str.strip()

In [12]:
# Convert numeric columns
df["price"] = pd.to_numeric(df["price"], errors="coerce").fillna(0)
df["is_paid"] = df["is_paid"].astype(bool)

In [14]:
VALID_LEVELS = {
    "all levels",
    "beginner level",
    "intermediate level",
    "expert level"
}

In [15]:
df["semantic_text"] = (df["course_title"] + " " + df["subject"])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english",ngram_range=(1, 2),min_df=2)

tfidf_matrix = tfidf.fit_transform(df["semantic_text"])


In [17]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [31]:
def recommend_courses(course_query,subject_query,level="all levels",is_paid=None,min_price=None,max_price=None,top_n=5):
    course_query = course_query.lower()
    subject_query = subject_query.lower()
    level = level.lower()

    if level not in VALID_LEVELS:
        return "Invalid level selection."

    # Step 1: Find base course using title + subject
    base_matches = df[
        df["course_title"].str.contains(course_query) &
        df["subject"].str.contains(subject_query)
    ]

    if base_matches.empty:
        return "No course found matching title and subject."

    base_index = base_matches.index[0]

    # Step 2: Similarity ranking
    similarity_scores = list(enumerate(cosine_sim[base_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    ranked_indices = [i[0] for i in similarity_scores[1:]]

    results = df.iloc[ranked_indices]

    # Step 3: Level filter
    if level != "all levels":
        results = results[results["level"] == level]

    # Step 4: Paid / price filters
    if is_paid is not None:
        results = results[results["is_paid"] == is_paid]

    if is_paid:
        if min_price is not None:
            results = results[results["price"] >= min_price]
        if max_price is not None:
            results = results[results["price"] <= max_price]

    # Step 5: Final output
    return results.head(top_n)

In [38]:
recommend_courses(
    course_query="",
    subject_query="web development",
    level="all levels",
    is_paid=False,
)


Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,semantic_text
3107,370752,how to make a wordpress website 2017,https://www.udemy.com/wordpress-4-for-beginners/,False,0,12293,235,18,all levels,3.0,2015-01-14T22:25:58Z,web development,how to make a wordpress website 2017 web devel...
3274,856968,how to make a wordpress website 2017,https://www.udemy.com/how-to-wordpress/,False,0,2437,137,10,beginner level,0.566667,2016-06-07T00:02:04Z,web development,how to make a wordpress website 2017 web devel...
3322,1041426,how to build and make a wordpress website from...,https://www.udemy.com/how-to-build-and-make-a-...,False,0,7055,166,24,all levels,2.0,2017-01-24T04:59:20Z,web development,how to build and make a wordpress website from...
2718,1068590,wordpress website mastery 2017,https://www.udemy.com/wordpress-website-master...,False,0,5181,64,22,beginner level,2.5,2017-01-18T17:01:40Z,web development,wordpress website mastery 2017 web development
2719,1203894,how to make an ecommerce website with wordpres...,https://www.udemy.com/how-to-make-an-ecommerce...,False,0,3493,43,26,beginner level,4.5,2017-05-09T22:08:11Z,web development,how to make an ecommerce website with wordpres...


In [39]:
def parse_user_query(query):
    query = query.lower()

    # Level detection
    if "beginner" in query:
        level = "beginner level"
    elif "intermediate" in query:
        level = "intermediate level"
    elif "expert" in query or "advanced" in query:
        level = "expert level"
    else:
        level = "all levels"

    # Paid / Free detection
    if "free" in query:
        is_paid = False
    elif "paid" in query:
        is_paid = True
    else:
        is_paid = None

    # Price range detection
    prices = list(map(int, re.findall(r"\d+", query)))
    min_price, max_price = None, None
    if len(prices) >= 2:
        min_price, max_price = prices[0], prices[1]

    # Clean semantic query (remove noise words)
    noise_words = [
        "beginner", "intermediate", "expert", "advanced",
        "course", "courses", "paid", "free",
        "between", "under", "above", "level"
    ]

    semantic_query = query
    for word in noise_words:
        semantic_query = semantic_query.replace(word, "")

    semantic_query = semantic_query.strip()

    return semantic_query, level, is_paid, min_price, max_price


In [133]:
import re

def recommend_from_nlp_query(user_query, min_match_percent=50, top_n=10 ):
    semantic_query, level, is_paid, min_price, max_price = parse_user_query(user_query)

    # Convert query into TF-IDF vector
    query_vector = tfidf.transform([semantic_query])

    # Compute similarity with all courses
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)[0]

    # Attach similarity to dataframe
    results = df.copy()
    results["match_percent"] = similarity_scores * 100

    # Apply confidence threshold
    results = results[results["match_percent"] >= min_match_percent]

    # Apply level filter
    if level != "all levels":
        results = results[results["level"] == level]

    # Apply paid / price filters
    if is_paid is not None:
        results = results[results["is_paid"] == is_paid]

    if is_paid:
        if min_price is not None:
            results = results[results["price"] >= min_price]
        if max_price is not None:
            results = results[results["price"] <= max_price]

    # Rank by confidence
    results = results.sort_values(
        by="match_percent", ascending=False
    )

    return results[[
        "course_title",
        "subject",
        "level",
        "price",
        "is_paid",
        "match_percent"
    ]]


In [149]:
recommend_from_nlp_query(
    "html, css course "
)


Unnamed: 0,course_title,subject,level,price,is_paid,match_percent
3022,html css: essential steps to learn html css,web development,beginner level,145,True,73.000757
2916,html css javascript: most popular ways to code...,web development,all levels,170,True,65.02239
2846,introductory to html and css,web development,beginner level,50,True,62.731107
3067,learn and earn with html & css,web development,all levels,20,True,62.045664
2491,html tutorial: html & css for beginners,web development,all levels,20,True,59.517717
3342,html and css foundations,web development,beginner level,0,False,58.732074
3496,all in one html css and jquery,web development,all levels,20,True,56.912998
3621,html/css bootcamp,web development,beginner level,40,True,53.803858
3356,learn html and css from scratch,web development,beginner level,0,False,53.65391
2605,html and css crash course for beginners,web development,beginner level,50,True,53.383064


In [158]:
import google.generativeai as genai
import json


  from .autonotebook import tqdm as notebook_tqdm


In [216]:
genai.configure(api_key="AIzaSyAmW8yxscyklTRBPrw1QuQHJIBmIXMBc2g")

model = genai.GenerativeModel("gemini-2.5-flash")


In [None]:
SYSTEM_PROMPT = """
You are an AI assistant helping an online course recommendation system.

Extract structured filters from the user's query.
Only return valid JSON.
Do NOT explain anything.

Dataset fields:
- course_title
- subject
- level: ["all levels", "beginner level", "intermediate level", "expert level"]
- is_paid: true / false / null
- min_price: number or null
- max_price: number or null
- keywords: list of important semantic keywords

Return JSON in this format:
{
  "keywords": [],
  "level": "",
  "is_paid": null,
  "min_price": null,
  "max_price": null
}
"""


In [184]:
import re
import json

def safe_json_parse(text):
    """
    Extracts the first valid JSON object from Gemini response.
    """
    try:
        # Direct parse attempt
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    # Regex fallback: extract {...}
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group())
        except json.JSONDecodeError:
            pass

    # Absolute fallback
    return {
        "keywords": [],
        "level": "all levels",
        "is_paid": None,
        "min_price": None,
        "max_price": None
    }


In [185]:
def parse_query_with_gemini(user_query):
    response = model.generate_content(
        [
            SYSTEM_PROMPT,
            f"User query: {user_query}"
        ]
    )

    parsed = safe_json_parse(response.text)

    # Guardrails
    if "keywords" not in parsed:
        parsed["keywords"] = []

    if parsed.get("level") not in {
        "all levels",
        "beginner level",
        "intermediate level",
        "expert level"
    }:
        parsed["level"] = "all levels"

    return parsed


In [186]:
def build_semantic_query(parsed):
    if parsed["keywords"]:
        return " ".join(parsed["keywords"])
    return ""


In [219]:
def recommend_with_gemini(
    user_query,
    min_match_percent=50,
    top_n=10
):
    parsed = parse_query_with_gemini(user_query)

    semantic_query = build_semantic_query(parsed)

    query_vector = tfidf.transform([semantic_query])
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)[0]

    results = df.copy()
    results["match_percent"] = similarity_scores * 100

    # Confidence threshold
    results = results[results["match_percent"] >= min_match_percent]

    # Level filter
    if parsed["level"] != "all levels":
        results = results[results["level"] == parsed["level"]]

    # Paid / price filter
    if parsed["is_paid"] is not None:
        results = results[results["is_paid"] == parsed["is_paid"]]

    if parsed["is_paid"]:
        if parsed["min_price"] is not None:
            results = results[results["price"] >= parsed["min_price"]]
        if parsed["max_price"] is not None:
            results = results[results["price"] <= parsed["max_price"]]

    return results.sort_values(
        by="match_percent", ascending=False
    )[[
        "course_title",
        "subject",
        "level",
        "price",
        "is_paid",
        "match_percent"
    ]].head(top_n) if not results.empty else print("No courses found matching the criteria.")


In [224]:
recommend_with_gemini(
    "python and dijango course prive range between 10 and 150"
)


Unnamed: 0,course_title,subject,level,price,is_paid,match_percent
3194,learn python django - a hands-on course,web development,beginner level,50,True,66.436978
2528,learn python and django: payment processing,web development,all levels,70,True,53.533542
