In [None]:
!pip install gradio sentence-transformers pandas scikit-learn nltk

Collecting gradio
  Downloading gradio-5.27.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.9.1 (from gradio)
  Downloading gradio_client-1.9.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import nltk
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

class RealSemanticCourseRecommender:
    def __init__(self, csv_path='Cleaned_Courses_Data.csv', cache_dir='cached_embeddings'):
        self.csv_path = csv_path
        self.cache_dir = cache_dir
        self.embedding_model = None
        self.courses_df = None
        self.course_embeddings = None

    def load_and_preprocess(self):
        """Load course data, embeddings, and model, using cache if available."""
        os.makedirs(self.cache_dir, exist_ok=True)

        courses_cache = os.path.join(self.cache_dir, 'courses.pkl')
        embeddings_cache = os.path.join(self.cache_dir, 'embeddings.npy')
        model_cache = os.path.join(self.cache_dir, 'sentence_model')

        if os.path.exists(courses_cache) and os.path.exists(embeddings_cache):
            print("📦 Loading cached courses and embeddings...")
            with open(courses_cache, 'rb') as f:
                self.courses_df = pickle.load(f)
            self.course_embeddings = np.load(embeddings_cache)
            print("✅ Cache loaded.")

            if os.path.exists(model_cache):
                print("📦 Loading SentenceTransformer model from local cache...")
                self.embedding_model = SentenceTransformer(model_cache)
            else:
                print("🔵 Downloading model...")
                self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
                print("💾 Saving model locally...")
                self.embedding_model.save(model_cache)

        else:
            print("📂 No cache found. Preprocessing courses...")
            self.courses_df = pd.read_csv(self.csv_path)

            self.courses_df['combined_text'] = (
                self.courses_df['Title'].fillna('') + ' ' +
                self.courses_df['Description'].fillna('') + ' ' +
                self.courses_df['Subject'].fillna('')
            )

            print("🔵 Loading SentenceTransformer model...")
            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
            print("💾 Saving model locally...")
            self.embedding_model.save(model_cache)

            print("🔵 Encoding course texts...")
            self.course_embeddings = self.embedding_model.encode(
                self.courses_df['combined_text'].tolist(),
                show_progress_bar=True,
                batch_size=64,
                convert_to_numpy=True,
                normalize_embeddings=True
            ).astype(np.float32)

            print("💾 Saving course metadata and embeddings...")
            with open(courses_cache, 'wb') as f:
                pickle.dump(self.courses_df, f)
            np.save(embeddings_cache, self.course_embeddings)

            print("✅ Preprocessing complete and cached.")

    def _ensure_model_loaded(self):
        """Make sure model is loaded if needed."""
        if self.embedding_model is None:
            print("📦 Loading SentenceTransformer model from cache...")
            self.embedding_model = SentenceTransformer(self.cache_dir + '/sentence_model')

    def semantic_search(self, query, top_n=5, grad_level=None, graded_pref=None):
        """Perform fast semantic search, one course per Subject."""
        self._ensure_model_loaded()

        query_embedding = self.embedding_model.encode(
            [query],
            normalize_embeddings=True
        ).astype(np.float32)

        similarity_scores = cosine_similarity(query_embedding, self.course_embeddings).flatten()

        filtered_df = self.courses_df.copy()
        filtered_df['Catalog Number'] = pd.to_numeric(filtered_df['Catalog Number'], errors='coerce')

        # Apply Graduate/Undergraduate filter
        if grad_level == 'G':
            filtered_df = filtered_df[filtered_df['Catalog Number'] > 5000]
        elif grad_level == 'U':
            filtered_df = filtered_df[filtered_df['Catalog Number'] <= 5000]

        # Apply Graded/Non-Graded filter
        if 'Graded' in filtered_df.columns:
            if graded_pref == 'G':
                filtered_df = filtered_df[filtered_df['Graded'].str.strip().str.lower() == 'yes']
            elif graded_pref == 'N':
                filtered_df = filtered_df[filtered_df['Graded'].str.strip().str.lower() == 'no']

        matching_indices = filtered_df.index.tolist()

        if not matching_indices:
            print("\n⚠️ No matching courses found after filters.")
            return []

        filtered_scores = similarity_scores[matching_indices]

        # Sort all candidates by similarity
        sorted_indices = np.argsort(filtered_scores)[::-1]
        sorted_matching_indices = [matching_indices[idx] for idx in sorted_indices]

        # Pick top N without Subject duplicates
        seen_subjects = set()
        results = []

        for idx in sorted_matching_indices:
            course = self.courses_df.iloc[idx]
            subject = course['Subject']

            if subject not in seen_subjects:
                seen_subjects.add(subject)
                results.append({
                    'Catalog Number': course['Catalog Number'],
                    'Title': course['Title'],
                    'Subject': course['Subject'],
                    'Description': course['Description'],
                    'Similarity Score': similarity_scores[idx]
                })

            if len(results) >= top_n:
                break

        return results

    def print_recommendations(self, recommendations):
        """Pretty print results nicely."""
        if not recommendations:
            print("❌ No recommendations found.")
            return

        print("\n🎯 Recommended Courses:")
        for i, rec in enumerate(recommendations, 1):
            description = rec['Description'] if pd.notna(rec['Description']) else ""
            short_desc = description[:200] + ("..." if len(description) > 200 else "")
            print(f"\n{i}. {rec['Catalog Number']} - {rec['Title']}")
            print(f"   Subject: {rec['Subject']}")
            print(f"   Description: {short_desc}")
            print(f"   Semantic Similarity Score: {rec['Similarity Score']:.4f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import gradio as gr

# Load your model
model = RealSemanticCourseRecommender()
model.load_and_preprocess()

def recommend_courses(query, course_level, graded_pref):
    if course_level not in ['G', 'U']:
        course_level = None
    if graded_pref not in ['G', 'N']:
        graded_pref = None

    results = model.semantic_search(
        query=query,
        top_n=5,
        grad_level=course_level,
        graded_pref=graded_pref
    )

    if not results:
        return "❌ No matching courses found."

    cards = ""
    for i, rec in enumerate(results, 1):
        description = rec['Description'] if pd.notna(rec['Description']) else ""
        preview = description[:150] + ("..." if len(description) > 150 else "")
        full_description = description.replace("\n", "<br>")
        cards += f"""
<details style="background-color: #f9f9f9; padding: 10px; margin-bottom: 10px; border-radius: 10px; box-shadow: 0px 0px 5px #ccc;">
  <summary style="font-size: 18px; font-weight: bold;">{i}. {rec['Catalog Number']} - {rec['Title']} <span style='color: gray; font-size: 14px;'>[{rec['Subject']}]</span></summary>
  <div style="margin-top: 10px;">
    <b>Similarity Score:</b> {rec['Similarity Score']:.4f}<br><br>
    <b>Description:</b><br> {full_description}
  </div>
</details>
"""

    return cards

# Build Gradio Blocks with a Clear Button
with gr.Blocks(title="Cornell Course Recommender") as demo:
    gr.Markdown("<h1 style='text-align: center;'>📚 Find Your Perfect Course at Cornell</h1>")
    gr.Markdown("<p style='text-align: center;'>Enter your interests below and we'll match you to the best available courses!</p>")

    with gr.Row():
        query_input = gr.Textbox(label="🎯 Your Interest", placeholder="e.g., product management, consulting, data science")

    with gr.Row():
        course_level = gr.Radio(["G", "U"], label="🎓 Graduate (G) or Undergraduate (U)?", info="Optional")
        graded_pref = gr.Radio(["G", "N"], label="📝 Graded (G) or Non-Graded (N)?", info="Optional")

    with gr.Row():
        search_button = gr.Button("🔍 Find Courses")
        clear_button = gr.Button("🧹 Clear")

    results_output = gr.HTML()

    # What happens when "Find Courses" button is clicked
    search_button.click(
        recommend_courses,
        inputs=[query_input, course_level, graded_pref],
        outputs=[results_output]
    )

    # What happens when "Clear" button is clicked
    clear_button.click(
        fn=lambda: ("", None, None, ""),  # reset fields
        inputs=[],
        outputs=[query_input, course_level, graded_pref, results_output]
    )

# Launch app
demo.launch(share=True)


📂 No cache found. Preprocessing courses...
🔵 Loading SentenceTransformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

💾 Saving model locally...
🔵 Encoding course texts...


Batches:   0%|          | 0/358 [00:00<?, ?it/s]

💾 Saving course metadata and embeddings...
✅ Preprocessing complete and cached.
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b7edd502aa31523dfc.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


