<a href="https://colab.research.google.com/github/BhanuPrakashSamoju/DatabricksGenAITraining/blob/main/assignments/assignment_02/assignment_2_course_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -r /content/requirements.txt



In [2]:
# --- Load Environment Variables for Azure Credentials ---
from dotenv import load_dotenv
load_dotenv("/content/env")

True

In [3]:
# --- 1. SETUP AND DEPENDENCIES ---
# Ensure all packages are installed
# !pip install langchain langgraph langchain_openai chromadb pandas python-dotenv requests beautifulsoup4

import os
import pandas as pd
import requests
from typing import List, Tuple, TypedDict, Optional

from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from langgraph.graph import StateGraph, END

In [4]:
# --- 2. DATA LOADING AND PREPARATION ---
print("--- 2. Loading and Preparing Data ---")
DATASET_URL = "https://raw.githubusercontent.com/Bluedata-Consulting/GAAPB01-training-code-base/refs/heads/main/Assignments/assignment2dataset.csv"

try:
    courses_df = pd.read_csv(DATASET_URL)
    # print(courses_df)
    courses_df['combined_text'] = courses_df['title'] + ": " + courses_df['description']
    courses_df.dropna(subset=['course_id', 'combined_text'], inplace=True)
    print(f"Successfully loaded and prepared {len(courses_df)} courses.")
    # Create a string representation of the course catalog for the LLM prompt
    course_catalog_str = "\n".join([f"- {row['course_id']}: {row['title']}" for _, row in courses_df.iterrows()])

except requests.exceptions.RequestException as e:
    print(f"Error fetching data: {e}")
    exit()



--- 2. Loading and Preparing Data ---
Successfully loaded and prepared 25 courses.


In [5]:
course_catalog_str

'- C001: Foundations of Machine Learning\n- C002: Deep Learning with TensorFlow and Keras\n- C003: Natural Language Processing Fundamentals\n- C004: Computer Vision and Image Processing\n- C005: Reinforcement Learning Basics\n- C006: Data Engineering on AWS\n- C007: Cloud Computing with Azure\n- C008: DevOps Practices and CI/CD\n- C009: Containerization with Docker and Kubernetes\n- C010: APIs and Microservices Architecture\n- C011: Big Data Analytics with Spark\n- C012: SQL for Data Analysis\n- C013: NoSQL Databases and MongoDB\n- C014: Data Visualization with Tableau\n- C015: Business Intelligence with Power BI\n- C016: Python Programming for Data Science\n- C017: R Programming and Statistical Analysis\n- C018: Product Management Essentials\n- C019: Agile and Scrum Mastery\n- C020: User Experience (UX) Design Principles\n- C021: Cybersecurity Fundamentals\n- C022: Internet of Things (IoT) Development\n- C023: Blockchain Technology and Smart Contracts\n- C024: Augmented and Virtual Re

In [6]:
print(courses_df.head())

  course_id                                     title  \
0      C001           Foundations of Machine Learning   
1      C002   Deep Learning with TensorFlow and Keras   
2      C003  Natural Language Processing Fundamentals   
3      C004      Computer Vision and Image Processing   
4      C005             Reinforcement Learning Basics   

                                         description  \
0  Understand foundational machine learning algor...   
1  Explore neural network architectures using Ten...   
2  Dive into NLP techniques for processing and un...   
3  Learn the principles of computer vision and im...   
4  Get introduced to reinforcement learning parad...   

                                       combined_text  
0  Foundations of Machine Learning: Understand fo...  
1  Deep Learning with TensorFlow and Keras: Explo...  
2  Natural Language Processing Fundamentals: Dive...  
3  Computer Vision and Image Processing: Learn th...  
4  Reinforcement Learning Basics: Get introdu

In [7]:
# --- 3. INITIALIZE MODELS AND VECTOR STORES ---
print("\n--- 3. Initializing Models and Vector Stores ---")
os.environ.pop("OPENAI_API_BASE", None)

# --- Your existing code (with the 'model' parameter uncommented) ---
embedding_model_name = "text-embedding-3-small"

# Initialize Azure OpenAI Embedding Model
embedding_model = AzureOpenAIEmbeddings(
    azure_endpoint=os.environ["AZURE_OPENAI_EMBEDDING_ENDPOINT"],
    # It's good practice to still specify the model name
    model=embedding_model_name,
    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"], # Make sure this env var is set
    api_key=os.environ["AZURE_OPENAI_EMBEDDING_KEY"],
    api_version=os.environ["AZURE_OPENAI_EMBEDDING_API_VERSION"],
)



--- 3. Initializing Models and Vector Stores ---


In [8]:
embedding_model.embed_query("Hi how are you?")

[0.011693759821355343,
 -0.043644730001688004,
 -0.016767684370279312,
 0.05978378653526306,
 -0.02491418831050396,
 -0.02193782664835453,
 0.011398689821362495,
 0.02227138541638851,
 -0.03515183925628662,
 -0.06209303066134453,
 -0.023143766447901726,
 -0.0058532943949103355,
 -0.03340707719326019,
 -0.005590297281742096,
 0.008486475795507431,
 0.023836540058255196,
 -0.003138327971100807,
 0.024785896763205528,
 -0.00911510270088911,
 0.038795314729213715,
 0.031380072236061096,
 0.00788991991430521,
 -0.009018884971737862,
 -0.001393565209582448,
 0.02418292686343193,
 0.015164041891694069,
 0.013060063123703003,
 -0.006167608313262463,
 0.011443591676652431,
 -0.0072356341406702995,
 0.048134930431842804,
 -0.03099519945681095,
 0.017268020659685135,
 0.01819171942770481,
 -0.014381464570760727,
 0.013444937765598297,
 -0.011668101884424686,
 0.02817278914153576,
 0.00558708980679512,
 -0.06799443811178207,
 -0.011713003739714622,
 -0.02912214584648609,
 0.022399676963686943,
 0.

In [9]:
# Initialize Azure OpenAI Chat Model (GPT-4o mini)
llm = AzureChatOpenAI(
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    openai_api_type=os.getenv("OPENAI_API_TYPE"),
    temperature=0.2,
    # streaming=False,
)

In [10]:
llm.invoke("Hi!")

AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 9, 'total_tokens': 19, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_efad92c60b', 'id': 'chatcmpl-CKpVWIJDnCwV80o2NpIEYD4NaxXlI', 'service_tier': None, 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe

In [11]:



# --- Vector Store for COURSES ---
print("Initializing course catalog vector store...")
course_documents = [
    Document(
        page_content=row['combined_text'],
        metadata={'course_id': row['course_id'], 'title': row['title']}
    ) for _, row in courses_df.iterrows()
]
course_vector_store = Chroma.from_documents(
    documents=course_documents,
    embedding=embedding_model,
    collection_name="courses"
)
course_retriever = course_vector_store.as_retriever(search_kwargs={"k": 10})

# --- Vector Store for USER PREFERENCES ---
print("Initializing user preference vector store...")
# This DB will store extracted user interests over time
preference_vector_store = Chroma(
    collection_name="user_preferences",
    embedding_function=embedding_model
)

Initializing course catalog vector store...
Initializing user preference vector store...


  preference_vector_store = Chroma(


In [12]:
# --- 4. BUILDING THE ENHANCED RECOMMENDATION GRAPH ---
print("\n--- 4. Building the Enhanced LangGraph Recommendation Graph ---")

# Pydantic model for structured output from the LLM
class RecognizedPreferences(BaseModel):
    preferences: List[str] = Field(description="A list of key skills, topics, or technologies the user is interested in.")

class CompletedCourses(BaseModel):
    completed_course_ids: List[str] = Field(description="A list of course_ids that the user has completed based on the query.")

# Define the state for the graph
class RecommendationState(TypedDict):
    user_id: str
    profile: str
    completed_ids: List[str]
    historical_preferences: Optional[List[str]]
    synthesized_query: str
    final_recommendations: List[Tuple[str, float]]

# Define the nodes of the graph
def extract_completed_courses_node(state: RecommendationState) -> RecommendationState:
    """Uses an LLM to extract completed course IDs from the user's natural language query."""
    print(">> Node: extract_completed_courses_node")
    profile = state["profile"]

    structured_llm = llm.with_structured_output(CompletedCourses)

    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an expert at reading a user query and identifying which courses they have completed from a provided catalog. Only return IDs for courses explicitly mentioned as completed. If none are mentioned, return an empty list."),
        ("human", "From the user's query below, identify the `course_id` for any courses they have completed.\n\n## Course Catalog:\n{catalog}\n\n## User Query:\n'{profile}'")
    ])

    chain = prompt | structured_llm
    result = chain.invoke({"profile": profile, "catalog": course_catalog_str})

    completed_ids = result.completed_course_ids
    print(f"   LLM extracted completed courses: {completed_ids}")

    return {**state, "completed_ids": completed_ids}

# Define the nodes of the graph
def fetch_user_history_node(state: RecommendationState) -> RecommendationState:
    """Fetches the user's past preferences from the preference vector store."""
    print(">> Node: fetch_user_history_node")
    user_id = state["user_id"]
    # In Chroma, we can filter by metadata to get user-specific documents
    try:
        # A simple way to get all docs for a user is to do a broad search and filter
        # For larger scale, a proper metadata query would be better.
        results = preference_vector_store.get(where={"user_id": user_id}, include=["documents"])
        history = [doc for doc in results['documents']] if results else []
        print(f"   Found {len(history)} historical preferences for user '{user_id}'.")
        return {**state, "historical_preferences": history}
    except Exception as e:
        print(f"   Could not retrieve history for user '{user_id}': {e}")
        return {**state, "historical_preferences": []}


def synthesize_and_recognize_node(state: RecommendationState) -> RecommendationState:
    """
    1. Recognizes preferences from the current query and indexes them.
    2. Synthesizes a new query using the current request AND historical data.
    """
    print(">> Node: synthesize_and_recognize_node")
    user_id = state["user_id"]
    profile = state["profile"]
    history = state.get("historical_preferences", [])

    # Part 1: Recognize and Index Preferences from CURRENT query using .with_structured_output

    # Create an LLM instance that is bound to the Pydantic output structure
    structured_llm = llm.with_structured_output(RecognizedPreferences)

    # Create a simpler prompt, as formatting instructions are handled automatically
    recognition_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an expert at extracting key skills, topics, and technologies from user queries."),
        ("human", "Extract the key topics from the following user query: '{profile}'")
    ])

    # Create and invoke the chain
    chain = recognition_prompt | structured_llm
    recognized = chain.invoke({"profile": profile})

    new_preferences = recognized.preferences
    if new_preferences:
        print(f"   Recognized new preferences: {new_preferences}")
        # Create and add documents to the preference store
        pref_docs = [Document(page_content=pref, metadata={"user_id": user_id}) for pref in new_preferences]
        preference_vector_store.add_documents(pref_docs)
        print(f"   Indexed {len(pref_docs)} new preferences for user '{user_id}'.")

    # Part 2: Synthesize a search query using current AND historical data
    history_str = ", ".join(history) if history else "None"
    synthesis_prompt = f"""
    A user has provided the following current interest: "{profile}"

    Based on their history, they have also previously shown interest in: {history_str}.

    Synthesize a concise, single paragraph that combines their current interest with their historical preferences to describe the ideal topics for their next course. Focus on keywords and concepts for a semantic search.
    """
    response = llm.invoke(synthesis_prompt)
    synthesized_query = response.content
    print(f"   Synthesized Query (with history): {synthesized_query}")

    return {**state, "synthesized_query": synthesized_query}

def retrieve_courses_node(state: RecommendationState) -> RecommendationState:
    """Retrieves similar courses from the course catalog using the synthesized query."""
    print(">> Node: retrieve_courses_node")
    query = state["synthesized_query"]
    docs_with_scores = course_vector_store.similarity_search_with_relevance_scores(query, k=10)

    # Filter and rank
    completed_ids = state["completed_ids"]
    recommendations = []
    for doc, score in docs_with_scores:
        course_id = doc.metadata.get('course_id')
        if course_id not in completed_ids:
            recommendations.append((course_id, score))

    top_5 = sorted(recommendations, key=lambda x: x[1], reverse=True)[:5]
    print(f"   Retrieved and filtered top {len(top_5)} recommendations.")
    return {**state, "final_recommendations": top_5}

# Define and compile the new graph structure
workflow = StateGraph(RecommendationState)
workflow.add_node("extract_completed_courses", extract_completed_courses_node)
workflow.add_node("fetch_user_history", fetch_user_history_node)
workflow.add_node("synthesize_and_recognize", synthesize_and_recognize_node)
workflow.add_node("retrieve_and_rank", retrieve_courses_node)

workflow.set_entry_point("extract_completed_courses")
workflow.add_edge("extract_completed_courses", "fetch_user_history")
workflow.add_edge("fetch_user_history", "synthesize_and_recognize")
workflow.add_edge("synthesize_and_recognize", "retrieve_and_rank")
workflow.add_edge("retrieve_and_rank", END)

app = workflow.compile()
print("Enhanced LangGraph application compiled successfully.")



--- 4. Building the Enhanced LangGraph Recommendation Graph ---
Enhanced LangGraph application compiled successfully.


In [13]:
# --- 5. THE MAIN RECOMMENDATION FUNCTION (SIMPLIFIED) ---
def recommend_courses(user_id: str, profile: str) -> List[Tuple[str, float]]:
    """Runs the full recommendation pipeline, including LLM-based extraction of completed courses."""
    inputs = {
        "user_id": user_id,
        "profile": profile,
    }
    final_state = app.invoke(inputs)
    return final_state.get("final_recommendations", [])

In [15]:
# --- 6. MODULAR EVALUATION REPORT ---

def run_evaluation_query(user_id: str, query: str):
    """
    A modular function to run a single evaluation query through the pipeline and print results.
    Includes an LLM evaluation of the top recommendations.
    """
    print("\n" + "="*50)
    print(f"EVALUATING QUERY FOR USER '{user_id}'")
    print(f"User Query: {query}")
    print("="*50)

    # Get recommendations (completed courses are extracted automatically)
    recommendations = recommend_courses(user_id, query)

    print("\nTop 5 Recommendations:")
    if recommendations:
        for course_id, score in recommendations:
            course_details = courses_df[courses_df['course_id'] == course_id].iloc[0]
            print(f"  - [ID: {course_id}] {course_details['title']} (Score: {score:.4f})")
    else:
        print("  No recommendations found.")

    # --- LLM Evaluation of Recommendations ---
    print("\n--- LLM Evaluation of Recommendations ---")
    if recommendations:
        recommendation_str = "\n".join([f"- [ID: {cid}] {courses_df[courses_df['course_id'] == cid].iloc[0]['title']}" for cid, score in recommendations])

        evaluation_prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an expert in course recommendations. Based on the user's query and the provided course catalog, evaluate if the given top 5 recommendations are relevant and appropriate. Provide a brief explanation for your assessment."),
            ("human", "## Course Catalog:\n{catalog}\n\n## User Query:\n'{query}'\n\n## Top 5 Recommendations:\n{recommendations}\n\nEvaluate the relevance of these recommendations:")
        ])

        evaluation_chain = evaluation_prompt | llm
        evaluation_result = evaluation_chain.invoke({"catalog": course_catalog_str, "query": query, "recommendations": recommendation_str})

        print(evaluation_result.content)
    else:
        print("No recommendations to evaluate.")

    print("="*50)

# --- Main Execution ---

print("\n--- 6. Evaluation Report ---")


evaluation_queries = [
    {
        "user_id" : "user_1",
        "query": "I've completed the 'Python Programming for Data Science' course and enjoy data visualization. What should I take next?"
    },
    {
        "user_id" : "user_2",
        "query": "I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.",
    },
    {
        "user_id" : "user_3",
        "query": "My background is in ML fundamentals; I'd like to specialize in neural networks and production workflows.",
    },
    {
        "user_id" : "user_2",
        "query": "I want to learn to build and deploy microservices with Kubernetes—what courses fit best?",
    },
    {
        "user_id" : "user_4",
        "query": "I'm interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?",
    }
]

for item in evaluation_queries:
    run_evaluation_query(
        user_id=item["user_id"],
        query=item["query"],
    )


--- 6. Evaluation Report ---

EVALUATING QUERY FOR USER 'user_1'
User Query: I've completed the 'Python Programming for Data Science' course and enjoy data visualization. What should I take next?
>> Node: extract_completed_courses_node
   LLM extracted completed courses: ['C016']
>> Node: fetch_user_history_node
   Found 4 historical preferences for user 'user_1'.
>> Node: synthesize_and_recognize_node
   Recognized new preferences: ['Python Programming', 'Data Science', 'Data Visualization', 'Course Recommendations']
   Indexed 4 new preferences for user 'user_1'.
   Synthesized Query (with history): Given your completion of the "Python Programming for Data Science" course and your enthusiasm for data visualization, the ideal next step would be to explore advanced data visualization techniques using Python libraries such as Matplotlib, Seaborn, or Plotly. Additionally, consider courses that delve into data storytelling, interactive dashboards, or machine learning applications in data