In [91]:
import os
import re
import json
import langchain

In [92]:
from langchain_community.document_loaders import JSONLoader
loader = JSONLoader(
    file_path='temp.json',
    jq_schema='.[]',
    text_content=False
)

docs = loader.load()


for index,doc in enumerate(docs):
    temp = json.loads(doc.page_content)
    data = ""
    del doc.metadata['source']
    del doc.metadata['seq_num']    


    for key, value in temp.items():
        if key != "description" and key != "course_link":
            if isinstance(value, list):
                # Join string representations of list items with spaces
                value = ' '.join(str(v).lower() for v in value)
            elif isinstance(value, str):
                value = value.lower()
            doc.metadata[key] = value
    
    for key, value in temp.items():
        if key == "description":
            if value is not None:
                # Replace newlines with spaces for uniformity
                desc = value.replace('\n', ' ')
                # Remove unnecessary '*' and '-' characters
                desc = re.sub(r'[\*\-]', '', desc)
                # Remove extra spaces
                desc = re.sub(r'\s+', ' ', desc).strip()
                # Split on bullet points (•)
                bullet_parts = [part.strip() for part in re.split(r'•', desc) if part.strip()]
                sentences = []
                for part in bullet_parts:
                    # Instead of splitting on every period, only split on periods that are not part of ordered lists (e.g., "a.", "1.")
                    # We'll use a regex to split on periods that are NOT preceded by a single letter/number and a space
                    # This will keep "a. " or "1. " together
                    sub_sentences = re.split(r'(?<!\b[a-zA-Z0-9])\.(?!\d)', part)
                    for s in sub_sentences:
                        s = s.strip()
                        if s:
                            # Ensure each sentence ends with a period
                            if not s.endswith('.'):
                                s += '.'
                            # Convert to lowercase and remove non-alphabetic characters (except spaces)
                            s = s.lower()
                            s = re.sub(r'[^a-z\s]', '', s)
                            # Remove extra spaces again after removing non-alphabetic chars
                            s = re.sub(r'\s+', ' ', s).strip()
                            if s:  # Only add non-empty sentences
                                sentences.append(s)
                # Reconstruct with each sentence on a new line
                updated_description = '\n'.join(sentences)
                data += f"{key} : {updated_description}"
            else:
                data += f"{key} : "
    del doc.metadata["class_time"]
    doc.metadata["index"]: index
    doc.page_content = data
    


In [121]:
print(docs[1])

page_content='description : brief history of cancer
thoughts on the metabolic and genetic basis of cancer since early s how genetic basis of cancer became the mainstay of understanding cancer
oncogenes and tumor suppressor genes
clonal origin of cancer
stem cells versus cancer stem cells
immunology of cancer
epigenetics of cancer
proteomics transcriptomics metabolomics of cancer
role of mitochondria in cancer
reemergence of metabolic basis of cancer
gwas in cancer
cancer as an evolutionary process
application of game theory in cancer
yeast as a model to understand cancer
bioenergetics of cancer rate versus efficiency
nongenetic heterogeneity in cancer' metadata={'course_code': 'bb703', 'course_name': 'cancer genetics and metabolism', 'department': 'biosciences & bioengineering', 'instructors': '', 'tags': '', 'credits': 6, 'prerequisites': '', 'is_running': False, 'venue': '', 'duration': '', 'slot': ''}


In [94]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
)

In [97]:
vector_store.delete_collection()

In [98]:
from langchain.vectorstores import Chroma

vector_store = Chroma(
    embedding_function = embedding_model,
    persist_directory='course_vector_database',
    collection_name = 'sample'
)

In [99]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 150, chunk_overlap=15)
chunks = text_splitter.split_documents(docs)

In [100]:
len(chunks)

1163

In [101]:
vector_store.add_documents(chunks)

['795c7cc9-7ce7-46cc-ba0d-054ca4e97e69',
 'a5e6f561-84a5-4058-bb0a-6573168997fb',
 '5cc904ad-8258-4e20-b089-9e08008a29d4',
 '7dbd908b-5157-45a5-91bd-93375542a2c9',
 '411cf054-86c5-409d-b618-9c07450a36bd',
 '623282a9-ab0e-4737-813f-5d21f929400c',
 'd7ffd1ef-134e-4cf4-a478-4247d913a954',
 '71e1e334-dfad-4a63-a18f-ddb4cf002c0b',
 '4ffaf003-cc89-41e7-b4a3-267cb3c31722',
 '5141be53-3d15-4c97-9948-acdaa0b7f725',
 '9f6bd92b-f519-48c9-a2ad-4a3ff1e8337f',
 '7b857e4e-2258-46d3-9706-0f055168dad1',
 '6a8c5053-c5e9-43c2-9b1e-9a7c9a61b5e2',
 'bb88f057-5125-481b-b835-a715b6b163fc',
 '646dc607-d48f-4a2d-8a77-8930d079f5d5',
 '693afa94-a8b6-43c3-aa59-1e79bdbabb92',
 '030bd816-8d97-41e6-a0a8-e1f612d8bb00',
 '8baf0b1a-0a00-4301-ad5a-d1a482972a7b',
 'fd5b81ec-cf8e-489d-b721-b858f277d511',
 '3633c566-dfcb-4345-bf74-5c494de5bc9f',
 '0c998a65-1d33-4daf-abfd-86c3f27f4691',
 '61b4f5de-283b-4882-8ce1-b1c7fd337f39',
 'ffc942a4-71fe-48a6-8f1c-2df40a4b2a36',
 '9d25e0d2-856c-43b2-9a64-b1bcd53346c2',
 '16204c8c-bc88-

In [102]:
query = "Is there any course taught by Roop Mallik in slot 6?"

# CORRECTED: The key 'instructors' now matches the document metadata
filter = {
    "$and": [
        {"instructors": {"$eq": "roop mallik"}},
        {"slot": {"$eq": "6"}}
    ]
}
# This part is correct
retriever = vector_store.as_retriever(search_kwargs={"filter": filter})

# CORRECTED: Pass the original string query here, not the filter dictionary
results = retriever.get_relevant_documents(query)

# This part is correct
for result in results:
    print(result, '\n')

page_content='c lens properties numerical aperture magnification aberrations corrections d resolution rayleigh criterion airy disc abbes principle lightmatter' metadata={'is_running': True, 'tags': 'theory', 'duration': 'fullsemester', 'instructors': 'roop mallik', 'slot': '6', 'credits': 6, 'department': 'biosciences & bioengineering', 'prerequisites': '', 'course_code': 'bb706', 'course_name': 'fundamental and practical aspects of microscopy', 'venue': 'lt 104'} 

page_content='description : optics for microscopy a wave nature of light birefringence interference diffraction b optical elements lenses filters polarizers c lens' metadata={'is_running': True, 'course_name': 'fundamental and practical aspects of microscopy', 'tags': 'theory', 'slot': '6', 'department': 'biosciences & bioengineering', 'instructors': 'roop mallik', 'venue': 'lt 104', 'prerequisites': '', 'credits': 6, 'course_code': 'bb706', 'duration': 'fullsemester'} 

page_content='molecule techniques introduction b atom

In [103]:
import os
from dotenv import load_dotenv

# This line loads the variables from your .env file into the environment
load_dotenv()

# You can now access your key using os.getenv()
my_api_key = os.getenv("GOOGLE_API_KEY")

# You can verify it's loaded (optional)
if my_api_key:
    print("✅ API Key loaded successfully!")
    # Your LangChain or other code that needs the key can now run.
    # It will often find the key automatically from the environment.
else:
    print("❌ Could not load API Key. Check your .env file and path.")

# Example: Using it with LangChain
# from langchain_google_genai import ChatGoogleGenerativeAI
# llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash") # This automatically uses the loaded key

✅ API Key loaded successfully!


In [104]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

class CourseMetaDataSearch(BaseModel):
    """
    A tool to find courses using specific metadata filters like instructor name, 
    course code, or slot number.
    """
    course_code : str = Field(default = None, description="The code for the course, e.g., 'AE103', 'CS419")
    instructors : str = Field(default = None, description="The name of the instructor teaching the course")
    slot: str = Field(default=None, description="The schedule slot for the course, e.g., '4'")


router_prompt = ChatPromptTemplate(
    [
        ("system", "You are an expert at routing a user's query. "
            "If the query contains specific criteria like an instructor's name, "
            "course code, or slot number, extract them. Otherwise, do not extract anything."),
        ("human", "{query}")
    ]
)

gemini_router_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)

llm_with_tool = gemini_router_llm.with_structured_output(CourseMetaDataSearch)
pre_retrieval_router_chain = router_prompt | llm_with_tool

In [None]:
query = "Find me a couse offered by professor Prabhu Ramachandran."

def retrieve_documents(query : str):
    router_decision = pre_retrieval_router_chain.invoke({"query":{query}})
    
    search_filter = {key: value for key,value in router_decision.dict().items() if value is not None}

retrieve_documents(query)

In [None]:
from langchain_core.output_parsers import StrOutputParser

gemini_generator_llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.5) 

In [None]:
# This error occurs because Chroma's filtering expects the "filter" argument to use a single operator (like "$and", "$or", "$eq", etc.) at the top level,
# buut the code is passing a dictionary of field-value pairs directly (e.g., {'instructors': ..., 'slot': ..., 'department': ...}).
# Chroma expects: {"$and": [{"instrctors": ...}, {"slot": ...}, {"department": ...}]}
# or a single field: {"instructors": ...}
# but NOT multiple fields at the top level.

# The error message:
# ValueError: Expected where to have exactly one operator, got {'instructors': 'roop mallik', 'slot': '6', 'department': 'bioscience and engineering'} in query.
# means that the filter dict is not wrapped in an operator.

# To fix this, wrap the filter in a "$and" operator if there are multiple fields.

import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# --- 1. SETUP ---
load_dotenv()

gemini_router_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)
gemini_generator_llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.1) 

# --- 3. PRE-RETRIEVAL ROUTER SETUP ---
class CourseMetadataSearch(BaseModel):
    """A tool to find courses using filters like instructor, course code, slot, or department."""
    course_code: str = Field(default=None, description="The code for the course, e.g., 'BB706'")
    instructors: str = Field(default=None, description="The name of the instructor")
    slot: str = Field(default=None, description="The schedule slot for the course, e.g., '6'")
    department: str = Field(default=None, description="The department offering the course, e.g., 'Physics', 'Mathematics', or 'Bioscience and Engineering'")

    def __init__(self, **data):
        super().__init__(**data)
        if self.course_code is not None:
            self.course_code = self.course_code.lower()
        if self.instructors is not None:
            self.instructors = self.instructors.lower()
        if self.slot is not None:
            self.slot = self.slot.lower()
        if self.department is not None:
            self.department = self.department.lower()

llm_with_tool = gemini_router_llm.with_structured_output(CourseMetadataSearch)
router_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an expert at routing a user's query. If the query contains specific criteria like an instructor's name, course code, department, or slot number, extract them."),
    ("human", "{query}")
])
pre_retrieval_router_chain = router_prompt | llm_with_tool

# --- 4. FINAL ANSWER GENERATION CHAIN ---
final_prompt_template = """
You are a helpful university course assistant. Answer the user's question based ONLY on the following context.
If the context is empty, say you don't have information on the topic.

CONTEXT:
{context}

QUESTION:
{question}

YOUR ANSWER:
"""
final_prompt = ChatPromptTemplate.from_template(final_prompt_template)
final_rag_chain = final_prompt | gemini_generator_llm | StrOutputParser()

# --- 5. THE COMPLETE APPLICATION LOGIC ---
def ask_course_bot(query: str):
    """
    This function orchestrates the entire RAG pipeline.
    """
    print(f"\n🤔 Query: '{query}'")

    # Step 1: Call the pre-retrieval router
    router_decision = pre_retrieval_router_chain.invoke({"query": query})
    search_filter = {key: value for key, value in router_decision.dict().items() if value is not None}

    # # Step 2: Retrieve documents based on the router's decision
    if search_filter:
        print(f"➡️ Router Decision: METADATA search. Filter: {search_filter}")
        # --- FIX: Wrap filter in $and if multiple fields ---
        if len(search_filter) == 1:
            filter_arg = search_filter
        else:
            filter_arg = {"$and": [{k: v} for k, v in search_filter.items()]}
        retriever = vector_store.as_retriever(search_kwargs={"filter": filter_arg})
    else:
        print("➡️ Router Decision: SEMANTIC search.")
        retriever = vector_store.as_retriever()
    # Custom filtering: retrieve all docs for slot 2, then filter by fuzzy instructor match (score > 80)
    # The error is in this block:
    #     if score < 80:
    #         filtered_docs.append(doc)
    # It should be "if score >= 80" to keep docs with a good fuzzy match.
    # Also, you should not call retriever.get_relevant_documents(query) again after filtering,
    # as that will overwrite your filtered_docs. Use only the filtered_docs as retrieved_docs.
    # Here is the corrected code:
    
    retrieved_docs = retriever.get_relevant_documents(query)

    # Step 3: Generate the final answer
    context_str = "\n\n---\n\n".join(
        [f"Content: {doc.page_content}\nMetadata: {doc.metadata}" for doc in retrieved_docs]
    )
    
    # print("💬 Generating final answer...")
    answer = final_rag_chain.invoke({
        "context": context_str,
        "question": query
    })

    print("\n✅ Answer:\n", answer)

# --- 6. LET'S RUN IT! ---

# Test a metadata-based query
ask_course_bot("What course is taught by Roop Mallik in slot 6 in bioscience & bioengineering department?")

In [112]:
slot_filter = {"slot": {"$eq": "6"}}
retriever = vector_store.as_retriever(search_kwargs={"filter": slot_filter})
candidate_docs = retriever.get_relevant_documents(query)
print(candidate_docs)

[Document(metadata={'tags': 'theory', 'instructors': 'shivasubramanian gopalakrishnan', 'course_name': 'galerkin methods for fluid dynamics', 'venue': 'cc 101', 'duration': 'fullsemester', 'department': 'mechanical engineering', 'slot': '6', 'is_running': True, 'credits': 6, 'prerequisites': '', 'course_code': 'me757'}, page_content='the implementation of the methods'), Document(metadata={'venue': 'f 24', 'credits': 4, 'duration': 'firsthalf', 'course_name': 'liquid material processing', 'is_running': True, 'department': 'mechanical engineering', 'instructors': 'prashant prabhakar date', 'slot': '6', 'prerequisites': '', 'course_code': 'me788', 'tags': 'theory'}, page_content='smart manufacturing tutorials analytical and numerical problems simulation examples casestudies'), Document(metadata={'credits': 6, 'duration': 'fullsemester', 'is_running': True, 'department': 'mechanical engineering', 'prerequisites': '', 'course_code': 'me757', 'instructors': 'shivasubramanian gopalakrishnan',

In [None]:
# query = "Is there any course taught by Roop Mallik in slot 6?"

# The first part works because it matches the exact string "roop mallik" in the "instructors" metadata.
# The second part may not return results if there are no documents with slot "2" and the query matches nothing,
# or if the query string is not relevant to the slot filter.
# To debug, let's print the number of results and inspect the candidate_docs.

# Filter for instructor "roop mallik" and slot "6"
filter = {
    "$and": [
        {"instructors": {"$eq": "roop mallik"}},
        {"slot": {"$eq": "6"}}
    ]
}
retriever = vector_store.as_retriever(search_kwargs={"filter": filter})
results = retriever.get_relevant_documents(query)
print(f"Results for instructor 'roop mallik' and slot '6': {len(results)}")
for result in results:
    print(result, '\n')

# Filter for slot "2"
slot_filter = {"slot": {"$eq": "2"}}
retriever = vector_store.as_retriever(search_kwargs={"filter": slot_filter})
candidate_docs = retriever.get_relevant_documents(query)
print(f"Results for slot '2': {len(candidate_docs)}")
for doc in candidate_docs:
    print(doc, '\n')

# If you are not getting results for slot "2", check:
# 1. Whether any documents have slot "2" in their metadata.
# 2. Whether the query string is relevant to those documents.
# 3. Try printing all documents with slot "2" regardless of query to debug:
all_slot2 = vector_store.similarity_search("", filter=slot_filter)
print(f"All docs with slot '2' (no query): {len(all_slot2)}")
for doc in all_slot2:
    print(doc, '\n')

page_content='c lens properties numerical aperture magnification aberrations corrections d resolution rayleigh criterion airy disc abbes principle lightmatter' metadata={'tags': 'theory', 'course_name': 'fundamental and practical aspects of microscopy', 'slot': '6', 'course_code': 'bb706', 'duration': 'fullsemester', 'venue': 'lt 104', 'instructors': 'roop mallik', 'credits': 6, 'department': 'biosciences & bioengineering', 'prerequisites': '', 'is_running': True} 

page_content='description : optics for microscopy a wave nature of light birefringence interference diffraction b optical elements lenses filters polarizers c lens' metadata={'prerequisites': '', 'course_name': 'fundamental and practical aspects of microscopy', 'is_running': True, 'tags': 'theory', 'slot': '6', 'instructors': 'roop mallik', 'course_code': 'bb706', 'duration': 'fullsemester', 'department': 'biosciences & bioengineering', 'credits': 6, 'venue': 'lt 104'} 

page_content='molecule techniques introduction b atom

In [132]:
from fuzzywuzzy import fuzz

print(fuzz.token_sort_ratio("yeramalli chandra shekher ","chandra shekher yeramalli"))

100
