In [1]:
!pip install langchain langgraph langchain-community langchain-text-splitters langchain-groq langchain-huggingface langchain-chroma pymupdf arxiv sentence-transformers

In [3]:
from google.colab import userdata
import os
os.environ['GROQ_API_KEY'] = userdata.get('GROQ_API_KEY')
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [4]:
from typing import TypedDict, Literal, List
class AgentState(TypedDict):
    messages: List[str]
    query: str
    retriever_choice: str
    retrieved_docs: List[str]
    final_answer: str

In [5]:
from langchain_groq import ChatGroq
llm = ChatGroq(model_name="openai/gpt-oss-120b", temperature=0)

In [6]:
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader([
    "https://python.langchain.com/docs/introduction/",
    "https://python.langchain.com/docs/tutorials/rag/",
    "https://python.langchain.com/docs/how_to/",
])
docs = loader.load()



In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
splits = text_splitter.split_documents(docs)

In [10]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
from langchain_chroma import Chroma
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

In [12]:
# 1. Standard Similarity Retriever
similarity_retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 4}
)

In [13]:
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever

# 2. Parent Document Retriever
store = InMemoryStore()
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

parent_doc_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [14]:
from langchain.retrievers.multi_query import MultiQueryRetriever
# 3. Multi-Query Retriever
multiquery_retriever = MultiQueryRetriever.from_llm(
    retriever=similarity_retriever,
    llm=llm
)

In [15]:
from langchain.retrievers.self_query.base import SelfQueryRetriever

# 4. Self-Query Retriever (simplified version)
selfquery_retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 4, "lambda_mult": 0.25}
)

In [16]:
# 5. HyDE-style retriever (hypothetical document embeddings simulation)
hyde_retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.5, "k": 3}
)

In [17]:
def parent_doc_retrieve(state: AgentState) -> AgentState:
    """Retrieve using Parent Document Retriever for comprehensive context"""
    query = state["query"]
    docs = parent_doc_retriever.get_relevant_documents(query)
    retrieved_content = [doc.page_content for doc in docs]

    state["retrieved_docs"] = retrieved_content
    state["retriever_choice"] = "parent_document"
    return state

def multiquery_retrieve(state: AgentState) -> AgentState:
    """Retrieve using Multi-Query for query expansion"""
    query = state["query"]
    docs = multiquery_retriever.get_relevant_documents(query)
    retrieved_content = [doc.page_content for doc in docs]

    state["retrieved_docs"] = retrieved_content
    state["retriever_choice"] = "multi_query"
    return state

def selfquery_retrieve(state: AgentState) -> AgentState:
    """Retrieve using Self-Query with MMR for diversity"""
    query = state["query"]
    docs = selfquery_retriever.get_relevant_documents(query)
    retrieved_content = [doc.page_content for doc in docs]

    state["retrieved_docs"] = retrieved_content
    state["retriever_choice"] = "self_query"
    return state

def hyde_retrieve(state: AgentState) -> AgentState:
    """Retrieve using HyDE-style approach with score threshold"""
    query = state["query"]
    docs = hyde_retriever.get_relevant_documents(query)
    retrieved_content = [doc.page_content for doc in docs]

    state["retrieved_docs"] = retrieved_content
    state["retriever_choice"] = "hyde"
    return state

def similarity_retrieve(state: AgentState) -> AgentState:
    """Default similarity retriever"""
    query = state["query"]
    docs = similarity_retriever.get_relevant_documents(query)
    retrieved_content = [doc.page_content for doc in docs]

    state["retrieved_docs"] = retrieved_content
    state["retriever_choice"] = "similarity"
    return state

In [18]:
from langgraph.types import Command

def supervisor_route(state: AgentState) -> Command[Literal["parent_doc_retrieve", "multiquery_retrieve", "selfquery_retrieve", "hyde_retrieve", "similarity_retrieve"]]:
    """
    Supervisor agent that routes queries to appropriate retrievers based on query characteristics
    """
    query = state["query"].lower()

    # Routing logic based on query analysis
    if any(word in query for word in ["comprehensive", "detailed", "complete", "full context"]):
        return Command(goto="parent_doc_retrieve")

    elif any(word in query for word in ["variations", "different ways", "alternatives", "multiple"]):
        return Command(goto="multiquery_retrieve")

    elif any(word in query for word in ["diverse", "variety", "different perspectives", "broad"]):
        return Command(goto="selfquery_retrieve")

    elif any(word in query for word in ["hypothetical", "what if", "suppose", "imagine"]):
        return Command(goto="hyde_retrieve")

    else:
        return Command(goto="similarity_retrieve")

In [19]:
def generate_answer(state: AgentState) -> AgentState:
    """Generate final answer using retrieved documents"""
    query = state["query"]
    docs = state["retrieved_docs"]
    retriever_used = state["retriever_choice"]

    context = "\n\n".join(docs[:3])  # Use top 3 documents

    prompt = f"""Based on the following context, answer the question comprehensively.

Context from {retriever_used} retriever:
{context}

Question: {query}

Answer:"""

    response = llm.invoke(prompt)
    state["final_answer"] = response.content
    return state

In [20]:
from langgraph.graph import StateGraph, START, END
workflow = StateGraph(AgentState)

workflow.add_node("supervisor", supervisor_route)
workflow.add_node("parent_doc_retrieve", parent_doc_retrieve)
workflow.add_node("multiquery_retrieve", multiquery_retrieve)
workflow.add_node("selfquery_retrieve", selfquery_retrieve)
workflow.add_node("hyde_retrieve", hyde_retrieve)
workflow.add_node("similarity_retrieve", similarity_retrieve)
workflow.add_node("generate_answer", generate_answer)

workflow.add_edge(START, "supervisor")

workflow.add_edge("parent_doc_retrieve", "generate_answer")
workflow.add_edge("multiquery_retrieve", "generate_answer")
workflow.add_edge("selfquery_retrieve", "generate_answer")
workflow.add_edge("hyde_retrieve", "generate_answer")
workflow.add_edge("similarity_retrieve", "generate_answer")

workflow.add_edge("generate_answer", END)

app = workflow.compile()

In [25]:
def run_rag_query(query: str):
    """Run a query through the multi-retriever RAG system"""
    initial_state = AgentState(
        messages=[],
        query=query,
        retriever_choice="",
        retrieved_docs=[],
        final_answer=""
    )

    result = app.invoke(initial_state)

    print(f"Query: {query}")
    print(f"Retriever Used: {result['retriever_choice']}")
    print(f"Answer: {result['final_answer']}")
    print("-" * 80)

    return result

test_queries = [
    "What is LangChain and how does it work?",  # Should use similarity
    "Give me comprehensive details about RAG implementation",  # Should use parent_doc
    "What are different ways to implement retrieval?",  # Should use multiquery
    "Show me diverse approaches to document processing",  # Should use selfquery
    "What if I wanted to create hypothetical documents?",  # Should use hyde
]

for query in test_queries:
    run_rag_query(query)

  docs = similarity_retriever.get_relevant_documents(query)


Query: What is LangChain and how does it work?
Retriever Used: similarity
Answer: **LangChain – a high‑level framework for building LLM‑powered applications**

---

## 1. What is LangChain?

LangChain is an open‑source Python (and now also JavaScript/TypeScript) framework that makes it easier to **design, develop, and deploy applications that use large language models (LLMs)** such as OpenAI’s GPT‑4, Anthropic’s Claude, Llama‑2, Gemini, etc.  

Instead of treating an LLM as a single “chat‑completion” endpoint, LangChain treats it as **one component in a larger data‑flow pipeline** and provides reusable building blocks for:

| Stage | What it does | Typical LangChain component |
|-------|--------------|-----------------------------|
| **Prompt creation** | Assemble dynamic, context‑aware prompts | PromptTemplate, FewShotPromptTemplate |
| **LLM invocation** | Call the model (chat, completion, embeddings) | LLM wrappers (OpenAI, AzureOpenAI, HuggingFace, etc.) |
| **Memory / State** | Ke

  self.vectorstore.similarity_search_with_relevance_scores(


Query: Show me diverse approaches to document processing
Retriever Used: self_query
Answer: Below is a **catalog of the most common (and a few emerging) ways to process documents**, grouped by the stage of the pipeline they belong to and the type of technique they use.  
Feel free to cherry‑pick the pieces that fit your use‑case, mix‑and‑match them, or use the whole end‑to‑end flow as a starter template.

---

## 1️⃣  Pre‑ingestion – Getting the raw material into a usable form  

| Approach | What it does | Typical tools / libraries | When to use it |
|----------|--------------|---------------------------|----------------|
| **File‑type parsers** | Convert PDFs, DOCX, HTML, emails, scans (OCR) → plain text or structured JSON | `pdfminer`, `PyMuPDF`, `docx2txt`, `BeautifulSoup`, `pdfplumber`, `tesseract‑ocr`, `unstructured.io` | You have heterogeneous sources; you need a single “text” representation. |
| **Language detection & normalization** | Detect language, transliterate, normalize 

In [26]:
def interactive_rag():
    """Interactive function to test the RAG system"""
    while True:
        query = input("\nEnter your query (or 'quit' to exit): ")
        if query.lower() == 'quit':
            break

        run_rag_query(query)

interactive_rag()


Enter your query (or 'quit' to exit): "What is LangChain and how does it work?"
Query: "What is LangChain and how does it work?"
Retriever Used: similarity
Answer: **LangChain – a high‑level framework for building LLM‑powered applications**

---

## 1. What is LangChain?

LangChain is an open‑source Python (and now also JavaScript/TypeScript) framework that makes it easier to **design, develop, and deploy applications that use large language models (LLMs)** such as OpenAI’s GPT‑4, Anthropic’s Claude, LLaMA, Gemini, etc.  

Instead of writing ad‑hoc code that stitches together prompt strings, API calls, and post‑processing, LangChain provides **building blocks (components) and patterns** that let you:

| Goal | LangChain component that solves it |
|------|------------------------------------|
| **Prompt engineering** – reusable, templated prompts | `PromptTemplate`, `FewShotPromptTemplate`, `ChatPromptTemplate` |
| **LLM invocation** – abstract over different providers | `LLM` wrappers