# INSTALLING DEPENDENCIES

In [1]:
!pip install -q "langchain-experimental==0.3.4" "langchain-google-genai==2.1.5" \
"langchain-community==0.3.25" "langchain==0.3.25" "langchain-core==0.3.65" \
"pydantic-settings==2.9.1" "docling==2.37.0" "qdrant-client" "langchain-qdrant" \
"langchain-text-splitters==0.3.8" "transformers==4.52.4"

# Settingup Environments and Access Secrets

In [2]:
from google.colab import userdata
import os
import time
import json

# LangChain and document processing imports
from langchain_core.documents import Document
from docling.document_converter import DocumentConverter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_qdrant import Qdrant
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser,JsonOutputParser
from langchain.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

# Retrieve the API key from Colab's Secret Manager.
try:
    os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
    print("SUCCESS: GOOGLE_API_KEY has been loaded from Colab Secrets.")
except Exception as e:
    print("ERROR: Could not find the secret 'GOOGLE_API_KEY'. Please add it via the 🔑 icon.")

SUCCESS: GOOGLE_API_KEY has been loaded from Colab Secrets.


## Questions the notebook will automatically ask the RAG agent.

In [3]:
GOLDEN_QUESTIONS = [
    "What BLEU score did the Transformer (big) model achieve on the WMT 2014 English-to-German translation task?",
    "What is the main architectural component that the Transformer model uses to replace recurrence and convolutions?",
    "According to the paper, what are the three primary advantages of self-attention over recurrent layers?",
    "How many parallel attention layers, or 'heads', were used in the base Transformer model?",
    "Does this paper present results for using the Transformer model on image classification tasks?"
]
print(f"\nLoaded {len(GOLDEN_QUESTIONS)} golden questions for accuracy testing.")


Loaded 5 golden questions for accuracy testing.


# The Automated "LLM-as-a-Judge" Evaluator Function

In [4]:
# This function uses a powerful LLM to automatically score the RAG agent's answers.


class Evaluation(BaseModel):
    score: int = Field(description="The score, either 0 for incorrect or 1 for correct.")
    reason: str = Field(description="A brief justification for the given score.")

async def evaluate_answers_with_llm(qa_pair_list: list):
    """
    Uses a robust LLM judge with a JSON output parser to score the answers.
    """
    print("🤖 Initializing LLM-as-a-Judge for automated scoring...")

    # We use a powerful model for nuanced evaluation
    judge_llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)

    # This parser is more robust than simple json.loads()
    output_parser = JsonOutputParser(pydantic_object=Evaluation)

    evaluation_prompt_template = """
    You are an impartial evaluator. Your task is to judge if the AI's answer is correct based ONLY on the provided Context.

    **Context:**
    {context}
    ---
    **Question:**
    {question}
    ---
    **Generated Answer:**
    {answer}
    ---
    Based on the context, is the Generated Answer a correct response to the Question?
    {format_instructions}
    """

    prompt = ChatPromptTemplate.from_template(
        template=evaluation_prompt_template,
        partial_variables={"format_instructions": output_parser.get_format_instructions()},
    )

    evaluator_chain = prompt | judge_llm | output_parser

    total_score = 0
    for i, qa_pair in enumerate(qa_pair_list):
        print(f"  -> Evaluating answer {i+1}/{len(qa_pair_list)}...")
        try:
            # Use ainvoke for the async call
            result = await evaluator_chain.ainvoke(qa_pair)
            total_score += result.get("score", 0)
        except Exception as e:
            print(f"    -> WARNING: Evaluation failed for one answer, scoring it 0. Error: {e}")
            total_score += 0 # Add 0 if parsing or anything else fails

    accuracy = total_score / len(qa_pair_list)
    print(f"🤖 Judge finished. Final Accuracy: {accuracy:.2%}")
    return accuracy

# The complete experiment function

In [5]:
# 'test_document.pdf' is uploaded to the Colab session storage

In [6]:
async def run_rag_experiment(file_path, chunking_strategy, embedding_model_name):
    """
    Runs a full RAG experiment and returns all metrics, including the context used for each answer.
    """
    print(f"\n{'='*30}\n--- Running Test: [{chunking_strategy.capitalize()} + {embedding_model_name.upper()}] ---\n{'='*30}")

    ingestion_start_time = time.time()
    print("1. Loading document...")
    converter = DocumentConverter()
    result = converter.convert(file_path)
    full_text = result.document.export_to_markdown()
    docs = [Document(page_content=full_text, metadata={"source": os.path.basename(file_path)})]

    print("2. Initializing embeddings and splitter...")
    if embedding_model_name == "google":
        embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
    else:
        embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5", encode_kwargs={'normalize_embeddings': True})

    if chunking_strategy == "semantic":
        text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type="percentile")
    else:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

    print("3. Chunking document...")
    chunks = text_splitter.split_documents(docs)
    ingestion_latency = time.time() - ingestion_start_time
    print(f"   -> Ingestion complete in {ingestion_latency:.2f}s. {len(chunks)} chunks created.")

    print("4. Setting up in-memory RAG chain...")
    vectorstore = Qdrant.from_documents(chunks, embeddings, location=":memory:", collection_name="test_collection")
    retriever = vectorstore.as_retriever()
    llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
    prompt = ChatPromptTemplate.from_template("Answer the question based only on the following context:\n\n{context}\n\nQuestion: {question}")
    rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser()

    print("5. Generating answers for accuracy testing...")
    qa_pairs_for_evaluation = []
    for question in GOLDEN_QUESTIONS:
        # First, retrieve the context for the question
        retrieved_docs = retriever.invoke(question)
        context_text = "\n---\n".join([doc.page_content for doc in retrieved_docs])

        # Now, generate the answer using that specific context
        answer = rag_chain.invoke(question)

        # Store all three parts for the judge
        qa_pairs_for_evaluation.append({"question": question, "answer": answer, "context": context_text})

    print("   -> Answer generation complete.")

    return {
        "config_name": f"{chunking_strategy.capitalize()} ({embedding_model_name.upper()})",
        "ingestion_latency": f"{ingestion_latency:.2f}s",
        "chunk_count": len(chunks),
        "qa_pairs": qa_pairs_for_evaluation
    }

# Executing All Experiments

In [7]:
import asyncio

async def main():
    TEST_DOCUMENT_PATH = "test_document.pdf"
    final_report_data = []
    if os.path.exists(TEST_DOCUMENT_PATH):
        print("Running 'Fastest Baseline' configuration...")
        baseline_results = await run_rag_experiment(TEST_DOCUMENT_PATH, "recursive", "google")
        print("\nRunning 'Highest Quality' configuration...")
        quality_results = await run_rag_experiment(TEST_DOCUMENT_PATH, "semantic", "google")

        if baseline_results:
            accuracy = await evaluate_answers_with_llm(baseline_results['qa_pairs'])
            baseline_results['accuracy'] = f"{accuracy:.0%}"
            final_report_data.append(baseline_results)

        if quality_results:
            accuracy = await evaluate_answers_with_llm(quality_results['qa_pairs'])
            quality_results['accuracy'] = f"{accuracy:.0%}"
            final_report_data.append(quality_results)

        print(f"\n\n{'='*25} FINAL REPORT DATA {'='*25}")
        print("The final FINDINGS table is shown below :")
        print("=================================================================\n")
        print("| Configuration | Ingestion Latency | Chunk Count | Retrieval Accuracy (Auto-Scored) |")
        print("| :--- | :--- | :--- | :--- |")
        for res in final_report_data:
            print(f"| **{res['config_name']}** | {res['ingestion_latency']} | {res['chunk_count']} | **{res['accuracy']}** |")
    else:
        print("\nERROR: 'test_document.pdf' not found.")
        print("Please upload the file to the Colab session storage (📁 icon).")


In [8]:
# Run the main asynchronous function
await main()

Running 'Fastest Baseline' configuration...

--- Running Test: [Recursive + GOOGLE] ---
1. Loading document...
2. Initializing embeddings and splitter...
3. Chunking document...
   -> Ingestion complete in 35.26s. 70 chunks created.
4. Setting up in-memory RAG chain...
5. Generating answers for accuracy testing...
   -> Answer generation complete.

Running 'Highest Quality' configuration...

--- Running Test: [Semantic + GOOGLE] ---
1. Loading document...
2. Initializing embeddings and splitter...
3. Chunking document...
   -> Ingestion complete in 38.81s. 21 chunks created.
4. Setting up in-memory RAG chain...
5. Generating answers for accuracy testing...
   -> Answer generation complete.
🤖 Initializing LLM-as-a-Judge for automated scoring...
  -> Evaluating answer 1/5...
  -> Evaluating answer 2/5...
  -> Evaluating answer 3/5...
  -> Evaluating answer 4/5...
  -> Evaluating answer 5/5...
🤖 Judge finished. Final Accuracy: 80.00%
🤖 Initializing LLM-as-a-Judge for automated scoring...
