# Installing Requirements

In [1]:
!pip install google-generativeai sentence-transformers faiss-cpu pypdf python-docx ipywidgets

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting pypdf
  Downloading pypdf-5.9.0-py3-none-any.whl.metadata (7.1 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvi

In [2]:
!pip install python-pptx python-docx PyMuPDF pandas numpy scikit-learn markdown langchain-text-splitters

Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading xlsxwriter-3.2.5-py3-none-any.whl.metadata (2.7 kB)
Downloading python_pptx-1.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.8/472.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m86.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xlsxwriter-3.2.5-py3-none-any.whl (172 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.3/172.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: XlsxWriter, PyMuPDF, python-pptx
Successfully installed PyMuPDF-1.26.3 XlsxWriter-3.2.5 python-pptx-1

**Google Ai Studio API Key**

In [3]:
import google.generativeai as genai
from google.colab import userdata

try:
    gemini_api_key = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=gemini_api_key)
    print("Google Gemini API configured successfully!")
except userdata.SecretNotFoundError as e:
    print(f"Secret not found: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

Google Gemini API configured successfully!


# Document Parser

In [4]:
import io
import pandas as pd
import pypdf
import docx
import pptx

class DocumentParser:
    """
    A class to handle parsing of various document formats.
    Supports: PDF, DOCX, PPTX, TXT, MD, CSV
    """
    def _parse_pdf(self, file_content):
        """Extracts text from a PDF file's content."""
        pdf_reader = pypdf.PdfReader(io.BytesIO(file_content))
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() or ""
        return text

    def _parse_docx(self, file_content):
        """Extracts text from a DOCX file's content."""
        doc = docx.Document(io.BytesIO(file_content))
        text = ""
        for para in doc.paragraphs:
            text += para.text + "\n"
        return text

    def _parse_pptx(self, file_content):
        """Extracts text from a PPTX file's content."""
        prs = pptx.Presentation(io.BytesIO(file_content))
        text = ""
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + "\n"
        return text

    def _parse_txt(self, file_content):
        """Decodes and returns text from a TXT file's content."""
        return file_content.decode('utf-8')

    def _parse_csv(self, file_content):
        """Parses a CSV file's content into a string representation."""
        try:
            df = pd.read_csv(io.BytesIO(file_content))
            return df.to_string()
        except Exception as e:
            return self._parse_txt(file_content)


    def parse(self, file_content, file_name):
        if file_name.endswith('.pdf'):
            return self._parse_pdf(file_content)
        elif file_name.endswith('.docx'):
            return self._parse_txt(file_content)
        elif file_name.endswith('.pptx'):
            return self._parse_pptx(file_content)
        elif file_name.endswith('.txt') or file_name.endswith('.md'):
            return self._parse_txt(file_content)
        elif file_name.endswith('.csv'):
            return self._parse_csv(file_content)
        else:
            print(f"Unsupported file format: {file_name}")
            return None

# Ingestion Agent

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


class IngestionAgent:
    def __init__(self):
        self.parser = DocumentParser()
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

    def ingest(self, mcp_message):
        files = mcp_message["payload"]["files"]
        all_chunks = []
        for file_name, file_content in files.items():
            print(f"  -> Processing file: {file_name}")
            raw_text = self.parser.parse(file_content, file_name)
            if raw_text:
                chunks = self.text_splitter.split_text(raw_text)
                chunks_with_metadata = [{"text": chunk, "source": file_name} for chunk in chunks]
                all_chunks.extend(chunks_with_metadata)

        return {
            "sender": "IngestionAgent",
            "receiver": "WorkflowCoordinator",
            "type": "INGEST_RESPONSE",
            "trace_id": mcp_message["trace_id"],
            "payload": {
                "chunks": all_chunks
            }
        }

# Retrieval Agent

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

class RetrievalAgent:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.embedding_model = SentenceTransformer(model_name)
        self.vector_store = None
        self.chunk_data = []

    def build_index(self, chunks):
        if not chunks: return
        self.chunk_data = chunks
        texts_to_embed = [chunk['text'] for chunk in self.chunk_data]
        embeddings = self.embedding_model.encode(texts_to_embed)
        self.vector_store = faiss.IndexFlatL2(embeddings.shape[1])
        self.vector_store.add(np.array(embeddings).astype('float32'))

    def retrieve(self, mcp_message, top_k=3):
        if self.vector_store is None: return []
        query = mcp_message["payload"]["query"]
        query_embedding = self.embedding_model.encode([query])
        _, indices = self.vector_store.search(np.array(query_embedding).astype('float32'), top_k)
        retrieved_chunks = [self.chunk_data[i] for i in indices[0]]

        return {
            "sender": "RetrievalAgent",
            "receiver": "WorkflowCoordinator",
            "type": "RETRIEVAL_RESPONSE",
            "trace_id": mcp_message["trace_id"],
            "payload": {
                "retrieved_context": retrieved_chunks,
                "query": query
            }
        }

# LLMResponse Agent

In [7]:
class LLMResponseAgent:
    def __init__(self, model_name='gemini-2.5-flash-lite'):
        self.model = genai.GenerativeModel(model_name)
        self.prompt_template = """
You are a helpful and precise Q&A assistant. Your task is to answer the user's question based *only* on the provided context.

Do not use any information outside of the provided context paragraphs.

If the context does not contain the answer, you must state that you cannot find the answer in the provided documents.

For each piece of information you use, you must cite the source document. The source for each context paragraph is provided in the format [Source: file_name.ext].

Here is the context retrieved from the documents:
---
{context}
---

Here is the user's question:
---
{question}
---

Your final answer should be comprehensive and directly based on the text above.
"""

    def generate_response(self, mcp_message):
        query = mcp_message["payload"]["query"]
        context = mcp_message["payload"]["context"]

        if not context:
            final_answer = "I'm sorry, but I couldn't find any relevant information..."
        else:
            formatted_context = "\n\n".join(f"[Source: {chunk['source']}]\n{chunk['text']}" for chunk in context)
            final_prompt = self.prompt_template.format(context=formatted_context, question=query)
            try:
                response = self.model.generate_content(final_prompt)
                final_answer = response.text
            except Exception as e:
                final_answer = f"An error occurred: {e}"

        return {
            "sender": "LLMResponseAgent",
            "receiver": "WorkflowCoordinator",
            "type": "FINAL_RESPONSE",
            "trace_id": mcp_message["trace_id"],
            "payload": {
                "final_answer": final_answer
            }
        }

# Workflow Coordinator Using Model Context Protocol (MCP)

In [8]:
import uuid
import time

class WorkflowCoordinator:
    """
    Manages the end-to-end RAG workflow and reports status back to the UI.
    """
    def __init__(self):
        self.ingestion_agent = IngestionAgent()
        self.retrieval_agent = RetrievalAgent()
        self.llm_agent = LLMResponseAgent()
        print("WorkflowCoordinator initialized with all agents.")

    def run_workflow(self, files_dict, query, status_callback):
        """
        Executes the full agentic pipeline for a given query and files.
        Args:
            files_dict (dict): Dictionary of uploaded files.
            query (str): The user's question.
            status_callback (function): A function to call to update the UI's status.
        """
        trace_id = f"rag-{uuid.uuid4()}"
        status_callback(0, "trace", trace_id)

        # 1. Ingestion
        status_callback(1, "in_progress")
        mcp_ingest_request = { "sender": "WorkflowCoordinator", "receiver": "IngestionAgent", "type": "INGEST_REQUEST", "trace_id": trace_id, "payload": { "files": files_dict }}
        mcp_ingest_response = self.ingestion_agent.ingest(mcp_ingest_request)
        ingested_chunks = mcp_ingest_response["payload"]["chunks"]
        status_callback(1, "complete", f"{len(ingested_chunks)} chunks created.")

        # 2. Retrieval - Index Building
        status_callback(2, "in_progress")
        self.retrieval_agent.build_index(ingested_chunks)
        status_callback(2, "complete")

        # 3. Retrieval - Querying
        status_callback(3, "in_progress")
        mcp_retrieve_request = { "sender": "WorkflowCoordinator", "receiver": "RetrievalAgent", "type": "RETRIEVAL_REQUEST", "trace_id": trace_id, "payload": { "query": query }}
        mcp_retrieve_response = self.retrieval_agent.retrieve(mcp_retrieve_request)
        retrieved_context = mcp_retrieve_response["payload"]["retrieved_context"]
        status_callback(3, "complete", f"{len(retrieved_context)} relevant chunks found.")

        # 4. LLM Response Generation
        status_callback(4, "in_progress")
        mcp_llm_request = { "sender": "WorkflowCoordinator", "receiver": "LLMResponseAgent", "type": "GENERATE_RESPONSE_REQUEST", "trace_id": trace_id, "payload": { "query": query, "context": retrieved_context }}
        mcp_llm_response = self.llm_agent.generate_response(mcp_llm_request)
        final_answer = mcp_llm_response["payload"]["final_answer"]
        status_callback(4, "complete")

        return final_answer

# UI

In [10]:
import ipywidgets as widgets
from IPython.display import display, clear_output

conversation_history = []

# --- UI Styling and Icons ---
style = {'description_width': 'initial'}
ICON_PENDING = "⚪"
ICON_IN_PROGRESS = "⏳"
ICON_COMPLETE = "✅"

# --- UI Components ---
header = widgets.HTML("<h2> Agentic RAG Chatbot</h2>")

uploader = widgets.FileUpload(
    multiple=True,
    description='Upload File',
    button_style='primary',
    style=style
)

upload_info = widgets.HTML(value="<p><i><small>Accepted formats: .pdf, .docx, .pptx, .txt, .csv, .md</small></i></p>")
upload_status = widgets.HTML(value="")

query_box = widgets.Text(placeholder='Ask a question...', description='Your Query:', style=style, layout=widgets.Layout(width='auto', flex='1 1 auto'))
submit_button = widgets.Button(description='Ask', button_style='success', icon='question')
history_button = widgets.Button(description="Previous Conversations", icon="history")
action_buttons = widgets.HBox([submit_button, history_button])


# Processing Status Area
STEP_DESCRIPTIONS = {
    1: "IngestionAgent: Parsing Document",
    2: "RetrievalAgent: Building Index",
    3: "RetrievalAgent: Finding Chunks",
    4: "LLMResponseAgent: Generating Answer"
}
status_header = widgets.HTML("<h4>Processing Status:</h4>")
status_trace = widgets.HTML()
status_step1 = widgets.HTML(value=f"{ICON_PENDING} {STEP_DESCRIPTIONS[1]}")
status_step2 = widgets.HTML(value=f"{ICON_PENDING} {STEP_DESCRIPTIONS[2]}")
status_step3 = widgets.HTML(value=f"{ICON_PENDING} {STEP_DESCRIPTIONS[3]}")
status_step4 = widgets.HTML(value=f"{ICON_PENDING} {STEP_DESCRIPTIONS[4]}")
status_box = widgets.VBox([status_header, status_trace, status_step1, status_step2, status_step3, status_step4], layout=widgets.Layout(margin='10px 0 0 0', display='none')) # Hidden by default

# 4. Output and History Area
output_area = widgets.Output(layout={'border': '1px solid black', 'padding': '10px', 'margin': '10px 0 0 0'})
history_accordion = widgets.Accordion(children=[], layout=widgets.Layout(display='none'))


# --- UI Event Handlers ---
def reset_status_icons():
    status_box.layout.display = 'flex'
    history_accordion.layout.display = 'none'
    status_trace.value = ""
    status_step1.value, status_step2.value, status_step3.value, status_step4.value = (
        f"{ICON_PENDING} {STEP_DESCRIPTIONS[1]}", f"{ICON_PENDING} {STEP_DESCRIPTIONS[2]}",
        f"{ICON_PENDING} {STEP_DESCRIPTIONS[3]}", f"{ICON_PENDING} {STEP_DESCRIPTIONS[4]}"
    )

def on_upload_change(change):
    if change['new']:
        filenames = ", ".join(f"'{name}'" for name in change['new'].keys())
        upload_status.value = f"<p><b>File ready:</b> {filenames}</p>"
    else:
        upload_status.value = ""
uploader.observe(on_upload_change, names='value')

def update_status_ui(step, status, details=""):
    steps_widgets = {1: status_step1, 2: status_step2, 3: status_step3, 4: status_step4}
    if status == "trace":
        status_trace.value = f"<b>Trace ID:</b> {details}"
        return
    widget, base_text = steps_widgets.get(step), STEP_DESCRIPTIONS.get(step)
    if not widget or not base_text: return
    icon = ICON_IN_PROGRESS if status == "in_progress" else ICON_COMPLETE
    widget.value = f"{icon} {base_text} {f'<i>({details})</i>' if details else ''}"

def on_history_button_clicked(b):
    """Populates and displays the conversation history accordion."""
    with output_area:
        clear_output()
        status_box.layout.display = 'none'
        if not conversation_history:
            history_accordion.children = [widgets.HTML("No previous conversations yet.")]
        else:
            history_items = []
            for i, conv in enumerate(reversed(conversation_history)):
                item_html = f"<b>Question:</b><p>{conv['query']}</p><hr><b>Answer:</b><p>{conv['answer']}</p>"
                history_items.append(widgets.HTML(item_html))
            history_accordion.children = history_items
            for i, conv in enumerate(reversed(conversation_history)):
                history_accordion.set_title(i, f"Conversation #{len(conversation_history)-i}")
        history_accordion.layout.display = 'flex'

def on_button_clicked(b):
    """Main function to run the workflow."""
    with output_area:
        clear_output()
        reset_status_icons()

        if not uploader.value or not query_box.value:
            output_area.append_stdout("Error: Please upload documents and enter a question.\n")
            return

        files_dict = {filename: file_info['content'] for filename, file_info in uploader.value.items()}
        user_query = query_box.value

        try:
            coordinator = WorkflowCoordinator()
            final_answer = coordinator.run_workflow(files_dict, user_query, status_callback=update_status_ui)

            conversation_history.append({'query': user_query, 'answer': final_answer})

            clear_output()
            output_area.append_stdout("\n--- Answer ---\n")
            output_area.append_stdout(final_answer)
        except Exception as e:
            clear_output()
            output_area.append_stdout(f"An unexpected error occurred: {e}\n")

submit_button.on_click(on_button_clicked)
history_button.on_click(on_history_button_clicked)

ui = widgets.VBox([
    header,
    uploader,
    upload_info,
    upload_status,
    widgets.HTML("<hr>"),
    widgets.HBox([query_box, action_buttons]),
    status_box,
    output_area,
    history_accordion
])

display(ui)

VBox(children=(HTML(value='<h2> Agentic RAG Chatbot</h2>'), FileUpload(value={}, button_style='primary', descr…