In [None]:
# ==============================================================================
# Part 1: Installations
# ==============================================================================
print("⏳ Installing all dependencies... This may take several minutes.")
!pip install --upgrade gradio gradio_client websockets -q
!pip install llama-index==0.10.34 pypdf==4.2.0 pymupdf==1.24.1 -q
!pip install llama-cpp-python==0.2.73 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122 -q
!pip install llama-index-llms-llama-cpp==0.1.3 llama-index-embeddings-huggingface==0.2.0 -q
print("✅ Installations complete.")


# ==============================================================================
# Part 2: Setup and Model Loading
# ==============================================================================
import gradio as gr
import os
import json
import time
import fitz  # PyMuPDF
from pypdf import PdfReader
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from typing import List, Dict
from PIL import Image

# Download and load the open-source models
model_path = "/content/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
if not os.path.exists(model_path):
    print("⏳ Downloading Mistral 7B model... (approx. 4.1 GB)")
    !wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf -O {model_path}

print("⏳ Loading LLM and Embedding Model...")
Settings.llm = LlamaCPP(
    model_path=model_path, temperature=0.1, max_new_tokens=512,
    context_window=3900, model_kwargs={"n_gpu_layers": 35}, verbose=False
)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
print("✅ Models loaded successfully.")


# ==============================================================================
# Part 3: Backend Pipeline Logic
# ==============================================================================
class DocumentIntelligencePipeline:
    def __init__(self):
        self.index = None; self.file_name = ""
        self.doc_types_in_file = []; self.known_doc_types = ["Lender Fee Sheet", "Payslip", "Contract", "Other"]

    def segment_and_index_document(self, pdf_path: str):
        start_time = time.time()
        self.file_name = os.path.basename(pdf_path)
        reader = PdfReader(pdf_path)
        pages = [{"text": page.extract_text()} for page in reader.pages]
        if not pages: raise ValueError("Could not extract pages.")

        documents_to_index = []
        doc_types_found = set()
        for i, page in enumerate(pages):
            doc_type = "Document" # Simplified segmentation for robustness
            doc_types_found.add(doc_type)
            metadata = {"page_number": i + 1, "doc_type": doc_type, "file_name": self.file_name}
            doc = Document(text=page['text'], metadata=metadata)
            documents_to_index.append(doc)

        self.index = VectorStoreIndex.from_documents(documents_to_index)

        processing_time = time.time() - start_time
        self.doc_types_in_file = list(doc_types_found)
        stats = {
            "File": self.file_name, "Pages": len(pages), "Chunks": len(documents_to_index),
            "Types": ", ".join(self.doc_types_in_file), "Time": f"{processing_time:.2f}s"
        }
        return stats

    def query(self, user_query: str, filter_type: str, auto_route: bool, top_k: int):
        if not self.index: return {"answer": "Please process a document first.", "sources": ""}
        retriever_args = {"similarity_top_k": int(top_k)}
        doc_type_to_search = filter_type
        if auto_route and filter_type == "All":
            prompt = f"[INST] You are a query router. Classify the user's query into one of these categories: {self.doc_types_in_file}. Query: '{user_query}' [/INST]"
            predicted = Settings.llm.complete(prompt).text.strip()
            print(f"🎯 Auto-routing to: {predicted}")
            if predicted in self.doc_types_in_file: doc_type_to_search = predicted
        if doc_type_to_search != "All":
             retriever_args["filters"] = MetadataFilters(filters=[ExactMatchFilter(key="doc_type", value=doc_type_to_search)])

        retriever = self.index.as_retriever(**retriever_args)
        nodes = retriever.retrieve(user_query)
        context = "\n\n".join([n.get_text() for n in nodes])
        prompt = f"[INST] Use context to answer. If unsure, say so.\nContext:\n{context}\n\nQuestion: {user_query} [/INST]"
        response = Settings.llm.complete(prompt)
        sources = "--- SOURCES ---\n" + "\n".join([f"**Source {i+1} (Page: {n.metadata['page_number']})**:\n```{n.get_text().strip()}```" for i, n in enumerate(nodes)])

⏳ Installing all dependencies... This may take several minutes.
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.8/30.8 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Installations complete.


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /usr/local/lib/python3.12/dist-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


⏳ Loading LLM and Embedding Model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ Models loaded successfully.


In [None]:
# ==============================================================================
# Step 1: Installations
# ==============================================================================
print("⏳ Installing all dependencies... This may take several minutes.")

# Install Gradio and the custom PDF component for the viewer
!pip install --upgrade gradio gradio_client websockets -q
!pip install gradio_pdf -q

# Install libraries for the backend AI pipeline
!pip install llama-index==0.10.34 pypdf==4.2.0 pymupdf==1.24.1 -q
!pip install llama-cpp-python==0.2.73 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122 -q
!pip install llama-index-llms-llama-cpp==0.1.3 llama-index-embeddings-huggingface==0.2.0 -q
print("✅ Installations complete. Please go to 'Runtime > Restart session' before proceeding to Step 2.")
# ==============================================================================
# Step 2: Setup and Model Loading
# ==============================================================================
import gradio as gr
import os
import json
import time
import fitz  # PyMuPDF for PDF preview
from pypdf import PdfReader
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from typing import List, Dict
from PIL import Image

# Download and load the open-source models
model_path = "/content/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
if not os.path.exists(model_path):
    print("⏳ Downloading Mistral 7B model... (approx. 4.1 GB)")
    !wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf -O {model_path}

print("⏳ Loading LLM and Embedding Model...")
Settings.llm = LlamaCPP(
    model_path=model_path, temperature=0.1, max_new_tokens=512,
    context_window=3900, model_kwargs={"n_gpu_layers": 35}, verbose=False
)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
print("✅ Models loaded successfully.")

# ==============================================================================
# Step 3: Define the Backend Pipeline Logic
# ==============================================================================
class DocumentIntelligencePipeline:
    def __init__(self):
        self.index = None; self.file_name = ""
        self.doc_types_in_file = []; self.known_doc_types = ["Lender Fee Sheet", "Payslip", "Contract", "Other"]

    def process_and_index_document(self, pdf_path: str):
        start_time = time.time()
        self.file_name = os.path.basename(pdf_path)
        reader = PdfReader(pdf_path)
        pages = [{"text": page.extract_text()} for page in reader.pages]
        if not pages: raise ValueError("Could not extract pages from PDF.")

        documents_to_index = []
        doc_types_found = set()
        for i, page in enumerate(pages):
            # A more advanced pipeline would use an LLM to classify each doc type
            doc_type = "Document"
            doc_types_found.add(doc_type)
            metadata = {"page_number": i + 1, "doc_type": doc_type, "file_name": self.file_name}
            doc = Document(text=page['text'], metadata=metadata)
            documents_to_index.append(doc)

        self.index = VectorStoreIndex.from_documents(documents_to_index)

        processing_time = time.time() - start_time
        self.doc_types_in_file = list(doc_types_found)
        stats = {
            "File": self.file_name, "Pages": len(pages), "Chunks": len(documents_to_index),
            "Types": ", ".join(self.doc_types_in_file), "Time": f"{processing_time:.2f}s"
        }
        return stats

    def query(self, user_query: str, filter_type: str, auto_route: bool, top_k: int):
        if not self.index: return {"answer": "Please process a document first.", "sources": ""}

        retriever_args = {"similarity_top_k": int(top_k)}
        doc_type_to_search = filter_type

        if auto_route and filter_type == "All":
            prompt = f"[INST] You are a query router. Classify the user's query into one of these categories: {self.doc_types_in_file}. Query: '{user_query}' [/INST]"
            predicted = Settings.llm.complete(prompt).text.strip()
            print(f"🎯 Auto-routing to: {predicted}")
            if predicted in self.doc_types_in_file: doc_type_to_search = predicted

        if doc_type_to_search != "All":
             retriever_args["filters"] = MetadataFilters(filters=[ExactMatchFilter(key="doc_type", value=doc_type_to_search)])

        retriever = self.index.as_retriever(**retriever_args)
        nodes = retriever.retrieve(user_query)
        context = "\n\n".join([n.get_text() for n in nodes])
        prompt = f"[INST] Use the provided context to answer the question. If the answer is not in the context, say so.\n\nContext:\n{context}\n\nQuestion: {user_query} [/INST]"
        response = Settings.llm.complete(prompt)
        sources = "--- SOURCES ---\n" + "\n".join([f"**Source {i+1} (Page: {n.metadata['page_number']})**:\n```{n.get_text().strip()}```" for i, n in enumerate(nodes)])

# ==============================================================================
# Step 4: Build the Gradio UI and Event Handlers
# ==============================================================================
def process_pdf_and_update_ui(file, progress=gr.Progress()):
    if file is None:
        return None, "⚠️ Please upload a PDF file.", [], gr.update(interactive=False), gr.update(choices=["All"], value="All")

    # Generate a preview of the first page
    try:
        doc = fitz.open(file.name)
        page = doc[0]
        pix = page.get_pixmap(dpi=150)
        preview_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        doc.close()
    except Exception as e:
        print(f"Error generating preview: {e}")
        preview_image = None

    progress(0, desc="Starting...")
    try:
        progress(0.1, desc="📄 Processing and indexing PDF...")
        stats = pipeline.process_and_index_document(file.name)
        info_text = "\n".join([f"- **{key}:** {value}" for key, value in stats.items()])
        info_text = f"✅ **Successfully Processed:**\n{info_text}"
        progress(1.0, desc="✅ Ready.")
        return preview_image, info_text, [], gr.update(interactive=True), gr.update(choices=["All"] + pipeline.doc_types_in_file, value="All")
    except Exception as e:
        return None, f"❌ Error: {e}", None, gr.update(interactive=False), gr.update(choices=["All"], value="All")

def chat_handler(message, history, filter_type, auto_route, top_k):
    if not pipeline.index:
        history.append({"role": "user", "content": message})
        history.append({"role": "assistant", "content": "Please process a document first."})
        yield history, "", ""
        return

    history.append({"role": "user", "content": message})
    history.append({"role": "assistant", "content": "Thinking..."})
    yield history, "", ""

    result = pipeline.query(message, filter_type, auto_route, top_k)
    history[-1]['content'] = result["answer"]
    yield history, result["sources"], ""

with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky"), title="Enhanced Document Q&A") as app:
    gr.Markdown("# 🚀 Enhanced Document Q&A System")
    gr.Markdown("An end-to-end pipeline for intelligent document analysis using an open-source LLM.")

    with gr.Row(equal_height=False):
        # Column 1: File Uploader & Preview
        with gr.Column(scale=4, min_width=400):
            # Using gr.File for upload, and gr.Image for preview
            file_uploader = gr.File(label="📄 Upload PDF File", file_types=[".pdf"])
            process_btn = gr.Button("⚙️ Process Document", variant="primary")
            pdf_preview = gr.Image(label="PDF Preview (First Page)", height=650)

        # Column 2: Info & Settings Panel
        with gr.Column(scale=2, min_width=300):
            gr.Markdown("### 📊 Document Info")
            info_output = gr.Markdown(value="Please upload a PDF file to begin.")

            gr.Markdown("### ⚙️ Settings")
            doc_filter = gr.Dropdown(choices=["All"], value="All", label="🏷️ Document Type Filter")
            auto_route = gr.Checkbox(value=True, label="🎯 Auto-Route Queries")
            chunks_to_retrieve = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="📊 Chunks to Retrieve")

        # Column 3: Chat Panel
        with gr.Column(scale=4, min_width=400):
            gr.Markdown("### 💬 Ask Questions")
            chatbot = gr.Chatbot(label="Chat", height=500, type="messages", avatar_images=(None, "https://i.imgur.com/C7MMFf1.png"))
            source_display = gr.Markdown(label="Sources Used for Answer")
            msg_box = gr.Textbox(label="Your Question", placeholder="Ask a question...", interactive=False, container=False)

    # Event Handlers
    process_btn.click(
        process_pdf_and_update_ui,
        inputs=[file_uploader],
        outputs=[pdf_preview, info_output, chatbot, msg_box, doc_filter]
    )

    msg_box.submit(
        chat_handler,
        inputs=[msg_box, chatbot, doc_filter, auto_route, chunks_to_retrieve],
        outputs=[chatbot, source_display, msg_box]
    )

### **Step 5: Launch the Application**

# ==============================================================================
# Step 5: Launch the Application
# ==============================================================================
test_file_path = "/content/Test Blob File.pdf"
if not os.path.exists(test_file_path):
    print("⏳ Downloading test file 'Test Blob File.pdf'...")
    !wget -q https://storage.googleapis.com/generativeai-downloads/data/Test%20Blob%20File.pdf -O "/content/Test Blob File.pdf"
print("\n🚀 Launching Gradio App... Please upload the 'Test Blob File.pdf' in the UI to begin.")
app.launch(debug=True, share=True)

⏳ Installing all dependencies... This may take several minutes.
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.4/325.4 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.8/30.8 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m63.9 MB/s[0m eta [

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# ==============================================================================
# Step 1: Install All Dependencies
# ==============================================================================
# After this cell finishes, please restart the Colab runtime before proceeding.
print("⏳ Installing dependencies... This may take several minutes.")
!pip install gradio==4.31.5 llama-index==0.10.34 pypdf==4.2.0 -q
!pip install llama-cpp-python==0.2.73 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122 -q
!pip install llama-index-llms-llama-cpp==0.1.3 llama-index-embeddings-huggingface==0.2.0 -q
print("✅ Installations complete. Please go to Runtime > Restart session.")

⏳ Installing dependencies... This may take several minutes.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yfinance 0.2.65 requires websockets>=13.0, but you have websockets 11.0.3 which is incompatible.
google-adk 1.13.0 requires websockets<16.0.0,>=15.0.1, but you have websockets 11.0.3 which is incompatible.
dataproc-spark-connect 0.8.3 requires websockets>=14.0, but you have websockets 11.0.3 which is incompatible.
google-genai 1.34.0 requires websockets<15.1.0,>=13.0.0, but you have websockets 11.0.3 which is incompatible.[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/bin/pip3", line 4, in <module>
    from pip._internal.cli.main import main
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/main.py", line 11, in <module>
    from pip._internal.cli.autocompletion import autocomplete
  File "/usr/local/lib/python3.1

In [None]:
# ==============================================================================
# Step 2: Setup, Download, and Load Models
# ==============================================================================
import gradio as gr
import os
import json
from pypdf import PdfReader
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from typing import List, Dict

# Download the open-source model (Mistral 7B) if it doesn't exist
model_path = "/content/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
if not os.path.exists(model_path):
    print("⏳ Downloading Mistral 7B model... (approx. 4.1 GB)")
    !wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf -O {model_path}
    print("✅ Model download complete.")

# Initialize the LLM and Embedding Model
try:
    print("⏳ Loading LLM and Embedding Model...")
    Settings.llm = LlamaCPP(
        model_path=model_path,
        temperature=0.1,
        max_new_tokens=512,
        context_window=3900,
        model_kwargs={"n_gpu_layers": 35}, # Offload layers to GPU
        verbose=False
    )
    Settings.embed_model = HuggingFaceEmbedding(
        model_name="BAAI/bge-small-en-v1.5"
    )
    print("✅ LLM and Embedding Model loaded successfully.")
except Exception as e:
    print(f"❌ Error loading models: {e}")

In [None]:
# ==============================================================================
# Step 3: Define the Backend Pipeline Logic
# ==============================================================================
class DocumentIntelligencePipeline:
    """Encapsulates the entire document processing and RAG pipeline."""

    def __init__(self):
        self.index = None
        self.doc_metadata = []
        self.doc_types = ["Lender Fee Sheet", "Payslip", "Contract", "Other"]

    def _classify_and_segment_page(self, prev_text, curr_text, prev_doc_type):
        """Uses LLM to get both boundary detection and classification in one call."""
        prompt = f"""
        [INST] You are a document analysis expert. Your task is to analyze two consecutive pages from a PDF and determine if the current page starts a new document.
        The previous page was part of a '{prev_doc_type}' document.

        Here is the text from the end of the previous page:
        ---
        {prev_text[-500:]}
        ---

        Here is the text from the start of the current page:
        ---
        {curr_text[:500]}
        ---

        Analyze the content. Provide a JSON object with two keys:
        1. "is_new_doc": A string, either "Yes" or "No".
        2. "doc_type": If "Yes", classify the new document from this list: {self.doc_types}. If "No", use the previous type '{prev_doc_type}'.

        Your response must be only the JSON object. [/INST]
        """
        response = Settings.llm.complete(prompt)
        try:
            json_str = response.text.strip().split('```json')[-1].split('```')[0].strip()
            return json.loads(json_str)
        except (json.JSONDecodeError, IndexError):
            is_new = "yes" in response.text.lower()
            return {"is_new_doc": "Yes" if is_new else "No", "doc_type": prev_doc_type}

    def segment_and_index_document(self, pdf_path: str):
        """Loads a PDF, segments it, and creates a metadata-aware vector index."""
        print("⏳ Starting document segmentation...")
        self.doc_metadata = [] # Reset for new file
        reader = PdfReader(pdf_path)
        pages = [{"page_num": i, "text": page.extract_text()} for i, page in enumerate(reader.pages)]

        if not pages:
            raise ValueError("Could not extract any pages from the PDF.")

        # Process first page separately
        first_page_type_prompt = f"[INST] Classify this document. Choose from: {self.doc_types}. Text: {pages[0]['text'][:1000]} [/INST]"
        current_doc_type = Settings.llm.complete(first_page_type_prompt).text.strip()

        doc_counter = 0
        documents_to_index = []

        # Loop through all pages
        for i, page in enumerate(pages):
            is_new_doc_flag = "No"
            if i > 0:
                analysis = self._classify_and_segment_page(pages[i-1]['text'], page['text'], current_doc_type)
                if analysis['is_new_doc'] == "Yes":
                    doc_counter += 1
                    current_doc_type = analysis['doc_type']
                    is_new_doc_flag = "Yes"

            # Store metadata for UI display
            self.doc_metadata.append({
                "Page": i + 1, "Is New Doc?": "Yes" if i == 0 else is_new_doc_flag,
                "Document Type": current_doc_type, "Doc ID": doc_counter
            })

            # Create a LlamaIndex Document with rich metadata
            doc = Document(text=page['text'], metadata={"page_number": i + 1, "doc_id": doc_counter, "doc_type": current_doc_type})
            documents_to_index.append(doc)

        print(f"✅ Segmentation complete. Found {doc_counter + 1} logical documents.")

        print("⏳ Creating vector index...")
        self.index = VectorStoreIndex.from_documents(documents_to_index)
        print("✅ Vector index created successfully.")
        return self.doc_metadata

    def query(self, user_query: str) -> Dict:
        """Routes a query and returns the answer and sources."""
        if not self.index:
            return {"answer": "Please process a document first.", "sources": ""}

        prompt = f"[INST] Classify this query: '{user_query}'. Choose from: {self.doc_types}. [/INST]"
        predicted_doc_type = Settings.llm.complete(prompt).text.strip()
        print(f"🎯 Query routed to: {predicted_doc_type}")

        filters = MetadataFilters(filters=[ExactMatchFilter(key="doc_type", value=predicted_doc_type)])
        retriever = self.index.as_retriever(similarity_top_k=3, filters=filters)
        response_nodes = retriever.retrieve(user_query)

        context = "\n\n".join([node.get_text() for node in response_nodes])
        synthesis_prompt = f"[INST] You are an expert Q&A assistant. Use the provided context to answer the user's question. If the answer is not in the context, state that.\n\nContext:\n---\n{context}\n---\nQuestion: {user_query} [/INST]"
        response = Settings.llm.complete(synthesis_prompt)

        sources = ""
        if response_nodes:
            sources = "--- SOURCES ---\n"
            for i, node in enumerate(response_nodes):
                sources += f"**Source {i+1} (Page: {node.metadata['page_number']}, Score: {node.score:.2f})**:\n```{node.get_text()[:250].strip()}...```\n\n"

        return {"answer": response.text.strip(), "sources": sources}

In [None]:
# ==============================================================================
# Step 4: Build the Gradio User Interface (Corrected)
# ==============================================================================

# Instantiate the pipeline
pipeline = DocumentIntelligencePipeline()

# Define the handler functions that the UI will call
def process_document_and_update_ui(file, progress=gr.Progress()):
    if file is None:
        # Return updates for all outputs, including making the message box non-interactive
        return "Please upload a file.", None, gr.update(interactive=False)

    # Use a try-except block for robust error handling
    try:
        progress(0, desc="Starting...")
        progress(0.2, desc="Segmenting document and building index...")
        segmentation_results = pipeline.segment_and_index_document(file.name)
        progress(1.0, desc="✅ Document processed. Ready for questions.")

        # Return a success message, the dataframe, and make the message box interactive
        return "✅ Document processed. Ready for questions.", segmentation_results, gr.update(interactive=True)
    except Exception as e:
        error_message = f"❌ Error processing file: {e}"
        return error_message, None, gr.update(interactive=False)

def chat_handler(message, history):
    # Append the user's message to the history immediately
    history.append([message, None])
    yield history, "", ""  # Update chatbot, clear sources, clear textbox

    # Get the response and sources from the pipeline
    result = pipeline.query(message)

    # Stream the bot's response into the history
    bot_response = ""
    for char in result["answer"]:
        bot_response += char
        history[-1][1] = bot_response
        yield history, result["sources"], ""

# Build the Gradio App
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="Document Intelligence System") as app:
    gr.Markdown("# 🤖 End-to-End Document Intelligence System")
    gr.Markdown("Upload a multi-document PDF, see it segmented, and ask questions.")

    with gr.Row():
        with gr.Column(scale=2):
            file_uploader = gr.File(label="Upload your PDF Blob File", file_types=[".pdf"])
            process_btn = gr.Button("⚙️ Process Document", variant="primary")
            gr.Markdown("### Segmentation Results")
            # CORRECTED: Removed 'interactive=False' to fix the TypeError
            segmentation_display = gr.DataFrame(headers=["Page", "Is New Doc?", "Document Type", "Doc ID"])

        with gr.Column(scale=3):
            chatbot = gr.Chatbot(label="Chat with your documents", height=550)
            # CORRECTED: Removed 'interactive=False' to fix the TypeError
            msg_box = gr.Textbox(label="Your Question", placeholder="e.g., What is the net pay on the payslip?", scale=7)
            source_display = gr.Markdown(label="Sources Used for Answer")

    # Wire up the UI components to the backend logic
    process_btn.click(
        process_document_and_update_ui,
        inputs=[file_uploader],
        outputs=[chatbot, segmentation_display, msg_box]
    )

    msg_box.submit(
        chat_handler,
        inputs=[msg_box, chatbot],
        outputs=[chatbot, source_display, msg_box]
    )

In [None]:
# ==============================================================================
# Step 5: Launch the App
# ==============================================================================

# Download the test file if it doesn't exist, so the user has something to upload.
test_file_path = "/content/drive/MyDrive/OUTAMATION/Test Blob File.pdf"
if not os.path.exists(test_file_path):
    print("⏳ Downloading test file 'Test Blob File.pdf'...")
    !wget -q https://storage.googleapis.com/generativeai-downloads/data/Test%20Blob%20File.pdf -O "/content/Test Blob File.pdf"
    print("✅ Test file downloaded. Please upload it in the Gradio UI below.")

print("\n🚀 Launching Gradio App...")
app.launch(debug=True, share=True)

In [None]:
# ==============================================================================
# FINAL CORRECTED CODE - Upgrade All Dependencies and Run Minimal Test
# ==============================================================================

# 1. Upgrade Gradio and ALL related dependencies
# ==============================================================================
print("⏳ Upgrading Gradio and all dependencies...")
# The key fix is upgrading gradio, gradio_client, AND websockets together.
!pip install --upgrade gradio gradio_client websockets
!pip install pypdf pandas -q
print("✅ Installations complete.")

# 2. Imports
# ==============================================================================
import gradio as gr
import pandas as pd
import time
# ==============================================================================
# Step 3: Dummy Backend Functions (Corrected for 'messages' format)
# ==============================================================================
def dummy_process_pdf(file, progress=gr.Progress()):
    """A placeholder function that simulates processing a PDF."""
    if file is None:
        return "Please upload a file.", None, gr.update(interactive=False)
    progress(0, desc="Starting...")
    time.sleep(1)
    progress(0.5, desc="Analyzing document...")
    time.sleep(1)
    fake_data = pd.DataFrame({
        "Page": [1, 2, 3], "Is New Doc?": ["Yes", "No", "Yes"],
        "Document Type": ["Contract", "Contract", "Payslip"], "Doc ID": [0, 0, 1]
    })
    progress(1.0, desc="✅ Ready.")
    return [], fake_data, gr.update(interactive=True) # Return empty list for chatbot

def dummy_chat_handler(message, history):
    """
    A placeholder function that echoes the user's message using the 'messages' format.
    History is now a list of dictionaries: [{"role": "user", "content": "..."}]
    """
    # Append the user's message to the history
    history.append({"role": "user", "content": message})
    # Append a placeholder for the bot's response
    history.append({"role": "assistant", "content": ""})

    response = f"This is a test response to your message: '{message}'"

    # Stream the bot's response into the last message
    bot_message = ""
    for char in response:
        bot_message += char
        history[-1]["content"] = bot_message
        time.sleep(0.05)
        yield history, f"Source for '{message}' would appear here.", ""

# ==============================================================================
# Step 4: Gradio Interface (Corrected for 'messages' format)
# ==============================================================================
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="Minimal UI Test") as app:
    gr.Markdown("# 🤖 Minimal UI Test")
    gr.Markdown("This is a test to see if the Gradio interface launches without errors after upgrading.")

    with gr.Row():
        with gr.Column(scale=2):
            file_uploader = gr.File(label="Upload your PDF Blob File", file_types=[".pdf"])
            process_btn = gr.Button("⚙️ Process Document", variant="primary")
            gr.Markdown("### Segmentation Results")
            segmentation_display = gr.DataFrame()

        with gr.Column(scale=3):
            # CORRECTED: Added type='messages'
            chatbot = gr.Chatbot(label="Chat with your documents", height=550, type='messages')
            msg_box = gr.Textbox(label="Your Question", placeholder="Ask a question...", scale=7)
            source_display = gr.Markdown(label="Sources Used for Answer")

    # Wire up the UI components to the DUMMY backend logic
    process_btn.click(
        dummy_process_pdf,
        inputs=[file_uploader],
        outputs=[chatbot, segmentation_display, msg_box]
    )

    msg_box.submit(
        dummy_chat_handler,
        inputs=[msg_box, chatbot],
        outputs=[chatbot, source_display, msg_box]
    )
# 5. Launch the App
# ==============================================================================
print("\n🚀 Launching Minimal Gradio App Test...")
app.launch(debug=True, share=True)

In [None]:
# ==============================================================================
# Part 1: Installations
# ==============================================================================
print("⏳ Installing all dependencies... This may take several minutes.")
# Install Gradio and the custom PDF component
!pip install --upgrade gradio gradio_client websockets -q
# Install all necessary backend libraries
!pip install llama-index==0.10.34 pypdf==4.2.0 -q
!pip install llama-cpp-python==0.2.73 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122 -q
!pip install llama-index-llms-llama-cpp==0.1.3 llama-index-embeddings-huggingface==0.2.0 -q
print("✅ Installations complete.")


# ==============================================================================
# Part 2: Setup and Model Loading
# ==============================================================================
import gradio as gr
import os
import json
import time
from pypdf import PdfReader
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from typing import List, Dict

# Download and load the open-source models
model_path = "/content/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
if not os.path.exists(model_path):
    print("⏳ Downloading Mistral 7B model... (approx. 4.1 GB)")
    !wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf -O {model_path}

print("⏳ Loading LLM and Embedding Model...")
Settings.llm = LlamaCPP(
    model_path=model_path, temperature=0.1, max_new_tokens=512,
    context_window=3900, model_kwargs={"n_gpu_layers": 35}, verbose=False
)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
print("✅ Models loaded successfully.")


# ==============================================================================
# Part 3: Backend Pipeline Logic
# ==============================================================================
class DocumentIntelligencePipeline:
    """Encapsulates the entire document processing and RAG pipeline."""

    def __init__(self):
        self.index = None
        self.doc_metadata = []
        self.doc_types_in_file = []
        self.known_doc_types = ["Lender Fee Sheet", "Payslip", "Contract", "Other"]
        self.file_name = ""

    def segment_and_index_document(self, pdf_path: str):
        """Loads a PDF, segments it, and creates a metadata-aware vector index."""
        start_time = time.time()
        print("⏳ Starting document segmentation...")
        self.file_name = os.path.basename(pdf_path)
        self.doc_metadata = []
        reader = PdfReader(pdf_path)
        pages = [{"text": page.extract_text()} for page in reader.pages]
        if not pages: raise ValueError("Could not extract pages from PDF.")

        # Simplified segmentation for this demo's robustness
        documents_to_index = []
        doc_types_found = set()
        for i, page in enumerate(pages):
            doc_type = "Document"  # A real implementation would use the LLM classifier here
            doc_types_found.add(doc_type)
            metadata = {"page_number": i + 1, "doc_type": doc_type, "file_name": self.file_name}
            doc = Document(text=page['text'], metadata=metadata)
            documents_to_index.append(doc)

        print("⏳ Creating vector index...")
        self.index = VectorStoreIndex.from_documents(documents_to_index)

        processing_time = time.time() - start_time
        self.doc_types_in_file = list(doc_types_found)

        # Create summary stats
        stats = {
            "File": self.file_name,
            "Pages": len(pages),
            "Chunks Created": len(documents_to_index),
            "Documents Found": len(doc_types_found),
            "Types": ", ".join(self.doc_types_in_file),
            "Time": f"{processing_time:.2f}s"
        }
        print("✅ Indexing complete.")
        return stats

    def query(self, user_query: str, filter_type: str, auto_route: bool, top_k: int) -> Dict:
        """Routes a query and returns the answer and sources."""
        if not self.index: return {"answer": "Please process a document first.", "sources": ""}

        retriever_args = {"similarity_top_k": int(top_k)}

        # Determine the document type to filter by
        doc_type_to_search = filter_type
        if auto_route and filter_type == "All":
            prompt = f"[INST] You are a query router. Classify the user's query into one of these categories: {self.doc_types_in_file}. Query: '{user_query}' [/INST]"
            predicted_doc_type = Settings.llm.complete(prompt).text.strip()
            print(f"🎯 Auto-routing query to: {predicted_doc_type}")
            if predicted_doc_type in self.doc_types_in_file:
                doc_type_to_search = predicted_doc_type

        if doc_type_to_search != "All":
             print(f"🎯 Filtering retrieval to: {doc_type_to_search}")
             retriever_args["filters"] = MetadataFilters(filters=[ExactMatchFilter(key="doc_type", value=doc_type_to_search)])

        # Retrieve and synthesize
        retriever = self.index.as_retriever(**retriever_args)
        response_nodes = retriever.retrieve(user_query)
        context = "\n\n".join([node.get_text() for node in response_nodes])
        synthesis_prompt = f"[INST] Use the provided context to answer the question.\n\nContext:\n{context}\n\nQuestion: {user_query} [/INST]"
        response = Settings.llm.complete(synthesis_prompt)

        # Format sources
        sources = ""
        if response_nodes:
            sources = "--- SOURCES ---\n"
            for i, node in enumerate(response_nodes):
                sources += f"**Source {i+1} (Page: {node.metadata['page_number']}, Score: {node.score:.2f})**:\n```{node.get_text()[:250].strip()}...```\n\n"

        return {"answer": response.text.strip(), "sources": sources}

# Global instance of our pipeline
pipeline = DocumentIntelligencePipeline()

# ==============================================================================
# Part 4: Gradio UI and Event Handlers
# ==============================================================================
def process_pdf_and_update_ui(file, progress=gr.Progress()):
    if file is None:
        return None, "⚠️ Please upload a PDF file.", gr.update(interactive=False), gr.update(choices=["All"], value="All")

    progress(0, desc="Starting...")
    try:
        file_path = file.name
        progress(0.1, desc="📄 Processing and indexing PDF...")
        stats = pipeline.segment_and_index_document(file_path)

        progress(1.0, desc="✅ Ready.")

        # Format stats for the info panel
        info_text = "\n".join([f"- **{key}:** {value}" for key, value in stats.items()])
        info_text = f"✅ **Successfully Processed:**\n{info_text}"

        return file_path, info_text, [], gr.update(interactive=True), gr.update(choices=["All"] + pipeline.doc_types_in_file, value="All")
    except Exception as e:
        return None, f"❌ Error: {e}", None, gr.update(interactive=False), gr.update(choices=["All"], value="All")

def chat_handler(message, history, filter_type, auto_route, top_k):
    if not pipeline.index:
        history.append({"role": "user", "content": message})
        history.append({"role": "assistant", "content": "Please process a document first."})
        yield history, "", ""
        return

    history.append({"role": "user", "content": message})
    history.append({"role": "assistant", "content": "Thinking..."})
    yield history, "", ""

    result = pipeline.query(message, filter_type, auto_route, top_k)

    bot_response = result["answer"]
    history[-1]['content'] = bot_response
    yield history, result["sources"], ""

# Define the Gradio App
with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky"), title="Enhanced Document Q&A") as app:
    gr.Markdown("# 🚀 Enhanced Document Q&A System")
    gr.Markdown("Intelligent Multi-Document Analysis with Advanced RAG Pipeline")

    with gr.Row(equal_height=True):
        # Left Column: PDF Viewer and Process Button
        with gr.Column(scale=3):
            pdf_viewer = gr.File(label="📄 PDF Document Viewer", height=700)

        # Right Column: Info, Settings, and Chat
        with gr.Column(scale=2):
            gr.Markdown("### 📊 Document Info")
            info_output = gr.Markdown(value="Please upload a PDF file to begin.")

            gr.Markdown("### ⚙️ Settings")
            doc_filter = gr.Dropdown(choices=["All"], value="All", label="🏷️ Document Type Filter")
            auto_route = gr.Checkbox(value=True, label="🎯 Auto-Route Queries")
            chunks_to_retrieve = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="📊 Chunks to Retrieve")

            gr.Markdown("### 💬 Ask Questions")
            chatbot = gr.Chatbot(label="Chat", height=400, type="messages", avatar_images=(None, "https://i.imgur.com/C7MMFf1.png"))
            msg_box = gr.Textbox(label="Your Question", placeholder="Ask a question...", interactive=False, container=False)
            source_display = gr.Markdown(label="Sources Used for Answer")

    # Event Handlers
    pdf_viewer.upload(
        process_pdf_and_update_ui,
        inputs=[pdf_viewer],
        outputs=[pdf_viewer, info_output, chatbot, msg_box, doc_filter]
    )

    msg_box.submit(
        chat_handler,
        inputs=[msg_box, chatbot, doc_filter, auto_route, chunks_to_retrieve],
        outputs=[chatbot, source_display, msg_box]
    )

# ==============================================================================
# Part 5: Launch
# ==============================================================================
test_file_path = "/content/Test Blob File.pdf"
if not os.path.exists(test_file_path):
    print("⏳ Downloading test file 'Test Blob File.pdf'...")
    !wget -q https://storage.googleapis.com/generativeai-downloads/data/Test%20Blob%20File.pdf -O "/content/Test Blob File.pdf"
print("\n🚀 Launching Gradio App... Please upload the 'Test Blob File.pdf' in the UI to begin.")
app.launch(debug=True, share=True)

⏳ Installing all dependencies... This may take several minutes.
✅ Installations complete.


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /usr/local/lib/python3.12/dist-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


⏳ Loading LLM and Embedding Model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ Models loaded successfully.

🚀 Launching Gradio App... Please upload the 'Test Blob File.pdf' in the UI to begin.
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://aeb2802a64a71d2432.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


⏳ Starting document segmentation...
⏳ Creating vector index...
✅ Indexing complete.
🎯 Auto-routing query to: Based on the given query 'what's this', it is difficult to definitively classify it as falling under any specific category without additional context. However, in general terms, such queries can be considered open-ended or ambiguous and could potentially relate to various types of information including documents. Therefore, a safe initial classification for this query could be ['Document'] with the understanding that further clarification from the user may be necessary to refine the category.
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://aeb2802a64a71d2432.gradio.live




In [None]:
# ==============================================================================
# FINAL WORKAROUND CODE (Replaces gr.PDF with gr.File)
# ==============================================================================

# 1. Installations (gradio_pdf is no longer needed)
# ==============================================================================
print("⏳ Installing dependencies...")
!pip install --upgrade gradio gradio_client websockets -q
!pip install llama-index==0.10.34 pypdf==4.2.0 -q
!pip install llama-cpp-python==0.2.73 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122 -q
!pip install llama-index-llms-llama-cpp==0.1.3 llama-index-embeddings-huggingface==0.2.0 -q
print("✅ Installations complete.")


# 2. Setup and Model Loading
# ==============================================================================
import gradio as gr
import os
import json
import time
from pypdf import PdfReader
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from typing import List, Dict

# Download and load the open-source models
model_path = "/content/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
if not os.path.exists(model_path):
    print("⏳ Downloading Mistral 7B model... (approx. 4.1 GB)")
    !wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf -O {model_path}

print("⏳ Loading LLM and Embedding Model...")
Settings.llm = LlamaCPP(
    model_path=model_path, temperature=0.1, max_new_tokens=512,
    context_window=3900, model_kwargs={"n_gpu_layers": 35}, verbose=False
)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
print("✅ Models loaded successfully.")


# 3. Backend Pipeline Logic
# ==============================================================================
class DocumentIntelligencePipeline:
    def __init__(self):
        self.index = None
        self.doc_types = ["Lender Fee Sheet", "Payslip", "Contract", "Other"]
        self.file_name = ""

    def segment_and_index_document(self, pdf_path: str):
        print(f"⏳ Processing {os.path.basename(pdf_path)}...")
        self.file_name = os.path.basename(pdf_path)
        reader = PdfReader(pdf_path)
        pages = [{"text": page.extract_text()} for page in reader.pages]
        if not pages: raise ValueError("Could not extract pages from PDF.")

        documents_to_index = []
        doc_types_found = set()
        for i, page in enumerate(pages):
            # Simplified segmentation for robustness
            doc_type = "Document"
            doc_types_found.add(doc_type)
            doc = Document(text=page['text'], metadata={"page_number": i + 1, "doc_type": doc_type})
            documents_to_index.append(doc)

        print("⏳ Creating vector index...")
        self.index = VectorStoreIndex.from_documents(documents_to_index)
        print("✅ Vector index created successfully.")
        return list(doc_types_found)

    def query(self, user_query: str, top_k: int):
        if not self.index: return {"answer": "Please process a document first.", "sources": ""}
        retriever = self.index.as_retriever(similarity_top_k=int(top_k))
        response_nodes = retriever.retrieve(user_query)
        context = "\n\n".join([node.get_text() for node in response_nodes])
        synthesis_prompt = f"[INST] Use the provided context to answer the question.\n\nContext:\n{context}\n\nQuestion: {user_query} [/INST]"
        response = Settings.llm.complete(synthesis_prompt)
        sources = "--- SOURCES ---\n" + "\n".join([f"**Source {i+1} (Page: {n.metadata['page_number']})**:\n```{n.get_text()[:250].strip()}...```" for i, n in enumerate(response_nodes)])
        return {"answer": response.text.strip(), "sources": sources}

pipeline = DocumentIntelligencePipeline()


# 4. Gradio UI and Event Handlers (with Workaround)
# ==============================================================================
def process_pdf_and_update_ui(file, progress=gr.Progress()):
    if file is None:
        return "⚠️ Please upload a PDF file.", gr.update(interactive=False)
    progress(0, desc="Starting...")
    try:
        file_path = file.name
        progress(0.1, desc="📄 Indexing PDF...")
        pipeline.segment_and_index_document(file_path)
        info_text = f"✅ **Ready to answer questions about:** {os.path.basename(file_path)}"
        progress(1.0, desc="✅ Ready.")
        return info_text, gr.update(interactive=True)
    except Exception as e:
        return f"❌ Error: {e}", gr.update(interactive=False)

def chat_handler(message, history, top_k):
    if not pipeline.index:
        history.append({"role": "user", "content": message})
        history.append({"role": "assistant", "content": "Please process a document before asking questions."})
        yield history, "", ""
        return

    history.append({"role": "user", "content": message})
    history.append({"role": "assistant", "content": "Thinking..."})
    yield history, "", ""

    result = pipeline.query(message, top_k)
    bot_response = result["answer"]
    history[-1]['content'] = bot_response
    yield history, result["sources"], ""

with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky"), title="Document Q&A") as app:
    gr.Markdown("# 🚀 Document Q&A System")
    with gr.Row():
        with gr.Column(scale=2):
            # --- WORKAROUND: Replaced gr.PDF with gr.File ---
            file_uploader = gr.File(label="📄 Upload Your PDF File", file_types=[".pdf"])
            info_output = gr.Markdown(value="Please upload a PDF to begin.")
            chunks_to_retrieve = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="📊 Chunks to Retrieve")

        with gr.Column(scale=3):
            chatbot = gr.Chatbot(label="Chat", height=600, type="messages", avatar_images=(None, "https://i.imgur.com/C7MMFf1.png"))
            msg_box = gr.Textbox(label="Your Question", placeholder="Ask a question...", interactive=False)
            source_display = gr.Markdown(label="Sources Used for Answer")

    # Event Handlers
    file_uploader.upload(
        process_pdf_and_update_ui,
        inputs=[file_uploader],
        outputs=[info_output, msg_box]
    )

    msg_box.submit(
        chat_handler,
        inputs=[msg_box, chatbot, chunks_to_retrieve],
        outputs=[chatbot, source_display, msg_box]
    )

# 5. Launch
# ==============================================================================
test_file_path = "/content/Test Blob File.pdf"
if not os.path.exists(test_file_path):
    print("⏳ Downloading test file 'Test Blob File.pdf'...")
    !wget -q https://storage.googleapis.com/generativeai-downloads/data/Test%20Blob%20File.pdf -O "/content/Test Blob File.pdf"
print("\n🚀 Launching Gradio App... Please upload the 'Test Blob File.pdf' in the UI to begin.")
app.launch(debug=True, share=True)

⏳ Installing dependencies...
✅ Installations complete.


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /usr/local/lib/python3.12/dist-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


⏳ Loading LLM and Embedding Model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ Models loaded successfully.

🚀 Launching Gradio App... Please upload the 'Test Blob File.pdf' in the UI to begin.
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://e54dda8bf169d56f33.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


⏳ Processing Test Blob File.pdf...
⏳ Creating vector index...
✅ Vector index created successfully.
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://e54dda8bf169d56f33.gradio.live




In [None]:
# ==============================================================================
# Part 1: Installations
# ==============================================================================
# Install necessary libraries, including the specific version of llama-cpp-python
# with CUDA support for the Colab environment.
print("Installing dependencies... This may take a few minutes.")

# Uninstall conflicting libraries
!pip uninstall -y opencv-contrib-python opencv-python thinc opencv-python-headless

# Install necessary libraries with compatible numpy version
!pip install numpy --upgrade
!pip install gradio llama-index==0.10.34 pypdf==4.2.0 -q
!pip install llama-cpp-python==0.2.73 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122 -q
!pip install llama-index-llms-llama-cpp==0.1.3 llama-index-embeddings-huggingface==0.2.0 -q
print("✅ Installations complete.")


In [None]:


# ==============================================================================
# Part 2: Setup (Imports and Model Loading)
# ==============================================================================
import gradio as gr
import os
import json
from pypdf import PdfReader
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from typing import List, Dict

# Download the open-source model (Mistral 7B)
model_path = "/content/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
if not os.path.exists(model_path):
    print("Downloading Mistral 7B model...")
    !wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf -O {model_path}
    print("✅ Model download complete.")

# Initialize the LLM and Embedding Model
try:
    Settings.llm = LlamaCPP(
        model_path=model_path,
        temperature=0.1,
        max_new_tokens=512,
        context_window=3900,
        model_kwargs={"n_gpu_layers": 35}, # Offload all layers to GPU
        verbose=False
    )
    Settings.embed_model = HuggingFaceEmbedding(
        model_name="BAAI/bge-small-en-v1.5"
    )
    print("✅ LLM and Embedding Model loaded successfully.")
except Exception as e:
    print(f"❌ Error loading models: {e}")


In [None]:

# ==============================================================================
# Part 3: Backend Pipeline Logic (The "Brain" of the App)
# ==============================================================================
class DocumentIntelligencePipeline:
    """Encapsulates the entire document processing and RAG pipeline."""

    def __init__(self):
        self.index = None
        self.doc_metadata = []
        self.doc_types = ["Lender Fee Sheet", "Payslip", "Contract", "Other"]

    def _classify_first_page(self, text: str) -> str:
        """Classifies the very first page of the blob."""
        prompt = f"""
        [INST] You are a document classification expert. Classify the following document based on its text.
        Choose from: {self.doc_types}.

        Document Text:
        ---
        {text[:1000]}
        ---

        Respond with only the document type. [/INST]
        """
        response = Settings.llm.complete(prompt)
        return response.text.strip()

    def _is_same_document(self, prev_text: str, curr_text: str, prev_doc_type: str) -> bool:
        """Determines if two pages belong to the same document."""
        prompt = f"""
        [INST] You are a document boundary detection expert. Your task is to determine if the 'Current Page' is a continuation of the 'Previous Page'.
        The previous page was part of a '{prev_doc_type}' document. An 'Annexure' or 'Appendix' is part of the preceding document.

        End of Previous Page Text:
        ---
        {prev_text[-500:]}
        ---

        Start of Current Page Text:
        ---
        {curr_text[:500]}
        ---

        Based on the content, is the 'Current Page' a continuation of the same document?
        Respond with only "Yes" or "No". [/INST]
        """
        response = Settings.llm.complete(prompt)
        return "yes" in response.text.lower()

    def segment_document(self, pdf_path: str) -> List[Dict]:
        """Loads a PDF and segments it into distinct documents with metadata."""
        print("Starting document segmentation...")
        reader = PdfReader(pdf_path)
        pages = [{"page_num": i, "text": page.extract_text()} for i, page in enumerate(reader.pages)]

        if not pages:
            return []

        doc_counter = 0
        page_in_doc_counter = 0
        current_doc_type = self._classify_first_page(pages[0]["text"])

        self.doc_metadata = []
        for i, page in enumerate(pages):
            is_new = False
            if i > 0:
                if not self._is_same_document(pages[i-1]["text"], page["text"], current_doc_type):
                    doc_counter += 1
                    page_in_doc_counter = 0
                    current_doc_type = self._classify_first_page(page["text"])
                    is_new = True

            self.doc_metadata.append({
                "page": i + 1,
                "doc_id": doc_counter,
                "doc_type": current_doc_type,
                "page_in_doc": page_in_doc_counter,
                "is_new": "Yes" if is_new or i==0 else "No",
                "text": page["text"]
            })
            page_in_doc_counter += 1

        print(f"✅ Segmentation complete. Found {doc_counter + 1} documents.")
        return self.doc_metadata

    def create_index(self):
        """Creates a metadata-aware vector index from the segmented documents."""
        if not self.doc_metadata:
            return

        print("Creating vector index...")
        documents_to_index = []
        for page_info in self.doc_metadata:
            doc = Document(
                text=page_info["text"],
                metadata={
                    "page_number": page_info["page"],
                    "doc_id": page_info["doc_id"],
                    "doc_type": page_info["doc_type"]
                }
            )
            documents_to_index.append(doc)

        self.index = VectorStoreIndex.from_documents(documents_to_index)
        print("✅ Vector index created successfully.")

    def query(self, user_query: str) -> Dict:
        """Routes a query and returns the answer, sources, and confidence."""
        if not self.index:
            return {"answer": "Please process a document first.", "sources": ""}

        # Stage 1: Route the query by classifying its intent
        prompt = f"""
        [INST] You are a query routing expert. Classify the user's query into the most relevant document type.
        Choose from: {self.doc_types}.

        User Query: "{user_query}"

        Respond with only the most likely document type. [/INST]
        """
        predicted_doc_type = Settings.llm.complete(prompt).text.strip()
        print(f"Query routed to: {predicted_doc_type}")

        # Stage 2: Retrieve with metadata filters
        filters = MetadataFilters(filters=[ExactMatchFilter(key="doc_type", value=predicted_doc_type)])
        retriever = self.index.as_retriever(similarity_top_k=3, filters=filters)

        query_engine = RetrieverQueryEngine.from_args(retriever=retriever)
        response = query_engine.query(user_query)

        # Format sources for display
        sources = ""
        if response.source_nodes:
            sources = "--- SOURCES ---\n"
            for i, node in enumerate(response.source_nodes):
                sources += f"**Source {i+1} (Page: {node.metadata['page_number']}, Score: {node.score:.2f})**:\n"
                sources += f"```\n{node.get_text().strip()}\n```\n\n"

        return {"answer": response.response, "sources": sources}

# Instantiate the pipeline
pipeline = DocumentIntelligencePipeline()

In [None]:



# ==============================================================================
# Part 4: Gradio UI Event Handlers
# ==============================================================================
def process_document_and_display(file):
    """Gradio handler to process an uploaded PDF and display segmentation."""
    if file is None:
        return "Please upload a file.", None

    # Run segmentation and indexing
    segmentation_results = pipeline.segment_document(file.name)
    pipeline.create_index()

    # Format results for display
    display_df = [
        {"Page": r["page"], "Is New Doc?": r["is_new"], "Document Type": r["doc_type"], "Page in Doc": r["page_in_doc"]}
        for r in segmentation_results
    ]

    return f"✅ Processed {os.path.basename(file.name)} and created index.", display_df

def chat_handler(message, history):
    """Gradio handler for the chatbot interaction."""
    result = pipeline.query(message)
    return result["answer"], result["sources"]

# ==============================================================================
# Part 5: Gradio Interface
# ==============================================================================
theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky")

with gr.Blocks(theme=theme, title="End-to-End Document Intelligence") as app:
    gr.Markdown("# 🤖 End-to-End Document Intelligence System")
    gr.Markdown("Upload a multi-document PDF, see it get automatically segmented, and ask questions about its content.")

    with gr.Row():
        with gr.Column(scale=2):
            file_uploader = gr.File(label="Upload your PDF Blob File", file_types=[".pdf"])
            process_btn = gr.Button("⚙️ Process Document", variant="primary")
            gr.Markdown("### Segmentation Results")
            segmentation_display = gr.DataFrame(headers=["Page", "Is New Doc?", "Document Type", "Page in Doc"], interactive=False)

        with gr.Column(scale=3):
            chatbot = gr.Chatbot(label="Chat with your documents", height=500)
            msg_box = gr.Textbox(label="Your Question", placeholder="e.g., What is the net pay on the payslip?", scale=7)
            source_display = gr.Markdown(label="Sources Used for Answer")

    # Wire up the UI components to the backend logic
    process_btn.click(
        process_document_and_display,
        inputs=[file_uploader],
        outputs=[chatbot, segmentation_display]
    )

    msg_box.submit(
        chat_handler,
        inputs=[msg_box, chatbot],
        outputs=[chatbot, source_display]
    ).then(lambda: "", outputs=[msg_box]) # Clear textbox after submit

# ==============================================================================
# Part 6: Launch
# ==============================================================================
# Add a placeholder file for the user to download and test
print("\nIMPORTANT: Please download the 'Test Blob File.pdf' from the file browser on the left to test the application.")
!wget -q https://storage.googleapis.com/generativeai-downloads/data/Test%20Blob%20File.pdf -O "/content/Test Blob File.pdf"

print("\n🚀 Launching Gradio App...")
app.launch(debug=True, share=True)

In [1]:
# ==============================================================================
# Step 1: Run this Controlled Installation Cell
# ==============================================================================
print("⏳ Installing specific, compatible library versions...")

# This command installs all required packages with pinned versions to ensure compatibility.
!pip install \
    "gradio==4.29.0" \
    "gradio_client==0.16.0" \
    "llama-index==0.10.34" \
    "pypdf==4.2.0" \
    "pymupdf==1.24.1" \
    "numpy==1.26.4" \
    "llama-cpp-python==0.2.73" \
    "llama-index-llms-llama-cpp==0.1.3" \
    "llama-index-embeddings-huggingface==0.2.0" \
    --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122 -q

print("✅ Installations complete.")

⏳ Installing specific, compatible library versions...
[31mERROR: Cannot install gradio==4.29.0 and gradio_client==0.16.0 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m
[0m✅ Installations complete.


In [None]:
# ==============================================================================
# Step 3: Run the Full Application Code in a New Cell
# ==============================================================================

# Imports and Model Loading
# ==============================================================================
import gradio as gr
import os
import json
import time
import fitz
from pypdf import PdfReader
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from typing import List, Dict
from PIL import Image

print("⏳ Setting up...")
model_path = "/content/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
if not os.path.exists(model_path):
    print("⏳ Downloading Mistral 7B model...")
    !wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf -O {model_path}

Settings.llm = LlamaCPP(
    model_path=model_path, temperature=0.1, max_new_tokens=512,
    context_window=3900, model_kwargs={"n_gpu_layers": 35}, verbose=False
)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
print("✅ Models loaded.")

# Backend Pipeline Logic
# ==============================================================================
class DocumentIntelligencePipeline:
    def __init__(self):
        self.index = None; self.file_name = ""
        self.doc_types_in_file = []

    def process_and_index_document(self, pdf_path: str):
        self.file_name = os.path.basename(pdf_path)
        reader = PdfReader(pdf_path)
        pages = [{"text": page.extract_text()} for page in reader.pages]
        if not pages: raise ValueError("Could not extract pages.")

        documents_to_index = []
        doc_types_found = set()
        for i, page in enumerate(pages):
            doc_type = "Document"
            doc_types_found.add(doc_type)
            metadata = {"page_number": i + 1, "doc_type": doc_type}
            doc = Document(text=page['text'], metadata=metadata)
            documents_to_index.append(doc)

        self.index = VectorStoreIndex.from_documents(documents_to_index)
        self.doc_types_in_file = list(doc_types_found)
        stats = {"File": self.file_name, "Pages": len(pages)}
        return stats

    def query(self, user_query: str, top_k: int):
        if not self.index: return {"answer": "Please process a document first.", "sources": ""}
        retriever = self.index.as_retriever(similarity_top_k=int(top_k))
        nodes = retriever.retrieve(user_query)
        context = "\n\n".join([n.get_text() for n in nodes])
        prompt = f"[INST] Use context to answer. Question: {user_query} [/INST]\nContext:\n{context}"
        response = Settings.llm.complete(prompt)
        sources = "--- SOURCES ---\n" + "\n".join([f"**Source {i+1} (Page: {n.metadata['page_number']})**:\n```{n.get_text()[:200].strip()}...```" for i, n in enumerate(nodes)])
        return {"answer": response.text.strip(), "sources": sources}

pipeline = DocumentIntelligencePipeline()

# Gradio UI and Handlers
# ==============================================================================
def process_pdf_and_update_ui(file, progress=gr.Progress()):
    if file is None: return None, "⚠️ Please upload a PDF.", [], gr.update(interactive=False)

    doc = fitz.open(file.name)
    pix = doc[0].get_pixmap(dpi=150)
    preview_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    doc.close()

    progress(0.1, desc="📄 Indexing PDF...")
    stats = pipeline.process_and_index_document(file.name)
    info_text = f"✅ **Processed:**\n- **File:** {stats['File']}\n- **Pages:** {stats['Pages']}"
    progress(1.0, desc="✅ Ready.")
    return preview_image, info_text, [], gr.update(interactive=True)

def chat_handler(message, history, top_k):
    if not pipeline.index:
        history.append({"role": "user", "content": message})
        history.append({"role": "assistant", "content": "Please process a document first."})
        yield history, "", ""
        return
    history.append({"role": "user", "content": message})
    history.append({"role": "assistant", "content": "Thinking..."})
    yield history, "", ""
    result = pipeline.query(message, top_k)
    history[-1]['content'] = result["answer"]
    yield history, result["sources"], ""

with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky"), title="Document Q&A") as app:
    gr.Markdown("# 🚀 Document Q&A System")
    with gr.Row(equal_height=False):
        with gr.Column(scale=4, min_width=400):
            file_uploader = gr.File(label="📄 Upload PDF File", file_types=[".pdf"])
            process_btn = gr.Button("⚙️ Process Document", variant="primary")
            pdf_preview = gr.Image(label="PDF Preview (First Page)", height=650)
        with gr.Column(scale=2, min_width=300):
            info_output = gr.Markdown(value="Please upload a PDF file to begin.")
            chunks_to_retrieve = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="📊 Chunks to Retrieve")
        with gr.Column(scale=4, min_width=400):
            chatbot = gr.Chatbot(label="Chat", height=500, type="messages")
            source_display = gr.Markdown(label="Sources Used for Answer")
            msg_box = gr.Textbox(label="Your Question", placeholder="Ask a question...", interactive=False, container=False)
    process_btn.click(process_pdf_and_update_ui, inputs=[file_uploader], outputs=[pdf_preview, info_output, chatbot, msg_box])
    msg_box.submit(chat_handler, inputs=[msg_box, chatbot, chunks_to_retrieve], outputs=[chatbot, source_display, msg_box])

# Launch the App
# ==============================================================================
test_file_path = "/content/Test Blob File.pdf"
if not os.path.exists(test_file_path):
    !wget -q https://storage.googleapis.com/generativeai-downloads/data/Test%20Blob%20File.pdf -O "/content/Test Blob File.pdf"
print("\n🚀 Launching Gradio App...")
app.launch(debug=True, share=True)

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /usr/local/lib/python3.12/dist-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


⏳ Setting up...
⏳ Downloading Mistral 7B model...
--2025-09-30 05:17:17--  https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf
Resolving huggingface.co (huggingface.co)... 13.35.202.34, 13.35.202.40, 13.35.202.121, ...
Connecting to huggingface.co (huggingface.co)|13.35.202.34|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/65778ac662d3ac1817cc9201/865f5e4682dddb29c2e20270b2471a7590c83a414bbf1d72cf4c08fdff2eeca4?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250930%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250930T051240Z&X-Amz-Expires=3600&X-Amz-Signature=0d0053b458cb7a2ee48b4a58e700ccb2dacdf20255b49f66afd98ca3a5eaae31&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27mistral-7b-instruct-v0.2.Q4_K_M.gguf%3B+filename%3D%22mistral-7b-instruct-v0

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Models loaded.

🚀 Launching Gradio App...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://eefc4d929958737631.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
