In [2]:
!pip install -q torch

In [3]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA available: False


In [1]:
# Check CUDA version first
!nvcc --version

# Install llama-cpp-python with CUDA 12.x support
!pip install --no-cache-dir llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu123

# install remaining libraries
!pip install llama-index
!pip install pymupdf
!pip install llama-index-llms-llama-cpp
!pip install llama-index-embeddings-huggingface

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Looking in indexes: https://pypi.org/simple, https://abetlen.github.io/llama-cpp-python/whl/cu123
Collecting llama-index-llms-llama-cpp
  Using cached llama_index_llms_llama_cpp-0.4.0-py3-none-any.whl.metadata (4.5 kB)
Collecting llama-cpp-python<0.4.0,>=0.3.0 (from llama-index-llms-llama-cpp)
  Using cached llama_cpp_python-0.3.14.tar.gz (51.0 MB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Using cached llama_index_llms_llama_cpp-0.4.0-py3-none-any.whl (7.5 kB)
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created 

In [4]:
from llama_cpp import Llama
import os

# Download Mistral model if not already present
model_path = "/content/mistral.gguf"
if not os.path.exists(model_path):
    !wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf -O {model_path}
    print(f"Model downloaded to {model_path}")

# Verify file exists and check size
if os.path.exists(model_path):
    print(f"Model file exists. Size: {os.path.getsize(model_path) / (1024 * 1024):.2f} MB")
else:
    print("Model file not found!")


--2025-07-20 22:19:45--  https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf
Resolving huggingface.co (huggingface.co)... 18.164.174.23, 18.164.174.118, 18.164.174.55, ...
Connecting to huggingface.co (huggingface.co)|18.164.174.23|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/72/62/726219e98582d16c24a66629a4dec1b0761b91c918e15dea2625b4293c134a92/3e0039fd0273fcbebb49228943b17831aadd55cbcbf56f0af00499be2040ccf9?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27mistral-7b-instruct-v0.2.Q4_K_M.gguf%3B+filename%3D%22mistral-7b-instruct-v0.2.Q4_K_M.gguf%22%3B&Expires=1753053586&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1MzA1MzU4Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzcyLzYyLzcyNjIxOWU5ODU4MmQxNmMyNGE2NjYyOWE0ZGVjMWIwNzYxYjkxYzkxOGUxNWRlYTI2MjViNDI5M2MxMzRhOTIvM2UwMDM5ZmQwMjczZmNiZWJ

In [16]:
import fitz  # PyMuPDF

# Define document paths
doc_paths = {
    "Unknown 1": "/content/sample_bank_statement.pdf",
    "Unknown 2": "/content/payslip_sample_image.pdf",
    "Unknown 3": "/content/appraisal_report.pdf"
}

# Extract text from all PDFs
doc_texts = {}

for i, (doc_type, path) in enumerate(doc_paths.items()):
    doc = fitz.open(path)
    text = "\n".join([page.get_text() for page in doc])
    doc_texts[f"Doc-{i+1}"] = text  # Temporarily label them "Unknown"
    print(f"Extracted {len(text.split())} words from {path}.")

Extracted 287 words from /content/sample_bank_statement.pdf.
Extracted 82 words from /content/payslip_sample_image.pdf.
Extracted 6470 words from /content/appraisal_report.pdf.


In [17]:
doc_texts.keys()

dict_keys(['Doc-1', 'Doc-2', 'Doc-3'])

In [18]:
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core import Document

# Load Mistral model with optimized generic parameters
llm = LlamaCPP(
    model_path="/content/mistral.gguf",
    temperature=0.0,  # Zero temperature for deterministic classification
    max_new_tokens=30,  # We only need a single category name
    context_window=4096,  # Increased context to handle our sampling approach
)


llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /content/mistral.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.

In [19]:
def prepare_document_for_classification(text):
    # Instead of truncating to first 500 chars, create a better representation

    # Get first, middle, and last portions
    doc_length = len(text)
    first_part = text[:min(500, doc_length)]

    middle_start = max(0, doc_length//2 - 250)
    middle_part = text[middle_start:middle_start + min(500, doc_length - middle_start)]

    last_start = max(0, doc_length - 500)
    last_part = text[last_start:]

    # Extract any structural elements (headings, tables, etc.)
    # This is a simplified version - could use regex for better extraction
    #possible_headers = [line.strip() for line in text.split('\n')
    #                   if line.strip() and len(line.strip()) < 50
    #                   and line.strip().isupper()]
    #headers = possible_headers[:10]  # Take first 10 potential headers

    return {
        "first_part": first_part,
        "middle_part": middle_part,
        "last_part": last_part,
        "total_length": doc_length,
        #"potential_headers": "\n".join(headers)
    }

In [20]:
def classify_document(text):
    doc_info = prepare_document_for_classification(text)

    prompt = f"""You are a document classification expert. Classify this document into one of these categories:
    - Bank Statement
    - Pay Slip
    - Appraisal Report
    - Unknown

    Here's information extracted from the document:

    DOCUMENT START EXCERPT:
    {doc_info['first_part']}
    DOCUMENT START EXCERPT END

    DOCUMENT MIDDLE EXCERPT:
    {doc_info['middle_part']}
    DOCUMENT MIDDLE EXCERPT END

    DOCUMENT END EXCERPT:
    {doc_info['last_part']}
    DOCUMENT END EXCERPT END

    Total document length: {doc_info['total_length']} characters

    IMPORTANT INSTRUCTION: Your response must be EXACTLY ONE of these four options:
    Bank Statement
    Pay Slip
    Appraisal Report
    Unknown

    Do not include any explanation, reasoning, or additional text. Respond with ONLY the category name.
    """

    response = llm.complete(prompt)
    raw_response = response.text.strip()

    # Post-process to extract just the category name
    categories = ["Bank Statement", "Pay Slip", "Appraisal Report", "Unknown"]

    # First check if the response exactly matches one of our categories
    if raw_response in categories:
        return raw_response

    # If not, look for the category within the response
    for category in categories:
        if category.lower() in raw_response.lower():
            return category

    # If still no match, return the closest match
    import re
    words = re.findall(r'\b\w+\b', raw_response.lower())
    if "bank" in words or "statement" in words:
        return "Bank Statement"
    elif "pay" in words or "slip" in words or "salary" in words:
        return "Pay Slip"
    elif "appraisal" in words or "property" in words:
        return "Appraisal Report"
    else:
        return "Unknown"

In [None]:
classified_docs = {}
for doc_id, text in doc_texts.items():
    doc_type = classify_document(text)
    classified_docs[doc_id] = {"text": text, "doc_type": doc_type}
    print(f"{doc_id} classified as: {doc_type}")

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Load embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [None]:
classified_docs.keys()

In [None]:
index_map = {}

for doc_id, data in classified_docs.items():
    doc_type = data["doc_type"]

    if doc_type == "Unknown":
        continue  # Skip unknown documents

    document = Document(text=data["text"], metadata={"doc_type": doc_type})

    if doc_type not in index_map:
        index_map[doc_type] = VectorStoreIndex.from_documents([document], embed_model=embed_model)
    else:
        index_map[doc_type].insert(document)

    print(f"Indexed {doc_id} as {doc_type}.")


In [None]:
index_map

In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.response_synthesizers import CompactAndRefine
import re

def route_query(query):
    # Check which document type the query is related to
    prompt = f"""
    Classify the following question into one of these categories:
    - 'Bank Statement'
    - 'Pay Slip'
    - 'Appraisal Report'

    If it does not match any, respond with 'Unknown'.

    IMPORTANT INSTRUCTION: Your response must be EXACTLY ONE of these four options:
    Bank Statement
    Pay Slip
    Appraisal Report
    Unknown

    Do not include any explanation, reasoning, or additional text. Respond with ONLY the category name.

    Query: {query}
    """

    doc_type = llm.complete(prompt).text.strip()

    raw_response = doc_type

    # Post-process to extract just the category name
    categories = ["Bank Statement", "Pay Slip", "Appraisal Report", "Unknown"]

    # First check if the response exactly matches one of our categories
    if raw_response in categories:
        doc_type = raw_response

    # If not, look for the category within the response
    for category in categories:
        if category.lower() in raw_response.lower():
            doc_type = category

    # If still no match, return the closest match
    words = re.findall(r'\b\w+\b', raw_response.lower())
    if "bank" in words or "statement" in words:
        doc_type = "Bank Statement"
    elif "pay" in words or "slip" in words or "salary" in words:
        doc_type = "Pay Slip"
    elif "appraisal" in words or "property" in words:
        doc_type = "Appraisal Report"
    else:
        doc_type = "Unknown"

    if doc_type not in index_map:
        return "Could not determine document type."

    # Retrieve from the correct index
    retriever = index_map[doc_type].as_retriever(similarity_top_k=2)

    # Create a response synthesizer with the Mistral model
    response_synthesizer = CompactAndRefine(
        llm=llm,
        verbose=True
    )

    # Create the query engine with our explicit components
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer
    )

    response = query_engine.query(query)
    return f"📄 **Document Type:** {doc_type}\n🔍 **Answer:** {response}"

# Test different queries
print(route_query("What is my net salary?"))
print(route_query("What is the appraised value of the house?"))
print(route_query("What was my last deposit?"))
