In [18]:
# Cell 2 - imports & config
import os, io, json, re, math
from typing import List, Tuple, Dict, Any
import fitz                # PyMuPDF
from PIL import Image
from pdf2image import convert_from_path
import pytesseract

# Embedding model
from sentence_transformers import SentenceTransformer

# Vector DB
import chromadb
from chromadb.config import Settings

# Ollama (local LLM client)
import ollama

# --------- CONFIG ---------
POPLER_PATH = r"C:\Users\lenovo\Downloads\Release-25.12.0-0\poppler-25.12.0\Library\bin"   # <-- change to your poppler bin (Windows)
TESSERACT_CMD = r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # <-- change if needed
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD

EMBED_MODEL_NAME = "sentence-transformers/paraphrase-mpnet-base-v2"   # small CPU embedding model
EMBED_BATCH = 64

# Ollama models (change if you pulled different ones)
CLASSIFIER_MODEL = "mistral-nemo:12b"
EXTRACTION_MODEL = "mistral-nemo:12b"

# Chroma persistent directory
CHROMA_DIR = "chroma_store"

# Chunk size for text chunks used to create embeddings
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200


In [19]:
# Cell 3 - text extraction (auto-detect)
def extract_text_auto(pdf_path: str, ocr_dpi=300, ocr_lang="eng") -> List[str]:
    """
    Returns: list of page texts (one element per page).
    Uses PyMuPDF for machine text and falls back to OCR per-page.
    """
    doc = fitz.open(pdf_path)
    pages_text = []
    for i, page in enumerate(doc, start=1):
        try:
            page_text = page.get_text().strip()
        except Exception:
            page_text = ""
        if page_text and len(page_text) > 40:
            pages_text.append(page_text)
        else:
            # Try PyMuPDF pixmap -> PIL image -> pytesseract
            try:
                pix = page.get_pixmap(dpi=ocr_dpi)
                img = Image.open(io.BytesIO(pix.tobytes()))
            except Exception:
                # fallback to pdf2image (slower)
                images = convert_from_path(pdf_path, dpi=ocr_dpi, poppler_path=POPLER_PATH)
                img = images[i-1] if i-1 < len(images) else images[0]
            ocr_text = pytesseract.image_to_string(img, lang=ocr_lang)
            pages_text.append(ocr_text)
    return pages_text

def join_pages(pages: List[str]) -> str:
    txt = "\n".join(pages)
    return " ".join(txt.split())


In [20]:
# Cell 4 - chunking long text into overlapping chunks
def chunk_text(text: str, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP) -> List[str]:
    text = text.strip()
    if len(text) <= chunk_size:
        return [text]
    chunks = []
    start = 0
    L = len(text)
    while start < L:
        end = min(start + chunk_size, L)
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start += (chunk_size - overlap)
    return chunks


In [21]:
# Cell 5 - load SentenceTransformer once
print("Loading embedding model:", EMBED_MODEL_NAME)
embed_model = SentenceTransformer(EMBED_MODEL_NAME)
print("Embedding model loaded.")


Loading embedding model: sentence-transformers/paraphrase-mpnet-base-v2
Embedding model loaded.


In [22]:
def get_extraction_prompt(category, text):
    """
    Returns a strict JSON extraction prompt for the given document category.
    Bank Statements and Salary Slips have been intentionally removed.
    """

    # ============================================================
    # 1) HOSPITAL INSURANCE CLAIM FORM
    # ============================================================
    if category == "Claim Document":
        return f"""
You are an expert in extracting structured fields from HOSPITAL INSURANCE CLAIM FORMS.
Extract ONLY values explicitly visible — NO hallucination.

CRITICAL RULES:
1. Missing → return "Not mentioned".
2. Preserve original formatting of dates, amounts, and labels.
3. Checkboxes: ✔ / ✓ / X / Yes / Selected = mark as present.
4. Keep OCR errors minimal — extract only clear text.
5. AI Summary must be max 2 sentences.

FIELDS TO EXTRACT:
- Primary Insured Name
- Policy Number
- TPA / Company ID
- Employee / Member ID
- Patient Name
- Insurance Company Name
- Hospital Name
- Hospital Type
- Admission Date
- Discharge Date
- Injury / Illness Type
- Claim Type
- Claim Amount
- Billing Breakdown:
    - Pre Hospitalization
    - Hospitalization
    - Post Hospitalization
    - Pharmacy Bills
    - Ambulance Charges
    - Other Charges
- Submitted Documents Checklist (list)
- City / Location
- AI Summary

DOCUMENT:
{text}

Return STRICT JSON ONLY:
{{
  "Primary Insured Name": "",
  "Policy Number": "",
  "TPA / Company ID": "",
  "Employee / Member ID": "",
  "Patient Name": "",
  "Insurance Company Name": "",
  "Hospital Name": "",
  "Hospital Type": "",
  "Admission Date": "",
  "Discharge Date": "",
  "Injury / Illness Type": "",
  "Claim Type": "",
  "Claim Amount": "",
  "Billing Breakdown": {{
    "Pre Hospitalization": "",
    "Hospitalization": "",
    "Post Hospitalization": "",
    "Pharmacy Bills": "",
    "Ambulance Charges": "",
    "Other Charges": ""
  }},
  "Submitted Documents Checklist": [],
  "City / Location": "",
  "AI Summary": ""
}}
"""

    # ============================================================
    # 2) HEALTH INSURANCE POLICY
    # ============================================================
    if category == "Health Insurance Policy":
        return f"""
You are an expert in HEALTH INSURANCE POLICY extraction.

RULES:
- Extract ONLY explicit text from document.
- Missing → "Not mentioned".
- Preserve formatting.

FIELDS:
- Policy Holder Name
- Policy Number
- Insurance Company
- TPA Name
- Sum Insured
- Coverage Type
- Policy Start Date
- Policy End Date
- UIN / Product Code
- AI Summary (max 2 sentences)

DOCUMENT:
{text}

Return STRICT JSON ONLY:
{{
 "Policy Holder Name": "",
 "Policy Number": "",
 "Insurance Company": "",
 "TPA Name": "",
 "Sum Insured": "",
 "Coverage Type": "",
 "Policy Start Date": "",
 "Policy End Date": "",
 "UIN / Product Code": "",
 "AI Summary": ""
}}
"""

    # ============================================================
    # 3) MOTOR INSURANCE POLICY
    # ============================================================
    if category == "Motor Insurance Policy":
        return f"""
You extract structured fields from MOTOR INSURANCE POLICY documents.

RULES:
- No hallucination.
- Preserve original formatting.

FIELDS:
- Policy Holder Name
- Policy Number
- Insurance Company
- Coverage Type
- Vehicle Model
- Registration Number
- Engine / Chassis Number
- Policy Start Date
- Policy End Date
- IDV (if present)
- UIN
- AI Summary

DOCUMENT:
{text}

Return STRICT JSON ONLY:
{{
 "Policy Holder Name": "",
 "Policy Number": "",
 "Insurance Company": "",
 "Coverage Type": "",
 "Vehicle Model": "",
 "Registration Number": "",
 "Engine / Chassis Number": "",
 "Policy Start Date": "",
 "Policy End Date": "",
 "IDV": "",
 "UIN": "",
 "AI Summary": ""
}}
"""

    # ============================================================
    # 4) LIFE INSURANCE POLICY
    # ============================================================
    if category == "Life Insurance Policy":
        return f"""
You extract structured fields from LIFE INSURANCE POLICY documents.

RULES:
- No guessing. Missing → "Not mentioned".
- Preserve formatting.
- AI Summary: max 2 sentences.

FIELDS:
- Policy Holder Name
- Insured Person / Life Assured
- Policy Number
- Insurance Company
- Plan / Policy Type
- Sum Assured
- Annual / Monthly Premium
- Policy Term
- Premium Paying Term
- Nominee Name
- Policy Start Date
- Policy End Date / Maturity Date
- UIN / Product Code
- Benefit Type
- AI Summary

DOCUMENT:
{text}

Return STRICT JSON ONLY:
{{
 "Policy Holder Name": "",
 "Insured Person / Life Assured": "",
 "Policy Number": "",
 "Insurance Company": "",
 "Plan / Policy Type": "",
 "Sum Assured": "",
 "Annual / Monthly Premium": "",
 "Policy Term": "",
 "Premium Paying Term": "",
 "Nominee Name": "",
 "Policy Start Date": "",
 "Policy End Date / Maturity Date": "",
 "UIN / Product Code": "",
 "Benefit Type": "",
 "AI Summary": ""
}}
"""

    # ============================================================
    # 5) IRCTC TRAIN TICKET
    # ============================================================
    if category == "IRCTC Ticket":
        return f"""
You extract structured fields from IRCTC train tickets.

FIELDS:
- Passenger Name
- Train Number
- Train Name
- Date of Journey
- Boarding Station
- Destination Station
- Class
- PNR
- Booking Status
- Current Status
- Coach / Seat
- Fare
- AI Summary

DOCUMENT:
{text}

Return STRICT JSON ONLY:
{{
 "Passenger Name": "",
 "Train Number": "",
 "Train Name": "",
 "Date of Journey": "",
 "Boarding Station": "",
 "Destination Station": "",
 "Class": "",
 "PNR": "",
 "Booking Status": "",
 "Current Status": "",
 "Coach / Seat": "",
 "Fare": "",
 "AI Summary": ""
}}
"""

    # ============================================================
    # 6) INVOICE
    # ============================================================
    if category == "Invoice":
        return f"""
You extract structured fields from INVOICE documents.

FIELDS:
- Invoice Number
- Invoice Date
- Vendor Name
- Customer Name
- Items (name, quantity, price, amount)
- Tax Amount
- Total Amount
- Payment Terms
- AI Summary

DOCUMENT:
{text}

Return STRICT JSON ONLY:
{{
 "Invoice Number": "",
 "Invoice Date": "",
 "Vendor Name": "",
 "Customer Name": "",
 "Items": [],
 "Tax Amount": "",
 "Total Amount": "",
 "Payment Terms": "",
 "AI Summary": ""
}}
"""

    # ============================================================
    # 7) LEGAL NOTICE  ✅ FIXED & PROPERLY INDENTED
    # ============================================================
    if category == "Legal Notice":
        return f"""
You are an expert in extracting structured information from LEGAL NOTICE DOCUMENTS.

RULES:
- Extract ONLY what is explicitly written.
- No hallucination. Missing → "Not mentioned".
- AI Summary = 2 clear sentences.

FIELDS:
- Document Title
- Client / Complainant
- Accused / Respondent
- Advocate
- Issue Description
- Contract Amount
- Amount Pending
- Payment Deadline
- Interest Rate
- Notice Date
- Legal Actions Mentioned
- AI Summary

DOCUMENT:
{text}

Return STRICT JSON ONLY:
{{
  "Document Title": "",
  "Client / Complainant": "",
  "Accused / Respondent": "",
  "Advocate": "",
  "Issue Description": "",
  "Contract Amount": "",
  "Amount Pending": "",
  "Payment Deadline": "",
  "Interest Rate": "",
  "Notice Date": "",
  "Legal Actions Mentioned": "",
  "AI Summary": ""
}}
"""

    # ============================================================
    # FALLBACK
    # ============================================================
    return f"""
Extract ONLY explicit metadata from this document.

DOCUMENT:
{text}

Return STRICT JSON ONLY:
{{
 "Document Type": "",
 "Key Fields": "",
 "AI Summary": ""
}}
"""


In [23]:
# Use this replace for your upsert_pdf_to_chroma function
def upsert_pdf_to_chroma(pdf_path: str, collection_name: str, clear_existing=False):
    """
    Extract text, chunk, embed, and upsert to Chroma collection.
    - If clear_existing=True, will delete any existing items with same collection_name.
    """
    pages = extract_text_auto(pdf_path)
    full_text = " ".join(pages)
    chunks = chunk_text(full_text)

    client = get_chroma_client()
    # create or get collection
    try:
        col = client.get_collection(collection_name)
    except Exception:
        col = client.create_collection(collection_name)

    ids = [f"{os.path.basename(pdf_path)}:::chunk::{i}" for i in range(len(chunks))]
    metadatas = [{"source": pdf_path, "chunk_index": i} for i in range(len(chunks))]

    # compute embeddings in batches and convert to plain python lists
    embeddings = []
    for i in range(0, len(chunks), EMBED_BATCH):
        batch = chunks[i:i+EMBED_BATCH]
        embs = embed_model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
        # convert each row to native python list for chroma safety
        for row in embs:
            embeddings.append(row.tolist() if hasattr(row, "tolist") else list(row))

    if len(embeddings) != len(chunks):
        raise RuntimeError(f"Embeddings length mismatch: {len(embeddings)} vs chunks {len(chunks)}")

    # Optionally remove any previous chunks for this file to avoid duplicates
    if clear_existing:
        try:
            # naive delete by ids if present
            existing = col.get(include=["ids", "metadatas"])
            # collect ids that start with this filename
            to_delete = [eid for eid in existing.get("ids", []) if eid.startswith(os.path.basename(pdf_path) + ":::chunk::")]
            if to_delete:
                col.delete(ids=to_delete)
        except Exception:
            pass

    # Add (or upsert)
    col.add(ids=ids, documents=chunks, metadatas=metadatas, embeddings=embeddings)
    print(f"Inserted {len(chunks)} chunks into collection '{collection_name}'")
    return col


In [24]:
# Cell 7 - retrieval & simple embedding-based classifier
def retrieve_similar_chunks(query: str, collection_name: str, top_k=5):
    client = get_chroma_client()
    col = client.get_collection(collection_name)

    # FIXED embedding call
    q_emb = embed_model([query])[0]
    try:
        q_emb = q_emb.tolist()
    except:
        pass

    results = col.query(
        query_embeddings=[q_emb],
        n_results=top_k
    )

    docs = results.get("documents", [[]])[0]
    metas = results.get("metadatas", [[]])[0]
    distances = results.get("distances", [[]])[0] if "distances" in results else [None] * len(docs)

    # Prevent None errors
    safe_docs = [(doc if isinstance(doc, str) else "") for doc in docs]

    return list(zip(safe_docs, metas, distances))


# Small helper: quick category candidates by checking common category keywords inside top chunks
# Small helper: quick category candidates by checking common category keywords inside top chunks
CATEGORY_KEYWORDS = {
    "Claim Document": [
        "claim", "admission", "discharge", "diagnosis", "tpa",
        "claim amount", "hospital", "pre-authorization"
    ],

    "Health Insurance Policy": [
        "policy", "sum insured", "uin", "health insurance",
        "coverage", "policy schedule"
    ],

    "Motor Insurance Policy": [
        "motor", "vehicle", "engine", "chassis", "registration",
        "idv", "two wheeler", "four wheeler"
    ],

    "Life Insurance Policy": [
        "life assured", "sum assured", "premium", "maturity",
        "death benefit", "survival benefit", "policy term"
    ],

    "Hospital Bill": [
        "bill", "invoice", "hospital charges", "room rent",
        "doctor fee", "pharmacy", "investigation"
    ],

    "Payment Receipt": [
        "receipt", "amount paid", "paid on", "transaction id",
        "payment received", "cash received"
    ],

    "KYC / Identity Document": [
        "passport", "aadhar", "aadhaar", "identity", "dob",
        "pan", "voter id", "driving license"
    ],

    "IRCTC Ticket": [
        "pnr", "train", "railway", "departure", "arrival",
        "berth", "coach", "irctc", "journey"
    ],

    "Invoice": [
        "invoice", "gst", "total payable", "unit price", "quantity",
        "tax invoice", "hsn", "igst", "cgst", "sgst"
    ],

    "Legal Notice": [
        "legal notice", "advocate", "lawyer", "contractor",
        "agreement", "pending amount", "breach", "serve notice",
        "defaulter", "obligation", "due amount", "legal action"
    ]
}

def embedding_candidate_category(chunks_with_meta):
    scores = {}

    for doc, meta, dist in chunks_with_meta:
        if not doc:
            continue

        snippet = str(doc).lower()

        for category, keywords in CATEGORY_KEYWORDS.items():
            for k in keywords:
                if k in snippet:
                    scores[category] = scores.get(category, 0) + 1

    if not scores:
        return []

    valid_chunks = max(1, sum(1 for doc,_,_ in chunks_with_meta if doc))

    for c in scores:
        scores[c] /= valid_chunks

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)


In [25]:
def llm_classify_with_context(text: str, model=CLASSIFIER_MODEL, top_k_snippets: List[str] = None):
    """
    Uses an LLM to classify the document into one category.
    Matches EXACTLY the categories supported in get_extraction_prompt().
    """

    ctx = "\n\n".join(top_k_snippets) if top_k_snippets else text[:3000]

    prompt = f"""
You are a highly accurate professional document classifier.
Classify the document into exactly ONE category from this list:

- Claim Document
- Health Insurance Policy
- Motor Insurance Policy
- Life Insurance Policy
- Hospital Bill
- Payment Receipt
- KYC / Identity Document
- IRCTC Ticket
- Invoice
- Legal Notice
- Other

RULES:
1. Use ONLY explicit text visible in the context.
2. If unsure or ambiguous → return "Other".
3. Output ONLY valid JSON.
4. Confidence MUST be between 0 and 1.

Return JSON only in this exact format:
{{ "category": "", "confidence": 0.0 }}

CONTEXT:
{ctx}
"""

    try:
        resp = ollama.chat(model=model, messages=[{"role":"user","content":prompt}])
        content = resp["message"]["content"].strip()
    except Exception as e:
        print("⚠️ ERROR: LLM classifier failed:", e)
        return {"category": "Other", "confidence": 0.0}

    # strict json
    try:
        parsed = json.loads(content)
        if "category" in parsed and "confidence" in parsed:
            return parsed
    except:
        pass

    m = re.search(r"\{[\s\S]*?\}", content)
    if m:
        try:
            obj = json.loads(m.group(0))
            if "category" in obj and "confidence" in obj:
                return obj
        except:
            pass

    return {"category": "Other", "confidence": 0.3}


In [26]:
# Cell 9 - improved prompt router (only key templates shown; add more as needed)
'''def get_extraction_prompt(category: str, text: str) -> str:
    if category == "Claim Document":
        return f"""
You are an expert at extracting structured fields from insurance CLAIM forms (health, motor).
Extract EXACT fields and RETURN ONLY JSON with the following keys:
{{ 
 "Policy Holder Name":"", "Claimant Name":"", "Patient Name":"",
 "Policy Number":"", "Insurance Company":"", "TPA Name":"",
 "Hospital Name":"", "Admission Date":"", "Discharge Date":"",
 "Diagnosis / Reason":"", "Claim Type":"", "Claim Amount":"",
 "UIN":"", "Documents Submitted": [], "AI Summary":""
}}

Rules:
- Use "Not mentioned" for missing fields.
- Extract dates/amounts exactly as written.
- Documents Submitted should be a list of submitted/checked items found in text.
- AI Summary: 1-2 sentences stating claimant, policy no (if found), claim type, and amount (if found).

Context Text:
{text}
"""
    if category == "Health Insurance Policy":
        return f"""
[Health policy extraction prompt omitted for brevity — implement similar strict JSON keys]
"""
    if category == "Motor Insurance Policy":
        return f"""
[Motor policy extraction prompt omitted for brevity — implement similar strict JSON keys]
"""
    # generic fallback:
    return f"""
Extract a short structured JSON summary from the document. Return ONLY JSON.
Document:
{text}
"""
'''

'def get_extraction_prompt(category: str, text: str) -> str:\n    if category == "Claim Document":\n        return f"""\nYou are an expert at extracting structured fields from insurance CLAIM forms (health, motor).\nExtract EXACT fields and RETURN ONLY JSON with the following keys:\n{{ \n "Policy Holder Name":"", "Claimant Name":"", "Patient Name":"",\n "Policy Number":"", "Insurance Company":"", "TPA Name":"",\n "Hospital Name":"", "Admission Date":"", "Discharge Date":"",\n "Diagnosis / Reason":"", "Claim Type":"", "Claim Amount":"",\n "UIN":"", "Documents Submitted": [], "AI Summary":""\n}}\n\nRules:\n- Use "Not mentioned" for missing fields.\n- Extract dates/amounts exactly as written.\n- Documents Submitted should be a list of submitted/checked items found in text.\n- AI Summary: 1-2 sentences stating claimant, policy no (if found), claim type, and amount (if found).\n\nContext Text:\n{text}\n"""\n    if category == "Health Insurance Policy":\n        return f"""\n[Health policy e

In [27]:
def extract_fields_via_llm(text: str, category: str, model=EXTRACTION_MODEL, max_chars=6000, rag_k=8, collection_name=None):
    """
    Use RAG: retrieve top rag_k chunks and pass them as 'CONTEXT CHUNKS' to LLM extractor.
    If collection_name is provided, we retrieve relevant chunks from Chroma.
    """
    # If a collection is available, retrieve top chunks relevant to the doc head
    context_block = ""
    if collection_name:
        # use the first 2000 chars as a query to retrieve relevant passages
        q = text[:2000]
        chunks = retrieve_similar_chunks(q, collection_name, top_k=rag_k)
        snippets = [doc for doc,meta,d in chunks if doc]
        if snippets:
            context_block = "\n\n---CONTEXT CHUNKS---\n\n" + "\n\n".join(snippets)

    # Build prompt using context_block + truncated text (fallback)
    body_text = text[:max_chars]
    prompt = get_extraction_prompt(category, body_text + ("\n\n" + context_block if context_block else ""))

    try:
        resp = ollama.chat(model=model, messages=[{"role":"user","content":prompt}])
        content = resp["message"]["content"].strip()
    except Exception as e:
        print("⚠️ ERROR: extraction LLM failed:", e)
        return {"AI Summary": "", "Note": f"LLM call failed: {e}"}

    # remove triple backticks
    content = re.sub(r"```(?:json)?", "", content).strip()

    # try parse JSON robustly
    try:
        return json.loads(content)
    except:
        m = re.search(r"\{[\s\S]*\}", content)
        if m:
            jtxt = m.group(0)
            jtxt = re.sub(r",\s*}", "}", jtxt)
            jtxt = re.sub(r",\s*]", "]", jtxt)
            try:
                return json.loads(jtxt)
            except Exception:
                pass

    return {"AI Summary": content[:2000], "Note": "Failed to parse JSON"}


In [28]:
# Cell 11 - universal weighted confidence scorer
def compute_confidence_from_dict(extracted: dict, category: str = None) -> float:
    """
    Computes confidence score dynamically based on category fields.
    Prevents incorrect penalties for fields that are not applicable 
    to a given document type.
    """

    if not isinstance(extracted, dict) or len(extracted) == 0:
        return 0.0

    # ----------------------
    # CATEGORY-SPECIFIC KEYS
    # ----------------------

    CATEGORY_HIGH = {
        "Claim Document": {
            "Primary Insured Name", "Policy Number", "Patient Name",
            "Hospital Name", "Admission Date", "Discharge Date",
            "Claim Amount", "Claim Type"
        },
        "Health Insurance Policy": {
            "Policy Holder Name", "Policy Number", "Insurance Company",
            "Sum Insured", "Policy Start Date", "Policy End Date"
        },
        "Motor Insurance Policy": {
            "Policy Holder Name", "Policy Number", "Registration Number",
            "Vehicle Model", "Policy Start Date", "Policy End Date"
        },
        "Life Insurance Policy": {
            "Policy Holder Name", "Insured Person / Life Assured",
            "Policy Number", "Sum Assured", "Policy Term"
        },
        "IRCTC Ticket": {
            "Passenger Name", "Train Number", "PNR",
            "Date of Journey", "Boarding Station", "Destination Station"
        },
        "Invoice": {
            "Invoice Number", "Vendor Name", "Customer Name",
            "Invoice Date", "Total Amount"
        },
        "KYC / Identity Document": {
            "Name", "DOB", "Document Number"
        },
        "Hospital Bill": {
            "Hospital Name", "Total Amount", "Patient Name"
        },
        "Payment Receipt": {
            "Receipt Number", "Amount Paid", "Date"
        }
    }

    CATEGORY_MEDIUM = {
        "Claim Document": {"TPA Name", "Injury / Illness Type", "Billing Breakdown"},
        "Health Insurance Policy": {"Coverage Type", "TPA Name"},
        "Motor Insurance Policy": {"Engine / Chassis Number", "Coverage Type", "IDV"},
        "Life Insurance Policy": {"Premium Paying Term", "Nominee Name"},
        "IRCTC Ticket": {"Booking Status", "Current Status", "Coach / Seat"},
        "Invoice": {"Tax Amount", "Payment Terms"},
        "KYC / Identity Document": {"Address", "Gender"},
        "Hospital Bill": {"Room Rent", "Consultation Fee"},
        "Payment Receipt": {"Payment Mode", "Reference Number"},
        "PA Insurance Policy": [
    "personal accident", "accidental death", "pa policy",
    "permanent disability", "td", "ttd", "ppd",
    "education grant", "accident cover"
]

    }

    SKIP = {"AI Summary", "Note"}

    # Get relevant keys for category
    high_keys = CATEGORY_HIGH.get(category, set())
    med_keys  = CATEGORY_MEDIUM.get(category, set())

    total_weight = 0
    score = 0

    def is_valid(v):
        if v is None:
            return False
        s = str(v).strip()
        if not s or s.lower() == "not mentioned":
            return False
        if s in {"--", "-", "n/a", "xxx", "nil"}:
            return False
        if len(s) == 1 and not s.isalnum():
            return False
        return True

    # ----------------------
    # SCORING LOOP
    # ----------------------
    for field, value in extracted.items():
        if field in SKIP:
            continue

        # Detect nested dictionaries (Invoice items, Billing Breakdown, etc.)
        if isinstance(value, dict):
            # score nested fields individually
            for k2, v2 in value.items():
                w = 2 if k2 in med_keys else 1
                total_weight += w
                if is_valid(v2):
                    score += w
            continue

        # Determine weight
        if field in high_keys:
            weight = 3
        elif field in med_keys:
            weight = 2
        else:
            weight = 1  # safe fallback for uncommon fields

        total_weight += weight

        if is_valid(value):
            score += weight

    if total_weight == 0:
        return 0.5  # safe fallback

    conf = score / total_weight
    return round(min(max(conf, 0.0), 1.0), 3)


In [29]:
# ================================================================
# CHROMA CLIENT + EMBEDDING LOADER (REQUIRED)
# ================================================================
import chromadb
from chromadb.utils import embedding_functions

# 1) Define where your Chroma DB will be stored
PERSIST_DIR = "chroma_store"

# 2) Choose a good embedding model
# all-MiniLM-L6-v2 = best lightweight CPU embedding model
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
embed_model = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBED_MODEL_NAME
)

# ---------------------------------------------------------------
# Function: get chroma client (persistent)
# ---------------------------------------------------------------
def get_chroma_client():
    return chromadb.PersistentClient(path=PERSIST_DIR)


# ---------------------------------------------------------------
# Function: embed texts safely
# ---------------------------------------------------------------
def embed_texts(text_list):
    return embed_model(text_list)


# ---------------------------------------------------------------
# Function: Upsert PDF into Chroma
# (this creates document chunks + embeddings)
# ---------------------------------------------------------------
def upsert_pdf_to_chroma(pdf_path, collection_name):
    client = get_chroma_client()

    try:
        col = client.get_collection(collection_name)
    except:
        col = client.create_collection(
            name=collection_name,
            embedding_function=embed_model
        )

    # Extract text pages
    pages = extract_text_auto(pdf_path)

    docs = []
    metas = []
    ids = []

    # Chunk strategy: simple per-page
    for i, page in enumerate(pages):
        if not page.strip():
            continue

        chunk_id = f"{pdf_path}_page_{i}"
        docs.append(page)
        metas.append({"page": i, "source": pdf_path})
        ids.append(chunk_id)

    # Store in vector DB
    col.upsert(
        documents=docs,
        metadatas=metas,
        ids=ids
    )

    print(f"Vector DB updated → Stored {len(docs)} chunks.")
    return col


In [30]:
# Cell 12 - final combined pipeline (clean + consistent)
def process_pdf_hybrid(pdf_path: str, collection_name: str, top_k=5, embed_threshold=0.30):
    
    # 1) Extract raw text
    pages = extract_text_auto(pdf_path)
    full_text = " ".join(pages)
    print(f"Extracted {len(full_text)} characters")

    # 2) Ensure PDF is in Chroma (create if missing)
    client = get_chroma_client()
    try:
        col = client.get_collection(collection_name)
    except Exception:
        col = upsert_pdf_to_chroma(pdf_path, collection_name)

    # 3) Retrieve chunks for fast embedding-based classification
    q = full_text[:2000]  
    top_chunks = retrieve_similar_chunks(q, collection_name, top_k=top_k)

    # 4) CATEGORY detection using keyword scanning
    candidates = embedding_candidate_category(top_chunks)
    print("Embedding candidates:", candidates)

    # 5) Decide category using embeddings OR LLM fallback
    if candidates and candidates[0][1] >= embed_threshold:
        category = candidates[0][0]
        classifier_conf = float(candidates[0][1])
        print(f"Chroma -> category {category} (score {classifier_conf})")
    else:
        snippets = [doc for doc, meta, dist in top_chunks if doc]
        cls = llm_classify_with_context(
            full_text,
            model=CLASSIFIER_MODEL,
            top_k_snippets=snippets
        )
        category = cls.get("category", "Other")
        classifier_conf = float(cls.get("confidence", 0.0))
        print(f"LLM classifier -> category {category} (conf {classifier_conf})")

    # 6) FIELD EXTRACTION using FULL RAG CONTEXT (Option B)
    extracted = extract_fields_via_llm(
        full_text,
        category,
        model=EXTRACTION_MODEL,
        max_chars=6000,
        collection_name=collection_name  # <--- Full RAG added
    )
    print("Extracted keys:", list(extracted.keys())[:25])

    # 7) Confidence scoring
    fields_conf = compute_confidence_from_dict(extracted)
    combined_conf = round((classifier_conf + fields_conf) / 2, 3)

    # 8) Final Output JSON
    result = {
        "document_path": pdf_path,
        "category": category,
        "classification_confidence": float(classifier_conf),
        "fields_confidence": fields_conf,
        "combined_confidence": combined_conf,
        "extracted": extracted
    }

    return result


In [31]:
# Cell 13 - choose file manually or via dialog and run
from tkinter import Tk
from tkinter.filedialog import askopenfilename
Tk().withdraw()

pdf_path = askopenfilename(title="Select PDF file", filetypes=[("PDF files","*.pdf")])
print("Selected:", pdf_path)

collection_name = "insurance_docs"   # vector DB
res = process_pdf_hybrid(pdf_path, collection_name=collection_name, top_k=5, embed_threshold=0.35)

print(json.dumps(res, indent=2, ensure_ascii=False))

with open("extracted_result.json", "w", encoding="utf-8") as f:
    json.dump(res, f, ensure_ascii=False, indent=2)

print("Saved extracted_result.json")


Selected: C:/Users/lenovo/Desktop/rag with chatboat/PA_CERTIFICATE_6929363934.pdf
Extracted 4813 characters
Embedding candidates: []
LLM classifier -> category Health Insurance Policy (conf 0.9)
Extracted keys: ['Policy Holder Name', 'Policy Number', 'Insurance Company', 'TPA Name', 'Sum Insured', 'Coverage Type', 'Policy Start Date', 'Policy End Date', 'UIN / Product Code', 'AI Summary']
{
  "document_path": "C:/Users/lenovo/Desktop/rag with chatboat/PA_CERTIFICATE_6929363934.pdf",
  "category": "Health Insurance Policy",
  "classification_confidence": 0.9,
  "fields_confidence": 0.889,
  "combined_confidence": 0.895,
  "extracted": {
    "Policy Holder Name": "Indian Railway Catering and Tourism Corporation Limited.",
    "Policy Number": "5002004224P101121449000",
    "Insurance Company": "UNITED INDIA INSURANCE COMPANY LIMITED",
    "TPA Name": "Not mentioned",
    "Sum Insured": "10,00,000/- for Death and Permanent Total Disability; upto 7,50,000 for Permanent Partial Disability; 

In [32]:
import chromadb

client = chromadb.PersistentClient(
    path=r"C:\Users\lenovo\Desktop\rag with chatboat\chroma_store"
)

def reset_collection(collection_name: str):
    try:
        col = client.get_collection(collection_name)

        # get all current items
        data = col.get()
        ids = data.get("ids", [])

        if not ids:
            print("[INFO] Collection is already empty.")
            return

        # delete all by IDs
        col.delete(ids=ids)
        print(f"[OK] Deleted {len(ids)} items from collection '{collection_name}'.")

    except Exception as e:
        print(f"[ERROR] Could not reset collection '{collection_name}'. Reason: {e}")

reset_collection("insurance_docs")


[INFO] Collection is already empty.
