In [1]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

In [2]:
from jinja2 import Template
from typing import List, Dict
import json
from pathlib import Path
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI

In [4]:
# PromptBuilderAgent: Modular Prompt Generator

from typing import List, Dict
from jinja2 import Template

# ----------------------------
# AGENT CLASS
# ----------------------------
class PromptBuilderAgent:
    def __init__(self):
        self.context_template = """### DOCUMENT CONTEXT
The content below was extracted from a PDF. It may be machine-readable or OCR-processed.
If OCR was used and accuracy is low, prefer fallback strategies like synonym search, pattern anchoring, or structured table recovery.

[START OF DOCUMENT CONTENT]
{{ document_content }}
[END OF DOCUMENT CONTENT]
"""

        self.few_shot_examples = """### FEW-SHOT EXAMPLES

#### Input Fields
- Commencement Date: The date when the lease becomes active
- Owner Name: The legal name of the landlord

#### Ideal Output
```json
{
  "Commencement Date": {
    "value": "July 1, 2020",
    "reasoning": "Located under section 'Lease Commencement'; clearly a valid date.",
    "confidence": 0.95
  },
  "Owner Name": {
    "value": null,
    "reasoning": "The term 'Owner' was not found. Related terms like 'Lessor' were present but ambiguous.",
    "confidence": 0.4
  }
}
```
"""

        self.output_constraints = """### OUTPUT FORMAT & GUARDRAILS

- Output must be a valid **JSON** object.
- Each key must match a field name exactly.
- Each value must be a nested object with:
  - "value": string or null
  - "reasoning": string explanation of the extraction or failure
  - "confidence": a float between 0 and 1
- Do not hallucinate or fabricate values.
- Only extract information grounded in the document context.
- If OCR issues occur, state so explicitly in the reasoning.
"""

    def generate_system_role(self, filters: Dict[str, str]) -> str:
        return f"""
You are a domain-specific document intelligence system specialized in analyzing legal documents in the **{filters['Module Name']}** module.
You operate primarily in the **{filters['Territory']}** territory, focusing on **{filters['Contract Type']}** documents such as **{filters['Document Type']}**s.
You are assigned to review documents for the owner **{filters['Owner']}**, within the **{filters['Market']}** market, specifically the **{filters['SubMarket']}** submarket in **{filters['Local Market']}**.
These documents pertain to **{filters['Facility Type']}** facilities and can be machine-readable or OCR-scanned.
"""

    def generate_objective(self, fields: List[Dict[str, str]]) -> str:
        objective = ["For each of the following fields, your job is to:",
                    "1. Check whether the field is mentioned in the document.",
                    "2. If found, extract the value exactly as written.",
                    "3. If not found, return null and explain why (e.g., not mentioned, ambiguous, OCR failure).",
                    "4. Return reasoning and confidence score per field.",
                    ""]
        for idx, field in enumerate(fields, 1):
            desc = field.get("description", "")
            line = f"{idx}. Field: '{field['name']}' - {desc}"
            objective.append(line)
        return "\n".join(objective)

    def build_prompt(self, filters: Dict[str, str], fields: List[Dict[str, str]]) -> str:
        role = self.generate_system_role(filters)
        objective = self.generate_objective(fields)
        return f"""
//‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï SYSTEM ‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï
{role}

//‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï OBJECTIVE ‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï
{objective}

{self.context_template}

{self.few_shot_examples}

{self.output_constraints}
"""

In [5]:
# ‚úÖ Example Usage

filters_example = {
    "Module Name": "Real estate",
    "Territory": "South",
    "Owner": "Crown",
    "Market": "South East",
    "Contract Type": "Lease",
    "SubMarket": "Florida",
    "Document Type": "Lease",
    "Local Market": "Tampa",
    "Facility Type": "Easement"
}

fields_example = [
    {"name": "Commencement Date", "description": "The start date of the lease term."},
    {"name": "Site ID", "description": "The internal or regulatory identifier for the facility."},
    {"name": "Owner Name", "description": "Legal name of the property owner or landlord."}
]

agent = PromptBuilderAgent()
prompt_text = agent.build_prompt(filters_example, fields_example)
print(prompt_text)


//‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï SYSTEM ‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï

You are a domain-specific document intelligence system specialized in analyzing legal documents in the **Real estate** module.
You operate primarily in the **South** territory, focusing on **Lease** documents such as **Lease**s.
You are assigned to review documents for the owner **Crown**, within the **South East** market, specifically the **Florida** submarket in **Tampa**.
These documents pertain to **Easement** facilities and can be machine-readable or OCR-scanned.


//‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï OBJECTIVE ‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï‚Äï
For each of the following fields, your job is to:
1. Check whether the field is mentioned in the document.
2. If found, extract the value exactly as written.
3. If not found, return null and explain why (e.g., not mentioned, ambiguous, OCR failure).
4. Return reasoning and confidence score per field.

1. Field: 'Commencement Date' - The start date of the lease term.
2. Field: 'Site ID' - The interna

In [None]:
 A[PDF Upload] --> B[OCR Module]
    B --> C[Cleaned Text]
    C --> D[PromptBuilderAgent (LLM)]
    D --> E[Generated Prompt (Validated)]
    E --> F[LLM Extractor Agent (e.g., GPT-4)]
    F --> G[Structured JSON Output with Reasoning & Confidence]

In [None]:
# Full Document Intelligence Pipeline with OCR Fallback, PDF Support, Audit Logging, and Pydantic Validation

from typing import List, Dict, Optional
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
import pytesseract
from pdf2image import convert_from_path
import fitz  # PyMuPDF
from PIL import Image
import re
import os
import json
from pydantic import BaseModel, ValidationError, Field
from datetime import datetime

# ----------------------------
# PromptBuilderAgent (LLM-generated prompt)
# ----------------------------
class PromptBuilderAgent:
    def __init__(self, model_name: str = "gpt-4", temperature: float = 0.3):
        self.llm = OpenAI(model=model_name, temperature=temperature)
        self.prompt_template = PromptTemplate.from_template("""
You are a prompt generation agent for legal document analysis.

Given the metadata:
- Module: {module_name}
- Document Type: {document_type}
- Facility Type: {facility_type}
- Market/Submarket: {market} / {submarket}
- Local Market: {local_market}
- Owner: {owner}
- OCR Mode: {has_ocr}

And Fields:
{field_string}

Generate a production-grade prompt that includes:
1. Role declaration
2. Step-by-step objective section (per field)
3. Placeholder for document content
4. At least one realistic few-shot example
5. Output JSON schema with value, reasoning, confidence
6. Guardrails to prevent hallucination or guessing
7. OCR handling instructions if text is noisy
""")
        self.chain = LLMChain(prompt=self.prompt_template, llm=self.llm)

    def build_prompt(self, filters: Dict[str, str], fields: List[Dict[str, str]], has_ocr: bool = True) -> str:
        field_str = "\n".join([f"- {f['name']}: {f.get('description', '')}" for f in fields])
        return self.chain.run(
            module_name=filters.get("Module Name", ""),
            document_type=filters.get("Document Type", ""),
            facility_type=filters.get("Facility Type", ""),
            market=filters.get("Market", ""),
            submarket=filters.get("SubMarket", ""),
            local_market=filters.get("Local Market", ""),
            owner=filters.get("Owner", ""),
            has_ocr="Yes" if has_ocr else "No",
            field_string=field_str
        )

# ----------------------------
# PromptValidator
# ----------------------------
def validate_prompt_structure(prompt: str) -> bool:
    required_sections = ["SYSTEM", "OBJECTIVE", "DOCUMENT CONTEXT", "FEW-SHOT", "OUTPUT FORMAT"]
    return all(section in prompt for section in required_sections)

# ----------------------------
# Pydantic Schema for LLM Output
# ----------------------------
class FieldExtraction(BaseModel):
    value: Optional[str]
    reasoning: str
    confidence: float = Field(..., ge=0, le=1)

class ExtractionSchema(BaseModel):
    __root__: Dict[str, FieldExtraction]

# ----------------------------
# DocumentProcessor
# ----------------------------
class DocumentProcessor:
    def __init__(self, prompt_agent: PromptBuilderAgent, extractor_model: str = "gpt-4"):
        self.prompt_agent = prompt_agent
        self.extractor_llm = OpenAI(model=extractor_model, temperature=0.2)

    def extract_text_from_pdf(self, pdf_path: str) -> Dict[str, str]:
        try:
            doc = fitz.open(pdf_path)
            text = "\n\n".join([page.get_text() for page in doc])
            return {"text": text, "is_ocr": False}
        except Exception:
            images = convert_from_path(pdf_path)
            full_text = []
            for img in images:
                ocr_result = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
                for i in range(len(ocr_result['text'])):
                    if int(ocr_result['conf'][i]) > 60:
                        full_text.append(ocr_result['text'][i])
            return {"text": " ".join(full_text), "is_ocr": True}

    def run_pipeline(self, pdf_path: str, filters: Dict[str, str], fields: List[Dict[str, str]]) -> Dict:
        extracted = self.extract_text_from_pdf(pdf_path)
        document_text = extracted["text"]
        is_ocr = extracted["is_ocr"]

        prompt = self.prompt_agent.build_prompt(filters, fields, has_ocr=is_ocr)

        if not validate_prompt_structure(prompt):
            raise ValueError("Generated prompt does not meet required structure standards.")

        final_input = f"""{prompt}\n\n### DOCUMENT CONTENT\n{document_text}\n"""

        raw_output = self.extractor_llm.invoke(final_input)
        self.log_prompt(prompt, raw_output)

        try:
            parsed_output = json.loads(raw_output)
            validated = ExtractionSchema.parse_obj(parsed_output)
            return validated.dict()
        except (json.JSONDecodeError, ValidationError):
            # Retry logic or graceful fallback
            return {"error": "Failed to validate LLM output", "raw": raw_output}

    def log_prompt(self, prompt: str, response: str):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        os.makedirs("audit_logs", exist_ok=True)
        with open(f"audit_logs/prompt_{timestamp}.txt", "w") as f:
            f.write(prompt)
        with open(f"audit_logs/response_{timestamp}.json", "w") as f:
            f.write(response)


# ‚úÖ Example Usage
if __name__ == "__main__":
    filters_example = {
        "Module Name": "Real Estate",
        "Territory": "South",
        "Owner": "Crown",
        "Market": "South East",
        "Contract Type": "Lease",
        "SubMarket": "Florida",
        "Document Type": "Lease",
        "Local Market": "Tampa",
        "Facility Type": "Easement"
    }

    fields_example = [
        {"name": "Commencement Date", "description": "The start date of the lease term."},
        {"name": "Site ID", "description": "Internal or regulatory identifier for the facility."},
        {"name": "Owner Name", "description": "Legal name of the property owner or landlord."}
    ]

    agent = PromptBuilderAgent()
    processor = DocumentProcessor(prompt_agent=agent)
    output = processor.run_pipeline("example_lease.pdf", filters_example, fields_example)
    print(json.dumps(output, indent=2))


In [None]:
# Streamlit UI for Document Intelligence with Retry Logic

import streamlit as st
from typing import List, Dict
from dynamic_prompt_generator import PromptBuilderAgent, DocumentProcessor
from pydantic import ValidationError
import json
import os

# Set page config
st.set_page_config(page_title="Lease Document Analyzer", layout="wide")

# Title
st.title("üìÑ Lease Document Intelligence System")

# Sidebar Inputs
st.sidebar.header("Document Metadata Filters")
filters = {
    "Module Name": st.sidebar.text_input("Module Name", value="Real Estate"),
    "Territory": st.sidebar.text_input("Territory", value="South"),
    "Owner": st.sidebar.text_input("Owner", value="Crown"),
    "Market": st.sidebar.text_input("Market", value="South East"),
    "Contract Type": st.sidebar.text_input("Contract Type", value="Lease"),
    "SubMarket": st.sidebar.text_input("SubMarket", value="Florida"),
    "Document Type": st.sidebar.text_input("Document Type", value="Lease"),
    "Local Market": st.sidebar.text_input("Local Market", value="Tampa"),
    "Facility Type": st.sidebar.text_input("Facility Type", value="Easement")
}

# Field Input
st.sidebar.header("Fields to Extract")
num_fields = st.sidebar.number_input("Number of fields", min_value=1, max_value=10, value=3)
fields_input = []
for i in range(num_fields):
    with st.sidebar.expander(f"Field {i+1}"):
        name = st.text_input(f"Field Name {i+1}", key=f"fname_{i}")
        desc = st.text_input(f"Description {i+1}", key=f"fdesc_{i}")
        if name:
            fields_input.append({"name": name, "description": desc})

# File Upload
st.header("Upload Lease Document (PDF)")
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

# Process Button
if uploaded_file and fields_input:
    if st.button("Run Document Analysis"):
        with st.spinner("üîç Processing document and generating prompt..."):
            # Save to temp
            pdf_path = os.path.join("temp", uploaded_file.name)
            os.makedirs("temp", exist_ok=True)
            with open(pdf_path, "wb") as f:
                f.write(uploaded_file.read())

            # Initialize engine
            agent = PromptBuilderAgent()
            processor = DocumentProcessor(prompt_agent=agent)

            # Retry logic
            max_attempts = 2
            attempt = 0
            output = {}
            while attempt < max_attempts:
                output = processor.run_pipeline(pdf_path, filters, fields_input)
                if "error" not in output:
                    break
                attempt += 1

            if "error" in output:
                st.error("‚ùå Failed to extract structured data. See below.")
                st.code(json.dumps(output, indent=2))
            else:
                st.success("‚úÖ Extraction Complete")
                st.json(output)
                st.download_button("Download JSON", data=json.dumps(output, indent=2), file_name="output.json")


In [None]:
# Full Document Intelligence Pipeline with Enhanced Retry, Logging, and Robustness

from typing import List, Dict, Optional
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
import pytesseract
from pdf2image import convert_from_path
import fitz  # PyMuPDF
from PIL import Image
import re
import os
import json
from pydantic import BaseModel, ValidationError, Field
from datetime import datetime

# ----------------------------
# PromptBuilderAgent (LLM-generated prompt)
# ----------------------------
class PromptBuilderAgent:
    def __init__(self, model_name: str = "gpt-4", temperature: float = 0.3):
        self.llm = OpenAI(model=model_name, temperature=temperature)
        self.prompt_template = PromptTemplate.from_template("""
You are a prompt generation agent for legal document analysis.

Given the metadata:
- Module: {module_name}
- Document Type: {document_type}
- Facility Type: {facility_type}
- Market/Submarket: {market} / {submarket}
- Local Market: {local_market}
- Owner: {owner}
- OCR Mode: {has_ocr}

And Fields:
{field_string}

Generate a production-grade prompt that includes:
1. Role declaration
2. Step-by-step objective section (per field)
3. Placeholder for document content
4. At least one realistic few-shot example
5. Output JSON schema with value, reasoning, confidence
6. Guardrails to prevent hallucination or guessing
7. OCR handling instructions if text is noisy
""")
        self.chain = LLMChain(prompt=self.prompt_template, llm=self.llm)

    def build_prompt(self, filters: Dict[str, str], fields: List[Dict[str, str]], has_ocr: bool = True) -> str:
        field_str = "\n".join([f"- {f['name']}: {f.get('description', '')}" for f in fields])
        return self.chain.run(
            module_name=filters.get("Module Name", ""),
            document_type=filters.get("Document Type", ""),
            facility_type=filters.get("Facility Type", ""),
            market=filters.get("Market", ""),
            submarket=filters.get("SubMarket", ""),
            local_market=filters.get("Local Market", ""),
            owner=filters.get("Owner", ""),
            has_ocr="Yes" if has_ocr else "No",
            field_string=field_str
        )

# ----------------------------
# PromptValidator
# ----------------------------
def validate_prompt_structure(prompt: str) -> bool:
    required_sections = ["SYSTEM", "OBJECTIVE", "DOCUMENT CONTEXT", "FEW-SHOT", "OUTPUT FORMAT"]
    return all(section in prompt for section in required_sections)

# ----------------------------
# Pydantic Schema for LLM Output
# ----------------------------
class FieldExtraction(BaseModel):
    value: Optional[str]
    reasoning: str
    confidence: float = Field(..., ge=0, le=1)

class ExtractionSchema(BaseModel):
    __root__: Dict[str, FieldExtraction]

# ----------------------------
# DocumentProcessor with Enhanced Retry + Logging
# ----------------------------
class DocumentProcessor:
    def __init__(self, prompt_agent: PromptBuilderAgent, extractor_model: str = "gpt-4"):
        self.prompt_agent = prompt_agent
        self.extractor_llm = OpenAI(model=extractor_model, temperature=0.2)

    def extract_text_from_pdf(self, pdf_path: str) -> Dict[str, str]:
        try:
            doc = fitz.open(pdf_path)
            text = "\n\n".join([page.get_text() for page in doc])
            return {"text": text, "is_ocr": False}
        except Exception:
            images = convert_from_path(pdf_path)
            full_text = []
            for img in images:
                ocr_result = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
                for i in range(len(ocr_result['text'])):
                    if int(ocr_result['conf'][i]) > 60:
                        full_text.append(ocr_result['text'][i])
            return {"text": " ".join(full_text), "is_ocr": True}

    def run_pipeline(self, pdf_path: str, filters: Dict[str, str], fields: List[Dict[str, str]]) -> Dict:
        extracted = self.extract_text_from_pdf(pdf_path)
        document_text = extracted["text"]
        is_ocr = extracted["is_ocr"]

        max_attempts = 3
        retry_logs = []

        for attempt in range(max_attempts):
            try:
                prompt = self.prompt_agent.build_prompt(filters, fields, has_ocr=is_ocr)
                if not validate_prompt_structure(prompt):
                    raise ValueError("Invalid prompt structure")

                final_input = f"""{prompt}\n\n### DOCUMENT CONTENT\n{document_text}\n"""
                raw_output = self.extractor_llm.invoke(final_input)
                self.log_attempt(prompt, raw_output, attempt)

                parsed_output = json.loads(raw_output)
                validated = ExtractionSchema.parse_obj(parsed_output)
                return validated.dict()

            except Exception as e:
                retry_logs.append({"attempt": attempt + 1, "error": str(e)})
                continue

        return {
            "error": "Failed to validate LLM output after multiple retries",
            "retries": retry_logs,
            "raw_output": raw_output if 'raw_output' in locals() else None
        }

    def log_attempt(self, prompt: str, response: str, attempt: int):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        os.makedirs("audit_logs", exist_ok=True)
        with open(f"audit_logs/prompt_attempt{attempt+1}_{timestamp}.txt", "w") as f:
            f.write(prompt)
        with open(f"audit_logs/response_attempt{attempt+1}_{timestamp}.json", "w") as f:
            f.write(response)


# ‚úÖ Example Usage
if __name__ == "__main__":
    filters_example = {
        "Module Name": "Real Estate",
        "Territory": "South",
        "Owner": "Crown",
        "Market": "South East",
        "Contract Type": "Lease",
        "SubMarket": "Florida",
        "Document Type": "Lease",
        "Local Market": "Tampa",
        "Facility Type": "Easement"
    }

    fields_example = [
        {"name": "Commencement Date", "description": "The start date of the lease term."},
        {"name": "Site ID", "description": "Internal or regulatory identifier for the facility."},
        {"name": "Owner Name", "description": "Legal name of the property owner or landlord."}
    ]

    agent = PromptBuilderAgent()
    processor = DocumentProcessor(prompt_agent=agent)
    output = processor.run_pipeline("example_lease.pdf", filters_example, fields_example)
    print(json.dumps(output, indent=2))

In [None]:
# Few-Shot Bootstrap Utilities and Streamlit Toggle Integration

import os
import json
import streamlit as st
from typing import List, Dict
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

# ----------------------------
# Sample Few-Shot Bootstrap Examples
# ----------------------------
def bootstrap_fewshot_examples(output_dir="fewshot_examples"):
    os.makedirs(output_dir, exist_ok=True)

    examples = {
        "lease_commencement_example.json": {
            "Commencement Date": {
                "value": "July 1, 2020",
                "reasoning": "Found under section 'Lease Commencement Date'; clearly formatted.",
                "confidence": 0.96
            }
        },
        "site_id_extraction_success.json": {
            "Site ID": {
                "value": "FL-TMP-8234",
                "reasoning": "Located next to label 'Site Identifier' in tabular header.",
                "confidence": 0.92
            }
        },
        "owner_name_missing_example.json": {
            "Owner Name": {
                "value": None,
                "reasoning": "No match for 'Owner'; other legal parties listed but unrelated.",
                "confidence": 0.43
            }
        },
        "monthly_fee_success.json": {
            "Monthly Fee": {
                "value": "$1,500",
                "reasoning": "Matched to line item labeled 'Monthly Payment Obligation'.",
                "confidence": 0.97
            }
        }
    }

    for fname, content in examples.items():
        with open(os.path.join(output_dir, fname), "w") as f:
            json.dump(content, f, indent=2)

# ----------------------------
# Auto-Update FAISS Index with New Examples
# ----------------------------
def refresh_faiss_index(example_dir="fewshot_examples"):
    embeddings = OpenAIEmbeddings()
    documents = []
    for fname in os.listdir(example_dir):
        if fname.endswith(".json"):
            with open(os.path.join(example_dir, fname), "r") as f:
                data = f.read()
                documents.append(Document(page_content=data, metadata={"filename": fname}))
    return FAISS.from_documents(documents, embeddings)

# ----------------------------
# Streamlit Toggle to Preview Injected Few-Shots
# ----------------------------
def preview_few_shot_examples(query: str, k: int = 2):
    index = refresh_faiss_index()
    results = index.similarity_search(query, k=k)
    st.subheader("üîç Injected Few-Shot Examples")
    for i, r in enumerate(results):
        st.markdown(f"**Example {i+1} ‚Äî {r.metadata['filename']}**")
        st.code(r.page_content, language="json")

# ‚úÖ Example Usage: Bootstrapping and Streamlit Preview
if __name__ == "__main__":
    bootstrap_fewshot_examples()
    query_string = "Prompt for: Lease fields: Commencement Date, Site ID, Owner Name"
    st.title("üîß Few-Shot Preview Utility")
    preview_few_shot_examples(query_string)


In [None]:
streamlit run dynamic_prompt_generator.py
# Streamlit Interface for Few-Shot Editing and Retrieval Logging

import os
import json
import streamlit as st
from datetime import datetime
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document

EXAMPLES_DIR = "fewshot_examples"
LOG_FILE = "retrieval_log.json"

# ----------------------------
# UI Toggle: Preview Few-Shot Examples
# ----------------------------
def preview_few_shot_examples(query: str, k: int = 2):
    st.subheader("üìå Injected Few-Shot Examples")
    index = refresh_faiss_index()
    results = index.similarity_search(query, k=k)
    retrieval_log(query, results)
    for i, doc in enumerate(results):
        st.markdown(f"**Example {i+1} ‚Äî {doc.metadata['filename']}**")
        st.code(doc.page_content, language="json")

# ----------------------------
# Few-Shot Editor Mode
# ----------------------------
def few_shot_editor():
    st.header("üõ†Ô∏è Few-Shot Editor")
    files = [f for f in os.listdir(EXAMPLES_DIR) if f.endswith(".json")]
    selected_file = st.selectbox("Choose a sample to edit", files)
    filepath = os.path.join(EXAMPLES_DIR, selected_file)

    if selected_file:
        with open(filepath, "r") as f:
            data = json.load(f)
        edited = st_ace_editor(json.dumps(data, indent=2), language="json")

        if st.button("üíæ Save Changes"):
            try:
                parsed = json.loads(edited)
                with open(filepath, "w") as f:
                    json.dump(parsed, f, indent=2)
                st.success(f"Saved {selected_file}")
            except json.JSONDecodeError:
                st.error("Invalid JSON ‚Äî please fix and try again")

# ----------------------------
# Editor Widget (ACE)
# ----------------------------
from streamlit_ace import st_ace as st_ace_editor

# ----------------------------
# FAISS Index Refresh
# ----------------------------
def refresh_faiss_index():
    embeddings = OpenAIEmbeddings()
    docs = []
    for fname in os.listdir(EXAMPLES_DIR):
        if fname.endswith(".json"):
            with open(os.path.join(EXAMPLES_DIR, fname), "r") as f:
                content = f.read()
                docs.append(Document(page_content=content, metadata={"filename": fname}))
    return FAISS.from_documents(docs, embeddings)

# ----------------------------
# Retrieval Logging
# ----------------------------
def retrieval_log(query: str, results):
    log_entry = {
        "timestamp": datetime.now().isoformat(),
        "query": query,
        "retrieved": [r.metadata["filename"] for r in results]
    }
    os.makedirs("logs", exist_ok=True)
    path = os.path.join("logs", LOG_FILE)
    existing = []
    if os.path.exists(path):
        with open(path, "r") as f:
            try:
                existing = json.load(f)
            except:
                existing = []
    existing.append(log_entry)
    with open(path, "w") as f:
        json.dump(existing, f, indent=2)

# ----------------------------
# Streamlit Launcher
# ----------------------------
def main():
    st.set_page_config(layout="wide")
    st.title("üîé Few-Shot Intelligence Tools")

    tab1, tab2 = st.tabs(["Preview", "Editor"])

    with tab1:
        q = st.text_input("Query (ex: Lease fields: Site ID, Commencement Date)")
        k = st.slider("# of Examples", 1, 5, 2)
        if st.button("üîç Retrieve") and q:
            preview_few_shot_examples(f"Prompt for: {q}", k=k)

    with tab2:
        few_shot_editor()

if __name__ == "__main__":
    os.makedirs(EXAMPLES_DIR, exist_ok=True)
    main()


In [None]:
# retrieval_pipeline.py
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from typing import List, Dict, Any
import faiss
import os
import json

class ChunkIndexer:
    def __init__(self, index_path="faiss_index"):
        self.index_path = index_path
        self.embeddings = OpenAIEmbeddings()

    def chunk_document(self, doc_id: str, pages: List[str]) -> List[Document]:
        chunks = []
        for i, page in enumerate(pages):
            content = page.strip()
            if content:
                chunks.append(Document(
                    page_content=content,
                    metadata={
                        "doc_id": doc_id,
                        "chunk_id": f"{doc_id}_{i+1}",
                        "page_num": i + 1,
                        "source": "ocr" if "\n" in content else "digital"
                    }
                ))
        return chunks

    def build_or_update_index(self, doc_id: str, pages: List[str]):
        chunks = self.chunk_document(doc_id, pages)
        if not os.path.exists(self.index_path):
            vectorstore = FAISS.from_documents(chunks, self.embeddings)
            vectorstore.save_local(self.index_path)
        else:
            vectorstore = FAISS.load_local(self.index_path, self.embeddings)
            vectorstore.add_documents(chunks)
            vectorstore.save_local(self.index_path)


class ChunkRetriever:
    def __init__(self, index_path="faiss_index"):
        self.embeddings = OpenAIEmbeddings()
        self.index = FAISS.load_local(index_path, self.embeddings)

    def semantic_retrieve(self, query: str, k: int = 5) -> List[Document]:
        return self.index.similarity_search(query, k=k)

    def keyword_retrieve(self, keyword: str, k: int = 5) -> List[Document]:
        all_docs = self.index.docstore._dict.values()
        filtered = [doc for doc in all_docs if keyword.lower() in doc.page_content.lower()]
        return filtered[:k]

    def hybrid_retrieve(self, field_name: str, description: str, k: int = 5) -> List[Document]:
        results = self.semantic_retrieve(description, k=k)
        fallback = self.keyword_retrieve(field_name, k=k)
        ids = {r.metadata['chunk_id'] for r in results}
        results.extend([doc for doc in fallback if doc.metadata['chunk_id'] not in ids])
        return results[:k]


class FieldTaskPlanner:
    def plan(self, fields: List[Dict[str, str]]) -> List[Dict[str, Any]]:
        tasks = []
        for f in fields:
            tasks.append({
                "name": f["name"],
                "description": f.get("description", f["name"]),
                "strategy": "hybrid",
                "top_k": 5
            })
        return tasks


class RetrievalExecutor:
    def __init__(self, retriever: ChunkRetriever, llm):
        self.retriever = retriever
        self.llm = llm

    def run_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
        chunks = self.retriever.hybrid_retrieve(task["name"], task["description"], task.get("top_k", 5))
        context = "\n\n".join([doc.page_content for doc in chunks])
        prompt = f"""
You are an expert lease document reviewer. Extract the field: '{task['name']}' from the content below.
If present, return the exact value with confidence.
If not found, return null and a reason.

---
{context}
---
Respond in JSON: {{ "value": ..., "reason": ..., "confidence": 0.0‚Äì1.0 }}
"""
        response = self.llm.invoke(prompt)
        try:
            return json.loads(response)
        except:
            return {"value": None, "reason": "Invalid LLM output", "confidence": 0.0}

    def run_all(self, field_tasks: List[Dict[str, Any]]) -> Dict[str, Any]:
        results = {}
        for task in field_tasks:
            results[task["name"]] = self.run_task(task)
        return results


In [None]:
# structured_chunker.py
import fitz  # PyMuPDF
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
from typing import List, Dict, Union, Tuple
from dataclasses import dataclass, asdict
import os
import uuid

@dataclass
class TypedChunk:
    type: str  # "narrative_text", "table", "image"
    page: int
    bbox: Tuple[float, float, float, float]
    content: Union[str, List]
    metadata: Dict


class StructuredChunker:
    def __init__(self, pdf_path: str, dpi: int = 300):
        self.pdf_path = pdf_path
        self.doc = fitz.open(pdf_path)
        self.dpi = dpi
        self.chunk_id_prefix = os.path.basename(pdf_path).replace(".pdf", "")

    def extract_text_blocks(self) -> List[TypedChunk]:
        chunks = []
        for i, page in enumerate(self.doc):
            blocks = page.get_text("blocks")  # (x0, y0, x1, y1, text, block_no)
            for b in blocks:
                x0, y0, x1, y1, text, *_ = b
                if text.strip():
                    chunk = TypedChunk(
                        type="narrative_text",
                        page=i + 1,
                        bbox=(x0, y0, x1, y1),
                        content=text.strip(),
                        metadata={"chunk_id": f"{self.chunk_id_prefix}_text_{uuid.uuid4().hex[:6]}"}
                    )
                    chunks.append(chunk)
        return chunks

    def extract_tables(self) -> List[TypedChunk]:
        tables = []
        with pdfplumber.open(self.pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                try:
                    for table in page.extract_tables():
                        if table and len(table) > 1:
                            chunk = TypedChunk(
                                type="table",
                                page=i + 1,
                                bbox=page.bbox,
                                content=table,
                                metadata={"chunk_id": f"{self.chunk_id_prefix}_table_{uuid.uuid4().hex[:6]}"}
                            )
                            tables.append(chunk)
                except Exception:
                    continue
        return tables

    def extract_images_with_ocr(self) -> List[TypedChunk]:
        chunks = []
        images = convert_from_path(self.pdf_path, dpi=self.dpi)
        for i, img in enumerate(images):
            ocr_text = pytesseract.image_to_string(img)
            if ocr_text.strip():
                chunk = TypedChunk(
                    type="image",
                    page=i + 1,
                    bbox=(0, 0, img.size[0], img.size[1]),
                    content=ocr_text.strip(),
                    metadata={"chunk_id": f"{self.chunk_id_prefix}_img_{uuid.uuid4().hex[:6]}"}
                )
                chunks.append(chunk)
        return chunks

    def get_all_chunks(self) -> List[Dict]:
        text_chunks = self.extract_text_blocks()
        table_chunks = self.extract_tables()
        image_chunks = self.extract_images_with_ocr()
        all_chunks = text_chunks + table_chunks + image_chunks

        # Sort by page + vertical position (y0)
        all_chunks.sort(key=lambda c: (c.page, c.bbox[1]))
        return [asdict(chunk) for chunk in all_chunks]


if __name__ == "__main__":
    import json
    import sys

    pdf_file = sys.argv[1] if len(sys.argv) > 1 else "example.pdf"
    chunker = StructuredChunker(pdf_file)
    output = chunker.get_all_chunks()
    json_path = pdf_file.replace(".pdf", "_structured.json")

    with open(json_path, "w") as f:
        json.dump(output, f, indent=2)

    print(f"‚úÖ Structured chunks saved to {json_path}")


In [None]:
# retrieval_pipeline.py with structured chunk ingestion + type metadata
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from typing import List, Dict, Any
import faiss
import os
import json

class ChunkIndexer:
    def __init__(self, index_path="faiss_index"):
        self.index_path = index_path
        self.embeddings = OpenAIEmbeddings()

    def load_structured_chunks(self, chunk_file: str) -> List[Document]:
        with open(chunk_file, "r") as f:
            chunks = json.load(f)

        docs = []
        for c in chunks:
            text = ""
            if c["type"] == "table":
                text = "\n".join([" | ".join(row) for row in c["content"]])
            elif c["type"] == "image":
                text = c["content"]
            elif c["type"] == "narrative_text":
                text = c["content"]

            doc = Document(
                page_content=text,
                metadata={
                    "chunk_id": c["metadata"].get("chunk_id"),
                    "type": c["type"],
                    "page": c["page"],
                    **c.get("metadata", {})
                }
            )
            docs.append(doc)
        return docs

    def build_index_from_chunks(self, chunk_json_file: str):
        chunks = self.load_structured_chunks(chunk_json_file)
        if not os.path.exists(self.index_path):
            db = FAISS.from_documents(chunks, self.embeddings)
            db.save_local(self.index_path)
        else:
            db = FAISS.load_local(self.index_path, self.embeddings)
            db.add_documents(chunks)
            db.save_local(self.index_path)


class ChunkRetriever:
    def __init__(self, index_path="faiss_index"):
        self.embeddings = OpenAIEmbeddings()
        self.index = FAISS.load_local(index_path, self.embeddings)

    def semantic_retrieve(self, query: str, k: int = 5) -> List[Document]:
        return self.index.similarity_search(query, k=k)

    def keyword_retrieve(self, keyword: str, k: int = 5) -> List[Document]:
        all_docs = self.index.docstore._dict.values()
        return [doc for doc in all_docs if keyword.lower() in doc.page_content.lower()][:k]

    def hybrid_retrieve(self, field_name: str, description: str, k: int = 5) -> List[Document]:
        semantic = self.semantic_retrieve(description, k=k)
        keyword = self.keyword_retrieve(field_name, k=k)
        existing_ids = {d.metadata['chunk_id'] for d in semantic}
        combined = semantic + [d for d in keyword if d.metadata['chunk_id'] not in existing_ids]
        return combined[:k]


class FieldTaskPlanner:
    def plan(self, fields: List[Dict[str, str]]) -> List[Dict[str, Any]]:
        return [{
            "name": f["name"],
            "description": f.get("description", f["name"]),
            "strategy": "hybrid",
            "top_k": 5
        } for f in fields]


class RetrievalExecutor:
    def __init__(self, retriever: ChunkRetriever, llm):
        self.retriever = retriever
        self.llm = llm

    def run_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
        chunks = self.retriever.hybrid_retrieve(task["name"], task["description"], task.get("top_k", 5))
        context = "\n\n".join([doc.page_content for doc in chunks])
        prompt = f"""
You are an expert lease analyst. Extract the field: '{task['name']}' from the context below.
Use chunk metadata (type, page, source) to improve reasoning. If not found, say why.

---\n{context}\n---

Respond in JSON:
{{"value":..., "reason":..., "confidence": 0.0-1.0}}
"""
        try:
            response = self.llm.invoke(prompt)
            result = json.loads(response)
            result["_aligned_chunks"] = [
                {
                    "chunk_id": d.metadata["chunk_id"],
                    "page_num": d.metadata["page"],
                    "doc_id": d.metadata.get("doc_id"),
                    "type": d.metadata["type"],
                    "text": d.page_content
                } for d in chunks
            ]
            return result
        except Exception as e:
            return {"value": None, "reason": str(e), "confidence": 0.0}


In [None]:
# retrieval_pipeline.py ‚Äî with type filtering + confidence scoring hint
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from typing import List, Dict, Any
import faiss
import os
import json

class ChunkIndexer:
    def __init__(self, index_path="faiss_index"):
        self.index_path = index_path
        self.embeddings = OpenAIEmbeddings()

    def load_structured_chunks(self, chunk_file: str) -> List[Document]:
        with open(chunk_file, "r") as f:
            chunks = json.load(f)

        docs = []
        for c in chunks:
            text = ""
            if c["type"] == "table":
                text = "\n".join([" | ".join(row) for row in c["content"]])
            elif c["type"] == "image":
                text = c["content"]
            elif c["type"] == "narrative_text":
                text = c["content"]

            doc = Document(
                page_content=text,
                metadata={
                    "chunk_id": c["metadata"].get("chunk_id"),
                    "type": c["type"],
                    "page": c["page"],
                    **c.get("metadata", {})
                }
            )
            docs.append(doc)
        return docs

    def build_index_from_chunks(self, chunk_json_file: str):
        chunks = self.load_structured_chunks(chunk_json_file)
        if not os.path.exists(self.index_path):
            db = FAISS.from_documents(chunks, self.embeddings)
            db.save_local(self.index_path)
        else:
            db = FAISS.load_local(self.index_path, self.embeddings)
            db.add_documents(chunks)
            db.save_local(self.index_path)


class ChunkRetriever:
    def __init__(self, index_path="faiss_index"):
        self.embeddings = OpenAIEmbeddings()
        self.index = FAISS.load_local(index_path, self.embeddings)

    def semantic_retrieve(self, query: str, k: int = 5, type_filter: str = None) -> List[Document]:
        docs = self.index.similarity_search(query, k=k)
        if type_filter:
            docs = [d for d in docs if d.metadata.get("type") == type_filter]
        return docs

    def keyword_retrieve(self, keyword: str, k: int = 5, type_filter: str = None) -> List[Document]:
        all_docs = self.index.docstore._dict.values()
        docs = [doc for doc in all_docs if keyword.lower() in doc.page_content.lower()]
        if type_filter:
            docs = [d for d in docs if d.metadata.get("type") == type_filter]
        return docs[:k]

    def hybrid_retrieve(self, field_name: str, description: str, k: int = 5, type_filter: str = None) -> List[Document]:
        semantic = self.semantic_retrieve(description, k=k, type_filter=type_filter)
        keyword = self.keyword_retrieve(field_name, k=k, type_filter=type_filter)
        existing_ids = {d.metadata['chunk_id'] for d in semantic}
        combined = semantic + [d for d in keyword if d.metadata['chunk_id'] not in existing_ids]
        return combined[:k]


class FieldTaskPlanner:
    def plan(self, fields: List[Dict[str, str]]) -> List[Dict[str, Any]]:
        return [{
            "name": f["name"],
            "description": f.get("description", f["name"]),
            "strategy": "hybrid",
            "top_k": 5,
            "type_filter": f.get("preferred_type")  # optional: limit to "table" | "narrative_text" | etc
        } for f in fields]


class RetrievalExecutor:
    def __init__(self, retriever: ChunkRetriever, llm):
        self.retriever = retriever
        self.llm = llm

    def run_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
        chunks = self.retriever.hybrid_retrieve(
            field_name=task["name"],
            description=task["description"],
            k=task.get("top_k", 5),
            type_filter=task.get("type_filter")
        )

        context = "\n\n".join([doc.page_content for doc in chunks])
        prompt = f"""
You are a lease document reviewer. Extract the field: '{task['name']}' from the provided document content.

You are analyzing chunks of type: {[d.metadata['type'] for d in chunks]}.

If you find a valid value, return it with a high confidence (close to 1.0).
If it's ambiguous or fuzzy, lower the confidence.
If missing, return value=null, reason="...", confidence=0.0

---\n{context}\n---
Respond strictly in JSON:
{{"value": ..., "reason": ..., "confidence": 0.0-1.0}}
"""
        try:
            response = self.llm.invoke(prompt)
            result = json.loads(response)
            result["_aligned_chunks"] = [
                {
                    "chunk_id": d.metadata["chunk_id"],
                    "page_num": d.metadata["page"],
                    "doc_id": d.metadata.get("doc_id"),
                    "type": d.metadata["type"],
                    "text": d.page_content
                } for d in chunks
            ]
            return result
        except Exception as e:
            return {"value": None, "reason": str(e), "confidence": 0.0}


In [None]:
# retrieval_pipeline.py ‚Äî with field normalization + semantic prompt injection
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from typing import List, Dict, Any
import faiss
import os
import json

class ChunkIndexer:
    def __init__(self, index_path="faiss_index"):
        self.index_path = index_path
        self.embeddings = OpenAIEmbeddings()

    def load_structured_chunks(self, chunk_file: str) -> List[Document]:
        with open(chunk_file, "r") as f:
            chunks = json.load(f)

        docs = []
        for c in chunks:
            text = ""
            if c["type"] == "table":
                text = "\n".join([" | ".join(row) for row in c["content"]])
            elif c["type"] == "image":
                text = c["content"]
            elif c["type"] == "narrative_text":
                text = c["content"]

            doc = Document(
                page_content=text,
                metadata={
                    "chunk_id": c["metadata"].get("chunk_id"),
                    "type": c["type"],
                    "page": c["page"],
                    **c.get("metadata", {})
                }
            )
            docs.append(doc)
        return docs

    def build_index_from_chunks(self, chunk_json_file: str):
        chunks = self.load_structured_chunks(chunk_json_file)
        if not os.path.exists(self.index_path):
            db = FAISS.from_documents(chunks, self.embeddings)
            db.save_local(self.index_path)
        else:
            db = FAISS.load_local(self.index_path, self.embeddings)
            db.add_documents(chunks)
            db.save_local(self.index_path)


class ChunkRetriever:
    def __init__(self, index_path="faiss_index"):
        self.embeddings = OpenAIEmbeddings()
        self.index = FAISS.load_local(index_path, self.embeddings)

    def semantic_retrieve(self, query: str, k: int = 5, type_filter: str = None) -> List[Document]:
        docs = self.index.similarity_search(query, k=k)
        if type_filter:
            docs = [d for d in docs if d.metadata.get("type") == type_filter]
        return docs

    def keyword_retrieve(self, keyword: str, k: int = 5, type_filter: str = None) -> List[Document]:
        all_docs = self.index.docstore._dict.values()
        docs = [doc for doc in all_docs if keyword.lower() in doc.page_content.lower()]
        if type_filter:
            docs = [d for d in docs if d.metadata.get("type") == type_filter]
        return docs[:k]

    def hybrid_retrieve(self, field_name: str, description: str, k: int = 5, type_filter: str = None, hints: List[str] = []) -> List[Document]:
        compound_query = description + " " + " ".join(hints)
        semantic = self.semantic_retrieve(compound_query, k=k, type_filter=type_filter)
        keyword = self.keyword_retrieve(field_name, k=k, type_filter=type_filter)
        existing_ids = {d.metadata['chunk_id'] for d in semantic}
        combined = semantic + [d for d in keyword if d.metadata['chunk_id'] not in existing_ids]
        return combined[:k]


class FieldTaskPlanner:
    def plan(self, fields: List[Dict[str, str]]) -> List[Dict[str, Any]]:
        tasks = []
        for f in fields:
            name = f["name"]
            desc = f.get("description", name)
            synonyms = f.get("synonyms", [])
            hint_keywords = f.get("hint_keywords", [])
            hints = synonyms + hint_keywords
            tasks.append({
                "name": name,
                "description": desc,
                "strategy": "hybrid",
                "top_k": f.get("top_k", 5),
                "type_filter": f.get("preferred_type"),
                "importance": f.get("importance", "medium"),
                "temporal_scope": f.get("temporal_scope"),
                "must_have_units": f.get("must_have_units", []),
                "hints": hints
            })
        return tasks


class RetrievalExecutor:
    def __init__(self, retriever: ChunkRetriever, llm):
        self.retriever = retriever
        self.llm = llm

    def run_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
        chunks = self.retriever.hybrid_retrieve(
            field_name=task["name"],
            description=task["description"],
            k=task.get("top_k", 5),
            type_filter=task.get("type_filter"),
            hints=task.get("hints", [])
        )

        chunk_types = set([d.metadata["type"] for d in chunks])
        unit_hint = ", must include units: " + ", ".join(task.get("must_have_units", [])) if task.get("must_have_units") else ""
        scope_hint = f" (temporal scope: {task.get('temporal_scope')})" if task.get("temporal_scope") else ""

        context = "\n\n".join([doc.page_content for doc in chunks])
        prompt = f"""
You are analyzing a lease document. Your goal is to extract the field: '{task['name']}'.

Field Description: {task['description']}{scope_hint}
Chunk types in use: {', '.join(chunk_types)}{unit_hint}
Hints: {', '.join(task.get('hints', []))}

---\n{context}\n---
Respond strictly in JSON format:
{{"value": ..., "reason": ..., "confidence": 0.0-1.0}}
"""
        try:
            response = self.llm.invoke(prompt)
            result = json.loads(response)
            result["_aligned_chunks"] = [
                {
                    "chunk_id": d.metadata["chunk_id"],
                    "page_num": d.metadata["page"],
                    "doc_id": d.metadata.get("doc_id"),
                    "type": d.metadata["type"],
                    "text": d.page_content
                } for d in chunks
            ]
            return result
        except Exception as e:
            return {"value": None, "reason": str(e), "confidence": 0.0}
