In [None]:
pip install crewai pdfplumber jinja2 weasyprint

In [68]:
import re
import pdfplumber
from crewai import Agent, Task, Crew
from crewai.process import Process
from jinja2 import Environment, StrictUndefined
from weasyprint import HTML

In [111]:
from pydantic import RootModel
from typing import Dict

class PlaceholderMapping(RootModel[Dict[str, str]]):
    pass

In [113]:
openai_api_key = "<API_KEY>"

os.environ["OPENAI_API_KEY"] = openai_api_key

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [115]:
INPUT_PDF = "sample_input_template.pdf"
OUTPUT_PDF = "final_filled_output.pdf"

SAMPLE_JSON = {
    "member": {
        "full_name": "Krishna Kumar S",
        "address": "Chennai, India"
    },
    "plan": {
        "type": "Gold PPO",
        "deductible": "$1,500",
        "out_of_pocket_max": "$6,000"
    },
    "federal": {
        "contract_id": "FED-2026-XY99"
    }
}


# =========================
# STEP 1: PDF TEXT EXTRACTION
# =========================

def extract_pdf_text(path: str) -> str:
    pages = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            pages.append(page.extract_text() or "")
    return "\n".join(pages)



In [117]:
placeholder_agent = Agent(
    role="Placeholder Extraction Agent",
    goal="Extract all unique placeholders enclosed in square brackets",
    backstory="Expert in parsing structured documents",
    verbose=True
)

schema_agent = Agent(
    role="Schema Inference Agent",
    goal=(
        "Infer canonical JSON paths for placeholders found in insurance documents. "
        "Use domains like member, plan, federal."
    ),
    backstory="Expert in insurance data modeling",
    verbose=True
)


# =========================
# STEP 3: CREWAI TASKS
# =========================

extract_placeholders_task = Task(
    description="""
    From the document text:
    - Extract all unique placeholders enclosed in square brackets
    - Return ONLY a JSON array of placeholder strings
    """,
    expected_output='["Member Name", "Plan Type"]',
    agent=placeholder_agent
)

infer_schema_task = Task(
    description=(
        "Given the extracted placeholders:\n"
        "- Infer canonical JSON paths\n"
        "- Return ONLY valid JSON\n"
        "- No explanations, no markdown, no prose\n"
        "- DO NOT rewrite document text"
    ),
    expected_output="""
    {
      "Member Name": "member.full_name",
      "Plan Type": "plan.type"
    }
    """,
    agent=schema_agent,
    output_json=PlaceholderMapping   # âœ… MUST be a BaseModel
)


In [139]:
import json

def infer_placeholder_mapping(document_text: str) -> dict:
    crew = Crew(
        agents=[placeholder_agent, schema_agent],
        tasks=[extract_placeholders_task, infer_schema_task],
        process=Process.sequential,
        verbose=True
    )

    crew_result = crew.kickoff(inputs={"document_text": document_text})

    task_out = crew_result.tasks_output[-1]

    # --- SAFE EXTRACTION ---
    if hasattr(task_out, "json") and not isinstance(task_out.json, str):
        # Parsed via Pydantic RootModel
        mapping = task_out.json.root
    else:
        # Fallback: parse raw string output
        mapping = json.loads(task_out.raw)

    if not isinstance(mapping, dict):
        raise ValueError("Schema agent did not return valid placeholder mapping")

    return mapping

In [141]:
def apply_jinja_substitution(text: str, mapping: dict) -> str:
    def replacer(match):
        key = match.group(1).strip()
        return "{{ " + mapping[key] + " }}" if key in mapping else match.group(0)

    return re.sub(r"\[(.*?)\]", replacer, text)

In [143]:
def render_pdf(jinja_text: str, data: dict, output_path: str):
    env = Environment(undefined=StrictUndefined)
    rendered_text = env.from_string(jinja_text).render(**data)

    html = f"<pre>{rendered_text}</pre>"
    HTML(string=html).write_pdf(output_path)

In [145]:
def run_pipeline():
    print("ðŸ”¹ Extracting PDF text...")
    raw_text = extract_pdf_text(INPUT_PDF)

    print("ðŸ”¹ Inferring placeholder mapping via CrewAI...")
    mapping = infer_placeholder_mapping(raw_text)
    print("ðŸ”¹ Inferred Mapping:", mapping)

    print("ðŸ”¹ Applying Jinja substitution...")
    jinja_template = apply_jinja_substitution(raw_text, mapping)

    print("ðŸ”¹ Rendering final PDF...")
    render_pdf(jinja_template, SAMPLE_JSON, OUTPUT_PDF)

    print(f"âœ… Final PDF generated: {OUTPUT_PDF}")

In [147]:
run_pipeline()

ðŸ”¹ Extracting PDF text...
ðŸ”¹ Inferring placeholder mapping via CrewAI...


Output()

Output()

Output()

Output()

âœ… Final PDF generated: final_filled_output.pdf
