In [8]:
import pdfplumber
import pandas as pd
import os

In [4]:
def extract_full_text(pdf_path: str) -> str:
    parts = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text() or ""
            if text.strip():
                parts.append(text.strip())
    return "\n".join(parts).strip()


def normalize_lines(text: str):
    return [ln.strip() for ln in text.split("\n") if ln.strip()]


def extract_company(lines):
    # v4 format: first non-empty line
    return lines[0] if lines else ""


def extract_letter_type(lines):
    # "Subject: <type>"
    for ln in lines:
        if ln.lower().startswith("subject:"):
            return ln.split(":", 1)[1].strip()
    return ""


def extract_insured(lines):
    # "Dear <Name>,"
    for ln in lines:
        if ln.lower().startswith("dear "):
            name = ln[5:].strip()
            return name[:-1] if name.endswith(",") else name
    return ""


def extract_policy_claim(lines):
    policy, claim = "", ""
    for ln in lines:
        tokens = ln.replace(",", "").replace(".", "").split()
        lower = [t.lower() for t in tokens]

        if "policy" in lower and not policy:
            i = lower.index("policy")
            if i + 1 < len(tokens) and tokens[i + 1].upper().startswith("P-"):
                policy = tokens[i + 1]

        if "claim" in lower and not claim:
            if "number" in lower:
                j = lower.index("number")
                if j + 1 < len(tokens) and tokens[j + 1].upper().startswith("C-"):
                    claim = tokens[j + 1]
            else:
                i = lower.index("claim")
                if i + 1 < len(tokens) and tokens[i + 1].upper().startswith("C-"):
                    claim = tokens[i + 1]

        if policy and claim:
            break

    return policy, claim


In [9]:
PDF_DIR = "letters"   # <-- folder with PDFs
CSV_OUTPUT = "insurance_letters_structured.csv"

records = []

pdf_files = sorted(
    f for f in os.listdir(PDF_DIR)
    if f.lower().endswith(".pdf")
)

print("Found PDFs:", len(pdf_files))

for fname in pdf_files:
    path = os.path.join(PDF_DIR, fname)

    raw_text = extract_full_text(path)
    lines = normalize_lines(raw_text)

    policy_number, claim_number = extract_policy_claim(lines)

    records.append({
        "filename": fname,
        "company_name": extract_company(lines),
        "letter_type": extract_letter_type(lines),
        "insured_name": extract_insured(lines),
        "policy_number": policy_number,
        "claim_number": claim_number,
        "raw_text": raw_text,
    })

df = pd.DataFrame(records)
print("Parsed letters:", len(df))
df.head()


Found PDFs: 500
Parsed letters: 500


Unnamed: 0,filename,company_name,letter_type,insured_name,policy_number,claim_number,raw_text
0,insurance_letter_0001.pdf,Acme Insurance Co.,Request for Additional Information,Sophia Garcia,P-2681365,C-5934663,Acme Insurance Co.\n123 Liberty Avenue\nNewark...
1,insurance_letter_0002.pdf,Liberty Shield Insurance,Denial Letter,Aarav Anderson,P-4955706,C-1439566,Liberty Shield Insurance\n123 Liberty Avenue\n...
2,insurance_letter_0003.pdf,Cascade Assurance,Denial Letter,Sophia Brown,P-9752207,C-6722035,"Cascade Assurance\n123 Liberty Avenue\nNewark,..."
3,insurance_letter_0004.pdf,Acme Insurance Co.,Denial Letter,Lucas Khan,P-8258614,C-9967585,Acme Insurance Co.\n123 Liberty Avenue\nNewark...
4,insurance_letter_0005.pdf,Acme Insurance Co.,Coverage Decision,Ananya Wilson,P-3729129,C-1331026,Acme Insurance Co.\n123 Liberty Avenue\nNewark...


In [10]:
df.to_csv(CSV_OUTPUT, index=False)
print("CSV written to:", CSV_OUTPUT)

CSV written to: insurance_letters_structured.csv
