In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os, sys, pathlib, json, re

BASE = "/content/drive/MyDrive/Claim_CoPilot"
SRC = os.path.join(BASE, "src")
DATA_DIR = os.path.join(BASE, "data")
OUT_DIR = os.path.join(BASE, "outputs")

for d in [SRC, DATA_DIR, OUT_DIR]:
    pathlib.Path(d).mkdir(parents=True, exist_ok=True)

if SRC not in sys.path:
    sys.path.append(SRC)

print("BASE      :", BASE)
print("SRC       :", SRC)
print("DATA_DIR  :", DATA_DIR)
print("OUT_DIR   :", OUT_DIR)
print("src files :", os.listdir(SRC))
print("data files:", os.listdir(DATA_DIR))


Mounted at /content/drive
BASE      : /content/drive/MyDrive/Claim_CoPilot
SRC       : /content/drive/MyDrive/Claim_CoPilot/src
DATA_DIR  : /content/drive/MyDrive/Claim_CoPilot/data
OUT_DIR   : /content/drive/MyDrive/Claim_CoPilot/outputs
src files : ['agents', '__init__.py', 'state.py', 'llm_client.py', 'orchestrator.py', '__pycache__']
data files: ['claims.jsonl']


In [3]:
import os
import getpass  # hides the input while typing in terminal / Colab

# Only ask if it's not already set
if not os.environ.get("OPENAI_API_KEY"):
    key = getpass.getpass("Enter your OpenAI API key (input hidden): ").strip()
    if key:
        os.environ["OPENAI_API_KEY"] = key
        print("OpenAI key set? ", "OPENAI_API_KEY" in os.environ)
    else:
        print("No key entered. OPENAI_API_KEY not set.")
else:
    print("â„¹ OPENAI_API_KEY is already set in the environment.")



Enter your OpenAI API key (input hidden): Â·Â·Â·Â·Â·Â·Â·Â·Â·Â·
OpenAI key set?  True


In [None]:
from orchestrator import Orchestrator
from state import ClaimState

claims_path = os.path.join(DATA_DIR, "claims.jsonl")

def load_claims(path: str):
    claims = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            claims.append(json.loads(line))
    return claims

claims = load_claims(claims_path)
print("Loaded", len(claims), "claims from", claims_path)

# ðŸ”¹ For evaluation, sample a manageable subset for the agentic pipeline
import random
random.seed(123)

EVAL_N = 300  # change this to 200/500 if you want, but 300 is good
sampled_claims = random.sample(claims, min(EVAL_N, len(claims)))

print("Using", len(sampled_claims), "claims for AGENTIC vs BASELINE evaluation.")
print("Example sampled record:\n", sampled_claims[0])


Loaded 10000 claims from /content/drive/MyDrive/Claim_CoPilot/data/claims.jsonl
Using 300 claims for AGENTIC vs BASELINE evaluation.
Example sampled record:
 {'id': 'c00858', 'text': 'Mark Garcia submitted a Health claim on 2024-05-25 after an emergency room visit after chest pain. The estimated cost is $3506.00. The patient required an overnight hospital stay.', 'claimant_name': 'Mark Garcia', 'policy_type': 'Health', 'claim_amount': 3506.0, 'incident_date': '2024-05-25', 'priority': 'Medium', 'gold_summary': 'Mark Garcia filed a medium-priority health claim on 2024-05-25 after an emergency room visit after chest pain, with an estimated cost of $3506.00 and the patient required an overnight hospital stay.'}


In [None]:
def run_agentic(text: str):
    """
    Run your full agentic pipeline and return the key fields we evaluate.
    """
    orc = Orchestrator()
    state = orc.run(text)
    return {
        "claimant_name": state.extracted_fields.get("claimant_name"),
        "policy_type": state.extracted_fields.get("policy_type"),
        "claim_amount": state.extracted_fields.get("claim_amount"),
        "incident_date": state.extracted_fields.get("incident_date"),
        "priority": state.triage.get("priority"),
        "summary": state.summary,
        "state": state,
    }

POLICY_TYPES = ["Health", "Auto", "Property"]

def baseline_extract_and_triage(text: str):
    """
    Simple regex baseline without agents/LLM.
    """
    # Amount like $1234.56
    amount = None
    m = re.search(r"\$([0-9]+(?:\.[0-9]{2})?)", text)
    if m:
        try:
            amount = float(m.group(1))
        except Exception:
            amount = None

    # Date like 2024-08-12
    date = None
    m = re.search(r"\b(20[0-9]{2}-[01][0-9]-[0-3][0-9])\b", text)
    if m:
        date = m.group(1)

    # Policy type from keywords
    policy = None
    for p in POLICY_TYPES:
        if p.lower() in text.lower():
            policy = p
            break

    # Claimant name: very simple heuristic
    name = None
    m = re.search(r"([A-Z][a-z]+ [A-Z][a-z]+)\s+(filed|submitted|presents)", text)
    if m:
        name = m.group(1)

    # Priority heuristic
    t_lower = text.lower()
    if "fracture" in t_lower or "injury" in t_lower or "hospital" in t_lower:
        priority = "High"
    elif amount is not None and amount >= 3000:
        priority = "Medium"
    else:
        priority = "Low"

    return {
        "claimant_name": name,
        "policy_type": policy,
        "claim_amount": amount,
        "incident_date": date,
        "priority": priority,
    }

print("Agentic demo:\n", run_agentic(claims[0]["text"]))
print("\nBaseline demo:\n", baseline_extract_and_triage(claims[0]["text"]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


Agentic demo:
 {'claimant_name': 'John Doe', 'policy_type': 'Health', 'claim_amount': 3250.0, 'incident_date': '2024-06-05', 'priority': 'High', 'summary': 'John Doe filed a Health insurance claim on June 5, 2024, for an amount of $3,250 following a collision. He sustained a minor injury as a result of the incident but did not require surgery. The claim has been designated as high priority, indicating the need for prompt attention. The nature of the claim is health-related, reflecting the medical expenses incurred due to the accident.', 'state': <state.ClaimState object at 0x7a512b13fec0>}

Baseline demo:
 {'claimant_name': 'John Doe', 'policy_type': 'Health', 'claim_amount': 3250.0, 'incident_date': '2024-06-05', 'priority': 'High'}


In [None]:
FIELDS = ["claimant_name", "policy_type", "claim_amount", "incident_date", "priority"]

# Counters for P/R/F1
metrics = {
    "agentic": {f: {"tp": 0, "fp": 0, "fn": 0} for f in FIELDS},
    "baseline": {f: {"tp": 0, "fp": 0, "fn": 0} for f in FIELDS},
}

def update_prf_counts(gold, pred, bucket):
    """
    Very simple correctness: exact match or not.
    """
    if gold is None and pred is None:
        return
    if pred == gold:
        bucket["tp"] += 1
    else:
        if pred is not None:
            bucket["fp"] += 1
        if gold is not None:
            bucket["fn"] += 1

def prf(tp, fp, fn):
    prec = 0.0 if (tp + fp) == 0 else tp / (tp + fp)
    rec  = 0.0 if (tp + fn) == 0 else tp / (tp + fn)
    if prec + rec == 0:
        f1 = 0.0
    else:
        f1 = 2 * prec * rec / (prec + rec)
    return prec, rec, f1

for rec in sampled_claims:
    text = rec["text"]
    gold = rec

    ag = run_agentic(text)
    bl = baseline_extract_and_triage(text)

    cs = {
        "id": rec["id"],
        "text": text,
        "gold": {f: gold[f] for f in FIELDS},
        "agentic_pred": {f: ag[f] for f in FIELDS},
        "baseline_pred": {f: bl[f] for f in FIELDS},
    }
    case_studies.append(cs)

    for field in FIELDS:
        g = gold[field]
        update_prf_counts(g, ag[field], metrics["agentic"][field])
        update_prf_counts(g, bl[field], metrics["baseline"][field])


    for field in FIELDS:
        g = gold[field]
        update_prf_counts(g, ag[field], metrics["agentic"][field])
        update_prf_counts(g, bl[field], metrics["baseline"][field])

print("=== Extraction + Triage: P/R/F1 per field ===")
summary_table = {}
for field in FIELDS:
    ag_m = metrics["agentic"][field]
    bl_m = metrics["baseline"][field]

    ag_p, ag_r, ag_f1 = prf(ag_m["tp"], ag_m["fp"], ag_m["fn"])
    bl_p, bl_r, bl_f1 = prf(bl_m["tp"], bl_m["fp"], bl_m["fn"])

    summary_table[field] = {
        "agentic_precision": ag_p,
        "agentic_recall": ag_r,
        "agentic_f1": ag_f1,
        "baseline_precision": bl_p,
        "baseline_recall": bl_r,
        "baseline_f1": bl_f1,
        "agentic_counts": ag_m,
        "baseline_counts": bl_m,
    }

    print(
        f"{field:15s} | Agentic P/R/F1: {ag_p:.2f}/{ag_r:.2f}/{ag_f1:.2f}  "
        f"| Baseline P/R/F1: {bl_p:.2f}/{bl_r:.2f}/{bl_f1:.2f}"
    )

# Save files
pathlib.Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

eval_out_path = os.path.join(OUT_DIR, "eval_results.json")
with open(eval_out_path, "w", encoding="utf-8") as f:
    json.dump({"summary": summary_table, "metrics_raw": metrics}, f, indent=2)

cases_out_path = os.path.join(OUT_DIR, "case_studies.jsonl")
with open(cases_out_path, "w", encoding="utf-8") as f:
    for cs in case_studies:
        f.write(json.dumps(cs) + "\n")

print("\nSaved eval results to:", eval_out_path)
print("Saved case studies to :", cases_out_path)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another 

=== Extraction + Triage: P/R/F1 per field ===
claimant_name   | Agentic P/R/F1: 0.93/0.93/0.93  | Baseline P/R/F1: 1.00/1.00/1.00
policy_type     | Agentic P/R/F1: 1.00/1.00/1.00  | Baseline P/R/F1: 1.00/1.00/1.00
claim_amount    | Agentic P/R/F1: 1.00/1.00/1.00  | Baseline P/R/F1: 1.00/1.00/1.00
incident_date   | Agentic P/R/F1: 1.00/1.00/1.00  | Baseline P/R/F1: 1.00/1.00/1.00
priority        | Agentic P/R/F1: 0.45/0.45/0.45  | Baseline P/R/F1: 0.38/0.38/0.38

Saved eval results to: /content/drive/MyDrive/Claim_CoPilot/outputs/eval_results.json
Saved case studies to : /content/drive/MyDrive/Claim_CoPilot/outputs/case_studies.jsonl


In [None]:
%%writefile eval.py
# eval.py - Evaluation script for ClaimCopilot
import json
import random
import re
import sys
from pathlib import Path
from typing import Dict, Any, List

# --- Locate project root and src folder --------------------------------------
if "__file__" in globals():
    BASE = Path(__file__).resolve().parent
else:
    # Notebook / interactive fallback
    BASE = Path.cwd()

SRC = BASE / "src"
if str(SRC) not in sys.path:
    sys.path.append(str(SRC))

from orchestrator import Orchestrator  # type: ignore

DATA_DIR = BASE / "data"
OUT_DIR = BASE / "outputs"

DATA_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

CLAIMS_PATH = DATA_DIR / "claims.jsonl"

FIELDS = ["claimant_name", "policy_type", "claim_amount", "incident_date", "priority"]
POLICY_TYPES = ["Health", "Auto", "Property"]


def load_claims(path: Path) -> List[Dict[str, Any]]:
    claims = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            claims.append(json.loads(line))
    return claims


def run_agentic(text: str) -> Dict[str, Any]:
    orc = Orchestrator()
    state = orc.run(text)
    return {
        "claimant_name": state.extracted_fields.get("claimant_name"),
        "policy_type": state.extracted_fields.get("policy_type"),
        "claim_amount": state.extracted_fields.get("claim_amount"),
        "incident_date": state.extracted_fields.get("incident_date"),
        "priority": state.triage.get("priority"),
        "summary": state.summary,
    }


def baseline_extract_and_triage(text: str) -> Dict[str, Any]:
    amount = None
    m = re.search(r"\$([0-9]+(?:\.[0-9]{2})?)", text)
    if m:
        try:
            amount = float(m.group(1))
        except Exception:
            amount = None

    incident_date = None
    m = re.search(r"\b(20[0-9]{2}-[01][0-9]-[0-3][0-9])\b", text)
    if m:
        incident_date = m.group(1)

    policy_type = None
    for p in POLICY_TYPES:
        if p.lower() in text.lower():
            policy_type = p
            break

    claimant_name = None
    m = re.search(r"([A-Z][a-z]+ [A-Z][a-z]+)\s+(submitted|filed|presents)", text)
    if m:
        claimant_name = m.group(1)

    t_lower = text.lower()
    if "fracture" in t_lower or "surgery" in t_lower or "hospital" in t_lower:
        priority = "High"
    elif amount is not None and amount >= 3000:
        priority = "Medium"
    else:
        priority = "Low"

    return {
        "claimant_name": claimant_name,
        "policy_type": policy_type,
        "claim_amount": amount,
        "incident_date": incident_date,
        "priority": priority,
    }


def update_prf_counts(gold, pred, bucket: Dict[str, int]) -> None:
    if gold is None and pred is None:
        return
    if pred == gold:
        bucket["tp"] += 1
    else:
        if pred is not None:
            bucket["fp"] += 1
        if gold is not None:
            bucket["fn"] += 1


def prf(tp: int, fp: int, fn: int):
    prec = 0.0 if (tp + fp) == 0 else tp / (tp + fp)
    rec = 0.0 if (tp + fn) == 0 else tp / (tp + fn)
    if prec + rec == 0:
        f1 = 0.0
    else:
        f1 = 2 * prec * rec / (prec + rec)
    return prec, rec, f1


def main(eval_n: int = 300, seed: int = 123):
    print(f"Loading claims from: {CLAIMS_PATH}")
    claims = load_claims(CLAIMS_PATH)
    print(f"Total claims available: {len(claims)}")

    random.seed(seed)
    sampled_claims = random.sample(claims, min(eval_n, len(claims)))
    print(f"Evaluating on a sampled subset of {len(sampled_claims)} claims.\n")

    metrics = {
        "agentic": {f: {"tp": 0, "fp": 0, "fn": 0} for f in FIELDS},
        "baseline": {f: {"tp": 0, "fp": 0, "fn": 0} for f in FIELDS},
    }

    case_studies = []

    for rec in sampled_claims:
        text = rec["text"]
        gold = rec

        agentic = run_agentic(text)
        baseline = baseline_extract_and_triage(text)

        case_studies.append(
            {
                "id": rec["id"],
                "text": text,
                "gold": {f: gold[f] for f in FIELDS},
                "agentic_pred": {f: agentic[f] for f in FIELDS},
                "baseline_pred": {f: baseline[f] for f in FIELDS},
            }
        )

        for field in FIELDS:
            g_val = gold[field]
            update_prf_counts(g_val, agentic[field], metrics["agentic"][field])
            update_prf_counts(g_val, baseline[field], metrics["baseline"][field])

    summary_table = {}
    print("=== Extraction + Triage: Precision / Recall / F1 ===")
    for field in FIELDS:
        ag_m = metrics["agentic"][field]
        bl_m = metrics["baseline"][field]

        ag_p, ag_r, ag_f1 = prf(ag_m["tp"], ag_m["fp"], ag_m["fn"])
        bl_p, bl_r, bl_f1 = prf(bl_m["tp"], bl_m["fp"], bl_m["fn"])

        summary_table[field] = {
            "agentic_precision": ag_p,
            "agentic_recall": ag_r,
            "agentic_f1": ag_f1,
            "baseline_precision": bl_p,
            "baseline_recall": bl_r,
            "baseline_f1": bl_f1,
            "agentic_counts": ag_m,
            "baseline_counts": bl_m,
        }

        print(
            f"{field:15s} | Agentic P/R/F1: {ag_p:.2f}/{ag_r:.2f}/{ag_f1:.2f} "
            f"| Baseline P/R/F1: {bl_p:.2f}/{bl_r:.2f}/{bl_f1:.2f}"
        )

    eval_out_path = OUT_DIR / "eval_results.json"
    with eval_out_path.open("w", encoding="utf-8") as f:
        json.dump({"summary": summary_table, "metrics_raw": metrics}, f, indent=2)

    cases_out_path = OUT_DIR / "case_studies.jsonl"
    with cases_out_path.open("w", encoding="utf-8") as f:
        for cs in case_studies:
            f.write(json.dumps(cs) + "\n")

    print("\nSaved eval results to:", eval_out_path.resolve())
    print("Saved case studies to :", cases_out_path.resolve())


if __name__ == "__main__":
    main()


Writing eval.py
