In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os, sys, pathlib

BASE = "/content/drive/MyDrive/Claim_CoPilot"
SRC = os.path.join(BASE, "src")
AGENTS_DIR = os.path.join(SRC, "agents")

# Make sure folders exist
for d in [SRC, AGENTS_DIR]:
    pathlib.Path(d).mkdir(parents=True, exist_ok=True)

# Ensure src is importable
if SRC not in sys.path:
    sys.path.append(SRC)

# Make sure these are packages
open(os.path.join(SRC, "__init__.py"), "a").close()
open(os.path.join(AGENTS_DIR, "__init__.py"), "a").close()

print("BASE       :", BASE)
print("SRC        :", SRC)
print("AGENTS_DIR :", AGENTS_DIR)
print("SRC in sys.path:", SRC in sys.path)
print("src contents:", os.listdir(SRC))
print("agents contents:", os.listdir(AGENTS_DIR))


Mounted at /content/drive
BASE       : /content/drive/MyDrive/Claim_CoPilot
SRC        : /content/drive/MyDrive/Claim_CoPilot/src
AGENTS_DIR : /content/drive/MyDrive/Claim_CoPilot/src/agents
SRC in sys.path: True
src contents: ['agents', '__init__.py', '__pycache__', 'generate_dataset.py', 'state.py', 'llm_client.py', 'orchestrator.py']
agents contents: ['__init__.py', '__pycache__', 'validation.py', 'base.py', 'extraction.py', 'triage.py', 'summarization.py']


In [2]:
import os
import getpass

if not os.environ.get("OPENAI_API_KEY"):
    use_openai = input("Do you want to enable OpenAI LLM calls? [y/N]: ").strip().lower()
    if use_openai == "y":
        openai_key = getpass.getpass("Enter your OpenAI API key (input hidden): ").strip()
        if openai_key:
            os.environ["OPENAI_API_KEY"] = openai_key
            print(" OPENAI_API_KEY set for this session.")
        else:
            print("Empty key entered. Continuing without OpenAI.")
    else:
        print("ℹ Continuing without OpenAI (LLMClient will run in offline mode).")
else:
    print("ℹ OPENAI_API_KEY already set in environment.")

if not os.environ.get("HUGGINGFACEHUB_API_TOKEN"):
    use_hf = input("Do you want to set a Hugging Face token (for private models)? [y/N]: ").strip().lower()
    if use_hf == "y":
        hf_token = getpass.getpass("Enter your Hugging Face token (input hidden): ").strip()
        if hf_token:
            os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_token
            print(" HUGGINGFACEHUB_API_TOKEN set for this session.")
        else:
            print("⚠ Empty token entered. Continuing without HF token.")
    else:
        print("ℹ Continuing without Hugging Face token (public models only).")
else:
    print("ℹ HUGGINGFACEHUB_API_TOKEN already set in environment.")

print("OpenAI key set? ", "OPENAI_API_KEY" in os.environ)
print("HF token set?  ", "HUGGINGFACEHUB_API_TOKEN" in os.environ)



Do you want to enable OpenAI LLM calls? [y/N]: n
ℹ Continuing without OpenAI (LLMClient will run in offline mode).
Do you want to set a Hugging Face token (for private models)? [y/N]: n
ℹ Continuing without Hugging Face token (public models only).
OpenAI key set?  False
HF token set?   False


In [3]:
!pip -q install "openai" "transformers"
print(" Installed openai and transformers")


 Installed openai and transformers


In [4]:
%%writefile /content/drive/MyDrive/Claim_CoPilot/src/state.py
import json
import time
from typing import Any, Dict, List, Optional

class ClaimState:
    """
    Shared state for one insurance claim as it moves through the agents.
    """
    def __init__(self, raw_texts: List[str]):
        # Original text(s) for this claim (e.g., combined from multiple docs)
        self.raw_texts: List[str] = raw_texts

        # Structured information extracted from text
        self.extracted_fields: Dict[str, Any] = {}

        # Triage decisions (priority, claim_type, etc.)
        self.triage: Dict[str, Any] = {}

        # Final human-readable summary string
        self.summary: Optional[str] = None

        # Any issues flagged by agents (missing fields, contradictions, etc.)
        self.issues: List[str] = []

        # Trace of what each agent did (for explainability)
        self.trace: List[Dict[str, Any]] = []

    @classmethod
    def from_single_text(cls, text: str) -> "ClaimState":
        """
        Convenience constructor when you just have one big text string.
        """
        return cls([text])

    def add_trace(self, agent: str, action: str, info: Optional[Dict[str, Any]] = None) -> None:
        """
        Record what an agent did, when, and what it produced.
        """
        self.trace.append({
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "agent": agent,
            "action": action,
            "info": info or {}
        })

    def is_complete(self) -> bool:
        """
        Decide if the claim is 'ready':
        - Required fields exist
        - Summary is not None
        """
        required = ["claimant_name", "policy_type", "claim_amount", "incident_date"]
        missing = [f for f in required if f not in self.extracted_fields]
        return len(missing) == 0 and self.summary is not None

    def to_json(self) -> str:
        """
        Serialize the entire state to pretty JSON (for saving / debugging).
        """
        return json.dumps({
            "raw_texts": self.raw_texts,
            "extracted_fields": self.extracted_fields,
            "triage": self.triage,
            "summary": self.summary,
            "issues": self.issues,
            "trace": self.trace,
        }, indent=2)


Overwriting /content/drive/MyDrive/Claim_CoPilot/src/state.py


In [5]:
 %%writefile /content/drive/MyDrive/Claim_CoPilot/src/llm_client.py
import os
from typing import Optional
from openai import OpenAI

class LLMClient:
    """
    Simple wrapper around OpenAI chat completion.
    If OPENAI_API_KEY is not set, it falls back to a debug 'echo' behavior.
    """
    def __init__(self, model: str = "gpt-4o-mini"):
        self.model = model
        # Check if we actually have an OpenAI key
        self._has_openai = bool(os.environ.get("OPENAI_API_KEY"))
        self._client: Optional[OpenAI] = OpenAI() if self._has_openai else None

    def chat(self, prompt: str, system: str = "", temperature: float = 0.2) -> str:
        """
        Send a chat-style prompt to the LLM and return the response text.
        If no API key is set, return a debug string instead (so code doesn't crash).
        """
        if not self._has_openai:
            # Fallback when no key is configured
            return f"[LLM disabled] Would have answered based on: {prompt[:200]!r}"

        messages = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})

        resp = self._client.chat.completions.create(
            model=self.model,
            messages=messages,
            temperature=temperature,
        )
        return resp.choices[0].message.content


Overwriting /content/drive/MyDrive/Claim_CoPilot/src/llm_client.py


In [6]:
%%writefile /content/drive/MyDrive/Claim_CoPilot/src/agents/base.py
from abc import ABC, abstractmethod
from typing import Optional
from llm_client import LLMClient
from state import ClaimState

class BaseAgent(ABC):
    """
    Abstract base class for all agents in ClaimCopilot.
    Each agent:
      - Has a name
      - Has access to an LLMClient
      - Implements run(state) which modifies the ClaimState in place.
    """
    name: str = "BaseAgent"

    def __init__(self, llm: Optional[LLMClient] = None):
        # If no LLM is supplied, create a default one
        self.llm = llm or LLMClient()

    @abstractmethod
    def run(self, state: ClaimState) -> None:
        """
        Perform this agent's operation on the given ClaimState.
        This method should update state.* and add a trace entry.
        """
        ...


Overwriting /content/drive/MyDrive/Claim_CoPilot/src/agents/base.py


In [7]:
%%writefile /content/drive/MyDrive/Claim_CoPilot/src/agents/extraction.py
import re
from typing import List, Tuple, Optional, Dict, Any

from transformers import pipeline, Pipeline
from .base import BaseAgent
from state import ClaimState

# ---------------- Date & Amount Helpers ----------------

MONTH_MAP = {
    "january": 1,
    "february": 2,
    "march": 3,
    "april": 4,
    "may": 5,
    "june": 6,
    "july": 7,
    "august": 8,
    "september": 9,
    "october": 10,
    "november": 11,
    "december": 12,
}


def extract_incident_date(text: str) -> Optional[str]:
    """
    Return incident date as 'YYYY-MM-DD' if we can find one, else None.

    Supports:
      - ISO dates: 2024-06-05
      - Natural dates: November 22, 2024  ->  2024-11-22
    """

    # 1) ISO-style dates like 2024-06-05
    m_iso = re.search(r"\b(20\d{2}-\d{2}-\d{2})\b", text)
    if m_iso:
        return m_iso.group(1)

    # 2) Long-form dates like 'November 22, 2024'
    m_long = re.search(
        r"\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),\s*(\d{4})",
        text,
    )
    if m_long:
        month_name = m_long.group(1).lower()
        day = int(m_long.group(2))
        year = int(m_long.group(3))
        month_num = MONTH_MAP.get(month_name)
        if month_num:
            return f"{year:04d}-{month_num:02d}-{day:02d}"

    # Nothing found
    return None


def extract_claim_amount(text: str) -> Optional[float]:
    """
    Extract the main monetary amount from the claim text.

    Strategy:
      - Only consider numbers prefixed with '$' to avoid confusing years (e.g., 2024).
      - Remove commas and parse as float.
      - If multiple amounts exist (repairs + medical), pick the largest
        as the main claim amount.
    """
    # Capture things like $3,750 or $280 or $1200.50
    matches = re.findall(r"\$\s*([\d,]+(?:\.\d+)?)", text)
    if not matches:
        return None

    amounts: List[float] = []
    for m in matches:
        cleaned = m.replace(",", "")
        try:
            amounts.append(float(cleaned))
        except ValueError:
            continue

    if not amounts:
        return None

    # Heuristic: main claim amount = largest value
    return max(amounts)


class ExtractionAgent(BaseAgent):
    name = "ExtractionAgent"
    _ner: Optional[Pipeline] = None  # shared NER pipeline

    def _get_ner(self) -> Optional[Pipeline]:
        """
        Lazily create the NER pipeline the first time it's needed.
        If something goes wrong, return None instead of crashing.
        """
        if self._ner is not None:
            return self._ner
        try:
            self._ner = pipeline(
                "ner",
                model="dslim/bert-base-NER",
                aggregation_strategy="simple",
            )
        except Exception as e:
            # Fallback: no NER, just regex-based extraction
            print("[ExtractionAgent] Warning: could not load NER model:", e)
            self._ner = None
        return self._ner

    def run(self, state: ClaimState) -> None:
        """
        Extract basic fields (claimant_name, policy_type, claim_amount, incident_date)
        from the first raw text using NER + regex (with graceful fallback).
        """
        text = state.raw_texts[0] if state.raw_texts else ""
        text_stripped = text.strip()
        entities: List[Tuple[str, str]] = []

        # ---------------- NER extraction ----------------
        ner = self._get_ner()
        if ner is not None and text_stripped:
            try:
                ents = ner(text)
                # ents elements look like: {"word": "...", "entity_group": "PER", ...}
                entities = [(e["word"], e["entity_group"]) for e in ents]
            except Exception as e:
                print("[ExtractionAgent] Warning during NER:", e)

        # ---------------- Policy type from keywords ----------------
        policy: Optional[str] = None
        lowered = text.lower()
        for p in ["Health", "Auto", "Property"]:
            if p.lower() in lowered:
                policy = p
                break

        # ---------------- Claimant name from PERSON NER ----------------
        claimant_name: Optional[str] = None
        for word, label in entities:
            if label == "PER":
                # With aggregation_strategy="simple", this is usually full name
                claimant_name = word
                break

        # ---------------- Amount & date using helper functions ----------------
        claim_amount = extract_claim_amount(text)
        incident_date = extract_incident_date(text)

        # ---------------- Update state.extracted_fields ----------------
        if claimant_name:
            state.extracted_fields["claimant_name"] = claimant_name
        if policy:
            state.extracted_fields["policy_type"] = policy
        if claim_amount is not None:
            state.extracted_fields["claim_amount"] = claim_amount
        if incident_date is not None:
            state.extracted_fields["incident_date"] = incident_date

        # ---------------- Trace logging ----------------
        trace_payload: Dict[str, Any] = {
            "entities": entities,
            "policy_guess": policy,
            "claimant_name": claimant_name,
            "claim_amount": claim_amount,
            "incident_date": incident_date,
        }

        state.add_trace(self.name, "extract_fields", trace_payload)


Overwriting /content/drive/MyDrive/Claim_CoPilot/src/agents/extraction.py


In [8]:
%%writefile /content/drive/MyDrive/Claim_CoPilot/src/agents/validation.py
import json
from .base import BaseAgent
from state import ClaimState

class ValidationAgent(BaseAgent):
    name = "ValidationAgent"

    def run(self, state: ClaimState) -> None:
        """
        Check for missing fields and simple contradictions.
        Ask the LLM to provide a short QA note.
        """
        issues = []

        required = ["claimant_name", "policy_type", "claim_amount", "incident_date"]
        for field in required:
            if field not in state.extracted_fields:
                issues.append(f"Missing field: {field}")

        # Example contradiction: multiple policy types in raw text
        text = " ".join(state.raw_texts)
        found = [p for p in ["Health", "Auto", "Property"] if p.lower() in text.lower()]
        if len(set(found)) > 1:
            issues.append(f"Multiple policy types mentioned in text: {found}")

        prompt = f"""
You are a claims QA checker.

Text:
{text}

Extracted fields:
{json.dumps(state.extracted_fields)}

1) Note any missing fields (claimant_name, policy_type, claim_amount, incident_date).
2) Note any contradictions.

Reply with 2-4 bullet points.
"""
        note = self.llm.chat(
            prompt,
            system="You are a precise and concise QA checker.",
            temperature=0.2,
        )
        if note:
            state.issues.append(f"LLM-note: {note}")

        for iss in issues:
            state.issues.append(iss)

        state.add_trace(self.name, "validate_fields", {
            "issues": issues,
            "llm_note_present": bool(note),
        })


Overwriting /content/drive/MyDrive/Claim_CoPilot/src/agents/validation.py


In [9]:
%%writefile /content/drive/MyDrive/Claim_CoPilot/src/agents/triage.py
from .base import BaseAgent
from state import ClaimState

class TriageAgent(BaseAgent):
    name = "TriageAgent"

    def run(self, state: ClaimState) -> None:
        """
        Assign a simple priority (High/Medium/Low) and claim_type.
        Uses both extracted fields and raw text.
        """
        fields = state.extracted_fields
        text = " ".join(state.raw_texts).lower()

        amount = float(fields.get("claim_amount", 0.0)) if fields.get("claim_amount") is not None else 0.0
        priority = "Low"
        if "fracture" in text or "injur" in text or "hospital" in text:
            priority = "High"
        elif amount >= 3000:
            priority = "Medium"

        claim_type = fields.get("policy_type", "Unknown")

        state.triage["priority"] = priority
        state.triage["claim_type"] = claim_type

        state.add_trace(self.name, "assign_triage", {
            "priority": priority,
            "claim_type": claim_type,
            "amount": amount,
        })


Overwriting /content/drive/MyDrive/Claim_CoPilot/src/agents/triage.py


In [10]:
%%writefile /content/drive/MyDrive/Claim_CoPilot/src/agents/summarization.py
import json
from .base import BaseAgent
from state import ClaimState

class SummarizationAgent(BaseAgent):
    name = "SummarizationAgent"

    def run(self, state: ClaimState) -> None:

        text = " ".join(state.raw_texts)
        fields = state.extracted_fields
        triage = state.triage

        prompt = f"""
You are an insurance claim summarization assistant.

Original text:
{text}

Extracted fields:
{json.dumps(fields)}

Triage info:
{json.dumps(triage)}

Write a concise, factual summary (4-6 sentences) so a claim adjuster can quickly understand:
- Who is involved
- What happened and when
- Policy type and claim amount
- Priority and any notable issues

Do not hallucinate information not supported by the text.
"""
        summary = self.llm.chat(
            prompt,
            system="You summarize insurance claims accurately.",
            temperature=0.3,
        )
        state.summary = summary

        state.add_trace(self.name, "summarize_claim", {
            "summary_preview": summary[:120] if summary else ""
        })


Overwriting /content/drive/MyDrive/Claim_CoPilot/src/agents/summarization.py


In [11]:
%%writefile /content/drive/MyDrive/Claim_CoPilot/src/orchestrator.py
from typing import Optional
from llm_client import LLMClient
from state import ClaimState
from agents.extraction import ExtractionAgent
from agents.validation import ValidationAgent
from agents.triage import TriageAgent
from agents.summarization import SummarizationAgent

class Orchestrator:
    """
    Runs a list of agents over a claim until it is 'complete'.
    """
    def __init__(self, llm: Optional[LLMClient] = None):
        self.llm = llm or LLMClient()
        self.agents = [
            ExtractionAgent(self.llm),
            ValidationAgent(self.llm),
            TriageAgent(self.llm),
            SummarizationAgent(self.llm),
        ]

    def run(self, text: str) -> ClaimState:
        """
        Create a ClaimState, pass it through agents, and return the final state.
        """
        state = ClaimState.from_single_text(text)
        for agent in self.agents:
            agent.run(state)
            if state.is_complete():
                break
        return state


Overwriting /content/drive/MyDrive/Claim_CoPilot/src/orchestrator.py


In [12]:
from orchestrator import Orchestrator

orc = Orchestrator()

demo_text = """John Doe presents with a fracture from a fall on 2024-08-12.
Insurance policy: Health. Estimated claim amount: $2275. Physician notes indicate follow-up needed."""

state = orc.run(demo_text)

print("=== Extracted fields ===")
print(state.extracted_fields)
print("\n=== Triage ===")
print(state.triage)
print("\n=== Summary ===")
print(state.summary)
print("\n=== Trace steps ===", len(state.trace))
for step in state.trace:
    print(step["timestamp"], "-", step["agent"], ":", step["action"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


=== Extracted fields ===
{'claimant_name': 'John Doe', 'policy_type': 'Health', 'claim_amount': 2275.0, 'incident_date': '2024-08-12'}

=== Triage ===
{'priority': 'High', 'claim_type': 'Health'}

=== Summary ===
[LLM disabled] Would have answered based on: '\nYou are an insurance claim summarization assistant.\n\nOriginal text:\nJohn Doe presents with a fracture from a fall on 2024-08-12.\nInsurance policy: Health. Estimated claim amount: $2275. Physician not'

=== Trace steps === 4
2025-12-02 05:09:32 - ExtractionAgent : extract_fields
2025-12-02 05:09:32 - ValidationAgent : validate_fields
2025-12-02 05:09:32 - TriageAgent : assign_triage
2025-12-02 05:09:32 - SummarizationAgent : summarize_claim
