In [8]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os, json, random, datetime, pathlib

BASE = "/content/drive/MyDrive/Claim_CoPilot"
DATA_DIR = os.path.join(BASE, "data")
pathlib.Path(DATA_DIR).mkdir(parents=True, exist_ok=True)

claims_path = os.path.join(DATA_DIR, "claims.jsonl")

print("BASE     :", BASE)
print("DATA_DIR :", DATA_DIR)
print("Target   :", claims_path)


Mounted at /content/drive
BASE     : /content/drive/MyDrive/Claim_CoPilot
DATA_DIR : /content/drive/MyDrive/Claim_CoPilot/data
Target   : /content/drive/MyDrive/Claim_CoPilot/data/claims.jsonl


In [9]:
random.seed(42)  # for reproducibility

first_names = [
    "John", "Jane", "Mark", "Priya", "Carlos", "Emily",
    "Ravi", "Anita", "Michael", "Sara", "David", "Liu",
]

last_names = [
    "Doe", "Smith", "Lee", "Nair", "Rivera", "Chen",
    "Patel", "Khan", "Johnson", "Brown", "Garcia", "Nguyen",
]

policy_types = ["Health", "Auto", "Property"]

health_events = [
    "a slip and fall at home",
    "a workplace accident",
    "a sports injury during a local match",
    "a minor surgery following a chronic condition",
    "an emergency room visit after chest pain",
]

health_injuries = [
    "no complications were reported",
    "the patient suffered a minor sprain",
    "the patient sustained a fracture",
    "follow-up physiotherapy sessions were recommended",
    "the patient required an overnight hospital stay",
]

auto_events = [
    "a rear-end collision at a traffic light",
    "a side-impact accident at an intersection",
    "a low-speed parking lot collision",
    "a highway collision involving multiple vehicles",
    "a single-vehicle skid on a wet road",
]

auto_injuries = [
    "no injuries were reported",
    "the driver reported mild whiplash",
    "the passenger reported minor bruising",
    "no occupants were harmed",
]

property_events = [
    "water damage from a burst pipe",
    "fire damage in the kitchen area",
    "storm damage to the roof and windows",
    "theft resulting in loss of electronics",
    "flooding in the basement after heavy rain",
]

property_notes = [
    "no injuries occurred",
    "no occupants were present at the time",
    "the tenant was not at home when the incident happened",
]

base_date = datetime.date(2024, 1, 1)

def random_date():
    """Random date in 2024."""
    delta_days = random.randint(0, 365)
    d = base_date + datetime.timedelta(days=delta_days)
    return d.isoformat()

def generate_single_claim(i: int):
    """
    Generate one synthetic claim record with the schema:
    id, text, claimant_name, policy_type, claim_amount, incident_date, priority, gold_summary
    """
    claim_id = f"c{i:05d}"
    policy_type = random.choice(policy_types)
    first = random.choice(first_names)
    last = random.choice(last_names)
    name = f"{first} {last}"
    date = random_date()

    if policy_type == "Health":
        event = random.choice(health_events)
        injury = random.choice(health_injuries)
        base_amt = random.randint(600, 9000)
    elif policy_type == "Auto":
        event = random.choice(auto_events)
        injury = random.choice(auto_injuries)
        base_amt = random.randint(800, 15000)
    else:  # Property
        event = random.choice(property_events)
        injury = random.choice(property_notes)
        base_amt = random.randint(1000, 20000)

    severity = random.choice(["low", "medium", "high"])
    multiplier = {"low": 0.6, "medium": 1.0, "high": 1.6}[severity]
    amount = round(base_amt * multiplier, 2)

    # Priority rules (roughly aligned with your triage logic)
    if "fracture" in injury or "surgery" in injury or severity == "high" or amount > 5000:
        priority = "High"
    elif amount >= 3000:
        priority = "Medium"
    else:
        priority = "Low"

    # Natural-ish text
    text = (
        f"{name} submitted a {policy_type} claim on {date} after {event}. "
        f"The estimated cost is ${amount:.2f}. {injury.capitalize()}."
    )

    gold_summary = (
        f"{name} filed a {priority.lower()}-priority {policy_type.lower()} claim on {date} "
        f"after {event}, with an estimated cost of ${amount:.2f} and {injury}."
    )

    return {
        "id": claim_id,
        "text": text,
        "claimant_name": name,
        "policy_type": policy_type,
        "claim_amount": amount,
        "incident_date": date,
        "priority": priority,
        "gold_summary": gold_summary,
    }


In [10]:
# How many total synthetic claims to generate
N_TOTAL = 10000  # âœ… massive dataset

random.seed(42)  # reproducible

claims = [generate_single_claim(i) for i in range(1, N_TOTAL + 1)]

print("Generated", len(claims), "synthetic claims.")

# Overwrite the old claims.jsonl (if it exists)
with open(claims_path, "w", encoding="utf-8") as f:
    for rec in claims:
        f.write(json.dumps(rec) + "\n")

print("Wrote dataset to:", claims_path)


Generated 10000 synthetic claims.
Wrote dataset to: /content/drive/MyDrive/Claim_CoPilot/data/claims.jsonl


In [11]:
loaded = []
with open(claims_path, "r", encoding="utf-8") as f:
    for line in f:
        loaded.append(json.loads(line))

print("Reloaded records:", len(loaded))
print("\nSample 3 records:\n")
for rec in random.sample(loaded, 3):
    print("ID:", rec["id"])
    print("Name:", rec["claimant_name"])
    print("Policy:", rec["policy_type"])
    print("Amount:", rec["claim_amount"])
    print("Priority:", rec["priority"])
    print("Text:", rec["text"])
    print("Gold summary:", rec["gold_summary"])
    print("-" * 80)


Reloaded records: 10000

Sample 3 records:

ID: c06233
Name: Ravi Smith
Policy: Property
Amount: 15140.0
Priority: High
Text: Ravi Smith submitted a Property claim on 2024-06-12 after fire damage in the kitchen area. The estimated cost is $15140.00. The tenant was not at home when the incident happened.
Gold summary: Ravi Smith filed a high-priority property claim on 2024-06-12 after fire damage in the kitchen area, with an estimated cost of $15140.00 and the tenant was not at home when the incident happened.
--------------------------------------------------------------------------------
ID: c05119
Name: Emily Chen
Policy: Property
Amount: 24430.4
Priority: High
Text: Emily Chen submitted a Property claim on 2024-05-26 after theft resulting in loss of electronics. The estimated cost is $24430.40. No occupants were present at the time.
Gold summary: Emily Chen filed a high-priority property claim on 2024-05-26 after theft resulting in loss of electronics, with an estimated cost of $244

In [2]:
%%writefile /content/drive/MyDrive/Claim_CoPilot/src/generate_dataset.py
import json
import random
import datetime
from pathlib import Path

# --------------------------------------------------------------------
# Project paths (relative to this file)
# --------------------------------------------------------------------
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = PROJECT_ROOT / "data"
DATA_DIR.mkdir(parents=True, exist_ok=True)

OUT_PATH = DATA_DIR / "claims.jsonl"

# --------------------------------------------------------------------
# Synthetic data templates
# --------------------------------------------------------------------
random.seed(42)

FIRST_NAMES = [
    "John", "Jane", "Mark", "Priya", "Carlos", "Emily",
    "Ravi", "Anita", "Michael", "Sara", "David", "Liu",
]

LAST_NAMES = [
    "Doe", "Smith", "Lee", "Nair", "Rivera", "Chen",
    "Patel", "Khan", "Johnson", "Brown", "Garcia", "Nguyen",
]

POLICY_TYPES = ["Health", "Auto", "Property"]

HEALTH_EVENTS = [
    "a slip and fall at home",
    "a workplace accident",
    "a sports injury during a local match",
    "a minor surgery following a chronic condition",
    "an emergency room visit after chest pain",
]

HEALTH_INJURIES = [
    "no complications were reported",
    "the patient suffered a minor sprain",
    "the patient sustained a fracture",
    "follow-up physiotherapy sessions were recommended",
    "the patient required an overnight hospital stay",
]

AUTO_EVENTS = [
    "a rear-end collision at a traffic light",
    "a side-impact accident at an intersection",
    "a low-speed parking lot collision",
    "a highway collision involving multiple vehicles",
    "a single-vehicle skid on a wet road",
]

AUTO_INJURIES = [
    "no injuries were reported",
    "the driver reported mild whiplash",
    "the passenger reported minor bruising",
    "no occupants were harmed",
]

PROPERTY_EVENTS = [
    "water damage from a burst pipe",
    "fire damage in the kitchen area",
    "storm damage to the roof and windows",
    "theft resulting in loss of electronics",
    "flooding in the basement after heavy rain",
]

PROPERTY_NOTES = [
    "no injuries occurred",
    "no occupants were present at the time",
    "the tenant was not at home when the incident happened",
]

BASE_DATE = datetime.date(2024, 1, 1)


def random_date() -> str:
    """Sample a random date in 2024 (YYYY-MM-DD)."""
    delta_days = random.randint(0, 365)
    d = BASE_DATE + datetime.timedelta(days=delta_days)
    return d.isoformat()


def generate_single_claim(i: int) -> dict:
    """
    Generate one synthetic claim record with fields aligned
    to what the pipeline expects.
    """
    claim_id = f"c{i:05d}"
    policy_type = random.choice(POLICY_TYPES)
    first = random.choice(FIRST_NAMES)
    last = random.choice(LAST_NAMES)
    name = f"{first} {last}"
    incident_date = random_date()

    if policy_type == "Health":
        event = random.choice(HEALTH_EVENTS)
        injury = random.choice(HEALTH_INJURIES)
        base_amt = random.randint(600, 9000)
    elif policy_type == "Auto":
        event = random.choice(AUTO_EVENTS)
        injury = random.choice(AUTO_INJURIES)
        base_amt = random.randint(800, 15000)
    else:  # Property
        event = random.choice(PROPERTY_EVENTS)
        injury = random.choice(PROPERTY_NOTES)
        base_amt = random.randint(1000, 20000)

    severity = random.choice(["low", "medium", "high"])
    multiplier = {"low": 0.6, "medium": 1.0, "high": 1.6}[severity]
    claim_amount = round(base_amt * multiplier, 2)

    # Priority logic aligned with TriageAgent
    if "fracture" in injury or "surgery" in injury or severity == "high" or claim_amount > 5000:
        priority = "High"
    elif claim_amount >= 3000:
        priority = "Medium"
    else:
        priority = "Low"

    text = (
        f"{name} submitted a {policy_type} claim on {incident_date} after {event}. "
        f"The estimated cost is ${claim_amount:.2f}. {injury.capitalize()}."
    )

    gold_summary = (
        f"{name} filed a {priority.lower()}-priority {policy_type.lower()} claim on "
        f"{incident_date} after {event}, with an estimated cost of ${claim_amount:.2f} "
        f"and {injury}."
    )

    return {
        "id": claim_id,
        "text": text,
        "claimant_name": name,
        "policy_type": policy_type,
        "claim_amount": claim_amount,
        "incident_date": incident_date,
        "priority": priority,
        "gold_summary": gold_summary,
    }


def main(n_total: int = 10000):
    print(f"Generating {n_total} synthetic claims...")
    with OUT_PATH.open("w", encoding="utf-8") as f:
        for i in range(1, n_total + 1):
            rec = generate_single_claim(i)
            f.write(json.dumps(rec) + "\n")
    print(f"Wrote dataset to: {OUT_PATH.resolve()}")


if __name__ == "__main__":
    main()


Overwriting /content/drive/MyDrive/Claim_CoPilot/src/generate_dataset.py


In [3]:
%cd /content/drive/MyDrive/Claim_CoPilot
!python -m src.generate_dataset


/content/drive/MyDrive/Claim_CoPilot
Generating 10000 synthetic claims...
Wrote dataset to: /content/drive/MyDrive/Claim_CoPilot/data/claims.jsonl


In [1]:
import json, random, os

path = "/content/drive/MyDrive/Claim_CoPilot/data/claims.jsonl"
print("Exists?", os.path.exists(path))

recs = []
with open(path, "r", encoding="utf-8") as f:
    for line in f:
        recs.append(json.loads(line))

print("Total records:", len(recs))
print("Sample record:")
print(recs[0])


Exists? True
Total records: 10000
Sample record:
{'id': 'c00001', 'text': 'Jane Doe submitted a Property claim on 2024-05-20 after fire damage in the kitchen area. The estimated cost is $8915.20. No injuries occurred.', 'claimant_name': 'Jane Doe', 'policy_type': 'Property', 'claim_amount': 8915.2, 'incident_date': '2024-05-20', 'priority': 'High', 'gold_summary': 'Jane Doe filed a high-priority property claim on 2024-05-20 after fire damage in the kitchen area, with an estimated cost of $8915.20 and no injuries occurred.'}
