In [None]:
# Generate Synthetic Healthcare Claims for Fraud Modeling

Generates a Delta table named healthcare_synth_v1 with configurable fraud prevalence (default 3%).


In [None]:
import pandas as pd
import random, datetime as dt

STATES = ['CA','NY','TX','FL','IL','PA','OH','GA','NC','MI','AZ','VA','WA','MA','TN','IN','MO','MD','WI']
SPECIALTIES = ['Internal Medicine','Cardiology','Orthopedics','Dermatology','Pediatrics','Radiology','General Surgery','Emergency']
POS = ['Office','Outpatient','Inpatient','ER','Telehealth']

def pick(a):
    return random.choice(a)

def normal(m, s):
    return random.gauss(m, s)

def gen_claims(n=200_000, fraud_rate=0.03, seed=42):
    random.seed(seed)
    rows = []
    for _ in range(n):
        date = dt.date.today() - dt.timedelta(days=int(random.random()*365))
        submitted = max(50, normal(800, 350))
        allowed = max(40, submitted * (0.6 + random.random()*0.3))
        paid = max(0, allowed * (0.7 + random.random()*0.25))
        base = dict(
            claim_id=f'clm_{random.randrange(1,1_000_000)}',
            member_id=f'mbr_{random.randrange(1,1_000_000)}',
            provider_id=f'prv_{random.randrange(1,1_000_000)}',
            claim_date=str(date),
            year=date.year, month=date.month,
            submitted_amount=round(submitted,2), allowed_amount=round(allowed,2), paid_amount=round(paid,2),
            coinsurance=round(paid*0.1 + random.random()*20,2), copay=round(10+random.random()*40,2),
            deductible_remaining=round(random.random()*1000,2),
            procedure_codes=['CPT'+str(random.randrange(100,999))],
            diagnosis_codes=['DX'+str(random.randrange(100,999))],
            place_of_service=pick(POS), provider_specialty=pick(SPECIALTIES), network_status='in' if random.random()<0.8 else 'out',
            claim_lag_days=max(0, int(normal(12,6))), prior_authorization_flag=random.random()<0.7, referral_flag=random.random()<0.4, out_of_state_flag=random.random()<0.1,
            utilization_90d=max(0,int(normal(3,2))), utilization_365d=max(0,int(normal(10,6))), provider_claim_volume_30d=max(1,int(normal(60,30))),
            geo_state=pick(STATES), geo_zip3=str(random.randrange(100,999)), member_age=min(90,max(0,int(normal(44,18)))), member_gender=pick(['M','F','O']),
            narrative='Routine consultation'
        )
        is_fraud = random.random() < fraud_rate
        ftype = None
        if is_fraud:
            ftype = pick(['upcoding','unbundling','phantom_billing','duplicate','other'])
            if ftype=='upcoding':
                base['submitted_amount']=round(base['submitted_amount']*(1.6+random.random()*0.6),2)
            elif ftype=='unbundling':
                base['procedure_codes']=base['procedure_codes']+['CPT11010','CPT11011']
            elif ftype=='phantom_billing':
                base['provider_claim_volume_30d']=int(base['provider_claim_volume_30d']*(1.7+random.random()*0.5))
            elif ftype=='duplicate':
                base['claim_lag_days']=max(0,int(normal(1,1)))
        rows.append({**base, 'fraud_flag': bool(is_fraud), 'fraud_type': ftype})
    return pd.DataFrame(rows)

# Generate and preview prevalence
fraud_rate = 0.03
preview = gen_claims(50_000, fraud_rate=fraud_rate, seed=42)
print('Prevalence:', preview['fraud_flag'].mean())
preview.head()


In [None]:
# Write to Delta table (run on Databricks with Spark)
try:
    spark
except NameError:
    spark = None

if spark is not None:
    sdf = spark.createDataFrame(preview)
    spark.sql("CREATE DATABASE IF NOT EXISTS aethergen")
    spark.sql("USE aethergen")
    (
        sdf.write.format("delta")
        .mode("overwrite")
        .partitionBy("year","month")
        .saveAsTable("healthcare_synth_v1")
    )
    print("Delta table written: aethergen.healthcare_synth_v1")
else:
    print("Spark not available in this environment; skip Delta write.")
