In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

In [2]:
# Load the cleaned, merged dataset created by data_loader.py
DATA_PATH = Path("../data/processed/claims_denial_base.csv")

df = pd.read_csv(DATA_PATH)

df.head()

Unnamed: 0,billing_id,patient_id_x,encounter_id,insurance_provider,payment_method,claim_id,claim_billing_date,billed_amount,paid_amount,claim_status,denial_reason,is_denied,first_name,last_name,dob,age,gender,ethnicity,insurance_type,marital_status,address,city,state,zip,phone,email_x,registration_date,patient_id_y,provider_id,visit_date,visit_type,department_x,reason_for_visit,diagnosis_code,admission_type,discharge_date,length_of_stay,status,readmitted_flag,name,department_y,specialty,npi,inhouse,location,years_experience,contact_info,email_y
0,BILL000001,PAT001464,ENC000001,BCBS,Insurance,CLM000001,06-02-2025 00:00,1971.52,0.0,Denied,Claim Billed to Wrong Payer,1,Traci,Tran,05-11-1948,76,Female,Asian,BCBS,Married,421 Victor Throughway Apt. 136,Long Beach,CA,90802.0,,traci.tran860@example.com,19-01-2025,PAT001464,PRO00345,19-01-2025,Outpatient,Pulmonology,Asthma Follow-up,J45.909,,,,Completed,No,Nancy Barber,Pulmonology,Pulmonologist,8515688546,Yes,CA,7,907 330 0119,nancy.barber@healthcare.org
1,BILL000002,PAT025832,ENC000002,Medicare,Insurance,CLM000002,01-05-2025 00:00,1243.8,736.05,Paid,,0,Stephanie,Williams,04-04-1964,61,Female,Hispanic,Medicare,Married,080 Tran Plains Suite 209,Anaheim,CA,92801.0,932.766.1047,stephanie.williams717@example.com,30-03-2025,PAT025832,PRO00023,30-03-2025,Emergency,Emergency Department,Sudden Breathing Issue,R06.02,Emergency,,,Completed,No,Shawn Jones,Emergency Department,General Practitioner,4923548486,Yes,CA,8,409 806 0735,shawn.jones@healthcare.org
2,BILL000003,PAT055873,ENC000003,BCBS,Insurance,CLM000003,23-02-2025 00:00,4854.11,2676.12,Paid,,0,Katie,Melendez,04-06-2021,4,Female,Asian,BCBS,Single,086 Chandler Points,Seattle,WA,98101.0,445 617 3534,katie.melendez409@example.com,18-01-2025,PAT055873,PRO01197,18-01-2025,Outpatient,Urology,Kidney Stone,N20.0,,,,Completed,No,David Morrison,Urology,Urologist,8599764735,Yes,CA,17,359 740 7814,
3,BILL000004,PAT048558,ENC000004,BCBS,Insurance,CLM000004,20-04-2025 00:00,2638.21,1861.39,Paid,,0,Michael,Martin,10-03-2010,15,Female,Asian,BCBS,Single,607 Jacob Terrace Suite 998,Sacramento,CA,94203.0,305.948.4358,michael.martin299@example.com,21-03-2025,PAT048558,PRO00049,21-03-2025,Outpatient,Emergency Department,Severe Chest Pain,R07.9,,,,Completed,No,Amy Jones,Emergency Department,General Practitioner,5067298284,Yes,CA,20,670 484 5671,amy.jones@healthcare.org
4,BILL000005,PAT018366,ENC000005,BCBS,Selfpay,,,1046.99,1046.99,Paid,,0,Richard,Larson,26-07-1952,72,Female,White,BCBS,Married,4303 David Lodge,Fresno,CA,93701.0,+1 757 412 0,richard.larson934@example.com,24-01-2025,PAT018366,PRO00594,24-01-2025,Inpatients,Gastroenterology,IBS,K58.9,,26-01-2025,2.0,Completed,No,Aaron Rich,Gastroenterology,Gastroenterologist,3057043847,Yes,CA,6,618 532 0882,aaron.rich@healthcare.org


In [3]:
df.shape, df.is_denied.value_counts(normalize=True)


((70000, 48),
 is_denied
 0    0.914314
 1    0.085686
 Name: proportion, dtype: float64)

### Length of stay features
Why:

- Age buckets reduce noise vs raw age
- Readmission is a known denial risk signal

In [4]:
# Replace missing length_of_stay with 0 for outpatient cases
df["length_of_stay"] = df["length_of_stay"].fillna(0)

# Bucketize stay duration
df["long_stay_flag"] = (df["length_of_stay"] > 5).astype(int)

### Patient risk features
Why:

- Age buckets reduce noise vs raw age
- Readmission is a known denial risk signal

In [5]:
df["age_bucket"] = pd.cut(
    df["age"],
    bins=[0, 18, 35, 50, 65, 120],
    labels=["child", "young_adult", "adult", "senior", "elder"]
)

# Readmission risk
df["readmitted_flag"] = df["readmitted_flag"].map({"Yes": 1, "No": 0})
df["readmitted_flag"] = df["readmitted_flag"].fillna(0)

### Insurance & claim complexity features
Why:

- Provider experience correlates with billing accuracy
- Binary flags generalize better than raw numbers

In [6]:
# High-risk insurance types
high_risk_insurance = ["Self-Pay", "Unknown"]

df["high_risk_insurance_flag"] = df["insurance_type"].isin(
    high_risk_insurance
).astype(int)

# Multiple diagnosis indicator
df["has_diagnosis"] = df["diagnosis_code"].notna().astype(int)

### Provider experience features
Why:

- Provider experience correlates with billing accuracy
- Binary flags generalize better than raw numbers

In [7]:
# Fill missing experience with median
df["years_experience"] = df["years_experience"].fillna(
    df["years_experience"].median()
)

# Low experience indicator
df["low_experience_provider"] = (df["years_experience"] < 5).astype(int)

### Target leakage check
Why:

- These columns contain outcome information
- We explicitly confirm and exclude them later

In [8]:
# Columns that must NEVER be used as features
leakage_cols = [
    "claim_status",
    "denial_reason",
    "paid_amount"
]

df[leakage_cols].head()

Unnamed: 0,claim_status,denial_reason,paid_amount
0,Denied,Claim Billed to Wrong Payer,0.0
1,Paid,,736.05
2,Paid,,2676.12
3,Paid,,1861.39
4,Paid,,1046.99


In [9]:
FEATURE_COLUMNS = [
    "billed_amount",
    "length_of_stay",
    "long_stay_flag",
    "age",
    "age_bucket",
    "readmitted_flag",
    "high_risk_insurance_flag",
    "has_diagnosis",
    "years_experience",
    "low_experience_provider",
    "insurance_type",
    "visit_type",
    "department_x",
    "admission_type"
]

TARGET_COLUMN = "is_denied"

df_model = df[FEATURE_COLUMNS + [TARGET_COLUMN]]

df_model.head()


Unnamed: 0,billed_amount,length_of_stay,long_stay_flag,age,age_bucket,readmitted_flag,high_risk_insurance_flag,has_diagnosis,years_experience,low_experience_provider,insurance_type,visit_type,department_x,admission_type,is_denied
0,1971.52,0.0,0,76,elder,0,0,1,7,0,BCBS,Outpatient,Pulmonology,,1
1,1243.8,0.0,0,61,senior,0,0,1,8,0,Medicare,Emergency,Emergency Department,Emergency,0
2,4854.11,0.0,0,4,child,0,0,1,17,0,BCBS,Outpatient,Urology,,0
3,2638.21,0.0,0,15,child,0,0,1,20,0,BCBS,Outpatient,Emergency Department,,0
4,1046.99,2.0,0,72,elder,0,0,1,6,0,BCBS,Inpatients,Gastroenterology,,0


- This defines the contract for training and inference
- Same list will be reused in feature_pipeline.py

In [10]:
OUTPUT_PATH = Path("../data/processed/claims_features.csv")

df_model.to_csv(OUTPUT_PATH, index=False)

print("Feature dataset saved successfully.")


Feature dataset saved successfully.
