In [36]:
import pandas as pd
import numpy as np
import os

In [None]:
np.random.seed(42)
n = 1500

data = pd.DataFrame({
    'Bill_id': range(1, n+1),
    'Payer_type': np.random.choice(['Medicare', 'Medicaid', 'BCBS', 'Aetna', 'Other_Commercial', 'Workers_Compensation', 'No_Fault', 'Self_Pay'], n, p=[0.2, 0.05, 0.2, 0.1, 0.2, 0.1, 0.1, 0.05]),
    'Balance_amount': np.random.choice([100, 300, 500, 800, 1000, 1200, 1500, 1800, 2100, 2500], n, p=[0.2, 0.15, 0.1, 0.1, 0.1, 0.1, 0.1, 0.05, 0.05,0.05 ]),
    'Follow_up_Group': np.random.choice(['A', 'B'], n)
})

# Simulate payment status and time based on group and balance
def simulate_payment(row):
    if row['Follow_up_Group'] == 'B' and row['Balance_amount'] < 500:
        paid = np.random.rand() < 0.6 # 60% payment rate for portal low balance
        days = np.random.randint(1, 15) if paid else np.nan
    elif row['Follow_up_Group'] == 'A' and row['Balance_amount'] > 1000:
        paid = np.random.rand() <0.55 # 55 % payment rate calls for high balance
        days = np.random.randint(5, 30) if paid else np.nan
    else:
        paid = np.random.rand() < 0.4
        days = np.random.randint(10, 40) if paid else np.nan
    return pd.Series([paid, days])

data [['Payment_status', 'Payment_time_days']] = data.apply(simulate_payment, axis=1)
data['Payment_status'] = data ['Payment_status'].map({True:'Paid', False:'Unpaid'})


# Cost per Collection (calls cost more)
data['cost_per_collection'] = data['Follow_up_Group'].apply(lambda x: np.random.uniform(5, 10) if x=='A' else np.random.uniform(1, 3))


# Additional Engagement Matrix
data['call_attempts'] = data['Follow_up_Group'].apply(lambda x: np.random.randint(1, 4) if x == 'A' else 0)
data['portal_clicks'] = data['Follow_up_Group'].apply(lambda x: np.random.randint(0, 5) if x == 'B' else 0)

os.makedirs('data', exist_ok = True)
data.to_csv('data/demo_medical_bills.csv', index = False)

print("File saved successfully")