In [1]:
import os
os.makedirs("../data", exist_ok=True)


In [2]:
# Hospital Readmissions: Synthetic Data Generator 

import os
import numpy as np
import pandas as pd
from faker import Faker

# -----------------------------
# Config
N = 10_000                     # number of rows
RANDOM_SEED = 42               # reproducibility
np_random = np.random.default_rng(RANDOM_SEED)
fake = Faker()

# Ensure ../data exists (run this notebook from /healthcare-analytics/notebooks)
os.makedirs("../data", exist_ok=True)

# -----------------------------
# Categorical domains + weights
diagnoses = ["I10","E11","J44","I21","N18","F32","K21","M54","A41","C50"]   # common ICD-10s
diag_w    = np.array([0.13,0.12,0.10,0.08,0.07,0.10,0.10,0.12,0.08,0.10])

admission_types = ["Emergency","Urgent","Elective","Newborn"]
adm_w           = np.array([0.55, 0.20, 0.23, 0.02])

regions = ["Northeast","Midwest","South","West"]
region_w = np.array([0.22, 0.23, 0.33, 0.22])

insurance = ["Private","Medicare","Medicaid","Uninsured"]
ins_w     = np.array([0.45, 0.30, 0.22, 0.03])

# Primary language distribution (roughly US-like)
languages   = ["English","Spanish","Chinese","Vietnamese","Arabic","Tagalog","Other"]
lang_w      = np.array([0.73,     0.15,      0.03,      0.02,        0.02,    0.02,    0.03])


# Generate base columns
patient_ids = np.arange(1, N+1)

# Age: mixture (younger + older cohort)
age = np.clip(
    np_random.normal(38, 10, N).round().astype(int) * (np_random.random(N) < 0.55) +
    np_random.normal(71, 8,  N).round().astype(int) * (np_random.random(N) >= 0.55),
    0, 95
)

gender = np_random.choice(["Female","Male","Other"], size=N, p=[0.49, 0.49, 0.02])
dx     = np_random.choice(diagnoses,      size=N, p=diag_w/diag_w.sum())
adm    = np_random.choice(admission_types,size=N, p=adm_w/adm_w.sum())
region = np_random.choice(regions,        size=N, p=region_w/region_w.sum())
ins    = np_random.choice(insurance,      size=N, p=ins_w/ins_w.sum())
lang   = np_random.choice(languages,      size=N, p=lang_w/lang_w.sum())

# Comorbidities (skewed low, capped)
comorbid = np.clip(np_random.poisson(1.2, N), 0, 8)

# Length of stay (log-normal, adjusted by admission type)
base_los = np_random.lognormal(mean=1.2, sigma=0.6, size=N)   # ~right-skewed
base_los *= np.where(adm=="Emergency", 1.25, 1.0)
base_los *= np.where(adm=="Elective",  0.85, 1.0)
los = np.clip(np.round(base_los, 1), 0.5, 45.0)

# Admission dates (last 18 months)
start_date   = pd.Timestamp.today().normalize() - pd.DateOffset(months=18)
admit_dates  = pd.to_datetime(np_random.integers(0, 18*30, N), unit="D", origin=start_date).normalize()

# Readmission (logistic model)

# Diagnosis risk adjustments
dx_risk = {"I10":0.10,"E11":0.18,"J44":0.25,"I21":0.22,"N18":0.24,
           "F32":0.12,"K21":0.11,"M54":0.10,"A41":0.28,"C50":0.20}
dx_adj = np.vectorize(dx_risk.get)(dx)

# Feature scaling
x_age = (age - 60)/12.0
x_com = (comorbid - 1.5)/1.5
x_los = (los - 3.0)/2.0

# Admission-type bump
adm_bump = np.where(adm=="Emergency", 0.25, np.where(adm=="Urgent", 0.10, -0.05))

# Language effect (non-English modestly higher odds due to comms/follow-up barriers)
lang_bump = np.where(lang=="English", 0.0, 0.15)

def sigmoid(z): 
    return 1.0/(1.0 + np.exp(-z))

# Linear predictor
z = -1.0 + 0.35*x_age + 0.45*x_com + 0.25*x_los + adm_bump + dx_adj + lang_bump
p_readmit = sigmoid(z)
readmitted = (np_random.random(N) < p_readmit).astype(int)

# -----------------------------
# Cost model
dx_cost_mult = {"I10":1.00,"E11":1.10,"J44":1.25,"I21":1.35,"N18":1.30,
                "F32":0.90,"K21":0.95,"M54":0.85,"A41":1.50,"C50":1.40}
reg_cost_mult = {"Northeast":1.08,"Midwest":0.98,"South":0.95,"West":1.05}

dx_m = np.vectorize(dx_cost_mult.get)(dx)
rg_m = np.vectorize(reg_cost_mult.get)(region)

# Base ~ $1200 per day, scales with comorbidities; add multiplicative noise
cost_base = 1200.0 * los * (1 + 0.08*comorbid)
noise = np_random.lognormal(mean=0.0, sigma=0.25, size=N)
cost = np.clip(cost_base * dx_m * rg_m * noise, 300.0, None).round(2)

# -----------------------------
# Assemble DataFrame
df = pd.DataFrame({
    "Patient_ID": patient_ids,
    "Age": age,
    "Gender": gender,
    "Primary_Language": lang,
    "Diagnosis_Code": dx,
    "Admission_Type": adm,
    "Length_of_Stay": los,
    "Readmitted": readmitted,          # 0/1
    "Cost_of_Stay": cost,
    "Hospital_Region": region,
    "Insurance_Type": ins,
    "Comorbidities_Count": comorbid,
    "Admission_Date": admit_dates
})

# -----------------------------
# Save + quick sanity checks
out_path = "../data/hospital_readmissions.csv"
df.to_csv(out_path, index=False)

print("Rows, Cols:", df.shape)
print("Overall readmission rate:", round(df['Readmitted'].mean(), 4))
print("\nReadmission rate by language (top 10 shown):")
print(df.groupby('Primary_Language')['Readmitted'].mean().sort_values(ascending=False).round(4).head(10))
print(f"\nSaved clean dataset to: {out_path}")


Rows, Cols: (10000, 13)
Overall readmission rate: 0.3406

Readmission rate by language (top 10 shown):
Primary_Language
Tagalog       0.3866
Arabic        0.3850
Other         0.3811
Spanish       0.3528
Chinese       0.3450
English       0.3348
Vietnamese    0.3000
Name: Readmitted, dtype: float64

Saved clean dataset to: ../data/hospital_readmissions.csv
