<a href="https://colab.research.google.com/github/Aadhavan-27/vancomycin-dose-prediction/blob/main/Synthetic_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# generate_synthetic_vanco.py
# Requirements: pandas, numpy
# Run: python generate_synthetic_vanco.py

import numpy as np
import pandas as pd
from datetime import datetime, timedelta

np.random.seed(42)

n = 10000

# Demographics
age = np.random.randint(18, 90, n)
sex = np.random.choice(["Male", "Female"], n, p=[0.52, 0.48])
weight = np.round(np.random.normal(70, 15, n).clip(40, 140), 1)  # kg

# Renal function (serum creatinine)
serum_creatinine = np.round(np.random.normal(1.0, 0.4, n).clip(0.4, 5.0), 2)  # mg/dL

# Cockcroft-Gault creatinine clearance (mL/min)
CLcr = ((140 - age) * weight) / (72 * serum_creatinine)
CLcr = np.where(sex == "Female", CLcr * 0.85, CLcr)
CLcr = np.round(CLcr.clip(5, 200), 2)

# Liver function markers
ALT = np.round(np.random.normal(25, 12, n).clip(5, 300), 1)  # U/L
AST = np.round(np.random.normal(22, 10, n).clip(5, 250), 1)  # U/L
albumin = np.round(np.random.normal(3.8, 0.4, n).clip(1.5, 5.5), 2)  # g/dL

# Clinical flags / comorbidities
ckd_flag = (CLcr < 60).astype(int)              # simple CKD flag
nephrotoxic_med = np.random.choice([0, 1], n, p=[0.85, 0.15])
sepsis_flag = np.random.choice([0, 1], n, p=[0.9, 0.1])

# Dosing information
dose_mg = np.random.choice([500, 750, 1000, 1250, 1500], n, p=[0.2,0.15,0.4,0.15,0.1])
dosing_interval = np.random.choice([8, 12, 24], n, p=[0.2, 0.6, 0.2])
infusion_time_hr = np.round(np.random.uniform(0.5, 3.0, n), 2)

# Pharmacokinetic parameter approximations
Vd_L = np.round(0.7 * weight + np.random.normal(0, 2, n), 2)  # L
# Rough clearance scaling (L/hr) influenced by creatinine clearance; this is a simplified approx.
# NOTE: these constants are illustrative for simulation purposes, not clinical guidance.
CLvanco_L_hr = (0.004 + 0.0009 * (CLcr / 60)) * 60   # simple scaling to L/hr
kel = (CLvanco_L_hr / Vd_L).clip(0.01, 0.5)           # 1/hr
kel = np.round(kel, 4)

# Time since last dose (simulate near-trough draws)
time_since_last_dose = np.array([
    float(np.round(np.random.uniform(max(0.5, di - 2), di), 2))
    for di in dosing_interval
])

# Compute steady-state trough approximation using one-compartment infusion formula
tau = dosing_interval
t_inf = infusion_time_hr

# Avoid division by zero for very small kel/t_inf combos by stable computation
term_inf = (1 - np.exp(-kel * t_inf)) / (kel * t_inf)
# Baseline peak at steady state (approx)
C_peak_ss = (dose_mg / Vd_L) * term_inf / (1 - np.exp(-kel * tau))
C_trough_ss = C_peak_ss * np.exp(-kel * (tau - t_inf))

# Apply modifiers: albumin (binding), nephrotoxic meds (reduce CL), sepsis (increase Vd/alter PK)
albumin_factor = 1 + (3.5 - albumin) * 0.05            # low albumin slightly increases free drug
nephrotoxic_factor = np.where(nephrotoxic_med == 1, 0.8, 1.0)
sepsis_factor = np.where(sepsis_flag == 1, 1.2, 1.0)

C_trough_ss = C_trough_ss * albumin_factor * nephrotoxic_factor * sepsis_factor

# Add measurement noise and clip
C_trough_ss = C_trough_ss + np.random.normal(0, 1.5, n)
C_trough_ss = np.round(np.clip(C_trough_ss, 0.1, 200), 2)

# Categorize (example therapeutic trough proxy)
category = pd.cut(C_trough_ss, bins=[0, 10, 20, np.inf], labels=["Subtherapeutic", "Therapeutic", "Toxic"])

# Timestamps (admission and sample times)
start_dates = [datetime(2022,1,1) + timedelta(days=int(x)) for x in np.random.randint(0, 365, n)]
admission_time = [sd.isoformat() for sd in start_dates]
sample_time = [(sd + timedelta(hours=float(ts))).isoformat() for sd, ts in zip(start_dates, time_since_last_dose)]

# Assemble DataFrame
df = pd.DataFrame({
    "patient_id": np.arange(1, n+1),
    "age_yrs": age,
    "sex": sex,
    "weight_kg": weight,
    "serum_creatinine_mg_dL": serum_creatinine,
    "creatinine_clearance_mL_min": CLcr,
    "ckd_flag": ckd_flag,
    "nephrotoxic_med_flag": nephrotoxic_med,
    "sepsis_flag": sepsis_flag,
    "alt_U_L": ALT,
    "ast_U_L": AST,
    "albumin_g_dL": albumin,
    "dose_mg": dose_mg,
    "dosing_interval_hr": dosing_interval,
    "infusion_time_hr": infusion_time_hr,
    "time_since_last_dose_hr": time_since_last_dose,
    "Vd_L": Vd_L,
    "kel_1_per_hr": kel,
    "simulated_trough_mg_L": C_trough_ss,
    "category": category,
    "admission_time": admission_time,
    "sample_time": sample_time
})

# Save CSV
out_path = "enhanced_synthetic_vancomycin_10000.csv"
df.to_csv(out_path, index=False)
print(f"Saved synthetic dataset to: {out_path}")
print(df.head(8).to_string(index=False))


Saved synthetic dataset to: enhanced_synthetic_vancomycin_10000.csv
 patient_id  age_yrs    sex  weight_kg  serum_creatinine_mg_dL  creatinine_clearance_mL_min  ckd_flag  nephrotoxic_med_flag  sepsis_flag  alt_U_L  ast_U_L  albumin_g_dL  dose_mg  dosing_interval_hr  infusion_time_hr  time_since_last_dose_hr  Vd_L  kel_1_per_hr  simulated_trough_mg_L category      admission_time         sample_time
          1       69 Female       98.2                    1.56                        52.76         1                     1            0     30.2     34.8          4.46     1250                  12              0.79                    11.92 67.49        0.0100                 111.97    Toxic 2022-09-16T00:00:00 2022-09-16T11:55:12
          2       32   Male       91.4                    0.75                       182.80         0                     0            0     23.5     22.9          4.36      500                  12              2.62                    10.09 63.00        0.0100      