We first generate medically structured seed data using clinical rules,
then use generative models (SynthCity) to scale it while preserving distributions.

In [1]:
import numpy as np
import pandas as pd

In [2]:
N = 2000  # seed dataset size

risk_levels = np.random.choice(
    [0, 1, 2],
    size=N,
    p=[0.60, 0.25, 0.15]
)

df = pd.DataFrame({
    "risk_level": risk_levels
})

df["risk_level"].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
risk_level,Unnamed: 1_level_1
0,0.5955
1,0.2655
2,0.139


In [3]:
import random

def bernoulli(p):
    return 1 if random.random() < p else 0

maternal_rows = []

for r in df["risk_level"]:
    if r == 0:   # LOW
        prom_p = bernoulli(0.15)
        fever_p = bernoulli(0.08)
        row = {
            "prom_present": prom_p,
            "chorioamnionitis": bernoulli(0.05),
            "foul_smelling_liquor": bernoulli(0.05),
            "prolonged_labor": bernoulli(0.10),
            "unbooked_pregnancy": bernoulli(0.15),
            "maternal_uti_sti": bernoulli(0.08),
            "meconium_stained_liquor": bernoulli(0.10),
            "cotwin_iud": bernoulli(0.02),
            "pv_examinations_count": random.randint(1,2)
        }
        row["prom_duration_hours"] = random.uniform(2,10) if prom_p else 0
        row["maternal_fever_celsius"] = random.uniform(37.5,38.2) if fever_p else random.uniform(36.5,37.4)

    elif r == 1: # MODERATE
        prom_p = bernoulli(0.40)
        fever_p = bernoulli(0.30)
        row = {
            "prom_present": prom_p,
            "chorioamnionitis": bernoulli(0.25),
            "foul_smelling_liquor": bernoulli(0.20),
            "prolonged_labor": bernoulli(0.35),
            "unbooked_pregnancy": bernoulli(0.40),
            "maternal_uti_sti": bernoulli(0.25),
            "meconium_stained_liquor": bernoulli(0.30),
            "cotwin_iud": bernoulli(0.08),
            "pv_examinations_count": random.randint(2,4)
        }
        row["prom_duration_hours"] = random.uniform(6,18) if prom_p else 0
        row["maternal_fever_celsius"] = random.uniform(38.0,38.8) if fever_p else random.uniform(36.5,37.4)

    else:       # HIGH
        prom_p = bernoulli(0.75)
        fever_p = bernoulli(0.65)
        row = {
            "prom_present": prom_p,
            "chorioamnionitis": bernoulli(0.60),
            "foul_smelling_liquor": bernoulli(0.55),
            "prolonged_labor": bernoulli(0.65),
            "unbooked_pregnancy": bernoulli(0.70),
            "maternal_uti_sti": bernoulli(0.55),
            "meconium_stained_liquor": bernoulli(0.60),
            "cotwin_iud": bernoulli(0.15),
            "pv_examinations_count": random.randint(3,6)
        }
        row["prom_duration_hours"] = random.uniform(12,36) if prom_p else 0
        row["maternal_fever_celsius"] = random.uniform(38.5,40.0) if fever_p else random.uniform(36.5,37.4)

    maternal_rows.append(row)

maternal_df = pd.DataFrame(maternal_rows)
df = pd.concat([df, maternal_df], axis=1)

df.head()


Unnamed: 0,risk_level,prom_present,chorioamnionitis,foul_smelling_liquor,prolonged_labor,unbooked_pregnancy,maternal_uti_sti,meconium_stained_liquor,cotwin_iud,pv_examinations_count,prom_duration_hours,maternal_fever_celsius
0,1,0,0,0,1,0,1,0,0,3,0.0,36.55541
1,2,0,1,0,1,0,0,0,0,4,0.0,36.687441
2,1,1,0,0,0,1,1,0,0,4,11.531519,37.370933
3,0,0,0,0,0,0,0,0,0,1,0.0,36.98452
4,2,1,0,0,1,1,1,1,0,6,25.514795,39.098882


In [4]:
df.groupby("risk_level")["prom_present"].mean()
df.groupby("risk_level")["maternal_fever_celsius"].mean()


Unnamed: 0_level_0,maternal_fever_celsius
risk_level,Unnamed: 1_level_1
0,37.026882
1,37.350209
2,38.383578


In [5]:
def categorical_choice(probs):
    return np.random.choice(len(probs), p=probs)

neonatal_rows = []

for i, row in df.iterrows():
    r = row["risk_level"]

    maternal_push = 0
    if row["maternal_fever_celsius"] > 38 or row["chorioamnionitis"] == 1:
        maternal_push = 0.3

    # ----- Temperature -----
    if r == 0:
        temp = random.uniform(36.5, 37.5)
    elif r == 1:
        temp = random.uniform(36.0, 38.0)
    else:
        temp = random.uniform(35.5, 39.5)

    temp += maternal_push * random.uniform(0.2, 0.6)
    temp = round(temp, 2)

    # ----- Feeding -----
    if r == 0:
        feeding = categorical_choice([0.80, 0.15, 0.05])
    elif r == 1:
        feeding = categorical_choice([0.45, 0.35, 0.20])
    else:
        feeding = categorical_choice([0.20, 0.40, 0.40])

    # ----- Activity -----
    if r == 0:
        activity = categorical_choice([0.80, 0.15, 0.05])
    elif r == 1:
        activity = categorical_choice([0.40, 0.35, 0.25])
    else:
        activity = categorical_choice([0.15, 0.35, 0.50])

    # ----- Respiratory Distress -----
    if r == 0:
        resp = bernoulli(0.08)
    elif r == 1:
        resp = bernoulli(0.35)
    else:
        resp = bernoulli(0.75)

    # ----- Apnea -----
    if r == 0:
        apnea = bernoulli(0.03)
    elif r == 1:
        apnea = bernoulli(0.20)
    else:
        apnea = bernoulli(0.55)

    # ----- Shock -----
    if r == 0:
        shock = bernoulli(0.02)
    elif r == 1:
        shock = bernoulli(0.15)
    else:
        shock = bernoulli(0.45)

    # ----- Heart Rate -----
    if r == 0:
        hr = random.randint(120,160)
    elif r == 1:
        hr = random.randint(110,180)
    else:
        hr = random.randint(90,210)

    if shock or resp:
        hr += random.randint(-15, 20)

    neonatal_rows.append({
        "temperature_celsius": temp,
        "feeding_status": feeding,
        "activity_level": activity,
        "respiratory_distress": resp,
        "heart_rate_bpm": hr,
        "apnea_present": apnea,
        "shock_present": shock
    })

neonatal_df = pd.DataFrame(neonatal_rows)
df = pd.concat([df, neonatal_df], axis=1)

df.head()


Unnamed: 0,risk_level,prom_present,chorioamnionitis,foul_smelling_liquor,prolonged_labor,unbooked_pregnancy,maternal_uti_sti,meconium_stained_liquor,cotwin_iud,pv_examinations_count,prom_duration_hours,maternal_fever_celsius,temperature_celsius,feeding_status,activity_level,respiratory_distress,heart_rate_bpm,apnea_present,shock_present
0,1,0,0,0,1,0,1,0,0,3,0.0,36.55541,37.81,1,0,0,144,1,0
1,2,0,1,0,1,0,0,0,0,4,0.0,36.687441,39.34,2,2,1,142,0,1
2,1,1,0,0,0,1,1,0,0,4,11.531519,37.370933,37.61,2,0,0,148,0,1
3,0,0,0,0,0,0,0,0,0,1,0.0,36.98452,37.43,0,2,1,153,0,0
4,2,1,0,0,1,1,1,1,0,6,25.514795,39.098882,37.42,1,2,1,157,0,1


In [6]:
df.groupby("risk_level")["respiratory_distress"].mean()
df.groupby("risk_level")["feeding_status"].mean()
df.groupby("risk_level")["temperature_celsius"].mean()


Unnamed: 0_level_0,temperature_celsius
risk_level,Unnamed: 1_level_1
0,37.018556
1,37.048795
2,37.682266


In [7]:
hss_rows = []

for i, row in df.iterrows():

    severity = row["risk_level"]
    if row["shock_present"] == 1:
        severity += 1
    if row["apnea_present"] == 1:
        severity += 1
    if row["respiratory_distress"] == 1:
        severity += 1

    if severity == 0:
        p = 0.05
    elif severity == 1:
        p = 0.15
    elif severity == 2:
        p = 0.30
    elif severity == 3:
        p = 0.50
    else:
        p = 0.70

    tlc = bernoulli(p)
    anc = bernoulli(p)
    it_ratio = bernoulli(p + 0.05)
    im_ratio = bernoulli(p + 0.05)
    platelet = bernoulli(p + 0.10)
    neut_deg = bernoulli(p)
    nrbc = bernoulli(p)

    hss_score = tlc + anc + it_ratio + im_ratio + platelet + neut_deg + nrbc

    hss_rows.append({
        "hss_tlc_abnormal": tlc,
        "hss_anc_abnormal": anc,
        "hss_it_ratio_high": it_ratio,
        "hss_im_ratio_high": im_ratio,
        "hss_platelet_low": platelet,
        "hss_neutrophil_degeneration": neut_deg,
        "hss_nrbc_elevated": nrbc,
        "hss_score": hss_score
    })

hss_df = pd.DataFrame(hss_rows)
df = pd.concat([df, hss_df], axis=1)

df[["risk_level","hss_score"]].groupby("risk_level").mean()


Unnamed: 0_level_0,hss_score
risk_level,Unnamed: 1_level_1
0,0.647355
1,2.092279
2,4.453237


In [8]:
def apgar_component(probs):
    return np.random.choice([0,1,2], p=probs)

apgar_rows = []

for i, row in df.iterrows():
    r = row["risk_level"]

    # ---------- Base probabilities (1 min) ----------
    if r == 0:
        base_probs = [0.05, 0.15, 0.80]
    elif r == 1:
        base_probs = [0.20, 0.35, 0.45]
    else:
        base_probs = [0.45, 0.40, 0.15]

    # ---------- Severity penalty ----------
    penalty = 0
    if row["respiratory_distress"] == 1:
        penalty += 0.10
    if row["apnea_present"] == 1:
        penalty += 0.10
    if row["shock_present"] == 1:
        penalty += 0.15

    # Adjust probabilities safely
    p0 = min(base_probs[0] + penalty, 0.70)
    p2 = max(base_probs[2] - penalty, 0.05)
    p1 = 1 - (p0 + p2)
    probs_1min = [p0, p1, p2]

    # ---------- APGAR 1 min ----------
    a1 = apgar_component(probs_1min)
    p1c = apgar_component(probs_1min)
    g1 = apgar_component(probs_1min)
    ac1 = apgar_component(probs_1min)
    r1 = apgar_component(probs_1min)

    apgar1_total = a1 + p1c + g1 + ac1 + r1

    # ---------- APGAR 5 min (improvement) ----------
    def improve(x):
        if row["shock_present"] == 1:
            return x  # no improvement
        return min(x + np.random.choice([0,1], p=[0.4,0.6]), 2)

    a5 = improve(a1)
    p5c = improve(p1c)
    g5 = improve(g1)
    ac5 = improve(ac1)
    r5 = improve(r1)

    apgar5_total = a5 + p5c + g5 + ac5 + r5

    apgar_rows.append({
        "apgar1_appearance": a1,
        "apgar1_pulse": p1c,
        "apgar1_grimace": g1,
        "apgar1_activity": ac1,
        "apgar1_respiration": r1,
        "apgar1_total": apgar1_total,

        "apgar5_appearance": a5,
        "apgar5_pulse": p5c,
        "apgar5_grimace": g5,
        "apgar5_activity": ac5,
        "apgar5_respiration": r5,
        "apgar5_total": apgar5_total
    })

apgar_df = pd.DataFrame(apgar_rows)
df = pd.concat([df, apgar_df], axis=1)

df.groupby("risk_level")[["apgar1_total","apgar5_total"]].mean()


Unnamed: 0_level_0,apgar1_total,apgar5_total
risk_level,Unnamed: 1_level_1,Unnamed: 2_level_1
0,8.65827,9.257767
1,5.335217,6.785311
2,2.18705,3.593525


In [9]:
baseline_rows = []

for i, row in df.iterrows():
    r = row["risk_level"]

    # ----- Gestational age -----
    if r == 0:
        ga = np.random.choice(["<34", "34–36", "≥37"], p=[0.05,0.15,0.80])
    elif r == 1:
        ga = np.random.choice(["<34", "34–36", "≥37"], p=[0.15,0.30,0.55])
    else:
        ga = np.random.choice(["<34", "34–36", "≥37"], p=[0.35,0.35,0.30])

    # ----- Birth weight -----
    if r == 0:
        bw = np.random.choice(["<1500", "1500–2499", "≥2500"], p=[0.05,0.15,0.80])
    elif r == 1:
        bw = np.random.choice(["<1500", "1500–2499", "≥2500"], p=[0.15,0.35,0.50])
    else:
        bw = np.random.choice(["<1500", "1500–2499", "≥2500"], p=[0.40,0.35,0.25])

    # ----- Resuscitation -----
    apgar1 = row["apgar1_total"]
    if apgar1 >= 7:
        resus = bernoulli(0.05)
    elif apgar1 >= 4:
        resus = bernoulli(0.40)
    else:
        resus = bernoulli(0.80)

    # ----- Sex -----
    sex = np.random.choice(["male", "female"], p=[0.52,0.48])

    baseline_rows.append({
        "gestational_age_category": ga,
        "birth_weight_category": bw,
        "resuscitation_required": resus,
        "neonatal_sex": sex
    })

baseline_df = pd.DataFrame(baseline_rows)
df = pd.concat([df, baseline_df], axis=1)


In [10]:
mnrs_list = []

for i, d in df.iterrows():
    s = 0

    # ---- A. Antenatal / Peripartum ----

    if d["prom_present"] == 1 and d["prom_duration_hours"] >= 18:
        s += 3
    if d["chorioamnionitis"] == 1:
        s += 3
    if d["maternal_fever_celsius"] >= 38:
        s += 3

    if d["foul_smelling_liquor"] == 1:
        s += 2
    if d["pv_examinations_count"] >= 3:
        s += 2
    if d["meconium_stained_liquor"] == 1:
        s += 2
    if d["prolonged_labor"] == 1:
        s += 2

    if d["unbooked_pregnancy"] == 1:
        s += 1
    if d["maternal_uti_sti"] == 1:
        s += 1
    if d["cotwin_iud"] == 1:
        s += 1

    # ---- B. Neonatal baseline ----

    if d["gestational_age_category"] == "<34":
        s += 3
    elif d["gestational_age_category"] == "34–36":
        s += 2

    if d["birth_weight_category"] == "<1500":
        s += 3
    elif d["birth_weight_category"] == "1500–2499":
        s += 2

    if d["apgar5_total"] < 7:
        s += 3
    elif d["apgar1_total"] < 7:
        s += 2

    if d["resuscitation_required"] == 1:
        s += 3
    if d["neonatal_sex"] == "male":
        s += 1

    # ---- C. Early clinical (0–72h) ----

    if d["temperature_celsius"] < 36 or d["temperature_celsius"] > 38:
        s += 3

    if d["feeding_status"] >= 1 or d["activity_level"] == 2:
        s += 3

    if d["apnea_present"] == 1 or d["shock_present"] == 1:
        s += 3

    if d["heart_rate_bpm"] > 160:
        s += 2

    if d["respiratory_distress"] == 1:
        s += 2

    mnrs_list.append(s)

df["mnrs_score"] = mnrs_list


In [11]:
df["mnrs_score"].describe()
df.groupby("risk_level")["mnrs_score"].mean()


Unnamed: 0_level_0,mnrs_score
risk_level,Unnamed: 1_level_1
0,3.811083
1,15.286252
2,30.697842


In [12]:
import math

labels = []

for i, row in df.iterrows():

    # sigmoid around MNRS = 12
    base = 1 / (1 + math.exp(-(row["mnrs_score"] - 12) / 4))

    if row["hss_score"] >= 4:
        base += 0.10
    if row["shock_present"] == 1:
        base += 0.10

    prob = min(max(base, 0.05), 0.95)

    label = 1 if random.random() < prob else 0
    labels.append(label)

df["sepsis_label"] = labels


In [13]:
df.groupby("mnrs_score")["sepsis_label"].mean().head(10)
df.groupby("mnrs_score")["sepsis_label"].mean().tail(10)

df.groupby("risk_level")["sepsis_label"].mean()


Unnamed: 0_level_0,sepsis_label
risk_level,Unnamed: 1_level_1
0,0.125945
1,0.65725
2,0.94964


In [14]:
df_final = df.drop(columns=["risk_level"])

In [15]:
!pip install synthcity
from synthcity.plugins import Plugins







In [17]:
# Initialize CTGAN plugin
plugin = Plugins().get(
    "ctgan",
    n_iter=300,          # training iterations
    batch_size=512,
    lr=2e-4,
    generator_n_layers_hidden=2,
    discriminator_n_layers_hidden=2
)

# Define the target column
target_column = "sepsis_label"

# Fit on seed dataset
plugin.fit(
    df_final,
    target_column=target_column
)

[2026-01-29T18:32:04.974232+0000][38752][CRITICAL] load failed: module 'synthcity.plugins.generic.plugin_great' has no attribute 'plugin'
[2026-01-29T18:32:04.975781+0000][38752][CRITICAL] load failed: module 'synthcity.plugins.generic.plugin_great' has no attribute 'plugin'
[2026-01-29T18:32:04.976877+0000][38752][CRITICAL] module plugin_great load failed
[2026-01-29T18:32:04.978870+0000][38752][CRITICAL] module disabled: /usr/local/lib/python3.12/dist-packages/synthcity/plugins/generic/plugin_goggle.py
100%|██████████| 300/300 [08:14<00:00,  1.65s/it]


<synthcity.plugins.generic.plugin_ctgan.CTGANPlugin at 0x7c4cfaebfe30>

In [19]:
N_SYNTH = 20000

synthetic_df = plugin.generate(N_SYNTH)

synthetic_df.dataframe().head()

Unnamed: 0,prom_present,chorioamnionitis,foul_smelling_liquor,prolonged_labor,unbooked_pregnancy,maternal_uti_sti,meconium_stained_liquor,cotwin_iud,pv_examinations_count,prom_duration_hours,...,apgar5_grimace,apgar5_activity,apgar5_respiration,apgar5_total,gestational_age_category,birth_weight_category,resuscitation_required,neonatal_sex,mnrs_score,sepsis_label
0,1,0,0,0,0,0,1,0,2,6.236439,...,2,2,2,9,≥37,≥2500,0,female,7,0
1,0,1,1,0,1,1,1,0,3,0.040409,...,0,2,1,2,34–36,1500–2499,0,male,33,1
2,0,0,0,0,1,0,0,0,2,0.0,...,1,2,2,9,34–36,≥2500,0,male,7,0
3,1,0,0,1,0,0,0,0,2,6.103503,...,2,2,1,9,≥37,≥2500,0,female,8,1
4,1,0,0,0,0,0,0,0,1,5.560599,...,2,1,2,10,≥37,≥2500,0,male,3,0


In [23]:
synthetic_df = synthetic_df.dataframe() # Convert to pandas DataFrame

# Shape check
synthetic_df.shape

# Label distribution
synthetic_df["sepsis_label"].mean()

# MNRS vs sepsis sanity
synthetic_df.groupby("mnrs_score")["sepsis_label"].mean().head()
synthetic_df.groupby("mnrs_score")["sepsis_label"].mean().tail()

# Risk proxies sanity
synthetic_df["temperature_celsius"].describe()
synthetic_df["hss_score"].describe()

Unnamed: 0,hss_score
count,20000.0
mean,1.302
std,1.661124
min,0.0
25%,0.0
50%,1.0
75%,2.0
max,7.0


In [25]:
synthetic_df.to_csv(
    "septoctor_training_dataset_20k.csv",
    index=False
)

print("Saved final dataset:", synthetic_df.shape)

Saved final dataset: (20000, 44)
