## Dataset Generation

In [2]:
import numpy as np
import pandas as pd

# -----------------------------
# Setup
# -----------------------------
rng = np.random.default_rng(42)
N = 1000                   # ≥500 rows
TARGET_PREV = 0.35         # target cancer fraction

# -----------------------------
# Sampling helpers (true truncation)
# -----------------------------
def rtruncnorm(mean, sd, low, high, size):
    """True truncated Normal via rejection sampling (no piling at bounds)."""
    out = np.empty(size)
    filled = 0
    while filled < size:
        need = size - filled
        draws = rng.normal(mean, sd, need * 2)
        keep = draws[(draws >= low) & (draws <= high)]
        k = min(keep.size, need)
        out[filled:filled+k] = keep[:k]
        filled += k
    return out

def rtrunclognorm(median, sigma, low, high, size):
    """True truncated Log-normal via rejection on the linear scale."""
    mu = np.log(median)
    out = np.empty(size)
    filled = 0
    while filled < size:
        need = size - filled
        draws = rng.lognormal(mean=mu, sigma=sigma, size=need*2)
        keep = draws[(draws >= low) & (draws <= high)]
        k = min(keep.size, need)
        out[filled:filled+k] = keep[:k]
        filled += k
    return out

def sigmoid(x):
    return 1/(1+np.exp(-x))

# -----------------------------
# 0) Class label (prevalence)
# -----------------------------
diagnosis = (rng.random(N) < TARGET_PREV).astype(int)
ctrl_idx = diagnosis == 0
case_idx = diagnosis == 1

# -----------------------------
# 1) Age (class-conditional, truncated normal)
# -----------------------------
age = np.empty(N, dtype=float)
age[ctrl_idx] = rtruncnorm(mean=45, sd=12, low=18, high=90, size=ctrl_idx.sum())
age[case_idx] = rtruncnorm(mean=58, sd=10, low=18, high=90, size=case_idx.sum())

# -----------------------------
# 2) Menopause (age-conditioned Bernoulli)
# -----------------------------
p_post = sigmoid((age - 52)/3.0)                 # targets P(post|~55) ≈ 0.85
menopausal_status = (rng.random(N) < p_post).astype(int)

# nudge P(post | age≥55) toward 0.85 within [0.80,0.90]
mask55 = age >= 55
if mask55.any():
    sub = menopausal_status[mask55]
    n55 = sub.size
    ones = sub.sum()
    target = int(round(0.85 * n55))
    if ones < int(0.80 * n55) or ones > int(0.90 * n55):
        idx55 = np.where(mask55)[0]
        if ones < target:
            cand = idx55[menopausal_status[idx55] == 0]
            need = min(target - ones, cand.size)
            if need > 0:
                flip = rng.choice(cand, size=need, replace=False)
                menopausal_status[flip] = 1
        elif ones > target:
            cand = idx55[menopausal_status[idx55] == 1]
            need = min(ones - target, cand.size)
            if need > 0:
                flip = rng.choice(cand, size=need, replace=False)
                menopausal_status[flip] = 0

# -----------------------------
# 3) BMI (class-conditional, truncated normal)
# -----------------------------
bmi = np.empty(N, dtype=float)
bmi[ctrl_idx] = rtruncnorm(mean=26.5, sd=5.0, low=16, high=45, size=ctrl_idx.sum())
bmi[case_idx] = rtruncnorm(mean=27.3, sd=5.2, low=16, high=45, size=case_idx.sum())

# -----------------------------
# 4) Parity (class-conditional, truncated Poisson)
# -----------------------------
parity = np.empty(N, dtype=int)
parity[ctrl_idx] = np.clip(rng.poisson(lam=2.0, size=ctrl_idx.sum()), 0, 8)
parity[case_idx] = np.clip(rng.poisson(lam=1.4, size=case_idx.sum()), 0, 8)

# -----------------------------
# 5) Family history (class-conditional Bernoulli)
# -----------------------------
family_history = np.empty(N, dtype=int)
family_history[ctrl_idx] = (rng.random(ctrl_idx.sum()) < 0.08).astype(int)
family_history[case_idx] = (rng.random(case_idx.sum()) < 0.22).astype(int)

# -----------------------------
# 6) MHT use (post-menopause only, class-conditional)
# -----------------------------
mht_use = np.zeros(N, dtype=int)
post_ctrl = ctrl_idx & (menopausal_status == 1)
post_case = case_idx & (menopausal_status == 1)
mht_use[post_ctrl] = (rng.random(post_ctrl.sum()) < 0.20).astype(int)
mht_use[post_case] = (rng.random(post_case.sum()) < 0.25).astype(int)

# -----------------------------
# 7) BRCA status (class-conditional; FH-aware among cases)
# -----------------------------
brca_status = np.zeros(N, dtype=int)
brca_status[ctrl_idx] = (rng.random(ctrl_idx.sum()) < 0.002).astype(int)
p_brca_case = np.where(family_history == 1, 0.18, 0.12)
draws = rng.random(N)
brca_status[case_idx] = (draws[case_idx] < p_brca_case[case_idx]).astype(int)

# adjust case BRCA prevalence toward 0.15 within [0.12,0.18]
case_ids = np.where(case_idx)[0]
if case_ids.size:
    rate = brca_status[case_ids].mean()
    if not (0.12 <= rate <= 0.18):
        target = 0.15
        pos = case_ids[brca_status[case_ids] == 1]
        neg = case_ids[brca_status[case_ids] == 0]
        target_pos = int(round(target * case_ids.size))
        if target_pos > pos.size and neg.size:
            flip = rng.choice(neg, size=min(target_pos - pos.size, neg.size), replace=False)
            brca_status[flip] = 1
        elif target_pos < pos.size and pos.size:
            flip = rng.choice(pos, size=min(pos.size - target_pos, pos.size), replace=False)
            brca_status[flip] = 0

# BRCA+ cases skew younger; resample menopause for those
brca_case = (brca_status == 1) & case_idx
n_shift = brca_case.sum()
if n_shift > 0:
    age[brca_case] = np.clip(age[brca_case] - rng.integers(6, 11, size=n_shift), 18, 90)
    p_post2 = sigmoid((age - 52)/3.0)
    menopausal_status[brca_case] = (rng.random(n_shift) < p_post2[brca_case]).astype(int)
    post_ctrl = ctrl_idx & (menopausal_status == 1)
    post_case = case_idx & (menopausal_status == 1)
    mht_use[post_ctrl] = (rng.random(post_ctrl.sum()) < 0.20).astype(int)
    mht_use[post_case] = (rng.random(post_case.sum()) < 0.25).astype(int)

# -----------------------------
# 8) Imaging (tumor size) conditional on diagnosis
# -----------------------------
tumor_size_cm = np.zeros(N, dtype=float)

# Controls: 70% no mass; else benign simple cysts 1–5 cm
ctrl_ids = np.where(ctrl_idx)[0]
ctrl_has_cyst = rng.random(ctrl_ids.size) < 0.30
cyst_ids = ctrl_ids[ctrl_has_cyst]
tumor_size_cm[cyst_ids] = rtruncnorm(mean=3.0, sd=1.0, low=1.0, high=5.0, size=cyst_ids.size)

# Cases: right-skewed sizes, median ~9.5 cm, clip to 0.5–20
case_ids = np.where(case_idx)[0]
tumor_size_cm[case_ids] = rtrunclognorm(median=9.5, sigma=0.5, low=0.5, high=20.0, size=case_ids.size)

# keep zero-mass proportion in [0.60, 0.80]
if ctrl_ids.size:
    zero_rate = (tumor_size_cm[ctrl_ids] == 0).mean()
    if zero_rate < 0.60:
        cyst_only = ctrl_ids[tumor_size_cm[ctrl_ids] > 0]
        need = int(np.ceil(0.60*ctrl_ids.size - (tumor_size_cm[ctrl_ids] == 0).sum()))
        if cyst_only.size and need > 0:
            flip = rng.choice(cyst_only, size=min(need, cyst_only.size), replace=False)
            tumor_size_cm[flip] = 0.0
    elif zero_rate > 0.80:
        zeros = ctrl_ids[tumor_size_cm[ctrl_ids] == 0]
        need = int(np.ceil((tumor_size_cm[ctrl_ids] == 0).sum() - 0.80*ctrl_ids.size))
        if zeros.size and need > 0:
            flip = rng.choice(zeros, size=min(need, zeros.size), replace=False)
            tumor_size_cm[flip] = rtruncnorm(mean=3.0, sd=1.0, low=1.0, high=5.0, size=flip.size)

# -----------------------------
# 9) CA-125 conditional on diagnosis (with healthy-only hemodilution)
# -----------------------------
ca125 = np.zeros(N, dtype=float)

# Healthy baseline by menopause
pre_ctrl = ctrl_idx & (menopausal_status == 0)
post_ctrl = ctrl_idx & (menopausal_status == 1)
ca125[post_ctrl] = rtrunclognorm(median=14.0, sigma=0.50, low=2, high=2000, size=post_ctrl.sum())
ca125[pre_ctrl]  = rtrunclognorm(median=18.0, sigma=0.55, low=2, high=2000, size=pre_ctrl.sum())

# Healthy age trend
age_ctrl = age[ctrl_idx]
age_factor = np.clip(1.0 + (-0.002)*(age_ctrl - 45), 0.85, 1.15)
ca125[ctrl_idx] *= age_factor

# Healthy BMI hemodilution (apply last in healthy)
delta_bmi_ctrl = bmi[ctrl_idx] - 25.0
hemo = np.clip(np.exp(delta_bmi_ctrl * np.log(0.97)), 0.75, 1.20)
ca125[ctrl_idx] *= hemo

# Healthy false-positive tail ≈ 2% >35 U/mL
if ctrl_ids.size:
    target = 0.02
    desired = int(np.floor(target * ctrl_ids.size))
    high = ctrl_ids[ca125[ctrl_ids] > 35]
    low  = ctrl_ids[ca125[ctrl_ids] <= 35]
    if high.size > desired:
        drop_sel = rng.choice(high, size=high.size - desired, replace=False)
        ca125[drop_sel] = rng.uniform(20, 34, size=drop_sel.size)
    elif high.size < desired and low.size:
        add_sel = rng.choice(low, size=min(desired - high.size, low.size), replace=False)
        ca125[add_sel] = np.maximum(ca125[add_sel], rng.uniform(36, 60, size=add_sel.size))


# Cases baseline with size effect
case_base = rtrunclognorm(median=80.0, sigma=0.80, low=2, high=2000, size=case_ids.size)
beta_size = 0.03
size_eff = np.exp(beta_size * (tumor_size_cm[case_ids] - 9.0))
ca125[case_ids] = np.clip(case_base * size_eff, 2, 2000)

# ensure ≥35% of cases >200 U/mL
over200 = (ca125[case_ids] > 200).sum()
need = int(np.ceil(0.35*case_ids.size - over200))
if need > 0:
    cand = case_ids[ca125[case_ids] <= 200]
    if cand.size:
        sel = rng.choice(cand, size=min(need, cand.size), replace=False)
        ca125[sel] = rng.uniform(210, 600, size=sel.size)

# -----------------------------
# 10) Ultrasound risk: primarily size-driven + moderate CA-125 correlation
# -----------------------------
eps = 1e-6
log_ca = np.log(ca125/35.0 + eps)

alpha, gamma_size, gamma_ca = -4.2, 0.35, 0.40
risk_linear = alpha + gamma_size * tumor_size_cm + gamma_ca * log_ca
ultrasound_risk_score = 1/(1+np.exp(-risk_linear))

# Controls with no mass: cap low risk
mask_no_mass = ctrl_idx & (tumor_size_cm == 0)
ultrasound_risk_score[mask_no_mass] = np.minimum(ultrasound_risk_score[mask_no_mass], 0.079)

# auto-tune weights to keep corr(size,risk)>0.5 and corr(CA-125,risk) in [0.3,0.5] within cases
def _spearman(a, b):
    a = pd.Series(a); b = pd.Series(b)
    return a.corr(b, method="spearman")

for _ in range(6):
    rc = ultrasound_risk_score[case_idx]
    szc = tumor_size_cm[case_idx]
    cac = ca125[case_idx]
    rho_sz = _spearman(szc, rc) if len(rc) > 2 else np.nan
    rho_ca = _spearman(cac, rc) if len(rc) > 2 else np.nan
    ok_sz = (rho_sz is not None) and (rho_sz > 0.50)
    ok_ca = (rho_ca is not None) and (0.30 <= rho_ca <= 0.50)
    if ok_sz and ok_ca:
        break
    if not ok_sz:
        gamma_size *= 1.10
    if rho_ca is not None:
        if rho_ca < 0.30:
            gamma_ca *= 1.10
        elif rho_ca > 0.50:
            gamma_ca *= 0.90
    risk_linear = alpha + gamma_size * tumor_size_cm + gamma_ca * log_ca
    ultrasound_risk_score = 1/(1+np.exp(-risk_linear))
    ultrasound_risk_score[mask_no_mass] = np.minimum(ultrasound_risk_score[mask_no_mass], 0.079)

# Keep within [0, 0.99]
ultrasound_risk_score = np.clip(ultrasound_risk_score, 0.0, 0.99)

# -----------------------------
# 11) Assemble dataframe
# -----------------------------
age = np.clip(age, 18, 90).astype(int)
bmi = np.round(np.clip(bmi, 16, 45), 2)
ca125 = np.clip(ca125, 2, 2000)
tumor_size_cm[ctrl_idx] = np.clip(tumor_size_cm[ctrl_idx], 0, None)
tumor_size_cm[case_idx] = np.clip(tumor_size_cm[case_idx], 0.5, 20)

df = pd.DataFrame({
    "age": age.astype(int),
    "menopausal_status": menopausal_status.astype(int),
    "bmi": bmi,
    "parity": parity.astype(int),
    "family_history": family_history.astype(int),
    "mht_use": mht_use.astype(int),
    "brca_status": brca_status.astype(int),
    "ca125": np.round(ca125, 1),
    "ultrasound_risk_score": np.round(ultrasound_risk_score, 3),
    "tumor_size_cm": np.round(tumor_size_cm, 2),
    "diagnosis_label": diagnosis.astype(int),
})

# -----------------------------
# 12) Save
# -----------------------------
csv_path = r"data/synthetic_clinical_dataset.csv"
df.to_csv(csv_path, index=False)
print("Dataset Saved")

Dataset Saved
