In [158]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [160]:
rng = np.random.default_rng(42)

In [162]:
N = 800                          # number of patients (>=500)
CANCER_RATE = 0.35               # >= 0.30
BRCA_RATE_CANCER = 0.22          # conditional brca+ rate among cancer
BRCA_RATE_HEALTHY = 0.10         # conditional brca+ rate among healthy
ULTRASOUND_LOW_MAX = 0.4         # most healthy are below this (some overlap allowed)
ULTRASOUND_HIGH_MIN = 0.6        # most cancer are above this (some overlap allowed)

In [164]:
# Output paths
data_path = "C://Users//cherisma//Downloads"
plots_dir = os.path.join(data_path, "plots")
os.makedirs(plots_dir, exist_ok=True)

csv_path = os.path.join(data_path, "synthetic_clinical_dataset.csv")
pdf_path = os.path.join(data_path, "Synthetic_Clinical_Report.pdf")

In [166]:
def clipped_normal(mean, sd, size=1, min_val=None, max_val=None, rng=rng):
    x = rng.normal(mean, sd, size)
    if min_val is not None:
        x = np.maximum(x, min_val)
    if max_val is not None:
        x = np.minimum(x, max_val)
    return x

In [168]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [170]:
diagnosis = (rng.random(N) < CANCER_RATE).astype(int) 

In [172]:
# Age distributions
# Early-stage ovarian cancer patients are often post-menopausal but BRCA+ skew younger.
# We'll generate baseline ages by diagnosis, then adjust for BRCA later.
age_healthy = clipped_normal(mean=53, sd=11, size=(diagnosis == 0).sum(), min_val=18, max_val=90)
age_cancer  = clipped_normal(mean=56, sd=10, size=(diagnosis == 1).sum(), min_val=25, max_val=90)
age = np.empty(N)
age[diagnosis == 0] = age_healthy
age[diagnosis == 1] = age_cancer
age = age.round().astype(int)

In [174]:
# BRCA mutation status conditional on diagnosis
brca = np.zeros(N, dtype=int)
brca[diagnosis == 1] = (rng.random((diagnosis == 1).sum()) < BRCA_RATE_CANCER).astype(int)
brca[diagnosis == 0] = (rng.random((diagnosis == 0).sum()) < BRCA_RATE_HEALTHY).astype(int)

In [176]:
# Apply age adjustment for BRCA+ (skew younger by ~6–8 years with bounds)
brca_young_shift = rng.integers(5, 9, size=N)  # 5..8
age = np.where(brca == 1, np.maximum(18, age - brca_young_shift), age)

In [178]:
# Menopausal status driven largely by age but not strictly (prob rising near 50)
# P(post) = sigmoid((age-49)/4) with mild BRCA adjustment (slightly lower if BRCA+)
p_post = sigmoid((age - 49)/4.0) - brca*0.05
p_post = np.clip(p_post, 0.05, 0.98)
menopausal = (rng.random(N) < p_post).astype(int)

In [180]:
# Family history: more likely if BRCA+, and slightly more if cancer
base_fh = 0.12 + 0.08*diagnosis + 0.35*brca  # range roughly 0.12 to 0.55
base_fh = np.clip(base_fh, 0.05, 0.85)
family_history = (rng.random(N) < base_fh).astype(int)


In [182]:
# BMI: mild shift higher in cancer; ensure plausible bounds
bmi = clipped_normal(mean=26 + 1.0*diagnosis, sd=4.2, size=N, min_val=15, max_val=55)

In [184]:
# Ultrasound risk score: healthy skew low, cancer skew high with overlap
ultra = np.empty(N)
# Healthy: Beta(2.5, 6) scaled, tends to be low
h_idx = np.where(diagnosis == 0)[0]
c_idx = np.where(diagnosis == 1)[0]
ultra[h_idx] = rng.beta(2.5, 6.0, size=len(h_idx))
# Cancer: Beta(6, 2.5) scaled, tends to be high
ultra[c_idx] = rng.beta(6.0, 2.5, size=len(c_idx))

In [186]:
# Enforce soft rule-based thresholds with a bit of noise
ultra = np.clip(ultra + rng.normal(0, 0.03, size=N), 0, 1)
# Slight bump for larger tumors later

# Tumor size (cm): near zero for healthy (some benign masses), >0 for cancer
tumor_size = np.zeros(N)
# Healthy small/near-zero masses: Half-normal around 0 with sigma=0.6, capped at 2.5cm
tumor_size[h_idx] = np.clip(np.abs(rng.normal(0, 0.6, size=len(h_idx))), 0, 2.5)
# Cancer: Normal around 3.8cm (early-stage), sd=1.2, min 0.5, max 10
tumor_size[c_idx] = clipped_normal(mean=3.8, sd=1.2, size=len(c_idx), min_val=0.5, max_val=10)

In [188]:
# CA-125 (U/mL): strong dependence on diagnosis and tumor size
# Healthy baseline: log-normal around median ~16, with ~10–15% > 35
# Cancer baseline: log-normal around median ~220, add linear term with tumor size
# We will build base levels then apply rule-based adjustments.
# Healthy
mu_h, sigma_h = np.log(16), 0.55   # lognormal params
ca125 = np.empty(N)
ca125[h_idx] = rng.lognormal(mean=mu_h, sigma=sigma_h, size=len(h_idx))
# Cancer
mu_c, sigma_c = np.log(220), 0.65  # lognormal params
ca125[c_idx] = rng.lognormal(mean=mu_c, sigma=sigma_c, size=len(c_idx))

In [190]:
# Add correlation with tumor size (both groups but stronger in cancer)
alpha_h, alpha_c = 8.0, 22.0
ca125[h_idx] += alpha_h * tumor_size[h_idx]
ca125[c_idx] += alpha_c * tumor_size[c_idx]

In [192]:
# Rule-based tweaks:
# - Ensure ~80% of cancer cases have CA-125 > 200 (not all, because early-stage may be lower)
cancer_mask = (diagnosis == 1)
current_prop_high = (ca125[cancer_mask] > 200).mean()
if current_prop_high < 0.80:
    # Scale up cancer values slightly to meet the target
    scale = 200 / np.quantile(ca125[cancer_mask], 0.2)  # lift lower quintile
    ca125[cancer_mask] *= np.clip(scale, 1.0, 1.5)

In [194]:
# - Keep healthy mostly below 35 but allow some outliers (~10–15%)
healthy_mask = (diagnosis == 0)
prop_h_gt35 = (ca125[healthy_mask] > 35).mean()
target_h_gt35 = 0.12
if prop_h_gt35 > target_h_gt35 + 0.03:
    # Compress upper tail for healthy
    hi = ca125[healthy_mask] > 35
    ca125[healthy_mask][hi] = 35 + (ca125[healthy_mask][hi] - 35)*0.6

In [196]:
# Ultrasound fine-tuning based on tumor size
ultra += 0.02 * (tumor_size / (tumor_size.max() if tumor_size.max() > 0 else 1))
ultra = np.clip(ultra, 0, 1)

In [198]:
# Symptom score (0–5): higher in cancer; BRCA+ slightly higher vigilance -> small shift
symptom_score = np.zeros(N, dtype=int)
# Healthy: majority 0–2
symp_probs_h = np.array([0.35, 0.33, 0.20, 0.07, 0.04, 0.01])
# Cancer: shifted to 2–5
symp_probs_c = np.array([0.06, 0.12, 0.26, 0.28, 0.18, 0.10])
symp_bins = np.arange(6)
symptom_score[h_idx] = rng.choice(symp_bins, size=len(h_idx), p=symp_probs_h)
symptom_score[c_idx] = rng.choice(symp_bins, size=len(c_idx), p=symp_probs_c)
# BRCA+ small upward bump (cap at 5)
symptom_score = np.minimum(5, symptom_score + (rng.random(N) < (0.06*brca)).astype(int))

In [200]:
# Assemble DataFrame
df = pd.DataFrame({
    "age_years": age,
    "bmi": np.round(bmi, 1),
    "ca125_u_ml": np.round(ca125, 1),
    "tumor_size_cm": np.round(tumor_size, 2),
    "brca_positive": brca,
    "menopausal_post": menopausal,
    "ultrasound_risk": np.round(ultra, 3),
    "family_history": family_history,
    "symptom_score_0_5": symptom_score,
    "diagnosis_cancer": diagnosis
})


In [202]:
# Edge-case sanity: no negatives
for col in ["bmi", "ca125_u_ml", "tumor_size_cm", "ultrasound_risk"]:
    df[col] = df[col].clip(lower=0)

In [204]:
# Basic summary
overall_brca_prev = df["brca_positive"].mean()
cancer_prev = df["diagnosis_cancer"].mean()
healthy_prop_ca125_lt35 = (df.loc[df.diagnosis_cancer == 0, "ca125_u_ml"] < 35).mean()
cancer_prop_ca125_gt200 = (df.loc[df.diagnosis_cancer == 1, "ca125_u_ml"] > 200).mean()

# Save CSV
df.to_csv(csv_path, index=False)

In [206]:
# Plots: distributions and correlations
# 1) Histogram for CA-125 by diagnosis
plt.figure()
bins = np.linspace(0, min(800, df["ca125_u_ml"].quantile(0.99)), 50)
plt.hist(df.loc[df.diagnosis_cancer == 0, "ca125_u_ml"], bins=bins, alpha=0.6, label="Healthy")
plt.hist(df.loc[df.diagnosis_cancer == 1, "ca125_u_ml"], bins=bins, alpha=0.6, label="Cancer")
plt.xlabel("CA-125 (U/mL)")
plt.ylabel("Count")
plt.title("CA-125 Distribution by Diagnosis")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, "ca125_by_diagnosis.png"))
plt.close()

In [208]:
# 2) Tumor size histogram by diagnosis
plt.figure()
bins_ts = np.linspace(0, max(10, df["tumor_size_cm"].max()), 50)
plt.hist(df.loc[df.diagnosis_cancer == 0, "tumor_size_cm"], bins=bins_ts, alpha=0.6, label="Healthy")
plt.hist(df.loc[df.diagnosis_cancer == 1, "tumor_size_cm"], bins=bins_ts, alpha=0.6, label="Cancer")
plt.xlabel("Tumor size (cm)")
plt.ylabel("Count")
plt.title("Tumor Size Distribution by Diagnosis")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, "tumor_size_by_diagnosis.png"))
plt.close()


In [209]:
# 3) Ultrasound risk score by diagnosis
plt.figure()
bins_u = np.linspace(0, 1, 40)
plt.hist(df.loc[df.diagnosis_cancer == 0, "ultrasound_risk"], bins=bins_u, alpha=0.6, label="Healthy")
plt.hist(df.loc[df.diagnosis_cancer == 1, "ultrasound_risk"], bins=bins_u, alpha=0.6, label="Cancer")
plt.xlabel("Ultrasound risk score (0–1)")
plt.ylabel("Count")
plt.title("Ultrasound Risk Distribution by Diagnosis")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, "ultrasound_by_diagnosis.png"))
plt.close()

In [211]:
# 4) Correlation matrix (numeric columns)
numeric_cols = ["age_years","bmi","ca125_u_ml","tumor_size_cm","brca_positive",
                "menopausal_post","ultrasound_risk","family_history",
                "symptom_score_0_5","diagnosis_cancer"]
corr = df[numeric_cols].corr()

plt.figure()
im = plt.imshow(corr, aspect='auto')
plt.colorbar(im, fraction=0.046, pad=0.04)
plt.xticks(range(len(numeric_cols)), numeric_cols, rotation=90)
plt.yticks(range(len(numeric_cols)), numeric_cols)
plt.title("Correlation Matrix")
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, "correlation_matrix.png"))
plt.close()

In [213]:
# 5) Scatter: Tumor size vs CA-125
plt.figure()
plt.scatter(df["tumor_size_cm"], df["ca125_u_ml"], s=8, alpha=0.6)
plt.xlabel("Tumor size (cm)")
plt.ylabel("CA-125 (U/mL)")
plt.title("Tumor Size vs CA-125")
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, "tumor_vs_ca125.png"))
plt.close()

In [216]:
# -----------------------------
# 3) Generate concise PDF report
# -----------------------------
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib.units import cm

In [218]:
def add_wrapped_text(c, text, x, y, max_width, leading=12):
    # Simple text wrap for reportlab
    from textwrap import wrap
    lines = []
    for paragraph in text.split("\n"):
        # Estimated characters per line; conservative scaling by font size
        max_chars = int(max_width / 5.2)  # heuristic
        for line in wrap(paragraph, width=max_chars):
            lines.append(line)
        lines.append("")  # paragraph break
    for line in lines:
        c.drawString(x, y, line)
        y -= leading
    return y

In [220]:
c = canvas.Canvas(pdf_path, pagesize=A4)
width, height = A4
margin = 2*cm
cursor_y = height - margin

In [222]:
c.setFont("Helvetica-Bold", 14)
c.drawString(margin, cursor_y, "Synthetic Clinical Dataset Report — Early-stage Ovarian Cancer")
cursor_y -= 18

In [224]:
c.setFont("Helvetica", 10)
intro = (
    f"Records: {N}   |   Cancer prevalence: {cancer_prev:.1%}   |   BRCA+ overall: {overall_brca_prev:.1%}\n"
    f"Healthy CA-125 < 35 U/mL: {healthy_prop_ca125_lt35:.1%}   |   Cancer CA-125 > 200 U/mL: {cancer_prop_ca125_gt200:.1%}\n\n"
    "This synthetic dataset was generated using mixed statistical and rule-based methods to reflect medically plausible "
    "patterns (e.g., higher CA-125 and ultrasound risk in cancer, BRCA-positive skewing to younger ages, and correlation "
    "between tumor size and CA-125). Values are clipped to remove impossible negatives and distributions are tuned to create "
    "overlap reflective of real-world screening scenarios."
)
cursor_y = add_wrapped_text(c, intro, margin, cursor_y, width-2*margin, leading=12) - 6

In [226]:
# Feature rationale summary
c.setFont("Helvetica-Bold", 12)
c.drawString(margin, cursor_y, "Feature Rationale & Generation Logic (Summary)")
cursor_y -= 14
c.setFont("Helvetica", 10)
logic = (
    "- Age: Normal by diagnosis; BRCA+ shifted ~5–8 years younger, then rounded to integer.\n"
    "- Menopausal status: Logistic function of age; slightly lower probability if BRCA+.\n"
    "- BRCA status: 22% among cancer, 10% among healthy (overall within 10–20%).\n"
    "- Tumor size: Near-zero half-normal for healthy; normal(3.8,1.2) cm for cancer (0.5–10 cm).\n"
    "- CA-125: Log-normal by diagnosis plus linear term with tumor size; tuned to get ~80% of cancer >200 U/mL "
    "and ~85–90% of healthy <35 U/mL.\n"
    "- Ultrasound risk: Beta-distributions; increased slightly with tumor size.\n"
    "- Family history: Higher with BRCA+ and cancer.\n"
    "- BMI: Slightly higher mean in cancer.\n"
    "- Symptom score (0–5): Shifted higher for cancer; small BRCA+ bump."
)
cursor_y = add_wrapped_text(c, logic, margin, cursor_y, width-2*margin, leading=12) - 6

In [228]:
# Insert small validation table-like text
c.setFont("Helvetica-Bold", 12)
c.drawString(margin, cursor_y, "Key Validation Checks")
cursor_y -= 14
c.setFont("Helvetica", 10)
val = (
    f"* No negative values for CA-125, tumor size, BMI, or risk score.\n"
    f"* CA-125 vs Tumor size correlation (Pearson): {corr.loc['ca125_u_ml','tumor_size_cm']:.2f}\n"
    f"* Ultrasound risk higher in cancer median: "
    f"{df.loc[df.diagnosis_cancer==1,'ultrasound_risk'].median():.2f} vs healthy "
    f"{df.loc[df.diagnosis_cancer==0,'ultrasound_risk'].median():.2f}\n"
    f"* Age median (BRCA+ vs BRCA-): "
    f"{df.loc[df.brca_positive==1,'age_years'].median()} vs {df.loc[df.brca_positive==0,'age_years'].median()}"
)
cursor_y = add_wrapped_text(c, val, margin, cursor_y, width-2*margin, leading=12) - 6

In [230]:
# Attempt to place one plot (correlation matrix) as an image if space allows
corr_img = os.path.join(plots_dir, "correlation_matrix.png")
if os.path.exists(corr_img):
    img_width = width - 2*margin
    img_height = img_width * 0.75
    if cursor_y - img_height > margin:
        c.drawImage(corr_img, margin, cursor_y - img_height, width=img_width, height=img_height, preserveAspectRatio=True, mask='auto')
        cursor_y -= (img_height + 6)
    else:
        c.showPage()
        cursor_y = height - margin
        c.drawImage(corr_img, margin, cursor_y - img_height, width=img_width, height=img_height, preserveAspectRatio=True, mask='auto')
        cursor_y -= (img_height + 6)

In [232]:
# Close out
c.setFont("Helvetica", 9)
if cursor_y < margin + 20:
    c.showPage()
    cursor_y = height - margin
c.drawString(margin, cursor_y, "Note: Ranges are clinically inspired but synthetic and for ML experimentation only.")
c.save()

In [234]:
# Show a preview of the data to the user as a table
df.head(25)

{
    "csv_path": csv_path,
    "pdf_path": pdf_path,
    "plots_dir": plots_dir,
    "n_records": len(df),
    "cancer_rate": float(cancer_prev),
    "overall_brca_rate": float(overall_brca_prev),
    "healthy_prop_ca125_lt35": float(healthy_prop_ca125_lt35),
    "cancer_prop_ca125_gt200": float(cancer_prop_ca125_gt200)
}

{'csv_path': 'C://Users//cherisma//Downloads\\synthetic_clinical_dataset.csv',
 'pdf_path': 'C://Users//cherisma//Downloads\\Synthetic_Clinical_Report.pdf',
 'plots_dir': 'C://Users//cherisma//Downloads\\plots',
 'n_records': 800,
 'cancer_rate': 0.36125,
 'overall_brca_rate': 0.14875,
 'healthy_prop_ca125_lt35': 0.898238747553816,
 'cancer_prop_ca125_gt200': 0.8442906574394463}