In [5]:
pip install faker



In [6]:
import csv
import random
from faker import Faker

fake = Faker()

# Set random seed for reproducibility
random.seed(42)

# Define headers
headers = [
    "patient_name",
    "gestational_week",
    "body_temperature_c",
    "systolic_bp_mmHg",
    "diastolic_bp_mmHg",
    "blood_glucose_mg_dL",
    "oxygen_saturation_percent",
    "heart_rate_bpm",
    "protein_urine_scale",  # 1-10 scale
    "weight_kg",
    "height_cm",
    "bmi",
    "past_history_anemia",       # 0=no, 1=yes
    "past_history_diabetes",     # 0=no, 1=yes
    "past_history_preeclampsia"  # 0=no, 1=yes
]

# Define medically realistic ranges
normal_ranges = {
    "body_temperature": (36.1, 37.2),       # °C
    "systolic_bp": (100, 120),              # mmHg
    "diastolic_bp": (60, 80),               # mmHg
    "blood_glucose_fasting": (70, 95),      # mg/dL
    "blood_glucose_postprandial": (90, 140), # mg/dL 1-2 hours after eating
    "oxygen_saturation": (95, 100),         # %
    "heart_rate": (70, 90),                 # bpm
    "weight_first_trimester": (45, 90),     # kg
    "height": (150, 180)                    # cm
}

def generate_patient_data():
    # Generate realistic patient details
    name = fake.name()
    gestational_week = random.randint(6, 40)
    height = round(random.uniform(150, 180), 1)

    # Base weight with pregnancy adjustment
    base_weight = random.uniform(45, 90)
    pregnancy_weight_gain = random.uniform(0, 0.5) * gestational_week
    weight = round(base_weight + pregnancy_weight_gain, 1)
    bmi = round(weight / ((height/100) ** 2), 1)

    # Generate random past medical history (independent probabilities)
    past_anemia = 1 if random.random() < 0.15 else 0       # 15% prevalence
    past_diabetes = 1 if random.random() < 0.10 else 0      # 10% prevalence
    past_preeclampsia = 1 if random.random() < 0.08 else 0  # 8% prevalence

    # Generate current health metrics (influenced by past history)
    # Base values
    temp = round(random.uniform(36.1, 37.2), 1)
    systolic = random.randint(100, 120)
    diastolic = random.randint(60, 80)
    glucose = random.choice([
        random.randint(70, 95),  # fasting
        random.randint(90, 140)  # postprandial
    ])
    oxygen = random.randint(95, 100)
    heart_rate = random.randint(70, 90)
    protein = random.randint(1, 2)  # normal is very low

    # Adjust for past conditions
    if past_anemia:
        oxygen = max(88, oxygen - random.randint(0, 5))
        heart_rate = min(110, heart_rate + random.randint(5, 15))

    if past_diabetes:
        glucose = min(200, glucose + random.randint(10, 40))

    if past_preeclampsia:
        systolic = min(160, systolic + random.randint(5, 20))
        diastolic = min(110, diastolic + random.randint(5, 15))
        protein = min(10, protein + random.randint(1, 3))

    return [
        name,
        gestational_week,
        temp,
        systolic,
        diastolic,
        glucose,
        oxygen,
        heart_rate,
        protein,
        weight,
        height,
        bmi,
        past_anemia,
        past_diabetes,
        past_preeclampsia
    ]

# Generate data
data = [headers]
for _ in range(10000):
    data.append(generate_patient_data())

# Write to CSV
filename = "pregnancy_health_data_with_history.csv"
with open(filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)

print(f"Generated {filename} with {len(data)-1} rows of data.")
print("Past history prevalence:")
print(f"Anemia: {sum(row[-3] for row in data[1:])} cases")
print(f"Gestational diabetes: {sum(row[-2] for row in data[1:])} cases")
print(f"Preeclampsia: {sum(row[-1] for row in data[1:])} cases")

Generated pregnancy_health_data_with_history.csv with 10000 rows of data.
Past history prevalence:
Anemia: 1559 cases
Gestational diabetes: 1024 cases
Preeclampsia: 752 cases


In [7]:
from google.colab import files
files.download(filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>