# **Synthetic Healthcare Dataset Generator**
This script generates a synthetic healthcare dataset using the **Faker** library and controlled vocabularies.
It simulates patient demographic and clinical information including:
- Personal details (name, age, gender, ethnicity, contact info)
- Medical data (diagnosis codes, symptoms, treatments, lab results)
- Consent and data source metadata

The dataset includes 1000 records (modifiable via NUM_RECORDS) and is saved as a CSV file named 'synthetic_healthcare_dataset.csv'.
This is useful for testing, prototyping, or educational purposes where real patient data cannot be used.

Note: All data is entirely fictional and randomly generated.

In [None]:
import pandas as pd
import random
import faker
from datetime import datetime

# Initialize Faker
fake = faker.Faker()

# Parameters
NUM_RECORDS = 1000   # Change this for larger dataset

# Controlled vocabularies
GENDERS = ["Male", "Female", "Other"]
ETHNICITIES = ["Asian", "Black", "White", "Hispanic", "Other"]
OCCUPATIONS = ["Teacher", "Engineer", "Doctor", "Nurse", "Clerk", "Unemployed", "Retired"]
DIAGNOSIS_CODES = ["C34", "I10", "E11", "J45", "K21", "F32", "M54"]  # Example ICD codes
SYMPTOMS_MAP = {
    "C34": ["Cough", "Chest pain", "Weight loss"],
    "I10": ["Headache", "Dizziness"],
    "E11": ["Frequent urination", "Thirst", "Fatigue"],
    "J45": ["Wheezing", "Shortness of breath"],
    "K21": ["Heartburn", "Regurgitation"],
    "F32": ["Sadness", "Fatigue", "Sleep problems"],
    "M54": ["Back pain", "Stiffness"]
}
TREATMENTS = ["Medication", "Surgery", "Therapy", "Lifestyle Change"]
RESPONSES = ["Improved", "No Change", "Worsened"]
HOSPITALS = ["H001", "H002", "H003", "H004"]
DATA_SOURCES = ["Hospital EMR", "Clinical Trial", "Insurance Claim"]

def generate_record(i):
    dob = fake.date_of_birth(minimum_age=0, maximum_age=90)
    age = datetime.today().year - dob.year
    gender = random.choice(GENDERS)
    ethnicity = random.choice(ETHNICITIES)

    # Age-appropriate occupation
    if age <= 5:
        occupation = None
    elif 6 <= age <= 16:
        occupation = "Student"
    elif age > 16 and age < 65:
        occupation = random.choice([o for o in OCCUPATIONS if o not in ["Student", "Retired"]])
    else:
        occupation = "Retired"

    # Health data
    diagnosis = random.choice(DIAGNOSIS_CODES)
    symptoms = ", ".join(random.sample(SYMPTOMS_MAP[diagnosis], k=min(2, len(SYMPTOMS_MAP[diagnosis]))))
    genetic_markers = fake.lexify(text="GM????")
    family_history = random.choice(["Yes", "No"])
    treatment = random.choice(TREATMENTS)
    response = random.choice(RESPONSES)
    lab_results = fake.lexify(text="LR-????")

    survival_months = random.randint(1, 120) if diagnosis == "C34" else None
    readmission = random.choice(["Yes", "No"])

    # Consent
    consent_flag = random.choice(["Yes", "No"])
    consent_date = fake.date_this_decade() if consent_flag == "Yes" else None

    return {
        "Patient_ID": f"P{i+1:05d}",
        "Full_Name": fake.name(),
        "DOB": dob,
        "Age": age,
        "Gender": gender,
        "Ethnicity": ethnicity,
        "Zip_Code": fake.postcode(),
        "Occupation": occupation,
        "Contact_Number": fake.phone_number(),
        "Email": fake.email(),
        "Address": fake.address().replace("\n", ", "),
        "Diagnosis_Code": diagnosis,
        "Symptoms": symptoms,
        "Genetic_Markers": genetic_markers,
        "Family_History": family_history,
        "Treatment_Type": treatment,
        "Treatment_Response": response,
        "Lab_Results": lab_results,
        "Hospital_ID": random.choice(HOSPITALS),
        "Consent_Flag": consent_flag,
        "Consent_Date": consent_date,
        "Data_Source": random.choice(DATA_SOURCES),
        "Disease_Progression": random.choice(["Stable", "Improving", "Worsening"]),
        "Survival_Months": survival_months,
        "Readmission": readmission
    }

# Generate dataset
records = [generate_record(i) for i in range(NUM_RECORDS)]
df = pd.DataFrame(records)

# Save to CSV
df.to_csv("synthetic_healthcare_dataset.csv", index=False)

print(f"Dataset generated with {NUM_RECORDS} records and saved to synthetic_healthcare_dataset.csv")



Dataset generated with 1000 records and saved to synthetic_healthcare_dataset.csv


In [None]:
%pip install faker

