In [1]:
import pandas as pd
import numpy as np
import os
import string

import random
from faker import Faker

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# Set seaborn styles
sns.set_theme(style="whitegrid", palette="pastel")
sns.set_palette("Set2")


# Clear annoying package and version warnings
import warnings
warnings.filterwarnings('ignore')


# Get current working directory
pwd = os.getcwd()  # This makes it so you can move the file and not have to constantly rename the entire filepath

In [9]:
# Create a Faker instance
fake = Faker()

# Number of records to generate
num_records = 1000

genders = ["Male", "Female", "Other"]
gender_weights = [0.44, 0.51, 0.05]
blood_types = ["O+", "A+", "B+", "AB+", "O-", "A-", "B-", "AB-"]
blood_type_weights = [37.5, 27.1, 26.3, 5.4, 6.6, 4.3, 1.5, 0.6]
ethnicities = ["White", "Black", "Asian", "Hispanic", "Native American", "Pacific Islander", "Other"]
occupations = [
    "Engineer", "Teacher", "Doctor", "Nurse", "Retail Worker", 
    "Manager", "Unemployed", "Student", "Artist", "Sales Representative",
    "Software Developer", "Accountant", "Construction Worker", 
    "Electrician", "Chef", "Lawyer", "Pharmacist", "Graphic Designer",
    "Data Analyst", "Marketing Specialist", "Mechanic", "Writer",
    "Scientist", "Consultant", "Architect", "Social Worker", 
    "Veterinarian", "Web Developer", "Researcher", "Real Estate Agent",
    "Insurance Agent", "Project Manager", "Public Relations Specialist",
    "Customer Service Representative", "Financial Analyst", 
    "Business Analyst", "IT Support", "Physiotherapist", "Sales Manager",
    "Security Guard", "Data Scientist", "Product Manager", 
    "Research Scientist", "Events Planner", "Recruiter", "Delivery Driver"
]

insurance_plans = ['HMO Plan', 'Premium Health Insurance', 'Comprehensive Care Plan',
       'Catastrophic Health Plan', 'Medicare Advantage',
       'Family Health Coverage', 'Individual Health Insurance',
       'High Deductible Health Plan', 'Basic Health Plan',
       'Short-term Health Insurance', 'PPO Plan', 'Employer Group Plan']

medical_conditions = ['Sleep Apnea', 'Diabetes', 'Obesity', 'Depression',
       'Multiple Sclerosis', 'Cholesterol', 'Asthma', 'Osteoporosis',
       'COPD', 'Autoimmune Disorder', 'Heart Disease', 'Kidney Disease',
       'Anxiety', "Parkinson's Disease", 'Arthritis', 'Fibroids',
       'Migraine', 'Anemia', 'Fibromyalgia', 'Thyroid Issues',
       'Irritable Bowel Syndrome', 'Hypertension', 'Epilepsy',
       'Celiac Disease', 'Liver Disease', 'Chronic Pain', 'Gout']

allergens = ['Fish', 'Shellfish', 'Peanuts', 'Tree Nuts', 'Milk', 'Eggs', 'Wheat',
    'Soy', 'Dairy', 'Latex', 'Penicillin', 'Amoxicillin', 'Sulfa Drugs',
    'Ibuprofen', 'Aspirin', 'Cephalosporins', 'Bee Stings', 'Dust Mites',
    'Mold', 'Pollen', 'Pet Dander', 'Latex', 'Sulfites', 'Sesame',
    'Soy Sauce', 'Corn', 'Gluten', 'Mustard', 'Chickpeas', 'Rye', 'None']

medications = ['Fluticasone', 'Atorvastatin', 'Zoloft', 'Omeprazole',
       'Simvastatin', 'Prozac', 'Clopidogrel', 'Amlodipine', 'Furosemide',
       'Risperidone', 'Metformin', 'Warfarin', 'Tramadol',
       'Sertraline', 'Montelukast', 'Lisinopril', 'Gabapentin',
       'Levothyroxine', 'Trazodone', 'Citalopram', 'Insulin',
       'Duloxetine', 'Losartan', 'Albuterol', 'None']



def calculate_bmi(weight, height_cm):
    height_m = height_cm / 100  # Convert height to meters
    return round(weight / (height_m ** 2), 2)

def generate_blood_pressure(conditions):
    if 'Hypertension' in conditions:
        systolic = random.randint(140, 180)
        diastolic = random.randint(90, 120)
    else:
        systolic = random.randint(90, 139)
        diastolic = random.randint(60, 89)
    return f"{systolic}/{diastolic} mmHg"

def generate_health_metrics(conditions):
    metrics = {}
    
    # Default healthy values
    metrics["Blood Glucose Level (mmol/L)"] = round(random.uniform(4.0, 5.4), 1)
    metrics["HDL Cholesterol (mmol/L)"] = round(random.uniform(1.0, 1.6), 1)
    metrics["LDL Cholesterol (mmol/L)"] = round(random.uniform(1.8, 3.0), 1)
    metrics["Triglycerides (mmol/L)"] = round(random.uniform(0.6, 1.7), 1)
    metrics["Hemoglobin A1C (%)"] = round(random.uniform(4.0, 5.6), 1)
    metrics["White Blood Cell Count (10^9/L)"] = round(random.uniform(4.5, 11.0), 1)
    metrics["Red Blood Cell Count (10^12/L)"] = round(random.uniform(4.5, 5.5), 1)
    metrics["Platelet Count (10^9/L)"] = round(random.uniform(150, 450), 1)
    
    if 'Diabetes' in conditions:
        metrics["Blood Glucose Level (mmol/L)"] = round(random.uniform(7.0, 15.0), 1)
        metrics["Hemoglobin A1C (%)"] = round(random.uniform(6.5, 14.0), 1)
    
    if 'Cholesterol' in conditions:
        metrics["LDL Cholesterol (mmol/L)"] = round(random.uniform(3.4, 6.5), 1)
        metrics["HDL Cholesterol (mmol/L)"] = round(random.uniform(0.5, 1.0), 1)
        metrics["Triglycerides (mmol/L)"] = round(random.uniform(1.8, 4.5), 1)
    
    if 'Anemia' in conditions:
        metrics["Red Blood Cell Count (10^12/L)"] = round(random.uniform(3.5, 4.4), 1)
    
    if 'Obesity' in conditions:
        metrics["LDL Cholesterol (mmol/L)"] = round(random.uniform(3.4, 6.5), 1)
        metrics["HDL Cholesterol (mmol/L)"] = round(random.uniform(0.5, 1.0), 1)
        metrics["Triglycerides (mmol/L)"] = round(random.uniform(1.8, 4.5), 1)
    
    if 'COPD' in conditions:
        metrics["White Blood Cell Count (10^9/L)"] = round(random.uniform(11.1, 20.0), 1)
    
    return metrics

def generate_multiple_items(items, max_count=3):
    count = random.randint(1, max_count)
    selected = random.sample(items, count)
    return ', '.join(selected) if selected else 'None'

def generate_patient_id():
    letters = ''.join(random.choices(string.ascii_uppercase, k=2))  # Generate 2 random uppercase letters
    numbers1 = ''.join(random.choices(string.digits, k=4))  # Generate first group of 4 random digits
    numbers2 = ''.join(random.choices(string.digits, k=4))  # Generate second group of 4 random digits
    unique_suffix = random.randint(1000, 9999)  # Add a random number for uniqueness
    return f"{letters}-{numbers1}-{numbers2}-{unique_suffix}"

def generate_multiple_items(items, max_count=3, single_prob=0.7):
    if random.random() < single_prob:
        # 70% chance of returning a single item
        return random.choice(items)
    else:
        # 30% chance of returning multiple items (up to max_count)
        count = random.randint(2, max_count)
        selected = random.sample(items, count)
        return ', '.join(selected)


def generate_dataset(is_healthy=True):
    data = {
        "Patient ID": [generate_patient_id() for _ in range(num_records)],
        "Last Checkup": [fake.date_between(start_date='-3y', end_date='today') for _ in range(num_records)],
        "Date of Birth": [fake.date_of_birth(minimum_age=18, maximum_age=80) for _ in range(num_records)],
        "Gender": random.choices(genders, weights=gender_weights, k=num_records),
        "Ethnicity": [random.choice(ethnicities) for _ in range(num_records)],
        "Blood Type": [random.choices(blood_types, weights=blood_type_weights)[0] for _ in range(num_records)],
        "Occupation": [random.choice(occupations) for _ in range(num_records)],
        "Insurance Provider": [fake.company() if random.random() > 0.2 else 'None' for _ in range(num_records)],
        "Insurance Plan": [random.choice(insurance_plans) for _ in range(num_records)],
        "Monthly Premium": [round(random.uniform(100, 800), 2) for _ in range(num_records)],
        "Medical History": [generate_multiple_items(medical_conditions, 2) if not is_healthy or random.random() < 0.2 else 'None' for _ in range(num_records)],
        "Allergies": [generate_multiple_items(allergens, 3) if not is_healthy or random.random() < 0.3 else 'None' for _ in range(num_records)],
        "Prescriptions": ['None' if is_healthy and random.random() < 0.6 else generate_multiple_items(medications, 3, single_prob=0.7) for _ in range(num_records)],
        "Smoking Status": ["Non-Smoker" if is_healthy and random.random() < 0.8 else random.choice(["Non-Smoker", "Former Smoker", "Current Smoker"]) for _ in range(num_records)],
        "Alcohol Consumption": [random.choice(["None", "Occasional"]) if is_healthy and random.random() < 0.8 else random.choice(["None", "Occasional", "Regular", "Heavy"]) for _ in range(num_records)],
    }

    heights = []
    weights = []
    bmis = []
    for gender, conditions in zip(data["Gender"], data["Medical History"]):
        if gender == "Male":
            height = random.randint(165, 190)
            weight = random.randint(60, 90) if 'Obesity' not in conditions else random.randint(100, 150)
        else:
            height = random.randint(150, 175)
            weight = random.randint(50, 75) if 'Obesity' not in conditions else random.randint(85, 130)
        
        heights.append(height)
        weights.append(weight)
        bmis.append(calculate_bmi(weight, height))

    data["Height (cm)"] = heights
    data["Weight (kg)"] = weights
    data["BMI"] = bmis
    data["Blood Pressure (mmHg)"] = [generate_blood_pressure(conditions.split(', ')) for conditions in data["Medical History"]]
    
    # Randomize the "Stroke" assignment
    probabilities = [0.15, 0.85]  # 15% Yes, 85% No
    data["Stroke"] = np.random.choice(["Yes", "No"], size=num_records, p=probabilities)
    
    data["Coronary Heart Disease (CHD)"] = ["Yes" if 'Heart Disease' in conditions else "No" for conditions in data["Medical History"]]

    for i in range(num_records):
        metrics = generate_health_metrics(data["Medical History"][i].split(', '))
        for key, value in metrics.items():
            data[key] = data.get(key, []) + [value]

    return pd.DataFrame(data)

# Generate healthy dataset
healthy_df = generate_dataset(is_healthy=True)
healthy_df.to_csv('healthy_medical_records.csv', index=False)

# Generate dataset with mixed conditions
mixed_df = generate_dataset(is_healthy=False)
# Make 80% of the records healthy
healthy_mask = [True] * int(0.8 * num_records) + [False] * int(0.2 * num_records)
random.shuffle(healthy_mask)

for column in mixed_df.columns:
    mixed_df.loc[healthy_mask, column] = healthy_df.loc[healthy_mask, column]

mixed_df.to_csv('mixed_medical_records.csv', index=False)

print("Data Exported")

Data Exported


In [10]:
# Load the datasets
healthy_df = pd.read_csv('healthy_medical_records.csv')
mixed_df = pd.read_csv('mixed_medical_records.csv')

# Concatenate the two DataFrames
output_df = pd.concat([healthy_df, mixed_df], ignore_index=True)

output_df.fillna('None', inplace=True)

# Export the final dataset to a CSV file
output_df.to_csv('sample_medical_records.csv', index=False)

print("Medical dataset exported successfully.")

Medical dataset exported successfully.


In [70]:
df = pd.read_csv('medical_records.csv')



df

Unnamed: 0,Patient ID,Last Checkup,Date of Birth,Gender,Ethnicity,Blood Type,Occupation,Insurance Provider,Insurance Plan,Monthly Premium,...,Stroke,Coronary Heart Disease (CHD),Blood Glucose Level (mmol/L),HDL Cholesterol (mmol/L),LDL Cholesterol (mmol/L),Triglycerides (mmol/L),Hemoglobin A1C (%),White Blood Cell Count (10^9/L),Red Blood Cell Count (10^12/L),Platelet Count (10^9/L)
0,AP-3485-6851-5017,2024-06-01,1958-01-17,Male,Other,O+,Engineer,Castro-Rodriguez,Comprehensive Care Plan,686.94,...,No,No,4.4,1.3,2.1,1.6,5.3,5.0,5.0,445.4
1,TT-0620-3204-1161,2023-11-17,2006-04-03,Male,Black,AB+,Unemployed,Mayer Ltd,Employer Group Plan,374.25,...,No,No,4.4,1.3,2.9,0.9,4.7,5.3,5.2,429.1
2,TO-3740-8174-7361,2022-07-25,1972-09-18,Male,White,O+,Scientist,Valenzuela-Fisher,High Deductible Health Plan,286.69,...,No,No,4.8,1.0,2.8,0.8,4.8,8.8,5.3,177.4
3,LT-0548-1351-6003,2022-03-19,2002-11-22,Female,Other,O+,Financial Analyst,Williamson and Sons,High Deductible Health Plan,119.14,...,No,No,5.3,1.5,2.5,0.8,4.4,8.0,4.8,333.0
4,VB-7486-5990-1923,2022-03-21,1987-04-06,Female,Hispanic,AB+,Delivery Driver,Young Inc,Short-term Health Insurance,552.07,...,No,No,4.0,1.1,2.9,1.3,4.1,5.5,5.3,354.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,JG-5901-8866-2978,2024-07-03,1986-01-20,Female,Other,O+,Writer,Meadows Ltd,High Deductible Health Plan,273.35,...,No,No,5.3,1.3,1.9,0.8,4.3,10.1,4.9,309.5
199996,LV-9177-4184-5174,2022-05-09,1976-04-06,Male,White,O+,Electrician,,,0.00,...,No,No,5.4,1.4,1.9,1.6,4.7,9.8,5.3,258.2
199997,TX-0270-9075-8701,2024-05-16,1987-06-21,Female,Hispanic,A+,Veterinarian,Rodriguez LLC,Catastrophic Health Plan,381.75,...,No,No,4.5,1.2,2.8,1.5,4.8,6.2,4.5,362.2
199998,OE-0986-2757-1871,2024-01-07,1957-09-12,Male,Black,A+,Security Guard,Young PLC,High Deductible Health Plan,281.97,...,No,No,4.1,1.4,2.4,1.4,5.5,9.3,4.6,351.3
