In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Generate 10,000 synthetic patient records
num_patients = 10000

# Define possible values
conditions = ['Diabetes', 'Heart Disease', 'COPD', 'Asthma', 'Hypertension']
regions = ['North', 'South', 'East', 'West']
genders = ['Male', 'Female']

# Generate synthetic data
data = {
    'Patient_ID': np.arange(1, num_patients + 1),
    'Age': np.clip(np.random.normal(loc=50, scale=15, size=num_patients).astype(int), 18, 90),
    'Gender': np.random.choice(genders, num_patients, p=[0.49, 0.51]),
    'Condition': np.random.choice(conditions, num_patients, p=[0.3, 0.25, 0.2, 0.15, 0.1]),
    'Readmission_Status': np.random.choice([0, 1], num_patients, p=[0.7, 0.3]),
    'Total_Cost': np.round(np.random.gamma(shape=2, scale=5000, size=num_patients), 2),
    'Discharge_Date': pd.date_range(start='2020-01-01', end='2023-12-31', periods=num_patients),
    'Region': np.random.choice(regions, num_patients)
}

# Add variation to Total_Cost based on condition (e.g., Heart Disease costs more)
condition_cost_multiplier = {
    'Diabetes': 1.2,
    'Heart Disease': 1.8,
    'COPD': 1.5,
    'Asthma': 1.0,
    'Hypertension': 1.1
}

data['Total_Cost'] = [
    cost * condition_cost_multiplier[condition]
    for cost, condition in zip(data['Total_Cost'], data['Condition'])
]

# Add noise to dates to avoid perfect alignment
data['Discharge_Date'] += pd.to_timedelta(np.random.randint(-30, 30, num_patients), unit='d')

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('synthetic_healthcare_data.csv', index=False)