# Healthcare Data Exploration

This notebook provides initial exploration of the Synthea COVID-19 dataset and sets up the foundation for data analysis.

## Objectives
1. Load and examine the Synthea dataset structure
2. Perform exploratory data analysis (EDA)
3. Identify data quality issues
4. Understand patient demographics and healthcare patterns
5. Prepare data for further analysis and modeling


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)


In [None]:
# Define data paths
DATA_PATH = Path('../data/synthea')
CCDA_PATH = Path('../data/ccda')
PROCESSED_PATH = Path('../data/processed')

print("Data directories:")
print(f"Synthea CSV data: {DATA_PATH}")
print(f"CCDA XML data: {CCDA_PATH}")
print(f"Processed data: {PROCESSED_PATH}")

# List available CSV files
csv_files = list(DATA_PATH.glob('*.csv'))
print(f"\nAvailable CSV files ({len(csv_files)}):")
for file in csv_files:
    print(f"- {file.name}")

# List available XML files (sample)
xml_files = list(CCDA_PATH.glob('*.xml'))
print(f"\nAvailable XML files: {len(xml_files)} CCDA documents")
if xml_files:
    print("Sample files:")
    for file in xml_files[:3]:  # Show first 3
        print(f"- {file.name}")
    if len(xml_files) > 3:
        print(f"... and {len(xml_files) - 3} more")


In [None]:
# Load the main datasets
patients = pd.read_csv(DATA_PATH / 'patients.csv')
conditions = pd.read_csv(DATA_PATH / 'conditions.csv')
encounters = pd.read_csv(DATA_PATH / 'encounters.csv')
observations = pd.read_csv(DATA_PATH / 'observations.csv')
medications = pd.read_csv(DATA_PATH / 'medications.csv')
procedures = pd.read_csv(DATA_PATH / 'procedures.csv')
organizations = pd.read_csv(DATA_PATH / 'organizations.csv')

print("Dataset shapes:")
print(f"Patients: {patients.shape}")
print(f"Conditions: {conditions.shape}")
print(f"Encounters: {encounters.shape}")
print(f"Observations: {observations.shape}")
print(f"Medications: {medications.shape}")
print(f"Procedures: {procedures.shape}")
print(f"Organizations: {organizations.shape}")

# Quick memory usage check
total_memory = (patients.memory_usage(deep=True).sum() + 
                conditions.memory_usage(deep=True).sum() + 
                encounters.memory_usage(deep=True).sum() + 
                observations.memory_usage(deep=True).sum() + 
                medications.memory_usage(deep=True).sum()) / 1024**2
print(f"\nTotal memory usage: {total_memory:.2f} MB")
