# Fraser Health Synthetic Data Generation - Onboarding Notebook

This notebook serves as an interactive controller for generating synthetic health data for Fraser Health (British Columbia) using the Synthea simulator.

**Key Features:**
- Uses existing geography/provider files from synthea-international (ca folder)
- No external Map APIs or file generation from scratch
- Programmatically reads and utilizes existing schemas
- Targets Fraser Health cities: Surrey, Burnaby, New Westminster, Coquitlam

**Prerequisites:**
- Java 11 or higher
- Python 3.7+
- pandas, matplotlib libraries

## Section 1: Environment Setup & Repo Inspection

### 1.1 Install Required Python Dependencies

In [None]:
import sys
import subprocess

# Install required packages
!pip install pandas matplotlib numpy ipywidgets

### 1.2 Check Java Installation

In [None]:
import subprocess
import re

def check_java_version():
    """Check if Java 11+ is installed"""
    try:
        result = subprocess.run(['java', '-version'], 
                              capture_output=True, 
                              text=True, 
                              check=True)
        version_output = result.stderr  # Java outputs version to stderr
        print("Java version output:")
        print(version_output)
        
        # Extract version number
        version_match = re.search(r'version "(\d+)', version_output)
        if version_match:
            major_version = int(version_match.group(1))
            if major_version >= 11:
                print(f"\n✓ Java {major_version} is installed (meets requirement: Java 11+)")
                return True
            else:
                print(f"\n✗ Java {major_version} is installed but version 11+ is required")
                return False
        else:
            print("\n⚠ Could not determine Java version")
            return False
    except FileNotFoundError:
        print("✗ Java is not installed or not in PATH")
        print("Please install Java 11 or higher from: https://adoptium.net/")
        return False
    except Exception as e:
        print(f"✗ Error checking Java version: {e}")
        return False

java_ok = check_java_version()

### 1.3 Clone Synthea Repositories

In [None]:
import os
from pathlib import Path

# Define repository paths
WORK_DIR = Path.cwd()
SYNTHEA_DIR = WORK_DIR / "synthea"
SYNTHEA_INTERNATIONAL_DIR = WORK_DIR / "synthea-international"
CONFIG_DIR = WORK_DIR / "config"

# Create config directory
CONFIG_DIR.mkdir(exist_ok=True)

def clone_repo_if_missing(repo_url, target_dir, repo_name):
    """Clone a git repository if it doesn't exist"""
    if target_dir.exists():
        print(f"✓ {repo_name} already exists at {target_dir}")
        return True
    
    print(f"Cloning {repo_name} from {repo_url}...")
    try:
        result = subprocess.run(
            ['git', 'clone', repo_url, str(target_dir)],
            capture_output=True,
            text=True,
            check=True
        )
        print(f"✓ Successfully cloned {repo_name}")
        return True
    except subprocess.CalledProcessError as e:
        print(f"✗ Error cloning {repo_name}: {e.stderr}")
        return False

# Clone repositories
print("Checking for required repositories...\n")
synthea_ok = clone_repo_if_missing(
    "https://github.com/synthetichealth/synthea.git",
    SYNTHEA_DIR,
    "Synthea"
)

international_ok = clone_repo_if_missing(
    "https://github.com/synthetichealth/synthea-international.git",
    SYNTHEA_INTERNATIONAL_DIR,
    "Synthea-International"
)

if synthea_ok and international_ok:
    print("\n✓ All repositories are ready!")
else:
    print("\n✗ Some repositories failed to clone")

### 1.4 Build Synthea JAR (if needed)

In [None]:
import os

def check_or_build_synthea_jar():
    """Check if Synthea JAR exists, build if necessary"""
    jar_path = SYNTHEA_DIR / "build" / "libs" / "synthea-with-dependencies.jar"
    
    if jar_path.exists():
        print(f"✓ Synthea JAR already exists at {jar_path}")
        return jar_path
    
    print("Building Synthea JAR (this may take several minutes)...")
    try:
        # Check if gradlew exists
        gradlew_path = SYNTHEA_DIR / "gradlew"
        if not gradlew_path.exists():
            print("✗ gradlew not found in Synthea directory")
            return None
        
        # Make gradlew executable
        os.chmod(str(gradlew_path), 0o755)
        
        # Build with gradle
        result = subprocess.run(
            ['./gradlew', 'uberJar'],
            cwd=str(SYNTHEA_DIR),
            capture_output=True,
            text=True,
            timeout=600  # 10 minute timeout
        )
        
        if result.returncode == 0 and jar_path.exists():
            print(f"✓ Successfully built Synthea JAR at {jar_path}")
            return jar_path
        else:
            print(f"✗ Build failed with return code {result.returncode}")
            print("Error output:", result.stderr[-1000:])  # Last 1000 chars
            return None
    except Exception as e:
        print(f"✗ Error building Synthea: {e}")
        return None

# Note: Building can take 5-10 minutes on first run
print("Checking Synthea JAR...")
synthea_jar = check_or_build_synthea_jar()

if synthea_jar:
    print(f"\nSynthea JAR location: {synthea_jar}")
else:
    print("\n⚠ JAR build may be needed. You can build manually with:")
    print("  cd synthea && ./gradlew uberJar")

### 1.5 Asset Discovery: Find Canada Data in Synthea-International

In [None]:
import pandas as pd
from pathlib import Path

def discover_canada_assets():
    """Discover Canada data files in synthea-international"""
    print("Searching for Canada assets in synthea-international...\n")
    
    # Canada data is in ca/src/main/resources/ directory
    canada_base = SYNTHEA_INTERNATIONAL_DIR / "ca" / "src" / "main" / "resources"
    
    if not canada_base.exists():
        print(f"✗ Canada directory not found at {canada_base}")
        return {}
    
    print(f"✓ Found Canada directory: {canada_base.relative_to(SYNTHEA_INTERNATIONAL_DIR)}\n")
    
    # Define expected file paths based on actual repository structure
    assets = {
        'zipcodes': canada_base / "geography" / "zipcodes_ca.csv",
        'demographics': canada_base / "geography" / "demographics_ca.csv",
        'timezones': canada_base / "geography" / "timezones_ca.csv",
        'hospitals': canada_base / "providers" / "hospitals_ca.csv",
        'primary_care': canada_base / "providers" / "primary_care_facilities_ca.csv",
        'urgent_care': canada_base / "providers" / "urgent_care_facilities_ca.csv",
    }
    
    print("Found asset files:")
    existing_assets = {}
    for asset_type, file_path in assets.items():
        if file_path.exists():
            print(f"  ✓ {asset_type.upper()}: {file_path.relative_to(SYNTHEA_INTERNATIONAL_DIR)}")
            existing_assets[asset_type] = file_path
        else:
            print(f"  ✗ {asset_type.upper()}: Not found")
    
    return existing_assets

canada_assets = discover_canada_assets()

### 1.6 Load and Display Existing Data Schemas

The Canada data uses the following schema:
- **zipcodes_ca.csv**: Columns include USPS, ST, NAME (city), ZCTA5, LAT, LON
- **demographics_ca.csv**: Population demographics by city and province
- **hospitals_ca.csv**: Hospital locations with columns: id, name, address, city, state, zip, LAT, LON

In [None]:
def load_and_display_asset(file_path, asset_name):
    """Load CSV and display schema information"""
    print(f"\n{'='*70}")
    print(f"Loading: {asset_name}")
    print(f"File: {file_path}")
    print(f"{'='*70}\n")
    
    try:
        df = pd.read_csv(file_path)
        
        print(f"Number of rows: {len(df)}")
        print(f"Number of columns: {len(df.columns)}\n")
        
        print("Column Names and Data Types:")
        for col in df.columns:
            print(f"  - {col}: {df[col].dtype}")
        
        print("\nFirst 5 rows:")
        print(df.head(5).to_string())
        
        return df
    except Exception as e:
        print(f"✗ Error loading {asset_name}: {e}")
        return None

# Load each asset type
zipcodes_df = None
demographics_df = None
hospitals_df = None

if 'zipcodes' in canada_assets:
    zipcodes_df = load_and_display_asset(
        canada_assets['zipcodes'], 
        "Zipcodes/Geography Data"
    )

if 'demographics' in canada_assets:
    demographics_df = load_and_display_asset(
        canada_assets['demographics'], 
        "Demographics Data"
    )

if 'hospitals' in canada_assets:
    hospitals_df = load_and_display_asset(
        canada_assets['hospitals'], 
        "Hospitals Data"
    )

## Section 2: Configuration & Filtering

### 2.1 Configuration Parameters

In [None]:
# ========== CONFIGURATION ==========
# Modify these parameters as needed

TARGET_REGION = "British Columbia"
TARGET_STATE_CODE = "BC"  # British Columbia's abbreviation
TARGET_CITIES = ['Surrey', 'Burnaby', 'New Westminster', 'Coquitlam']

# Simulation parameters
POPULATION_SIZE = 100
RANDOM_SEED = 12345

print("Configuration:")
print(f"  Target Region: {TARGET_REGION} ({TARGET_STATE_CODE})")
print(f"  Target Cities: {', '.join(TARGET_CITIES)}")
print(f"  Population Size: {POPULATION_SIZE}")
print(f"  Random Seed: {RANDOM_SEED}")

### 2.2 Filter and Prepare Zipcodes/Geography Data

In [None]:
def prepare_zipcodes_data(df, state_code, cities):
    """Filter zipcodes data for target region and cities"""
    if df is None:
        print("⚠ No zipcodes data loaded")
        return None
    
    print(f"Original zipcodes data: {len(df)} rows\n")
    
    # Filter by state (ST column contains state abbreviation like 'BC')
    filtered_df = df[df['ST'] == state_code].copy()
    print(f"After filtering by state '{state_code}': {len(filtered_df)} rows")
    
    if len(filtered_df) == 0:
        print(f"⚠ No rows found for state '{state_code}'")
        print(f"Available states: {df['ST'].unique()[:10]}")
        return None
    
    # Check which cities exist (NAME column contains city name)
    existing_cities = filtered_df['NAME'].unique()
    print(f"\nTotal cities in {state_code}: {len(existing_cities)}")
    print(f"Sample cities: {list(existing_cities[:20])}\n")
    
    # Filter by target cities
    city_filtered = filtered_df[filtered_df['NAME'].isin(cities)].copy()
    print(f"After filtering by target cities: {len(city_filtered)} rows")
    
    if len(city_filtered) == 0:
        print(f"\n⚠ None of the target cities found in existing data")
        print(f"Target cities: {cities}")
        print(f"\nSearching for similar city names...")
        for city in cities:
            matches = [c for c in existing_cities if city.lower() in c.lower()]
            if matches:
                print(f"  '{city}' - Found similar: {matches[:5]}")
        return None
    else:
        print(f"\n✓ Found {len(city_filtered)} zipcode entries for target cities:")
        for city in cities:
            city_count = len(city_filtered[city_filtered['NAME'] == city])
            if city_count > 0:
                print(f"  - {city}: {city_count} entries")
        
        print(f"\nFiltered zipcodes data (first 10 rows):")
        print(city_filtered.head(10).to_string())
    
    return city_filtered

prepared_zipcodes = prepare_zipcodes_data(zipcodes_df, TARGET_STATE_CODE, TARGET_CITIES)

### 2.3 Filter and Prepare Demographics Data

In [None]:
def prepare_demographics_data(df, region, cities):
    """Filter demographics data for target region and cities"""
    if df is None:
        print("⚠ No demographics data loaded")
        return None
    
    print(f"Original demographics data: {len(df)} rows\n")
    
    # Filter by province (STNAME column)
    filtered_df = df[df['STNAME'] == region].copy()
    print(f"After filtering by province '{region}': {len(filtered_df)} rows")
    
    if len(filtered_df) == 0:
        print(f"⚠ No rows found for province '{region}'")
        print(f"Available provinces: {df['STNAME'].unique()[:10]}")
        return None
    
    # Filter by cities if NAME column exists
    if 'NAME' in filtered_df.columns:
        city_filtered = filtered_df[filtered_df['NAME'].isin(cities)].copy()
        if len(city_filtered) > 0:
            filtered_df = city_filtered
            print(f"After filtering by target cities: {len(filtered_df)} rows")
    
    if len(filtered_df) > 0:
        print(f"\nFiltered demographics data (first 10 rows):")
        print(filtered_df.head(10).to_string())
    
    return filtered_df

prepared_demographics = prepare_demographics_data(demographics_df, TARGET_REGION, TARGET_CITIES)

### 2.4 Filter and Prepare Hospitals Data

In [None]:
def prepare_hospitals_data(df, state_code, cities):
    """Filter hospitals data for target region and cities"""
    if df is None:
        print("⚠ No hospitals data loaded")
        return None
    
    print(f"Original hospitals data: {len(df)} rows\n")
    
    # Filter by state
    filtered_df = df[df['state'] == state_code].copy()
    print(f"After filtering by state '{state_code}': {len(filtered_df)} rows")
    
    if len(filtered_df) == 0:
        print(f"⚠ No rows found for state '{state_code}'")
        return None
    
    # Filter by cities
    city_filtered = filtered_df[filtered_df['city'].isin(cities)].copy()
    print(f"After filtering by target cities: {len(city_filtered)} rows")
    
    if len(city_filtered) > 0:
        print(f"\n✓ Found {len(city_filtered)} hospitals for target cities:")
        for city in cities:
            city_count = len(city_filtered[city_filtered['city'] == city])
            if city_count > 0:
                print(f"  - {city}: {city_count} hospitals")
        
        print(f"\nFiltered hospitals data:")
        print(city_filtered[['name', 'city', 'state', 'zip']].to_string())
        
        return city_filtered
    else:
        print(f"⚠ No hospitals found for target cities")
        return None

prepared_hospitals = prepare_hospitals_data(hospitals_df, TARGET_STATE_CODE, TARGET_CITIES)

### 2.5 Save Prepared Data to Config Directory

In [None]:
def save_prepared_data():
    """Save filtered data to config directory"""
    print("Saving prepared data to config directory...\n")
    
    saved_files = []
    
    if prepared_zipcodes is not None and len(prepared_zipcodes) > 0:
        zip_path = CONFIG_DIR / "zipcodes_ca.csv"
        prepared_zipcodes.to_csv(zip_path, index=False)
        print(f"✓ Saved zipcodes_ca.csv ({len(prepared_zipcodes)} rows)")
        saved_files.append(zip_path)
    
    if prepared_demographics is not None and len(prepared_demographics) > 0:
        demo_path = CONFIG_DIR / "demographics_ca.csv"
        prepared_demographics.to_csv(demo_path, index=False)
        print(f"✓ Saved demographics_ca.csv ({len(prepared_demographics)} rows)")
        saved_files.append(demo_path)
    
    if prepared_hospitals is not None and len(prepared_hospitals) > 0:
        hosp_path = CONFIG_DIR / "hospitals_ca.csv"
        prepared_hospitals.to_csv(hosp_path, index=False)
        print(f"✓ Saved hospitals_ca.csv ({len(prepared_hospitals)} rows)")
        saved_files.append(hosp_path)
    
    print(f"\n✓ Saved {len(saved_files)} file(s) to {CONFIG_DIR}")
    return saved_files

saved_config_files = save_prepared_data()

## Section 3: Simulation Execution

### 3.1 Deploy Configuration Files to Synthea

In [None]:
import shutil

def deploy_config_to_synthea():
    """Copy prepared CSV files to Synthea resources directory"""
    resources_dir = SYNTHEA_DIR / "src" / "main" / "resources"
    geography_dir = resources_dir / "geography"
    providers_dir = resources_dir / "providers"
    
    if not resources_dir.exists():
        print(f"✗ Synthea resources directory not found: {resources_dir}")
        return False
    
    # Create subdirectories if needed
    geography_dir.mkdir(exist_ok=True)
    providers_dir.mkdir(exist_ok=True)
    
    print(f"Deploying configuration files to: {resources_dir}\n")
    
    deployed = []
    for config_file in saved_config_files:
        # Determine target directory based on file type
        if 'hospital' in config_file.name or 'provider' in config_file.name:
            target_dir = providers_dir
        else:
            target_dir = geography_dir
        
        target_file = target_dir / config_file.name
        
        # Backup existing file if present
        if target_file.exists():
            backup_file = target_file.with_suffix('.csv.backup')
            shutil.copy2(target_file, backup_file)
            print(f"⚠ Backed up existing {config_file.name} to {backup_file.name}")
        
        # Copy new file
        shutil.copy2(config_file, target_file)
        print(f"✓ Deployed {config_file.name} to {target_dir.name}/")
        deployed.append(target_file)
    
    print(f"\n✓ Deployed {len(deployed)} file(s) to Synthea resources")
    return True

deployment_ok = deploy_config_to_synthea()

### 3.2 Run Synthea Simulation

In [None]:
def run_synthea_simulation(population, seed, state, city=None):
    """Execute Synthea simulation with specified parameters"""
    
    if synthea_jar is None or not synthea_jar.exists():
        print("✗ Synthea JAR not found. Please build it first.")
        return False
    
    # Construct command
    cmd = [
        'java',
        '-jar', str(synthea_jar),
        '-p', str(population),
        '-s', str(seed),
    ]
    
    # Add location parameters
    cmd.append(state)
    if city:
        cmd.append(city)
    
    print("Running Synthea simulation...")
    print(f"Command: {' '.join(cmd)}\n")
    print("This may take several minutes depending on population size...\n")
    
    try:
        result = subprocess.run(
            cmd,
            cwd=str(SYNTHEA_DIR),
            capture_output=True,
            text=True,
            timeout=600  # 10 minute timeout
        )
        
        # Show last part of output
        if result.stdout:
            output_lines = result.stdout.split('\n')
            print("Simulation output (last 20 lines):")
            print('\n'.join(output_lines[-20:]))
        
        if result.returncode == 0:
            print("\n✓ Simulation completed successfully!")
            return True
        else:
            print(f"\n✗ Simulation failed with return code {result.returncode}")
            if result.stderr:
                print("Error output:")
                print(result.stderr[-1000:])
            return False
            
    except subprocess.TimeoutExpired:
        print("✗ Simulation timed out after 10 minutes")
        return False
    except Exception as e:
        print(f"✗ Error running simulation: {e}")
        return False

# Run simulation for first target city
if deployment_ok and java_ok:
    primary_city = TARGET_CITIES[0] if TARGET_CITIES else None
    simulation_ok = run_synthea_simulation(
        POPULATION_SIZE,
        RANDOM_SEED,
        TARGET_STATE_CODE,
        primary_city
    )
else:
    print("⚠ Skipping simulation - prerequisites not met")
    simulation_ok = False

## Section 4: Validation

### 4.1 Load Generated Patient Data

In [None]:
def load_generated_patients():
    """Load the generated patient data"""
    output_path = SYNTHEA_DIR / "output" / "csv" / "patients.csv"
    
    if not output_path.exists():
        print(f"✗ Patient data not found at: {output_path}")
        print("Please run the simulation first.")
        return None
    
    print(f"Loading generated patient data from: {output_path}\n")
    
    try:
        patients_df = pd.read_csv(output_path)
        print(f"✓ Loaded {len(patients_df)} patients")
        print(f"\nColumns: {patients_df.columns.tolist()}\n")
        
        print("First 5 patients:")
        print(patients_df.head().to_string())
        
        return patients_df
    except Exception as e:
        print(f"✗ Error loading patient data: {e}")
        return None

if simulation_ok:
    patients_df = load_generated_patients()
else:
    print("⚠ Skipping validation - simulation not completed")
    patients_df = None

### 4.2 Verify Location Data

In [None]:
def verify_location_data(df):
    """Verify that generated patients match target locations"""
    if df is None:
        return
    
    print("Verifying location data...\n")
    
    # Check for location columns
    city_col = 'CITY' if 'CITY' in df.columns else 'city' if 'city' in df.columns else None
    state_col = 'STATE' if 'STATE' in df.columns else 'state' if 'state' in df.columns else None
    
    if city_col:
        cities = df[city_col].value_counts()
        print(f"Cities in generated data:")
        print(cities)
        print()
        
        # Check if target cities are present
        for target_city in TARGET_CITIES:
            if target_city in cities.index:
                print(f"✓ {target_city}: {cities[target_city]} patients")
            else:
                print(f"✗ {target_city}: Not found in generated data")
    
    if state_col:
        print(f"\nStates/Provinces in generated data:")
        states = df[state_col].value_counts()
        print(states)
        
        if TARGET_REGION in states.index or any(TARGET_REGION.lower() in str(s).lower() for s in states.index):
            print(f"\n✓ {TARGET_REGION} found in generated data")
        else:
            print(f"\n⚠ {TARGET_REGION} may not be in generated data")

if patients_df is not None:
    verify_location_data(patients_df)

### 4.3 Load and Analyze Encounters Data

In [None]:
def load_generated_encounters():
    """Load the generated encounters data"""
    output_path = SYNTHEA_DIR / "output" / "csv" / "encounters.csv"
    
    if not output_path.exists():
        print(f"⚠ Encounters data not found at: {output_path}")
        return None
    
    try:
        encounters_df = pd.read_csv(output_path)
        print(f"✓ Loaded {len(encounters_df)} encounters")
        return encounters_df
    except Exception as e:
        print(f"✗ Error loading encounters data: {e}")
        return None

if simulation_ok:
    encounters_df = load_generated_encounters()
else:
    encounters_df = None

### 4.4 Visualize Patient Age vs. Encounter Type

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

def calculate_age(birthdate_str):
    """Calculate age from birthdate string"""
    try:
        birthdate = pd.to_datetime(birthdate_str)
        today = pd.Timestamp.now()
        age = (today - birthdate).days // 365
        return age
    except:
        return None

def plot_age_vs_encounter_type(patients_df, encounters_df):
    """Create visualization of patient age vs encounter type"""
    if patients_df is None or encounters_df is None:
        print("⚠ Missing data for visualization")
        return
    
    print("Creating visualization...\n")
    
    # Calculate ages
    birthdate_col = 'BIRTHDATE' if 'BIRTHDATE' in patients_df.columns else 'birthdate'
    if birthdate_col in patients_df.columns:
        patients_df['AGE'] = patients_df[birthdate_col].apply(calculate_age)
    
    # Merge patients with encounters
    patient_id_col = 'Id' if 'Id' in patients_df.columns else 'ID' if 'ID' in patients_df.columns else 'id'
    encounter_patient_col = 'PATIENT' if 'PATIENT' in encounters_df.columns else 'patient'
    
    merged_df = encounters_df.merge(
        patients_df[['AGE', patient_id_col]],
        left_on=encounter_patient_col,
        right_on=patient_id_col,
        how='left'
    )
    
    # Create figure with multiple subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Fraser Health Synthetic Data - Validation Dashboard', fontsize=16, fontweight='bold')
    
    # Plot 1: Age distribution
    if 'AGE' in patients_df.columns:
        axes[0, 0].hist(patients_df['AGE'].dropna(), bins=20, color='skyblue', edgecolor='black')
        axes[0, 0].set_xlabel('Age (years)')
        axes[0, 0].set_ylabel('Number of Patients')
        axes[0, 0].set_title('Patient Age Distribution')
        axes[0, 0].grid(True, alpha=0.3)
    
    # Plot 2: Encounter type distribution
    encounter_type_col = 'ENCOUNTERCLASS' if 'ENCOUNTERCLASS' in encounters_df.columns else 'encounterclass'
    if encounter_type_col in encounters_df.columns:
        encounter_counts = encounters_df[encounter_type_col].value_counts()
        axes[0, 1].bar(range(len(encounter_counts)), encounter_counts.values, color='lightcoral')
        axes[0, 1].set_xticks(range(len(encounter_counts)))
        axes[0, 1].set_xticklabels(encounter_counts.index, rotation=45, ha='right')
        axes[0, 1].set_ylabel('Number of Encounters')
        axes[0, 1].set_title('Encounter Type Distribution')
        axes[0, 1].grid(True, alpha=0.3, axis='y')
    
    # Plot 3: Age vs Encounter Type (box plot)
    if 'AGE' in merged_df.columns and encounter_type_col in merged_df.columns:
        encounter_types = merged_df[encounter_type_col].unique()[:5]  # Top 5 types
        age_data = [merged_df[merged_df[encounter_type_col] == et]['AGE'].dropna() 
                   for et in encounter_types]
        
        axes[1, 0].boxplot(age_data, labels=encounter_types)
        axes[1, 0].set_xticklabels(encounter_types, rotation=45, ha='right')
        axes[1, 0].set_ylabel('Age (years)')
        axes[1, 0].set_title('Age Distribution by Encounter Type')
        axes[1, 0].grid(True, alpha=0.3, axis='y')
    
    # Plot 4: Summary statistics
    axes[1, 1].axis('off')
    summary_text = f"""
    SUMMARY STATISTICS
    ==================
    
    Total Patients: {len(patients_df)}
    Total Encounters: {len(encounters_df)}
    
    Age Range: {patients_df['AGE'].min():.0f} - {patients_df['AGE'].max():.0f} years
    Mean Age: {patients_df['AGE'].mean():.1f} years
    
    Unique Encounter Types: {encounters_df[encounter_type_col].nunique()}
    
    Configuration:
    - Region: {TARGET_REGION}
    - Cities: {', '.join(TARGET_CITIES)}
    - Population: {POPULATION_SIZE}
    - Seed: {RANDOM_SEED}
    """
    axes[1, 1].text(0.1, 0.5, summary_text, 
                    fontsize=10, 
                    verticalalignment='center',
                    fontfamily='monospace')
    
    plt.tight_layout()
    plt.show()
    
    print("✓ Visualization complete!")

if patients_df is not None and encounters_df is not None:
    plot_age_vs_encounter_type(patients_df, encounters_df)
else:
    print("⚠ Cannot create visualization - missing data")

### 4.5 Final Summary

In [None]:
print("="*70)
print("FRASER HEALTH SYNTHETIC DATA GENERATION - SUMMARY")
print("="*70)
print()
print("✓ Environment Setup Complete")
print(f"  - Java: {'✓' if java_ok else '✗'}")
print(f"  - Synthea Repository: {'✓' if synthea_ok else '✗'}")
print(f"  - Synthea-International: {'✓' if international_ok else '✗'}")
print(f"  - Synthea JAR: {'✓' if synthea_jar else '✗'}")
print()
print("✓ Configuration")
print(f"  - Target Region: {TARGET_REGION} ({TARGET_STATE_CODE})")
print(f"  - Target Cities: {', '.join(TARGET_CITIES)}")
print(f"  - Population Size: {POPULATION_SIZE}")
print(f"  - Random Seed: {RANDOM_SEED}")
print()
print("✓ Data Preparation")
print(f"  - Zipcodes Data: {'✓' if prepared_zipcodes is not None else '✗'}")
print(f"  - Demographics Data: {'✓' if prepared_demographics is not None else '✗'}")
print(f"  - Hospitals Data: {'✓' if prepared_hospitals is not None else '✗'}")
print()
print("✓ Simulation")
print(f"  - Execution: {'✓ Success' if simulation_ok else '✗ Not Run or Failed'}")
print()
print("✓ Validation")
print(f"  - Patients Generated: {len(patients_df) if patients_df is not None else 0}")
print(f"  - Encounters Generated: {len(encounters_df) if encounters_df is not None else 0}")
print()
print("Output Location:")
print(f"  - {SYNTHEA_DIR / 'output' / 'csv'}")
print()
print("="*70)