# Test ASE (Adult Sepsis Event) Implementation

This notebook tests the `compute_ase()` function using CLIFpy's demo data or your specified data directory.

In [None]:
# Setup: Import required packages
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Add the project root to path to ensure we're using the local clifpy
project_root = Path().absolute().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Using clifpy from: {project_root / 'clifpy'}")

In [None]:
# Import ASE functions
from clifpy.utils.ase import compute_ase
from clifpy.data.loader import _get_demo_data_path

print("ASE module imported successfully!")

## 1. Configure Data Source

You can either use the demo data or specify your own data directory.

In [None]:
# Option 1: Use demo data (default)
use_demo_data = False

if use_demo_data:
    data_directory = _get_demo_data_path()
    print(f"Using demo data from: {data_directory}")
    # Configuration
    filetype = 'parquet'  # or 'csv' if your data is in CSV format
    timezone = 'US/Eastern'      # Adjust based on your data's timezone
else:
    # Option 2: Specify your own data directory
    # Update this path to your CLIF data directory
    data_directory = "path/to/2.1.0"  
    print(f"Using custom data from: {data_directory}")
    # Configuration
    filetype = 'parquet'  # or 'csv' if your data is in CSV format
    timezone = 'US/Central'      # Adjust based on your data's timezone



In [None]:
# List available data files
import os

if os.path.exists(data_directory):
    data_files = sorted([f for f in os.listdir(data_directory) if f.endswith('.parquet') or f.endswith('.csv')])
    print(f"Available data files ({len(data_files)}):")
    for f in data_files[:10]:  # Show first 10 files
        file_path = os.path.join(data_directory, f)
        size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"  - {f} ({size_mb:.2f} MB)")
    if len(data_files) > 10:
        print(f"  ... and {len(data_files) - 10} more files")
else:
    print(f"Error: Data directory not found: {data_directory}")
    print("Please update the data_directory path above.")

## 2. Get Hospitalization IDs for Cohort

Let's load the hospitalization data to get some IDs for testing.

In [None]:
# Load hospitalization data to get IDs
hosp_file = os.path.join(data_directory, f"clif_hospitalization.{filetype}")

if os.path.exists(hosp_file):
    if filetype == 'parquet':
        hosp_df = pd.read_parquet(hosp_file)
    else:
        hosp_df = pd.read_csv(hosp_file)
    
    print(f"Hospitalization data shape: {hosp_df.shape}")
    print(f"Columns: {list(hosp_df.columns)}")
    
    # Get unique hospitalization IDs
    hospitalization_ids = hosp_df['hospitalization_id'].unique().tolist()
    print(f"\nTotal hospitalizations available: {len(hospitalization_ids)}")
    
    # Select a subset for testing (adjust as needed)
    test_hosp_ids = hospitalization_ids[:10]  # Test with first 10 hospitalizations
    print(f"Selected {len(test_hosp_ids)} hospitalizations for testing")
    print(f"Sample IDs: {test_hosp_ids[:5]}")
else:
    print(f"Hospitalization file not found: {hosp_file}")
    # Manual specification if file not found
    test_hosp_ids = ['hosp1', 'hosp2']  # Update with actual IDs

## 3. Run ASE Computation

Now let's compute ASE for the selected hospitalizations. This will identify Adult Sepsis Events based on CDC criteria.

In [None]:
%%time
# Run ASE computation
print("Starting ASE computation...")
print(f"Processing {len(hospitalization_ids)} hospitalizations")
print("="*60)

ase_results = compute_ase(
    hospitalization_ids=hospitalization_ids,
    data_directory=data_directory,
    filetype=filetype,
    timezone=timezone,
    apply_rit=True,           # Apply 14-day Repeat Infection Timeframe
    include_lactate=False,    # Include lactate criterion
    verbose=True              # Show detailed progress
)

print("\n" + "="*60)
print(f"‚úÖ ASE computation complete!")
print(f"Result shape: {ase_results.shape}")

## 4. Examine Results

Let's explore the ASE results in detail.

In [None]:
# Display columns in the result
print("Result columns:")
for i, col in enumerate(ase_results.columns, 1):
    print(f"  {i:2}. {col}")

In [None]:
# Count unique hospitalization IDs with sepsis- 
unique_hosp_sepsis = ase_results.loc[ase_results['sepsis'] == 1, 'hospitalization_id'].nunique()

# Count total unique hospitalization IDs
total_hosp = ase_results['hospitalization_id'].nunique()

# Calculate percentage
sepsis_percentage = (unique_hosp_sepsis / total_hosp) * 100

# Display results
print(f"Unique hospitalizations with sepsis: {unique_hosp_sepsis:,}")
print(f"Total unique hospitalizations: {total_hosp:,}")
print(f"Percentage with sepsis: {sepsis_percentage:.2f}%")

In [None]:
# Overall statistics
total_hosps = ase_results['hospitalization_id'].nunique()
total_blood_cultures = ase_results[ase_results['bc_id'].notna()]['bc_id'].count()
total_sepsis_events = ase_results['sepsis'].sum()
total_sepsis_wo_lactate = ase_results['sepsis_wo_lactate'].sum()

print("=== ASE Summary Statistics ===")
print(f"Total hospitalizations processed: {total_hosps}")
print(f"Total blood cultures evaluated: {total_blood_cultures}")
print(f"Total ASE events (with lactate): {total_sepsis_events}")
print(f"Total ASE events (without lactate): {total_sepsis_wo_lactate}")

if total_blood_cultures > 0:
    print(f"\nASE rate per blood culture: {total_sepsis_events/total_blood_cultures:.1%}")
    print(f"ASE rate per hospitalization: {total_sepsis_events/total_hosps:.1%}")

In [None]:
# Show ASE events
ase_events = ase_results[ase_results['sepsis'] == 1]

if len(ase_events) > 0:
    print(f"\n=== ASE Events ({len(ase_events)} total) ===")
    display_cols = ['hospitalization_id', 'bc_id', 'episode_id', 'type', 
                   'ase_onset_w_lactate_dttm', 'ase_first_criteria_w_lactate']
    print(ase_events[display_cols].head(10))
else:
    print("No ASE events detected in this cohort.")

In [None]:
# Analyze reasons for no sepsis
no_sepsis_reasons = ase_results['no_sepsis_reason'].value_counts()

print("\n=== Reasons for No Sepsis ===")
for reason, count in no_sepsis_reasons.items():
    if pd.notna(reason):
        print(f"  {reason}: {count} ({count/len(ase_results):.1%})")

In [None]:
# Look at presumed infection (Component A)
presumed_infection_count = ase_results['presumed_infection'].sum()

print("\n=== Component A: Presumed Infection ===")
print(f"Blood cultures with presumed infection: {presumed_infection_count}")
if total_blood_cultures > 0:
    print(f"Presumed infection rate: {presumed_infection_count/total_blood_cultures:.1%}")

# Show some presumed infections
presumed = ase_results[ase_results['presumed_infection'] == 1]
if len(presumed) > 0:
    print("\nSample presumed infections:")
    display_cols = ['hospitalization_id', 'bc_id', 'blood_culture_dttm', 
                   'total_qad', 'qad_start_date', 'qad_end_date']
    available_cols = [c for c in display_cols if c in presumed.columns]
    print(presumed[available_cols].head())

In [None]:
# Look at organ dysfunction criteria
print("\n=== Component B: Organ Dysfunction Criteria ===")

organ_cols = [
    'vasopressor_dttm', 'imv_dttm', 'aki_dttm', 
    'hyperbilirubinemia_dttm', 'thrombocytopenia_dttm', 'lactate_dttm'
]

for col in organ_cols:
    if col in ase_results.columns:
        count = ase_results[col].notna().sum()
        organ_name = col.replace('_dttm', '').replace('_', ' ').title()
        print(f"  {organ_name}: {count} events")

## 5. Test Different Configurations

Let's test ASE with different configuration options.

In [None]:
# Test with lactate criterion included
print("Testing with lactate criterion included...")

ase_with_lactate = compute_ase(
    hospitalization_ids=hospitalization_ids,  # Use fewer IDs for quick test
    data_directory=data_directory,
    filetype=filetype,
    timezone=timezone,
    apply_rit=True,
    include_lactate=True,  # Include lactate
    verbose=False
)

sepsis_with_lactate = ase_with_lactate['sepsis'].sum()
sepsis_without_lactate = ase_with_lactate['sepsis_wo_lactate'].sum()

print(f"\nResults with lactate criterion:")
print(f"  ASE events (with lactate): {sepsis_with_lactate}")
print(f"  ASE events (without lactate): {sepsis_without_lactate}")
print(f"  Additional events from lactate: {sepsis_with_lactate - sepsis_without_lactate}")

In [None]:
# Test without RIT (Repeat Infection Timeframe)
print("Testing without RIT filtering...")

ase_no_rit = compute_ase(
    hospitalization_ids=hospitalization_ids,
    data_directory=data_directory,
    filetype=filetype,
    timezone=timezone,
    apply_rit=False,  # No RIT filtering
    include_lactate=False,
    verbose=False
)

sepsis_no_rit = ase_no_rit['sepsis'].sum()

print(f"\nResults without RIT:")
print(f"  ASE events (no RIT): {sepsis_no_rit}")

## 6. Validation

Let's validate that the ASE results are consistent and correct.

In [None]:
# Validation checks
print("=== Validation Checks ===")
print()

# Check 1: All hospitalizations are present
result_hosps = set(ase_results['hospitalization_id'].unique())
input_hosps = set(test_hosp_ids)
missing = input_hosps - result_hosps
status = "‚úÖ" if len(missing) == 0 else "‚ùå"
print(f"{status} All hospitalizations processed: {len(missing) == 0}")
if missing:
    print(f"   Missing: {missing}")

# Check 2: Sepsis requires presumed infection
invalid_sepsis = ase_results[(ase_results['sepsis'] == 1) & (ase_results['presumed_infection'] != 1)]
status = "‚úÖ" if len(invalid_sepsis) == 0 else "‚ùå"
print(f"{status} All sepsis events have presumed infection: {len(invalid_sepsis) == 0}")

# Check 3: Episode IDs are sequential within hospitalization
episodes = ase_results[ase_results['episode_id'].notna()]
if len(episodes) > 0:
    valid_episodes = True
    for hosp_id in episodes['hospitalization_id'].unique():
        hosp_episodes = episodes[episodes['hospitalization_id'] == hosp_id]['episode_id'].sort_values()
        expected = list(range(1, len(hosp_episodes) + 1))
        if list(hosp_episodes) != expected:
            valid_episodes = False
            break
    status = "‚úÖ" if valid_episodes else "‚ùå"
    print(f"{status} Episode IDs are sequential: {valid_episodes}")
else:
    print("‚ÑπÔ∏è No episodes to validate")

# Check 4: Type is either community, hospital, or NA
valid_types = {'community', 'hospital', pd.NA, None, np.nan}
invalid_types = set(ase_results['type'].unique()) - valid_types
status = "‚úÖ" if len(invalid_types) == 0 else "‚ùå"
print(f"{status} All onset types are valid: {len(invalid_types) == 0}")
if invalid_types:
    print(f"   Invalid types: {invalid_types}")

print("\n" + "="*60)
print("üéâ ASE testing complete!")

## 7. Export Results (Optional)

Save the results for further analysis.

In [None]:
# Uncomment to save results
# output_file = 'ase_results.csv'
# ase_results.to_csv(output_file, index=False)
# print(f"Results saved to {output_file}")