# Epidemiology of CRRT

Author: Kaveri Chhikara

This script identifies the cohort using CLIF 2.1 tables
**Requirements**

* Required table filenames should be `clif_patient`, `clif_hospitalization`, `clif_adt`, `clif_vitals`, `clif_labs`, `clif_medication_admin_continuous`, `clif_respiratory_support` ,`crrt_therapy`, `clif_hospital_diagnosis`
* Within each table, the following variables and categories are required.

| Table Name | Required Variables | Required Categories |
| --- | --- | --- |
| **clif_patient** | `patient_id`, `race_category`, `ethnicity_category`, `sex_category`, `death_dttm` | - |
| **clif_hospitalization** | `patient_id`, `hospitalization_id`, `admission_dttm`, `discharge_dttm`, `age_at_admission`, `discharge_category` | - |
| **clif_adt** |  `hospitalization_id`, `hospital_id`,`in_dttm`, `out_dttm`, `location_category`, `location_type` | - |
| **clif_vitals** | `hospitalization_id`, `recorded_dttm`, `vital_category`, `vital_value` | heart_rate, resp_rate, sbp, dbp, map, spo2, weight_kg, height_cm |
| **clif_labs** | `hospitalization_id`, `lab_result_dttm`, `lab_category`, `lab_value` | sodium, potassium, chloride, bicarbonate, bun, creatinine, glucose_serum, calcium_total, lactate, magnesium, ph_arterial, ph_venous, po2_arterial |
| **clif_medication_admin_continuous** | `hospitalization_id`, `admin_dttm`, `med_name`, `med_category`, `med_dose`, `med_dose_unit` | norepinephrine, epinephrine, phenylephrine, vasopressin, dopamine, angiotensin, dobutamine, milrinone, isoproterenol |
| **clif_respiratory_support** | `hospitalization_id`, `recorded_dttm`, `device_category`, `mode_category`, `tracheostomy`, `fio2_set`, `lpm_set`, `resp_rate_set`, `peep_set`, `resp_rate_obs`, `tidal_volume_set`, `pressure_control_set`, `pressure_support_set`, `peak_inspiratory_pressure_set`, `tidal_volume_obs` | - |
| **clif_crrt_therapy** | `hospitalization_id`, `recorded_dttm`, `crrt_mode_name`, `crrt_mode_category`, `device_id`, `blood_flow_rate`, `dialysate_flow_rate`, `pre_filter_replacement_fluid_rate`,`post_filter_replacement_fluid_rate`, `ultrafilteration_out` | - |
| **clif_hospital_diagnosis** | `hospitalization_id`, `diagnosis_code`, `present_on_admission` | - |



# Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
from pathlib import Path
import json
import pyarrow
import warnings
import clifpy
from typing import Union
from tqdm import tqdm

import sys
import clifpy
import os

print("=== Environment Verification ===")
print(f"Python executable: {sys.executable}")
print(f"Python version: {sys.version}")
print(f"clifpy version: {clifpy.__version__}")
print(f"clifpy location: {clifpy.__file__}")

print("\n=== Python Path Check ===")
local_clifpy_path = "/Users/kavenchhikara/Desktop/CLIF/CLIFpy"
if any(local_clifpy_path in path for path in sys.path):
    print("⚠️  WARNING: Local CLIFpy still in path!")
    for path in sys.path:
        if local_clifpy_path in path:
            print(f"   Found: {path}")
else:
    print("✅ Clean environment - no local CLIFpy in path")

print(f"\n=== Working Directory ===")
print(f"Current directory: {os.getcwd()}")

In [None]:
# Load configuration
config_path = "../config/config.json"
with open(config_path, 'r') as f:
    config = json.load(f)

## import outlier json
# with open('../config/outlier_config.json', 'r', encoding='utf-8') as f:
#     outlier_cfg = json.load(f)

print(f"\n=� Configuration:")
print(f"   Data directory: {config['tables_path']}")
print(f"   File type: {config['file_type']}")
print(f"   Timezone: {config['timezone']}")

In [None]:
import os
# Create output directories if they do not exist
os.makedirs("../output/final/graphs", exist_ok=True)
os.makedirs("../output/intermediate", exist_ok=True)

# Required columns and categories

In [None]:
print("\n" + "=" * 80)
print("Defining Required Data Elements")
print("=" * 80)

# Full patient table 

# Full hospitalization table 

# Full ADT table

# Vitals
vitals_required_columns = [
    'hospitalization_id',
    'recorded_dttm',
    'vital_category',
    'vital_value'
]
vitals_of_interest = ['heart_rate', 'respiratory_rate', 'sbp', 'dbp', 'map', 'spo2', 'weight_kg', 'height_cm']

#Labs
labs_required_columns = [
    'hospitalization_id',
    'lab_result_dttm',
    'lab_category',
    'lab_value',
    'lab_value_numeric'
]
labs_of_interest = ['po2_arterial','pco2_arterial', 'ph_arterial','ph_venous', 'bicarbonate','so2_arterial',
                    'sodium', 'potassium', 'chloride', 'calcium_total', 'magnesium', 'creatinine', 
                    'bun', 'glucose_serum', 'lactate', 'hemoglobin' ]

# Continuous administered meds
meds_required_columns = [
    'hospitalization_id',
    'admin_dttm',
    'med_name',
    'med_category',
    'med_dose',
    'med_dose_unit'
]
meds_of_interest = [
    'norepinephrine', 'epinephrine', 'phenylephrine', 'vasopressin',
    'dopamine', 'angiotensin', 'dobutamine', 'milrinone', 'isoproterenol',
    'propofol', 'midazolam', 'lorazepam', 'dexmedetomidine', 
    'vecuronium', 'rocuronium', 'cisatracurium', 'pancuronium'
]

# Respiratory Support 
rst_required_columns = [
    'hospitalization_id',
    'recorded_dttm',
    'device_name',
    'device_category',
    'mode_name', 
    'mode_category',
    'tracheostomy',
    'fio2_set',
    'lpm_set',
    'resp_rate_set',
    'peep_set',
    'resp_rate_obs',
    'tidal_volume_set', 
    'pressure_control_set',
    'pressure_support_set',
    'peak_inspiratory_pressure_set',
    'peak_inspiratory_pressure_obs',
    'plateau_pressure_obs',
    'minute_vent_obs'
]

# Full crrt table
crrt_required_columns = [
    'hospitalization_id',
    'recorded_dttm',
    'crrt_mode_category',
    'blood_flow_rate',
    'pre_filter_replacement_fluid_rate',
    'post_filter_replacement_fluid_rate',
    'dialysate_flow_rate',
    'ultrafiltration_out'
]

# Cohort Identification


**Inclusion**
1. Adults
2. Admitted between January 1, 2018 to December, 31, 2024
3. Receiving CRRT- must have DFR or UF documented at any point in the hospitalization
4. Data completeness- Must have weight & CRRT settings  documented

**Exclusion**
1. Prior to admission ICD codes for ESRD

In [None]:
strobe_counts = {}

## Step0: Load Core Tables

In [None]:
print("\n" + "=" * 80)
print("Loading CLIF Tables")
print("=" * 80)

from clifpy.clif_orchestrator import ClifOrchestrator

# Initialize ClifOrchestrator
clif = ClifOrchestrator(
    data_directory=config['tables_path'],
    filetype=config['file_type'],
    timezone=config['timezone']
)


In [None]:
# ============================================================================
# STEP 0: Load Core Tables (Patient, Hospitalization, ADT)
# ============================================================================
print("\n" + "=" * 80)
print("Step 0: Load Core Tables (Patient, Hospitalization, ADT)")
print("=" * 80)
core_tables = ['patient', 'hospitalization', 'adt']

print(f"\nLoading {len(core_tables)} core tables...")
for table_name in core_tables:
    print(f"   Loading {table_name}...", end=" ")
    try:
        clif.load_table(table_name)
        table = getattr(clif, table_name)
        print(f"✓ ({len(table.df):,} rows)")
    except Exception as e:
        print(f"✗ Error: {e}")
        raise

print("\nCore tables loaded successfully!")

In [None]:
hosp_df = clif.hospitalization.df
adt_df = clif.adt.df

# Merge to get age information
all_encounters = pd.merge(
    hosp_df[["patient_id", "hospitalization_id", "admission_dttm", "discharge_dttm", 
             "age_at_admission", "discharge_category"]],
    adt_df[["hospitalization_id", "hospital_id", "in_dttm", "out_dttm", 
            "location_category", "location_type"]],
    on='hospitalization_id',
    how='inner'
)

In [None]:
# Check for duplicates by ['hospitalization_id', 'in_dttm', 'out_dttm']
dup_counts = all_encounters.duplicated(subset=['hospitalization_id', 'in_dttm', 'out_dttm']).sum()
if dup_counts > 0:
    print(f"Warning: {dup_counts} duplicate (hospitalization_id, in_dttm, out_dttm) entries found in all_encounters.")
else:
    print("No duplicate (hospitalization_id, in_dttm, out_dttm) entries found in all_encounters.")

## Step1: Date & Age filter

In [None]:
# ============================================================================
# STEP 1: Identify Adult Patients (Age >= 18) and Admissions 2018-2024
# ============================================================================
print("\n" + "=" * 80)
print("Step 1: Identifying Adult Patients (Age >= 18) and Admissions 2018-2024")
print("=" * 80)

print("Applying initial cohort filters...")

# Use only the relevant columns from all_encounters
adult_encounters = all_encounters[
    [
        'patient_id', 'hospitalization_id', 'admission_dttm', 'discharge_dttm',
        'age_at_admission', 'discharge_category', 'hospital_id',
        'in_dttm', 'out_dttm', 'location_category', 'location_type'
    ]
].copy()

# Filter for adult patients (age >= 18) and valid age
adult_encounters = adult_encounters[
    (adult_encounters['age_at_admission'] >= 18) & (adult_encounters['age_at_admission'].notna())
]

# Filter for admission years 2018-2024
adult_encounters = adult_encounters[
    (adult_encounters['admission_dttm'].dt.year >= 2018) & (adult_encounters['admission_dttm'].dt.year <= 2024)
]

print(f"\nFiltering Results:")
print(f"   Total hospitalizations: {len(all_encounters['hospitalization_id'].unique()):,}")
print(f"   Adult hospitalizations (age >= 18, 2018-2024): {len(adult_encounters['hospitalization_id'].unique()):,}")
print(f"   Excluded (age < 18 or outside 2018-2024): {len(all_encounters['hospitalization_id'].unique()) - len(adult_encounters['hospitalization_id'].unique()):,}")


strobe_counts["0_total_hospitalizations"] = len(all_encounters['hospitalization_id'].unique())
strobe_counts["1_adult_hospitalizations"] = len(adult_encounters['hospitalization_id'].unique())
# Get list of adult hospitalization IDs for filtering
adult_hosp_ids = set(adult_encounters['hospitalization_id'].unique())
print(f"\n   Unique adult hospitalization IDs: {len(adult_hosp_ids):,}")

## Step1B: Stitch hospitalizations

In [None]:
from clifpy.utils.stitching_encounters import stitch_encounters

# Instead of multiple copies, work with references and clean up
hosp_filtered = clif.hospitalization.df[clif.hospitalization.df['hospitalization_id'].isin(adult_hosp_ids)]
adt_filtered = clif.adt.df[clif.adt.df['hospitalization_id'].isin(adult_hosp_ids)]

hosp_stitched, adt_stitched, encounter_mapping = stitch_encounters(
    hospitalization=hosp_filtered,
    adt=adt_filtered,
    time_interval=6  
)

# Direct assignment without additional copies
clif.hospitalization.df = hosp_stitched
clif.adt.df = adt_stitched

# Store the encounter mapping in the orchestrator for later use
clif.encounter_mapping = encounter_mapping

# Clean up intermediate variables
del hosp_filtered, adt_filtered
gc.collect()

In [None]:
# After your stitching code, add these calculations:

# Calculate stitching statistics
strobe_counts['1b_before_stitching'] = len(adult_hosp_ids)  # Original adult hospitalizations
strobe_counts['1b_after_stitching'] = len(hosp_stitched['encounter_block'].unique())  # Unique encounter blocks after stitching
strobe_counts['1b_stitched_hosp_ids'] = strobe_counts['1b_before_stitching'] - strobe_counts['1b_after_stitching']  # Number of hospitalizations that were linked

print(f"\nEncounter Stitching Results:")
print(f"   Number of unique hospitalizations before stitching: {strobe_counts['1b_before_stitching']:,}")
print(f"   Number of unique encounter blocks after stitching: {strobe_counts['1b_after_stitching']:,}")
print(f"   Number of linked hospitalization ids: {strobe_counts['1b_stitched_hosp_ids']:,}")

# Optional: Show the encounter mapping details
print(f"\nEncounter Mapping Details:")
print(f"   Total encounter mappings created: {len(encounter_mapping):,}")
if len(encounter_mapping) > 0:
    # Show some examples of how many original hospitalizations were combined
    mapping_counts = encounter_mapping.groupby('encounter_block').size()
    print(f"   Encounter blocks with multiple hospitalizations: {(mapping_counts > 1).sum():,}")
    print(f"   Maximum hospitalizations combined into one block: {mapping_counts.max()}")

In [None]:
cohort_df = encounter_mapping.copy()

## Step2: Identify CRRT Encounters

In [None]:
print(f"\nLoading crrt_therapy table...")
try:
    clif.load_table(
        'crrt_therapy',
        filters={'hospitalization_id': list(adult_hosp_ids)}
    )
    print(f"   CRRT therapy loaded: {len(clif.crrt_therapy.df):,} rows")
    print(f"   Unique CRRT therapy hospitalizations: {clif.crrt_therapy.df['hospitalization_id'].nunique()}")
except Exception as e:
    print(f"   CRRT therapy not available or error: {e}")

In [None]:
# Update CRRT therapy DataFrame with encounter blocks
clif.crrt_therapy.df = clif.crrt_therapy.df.merge(
    clif.encounter_mapping[['hospitalization_id', 'encounter_block']],
    on='hospitalization_id',
    how='left'
)

n_crrt_hosp = clif.crrt_therapy.df['hospitalization_id'].nunique()
n_crrt_blocks = clif.crrt_therapy.df['encounter_block'].nunique()
crrt_hosp_ids = set(clif.crrt_therapy.df['hospitalization_id'].unique())

print(f"Updated CRRT therapy DataFrame:")
print(f"   Total CRRT records: {len(clif.crrt_therapy.df):,}")
print(f"   Records with encounter blocks: {clif.crrt_therapy.df['encounter_block'].notna().sum():,}")
print(f"   Unique encounter blocks in CRRT data: {n_crrt_blocks}")
print(f"   Unique hospitalizations  in CRRT data: {n_crrt_hosp}")

strobe_counts["2_crrt_hospitalizations"] = n_crrt_hosp
strobe_counts["2_crrt_blocks"] = n_crrt_blocks

# Filter cohort_df to only hospitalizations present in CRRT data
cohort_df = cohort_df[cohort_df['hospitalization_id'].isin(crrt_hosp_ids)].copy()

## Step3: Exclude ESRD encounters

Prior to admission ICD codes for ESRD

In [None]:
print(f"\nLoading Hospital dx table...")
try:
    clif.load_table(
        'hospital_diagnosis',
        filters={'hospitalization_id': list(crrt_hosp_ids)}
    )
    print(f"   Hospital dx loaded: {len(clif.hospital_diagnosis.df):,} rows")
    print(f"   Unique Hospital dx hospitalizations: {clif.hospital_diagnosis.df['hospitalization_id'].nunique()}")

    print("Merge encounter blocks with diagnosis")
    clif.hospital_diagnosis.df = clif.hospital_diagnosis.df.merge(
                    clif.encounter_mapping[['hospitalization_id', 'encounter_block']],
                    on='hospitalization_id',
                    how='left')

    n_dx_hosp = clif.hospital_diagnosis.df['hospitalization_id'].nunique()
    n_dx_blocks = clif.hospital_diagnosis.df['encounter_block'].nunique()
    cohort_hosp_ids = set(clif.hospital_diagnosis.df['hospitalization_id'].unique())
    cohort_blocks = set(clif.hospital_diagnosis.df['encounter_block'].unique())
    print(f"   Total Hospital dx records: {len(clif.hospital_diagnosis.df):,}")
    print(f"   Records with encounter blocks: {clif.hospital_diagnosis.df['encounter_block'].notna().sum():,}")
    print(f"   Unique encounter blocks in Hospital dx data: {n_dx_blocks}")
    print(f"   Unique hospitalizations  in Hospital dx data: {n_dx_hosp}")
except Exception as e:
    print(f"   Hospital dx not available or error: {e}")

In [None]:
hospital_diagnosis_df = clif.hospital_diagnosis.df.copy()

print("Hospital dx column names :", hospital_diagnosis_df.columns)
# Clean and standardize diagnosis codes
hospital_diagnosis_df['diagnosis_code'] = hospital_diagnosis_df['diagnosis_code'].str.replace('.', '').str.lower()

if 'present_on_admission' in hospital_diagnosis_df.columns:
    hospital_diagnosis_df = hospital_diagnosis_df.rename(columns={'present_on_admission': 'poa_present'})

# Check present_on_admission column type and standardize to int8
if 'poa_present' in hospital_diagnosis_df.columns:
    # Only allow 1 (present on admission) or 0 (not present on admission)
    # Any other value (including Exempt, Unknown, Unspecified, NA) is set to 0
    hospital_diagnosis_df['poa_present'] = hospital_diagnosis_df['poa_present'].astype(str).str.lower()
    hospital_diagnosis_df['poa_present'] = hospital_diagnosis_df['poa_present'].map(
        {'yes': 1, 'y': 1, 'true': 1, '1': 1, 'no': 0, 'n': 0, 'false': 0, '0': 0}
    ).fillna(0).astype('int8')

In [None]:
# Define ESRD diagnosis codes
# Let's debug why we're not finding ESRD codes
esrd_codes = [
    'z992',    # Dependence on renal dialysis
    'z9115',   # Patient's noncompliance with renal dialysis
    'i120',    # Hypertensive chronic kidney disease with stage 5 CKD or ESRD
    'n186',    # End stage renal disease
    'i132',    # Hypertensive heart and chronic kidney disease with heart failure and ESRD
    'z992',    # Dependence on renal dialysis (alternate code)
    'i120',    # Hypertensive chronic kidney disease with stage 5 CKD or ESRD (alternate code)
    'z91158',  # Patient's noncompliance with renal dialysis (alternate code)
    'i1311',   # Hypertensive heart and chronic kidney disease with heart failure and stage 5 CKD
    'i132',    # Hypertensive heart and chronic kidney disease with ESRD (alternate code)
    '5856',     #ICD9 :End stage renal disease
    '40391',    #ICD9: Hypertensive chronic kidney disease, unspecified, with chronic kidney disease stage V or end stage renal disease
    '40311',     #ICD9: Hypertensive chronic kidney disease, benign, with chronic kidney disease stage V or end stage renal disease
    'v4511',     #ICD9: Renal dialysis status
    'v4512'     #ICD9: Noncompliance with renal dialysis
]

# Get hospitalization IDs with ESRD diagnoses and print debug info
print("\nNumber of rows matching ESRD codes:", hospital_diagnosis_df['diagnosis_code'].isin(esrd_codes).sum())


# Count how many ESRD codes have present_on_admission = 1, 0, or NA
esrd_poa_counts = hospital_diagnosis_df[
    hospital_diagnosis_df['diagnosis_code'].isin(esrd_codes)
]['poa_present'].value_counts(dropna=False)
print("Present_on_admission values for ESRD codes:")
print(esrd_poa_counts)

# Use a more inclusive approach for ESRD identification
# Include cases where present_on_admission is 1 OR NA (assuming NA means unknown/possible)
esrd_mask = (
    hospital_diagnosis_df['diagnosis_code'].isin(esrd_codes) & 
    ((hospital_diagnosis_df['poa_present'] == 1) | 
        (hospital_diagnosis_df['poa_present'].isna()))
)
hosp_ids_with_esrd = hospital_diagnosis_df[esrd_mask]['hospitalization_id'].unique()
blocks_with_esrd = hospital_diagnosis_df[esrd_mask]['encounter_block'].unique()

print(f"Hospitalizations with ESRD (including NA present_on_admission): {len(hosp_ids_with_esrd)}")


strobe_counts['3_hospitalizations_with_esrd'] = len(hosp_ids_with_esrd)
strobe_counts['3_encounter_blocks_with_esrd'] = len(blocks_with_esrd)


# Filter out hospitalizations with ESRD
cohort_df = cohort_df[~cohort_df['hospitalization_id'].isin(hosp_ids_with_esrd)].copy()
cohort_hosp_ids = set(cohort_df['hospitalization_id'].unique())
cohort_blocks = set(cohort_df['encounter_block'].unique())
# Create cohort subset excluding hospitalizations with ESRD
strobe_counts['3_encounter_blocks_without_esrd'] = len(cohort_blocks)  # Count blocks without ESRD
strobe_counts['3_hospitalizations_without_esrd'] = len(cohort_hosp_ids)  # Count hospitalizations without ESRD

strobe_counts

## Step4: Data availability, and CRRT Settings

In [None]:
print(f"\nLoading labs table...")
clif.load_table(
    'vitals',
    columns=vitals_required_columns,
    filters={
        'hospitalization_id': list(cohort_hosp_ids)
    }
)
print(f"   Vitals loaded: {len(clif.vitals.df):,} rows")
print(f"   Unique vitals categories: {clif.vitals.df['vital_category'].nunique()}")
print(f"   Unique vitals hospitalizations: {clif.vitals.df['hospitalization_id'].nunique()}")

clif.vitals.df = clif.vitals.df.merge(
    clif.encounter_mapping[['hospitalization_id', 'encounter_block']],
    on='hospitalization_id',
    how='left'
)

In [None]:
vitals_range = clif.vitals.df.groupby('encounter_block').agg({
    'recorded_dttm': ['min', 'max']
}).reset_index()
vitals_range.columns = ['encounter_block', 'first_vital_dttm', 'last_vital_dttm']

In [None]:
# Keep only rows where vital_category is 'weight_kg'
weight_df = clif.vitals.df[clif.vitals.df['vital_category'] == 'weight_kg'].copy()
# Identify the number of hospitalizations that do not have weight recorded
hosp_with_weight = set(weight_df['hospitalization_id'].unique())
hosp_without_weight = cohort_hosp_ids - hosp_with_weight
print(f"Number of hospitalizations without recorded weight: {len(hosp_without_weight)}")

clif.vitals.df = None ## clear from memory

In [None]:
cohort_df = cohort_df[~cohort_df['hospitalization_id'].isin(hosp_without_weight)].copy()
cohort_hosp_ids = set(cohort_df['hospitalization_id'].unique())
cohort_blocks = set(cohort_df['encounter_block'].unique())
strobe_counts['4_encounter_blocks_with_weight'] = len(cohort_blocks)  # Count blocks without weight
strobe_counts['4_hospitalizations_with_weight'] = len(cohort_hosp_ids)  # Count hospitalizations without weight

In [None]:
crrt_df = clif.crrt_therapy.df

In [None]:
# Identify the number of encounters who do not have any CRRT settings documented: 
# pre_filter_replacement_fluid_rate, post_filter_replacement_fluid_rate, dialysate_flow_rate, ultrafiltration_out
# Filter crrt_df to only include hospitalization_id present in cohort_df
crrt_df = crrt_df[crrt_df['hospitalization_id'].isin(cohort_df['hospitalization_id'])]

crrt_settings_cols = [
    'pre_filter_replacement_fluid_rate',
    'post_filter_replacement_fluid_rate',
    'dialysate_flow_rate',
    'ultrafiltration_out'
]
# Find encounter_blocks with ANY crrt settings recorded
crrt_settings_present = crrt_df.groupby('encounter_block')[crrt_settings_cols].apply(
    lambda df: df.notnull().any().any()
)
crrt_blocks_with_settings = set(crrt_settings_present[crrt_settings_present].index)
crrt_blocks_without_settings = set(crrt_df['encounter_block'].unique()) - crrt_blocks_with_settings
num_encounters_without_crrt_settings = len(crrt_blocks_without_settings)
print(f"Number of encounter blocks without any recorded CRRT settings: {num_encounters_without_crrt_settings}")

# Filter cohort_df to only include encounter_blocks with at least one CRRT setting recorded
cohort_df = cohort_df[cohort_df['encounter_block'].isin(crrt_blocks_with_settings)].copy()
cohort_hosp_ids = set(cohort_df['hospitalization_id'].unique())
cohort_blocks = set(cohort_df['encounter_block'].unique())
strobe_counts['5_encounter_blocks_with_crrt_settings'] = len(cohort_blocks)
strobe_counts['5_hospitalizations_with_crrt_settings'] = len(cohort_hosp_ids)

## Cohort Sanity Checks

## AKI

Majority of the cohort should have an ICD code for AKI

In [None]:
# AKI Codes Sanity check

# Define AKI ICD-10 codes
aki_codes = [
    # ICD-10 codes for acute kidney injury
    'n170', 'n171', 'n172', 'n178', 'n179',  # Acute kidney failure codes
    'r34',   # Anuria and oliguria
    'n990', # Post-procedural kidney failure
    't795',  # Traumatic anuria
    '5845',  # ICD9 Acute kidney failure with lesion of tubular necrosis
    '5849',  # ICD9- Acute kidney failure, unspecified
    "5848"    # ICD9 - Acute kidney failure with other specified pathological lesion in kidney
]

# Filter to non-ESRD encounters first
non_esrd_encounters = hospital_diagnosis_df[hospital_diagnosis_df['encounter_block'].isin(cohort_df['encounter_block'])]

# Create mask for AKI diagnoses on the filtered data
aki_mask = non_esrd_encounters['diagnosis_code'].isin(aki_codes)

# Get encounter blocks with AKI diagnoses
blocks_with_aki = non_esrd_encounters[aki_mask]['encounter_block'].unique()
total_non_esrd_blocks = cohort_df['encounter_block'].nunique()
strobe_counts['6_encounter_blocks_with_AKI_no_esrd'] = len(blocks_with_aki) 

# Calculate percentage
aki_percentage = (len(blocks_with_aki) / total_non_esrd_blocks) * 100

print(f"\nPercentage of non-ESRD encounter blocks with AKI codes: {aki_percentage:.1f}%")
print(f"({len(blocks_with_aki)} out of {total_non_esrd_blocks} blocks)")
strobe_counts['6_Percentage_non_ESRD_encounter_blocks_with_AKI_codes'] = aki_percentage
# Show sample of AKI diagnoses
aki_diagnoses = non_esrd_encounters[aki_mask][['hospitalization_id', 'diagnosis_code','poa_present']].drop_duplicates()
print("\nSample of AKI-related diagnoses found: ")
aki_diagnoses['diagnosis_code'].value_counts()

## ICU

Cohort should ideally be an ICU hospitalization

In [None]:
# Filter ADT data to only include hospitalizations in all_ids
adt_final_stitched = adt_stitched[adt_stitched['hospitalization_id'].isin(cohort_df['hospitalization_id'])].copy()
adt_final_stitched = adt_final_stitched.sort_values(by=['encounter_block', 'in_dttm'])
desired_order = ['hospitalization_id', 'encounter_block', 'hospital_id', 'in_dttm', 'out_dttm']
remaining_cols = [col for col in adt_final_stitched.columns if col not in desired_order]
adt_final_stitched = adt_final_stitched[desired_order + remaining_cols]

print("\n=== Validating ICU Administration ===")

adt_final_stitched['is_icu'] = adt_final_stitched['location_category'] == 'icu'

# Check if each hospitalization had at least one ICU stay
hosp_icu_status = adt_final_stitched.groupby('encounter_block')['is_icu'].any()
non_icu_hosps = hosp_icu_status[~hosp_icu_status].index.tolist()
strobe_counts["6_number_hosp_without_ICU_stay"] = len(non_icu_hosps)
print(f"\nNumber of CRRT hospitalizations without any ICU stay: {len(non_icu_hosps)}")
if len(non_icu_hosps) > 0:
    print("WARNING: Found CRRT hospitalizations without ICU stays")
    print("Number of hospitalization IDs without ICU stays:", len(non_icu_hosps), "check crrt_non_icu_df df")
else:
    print("All CRRT hospitalizations had at least one ICU stay")

crrt_non_icu_df = crrt_df[crrt_df['encounter_block'].isin(non_icu_hosps)]
crrt_non_icu_df = crrt_non_icu_df.sort_values(by=['hospitalization_id', 'encounter_block', 'recorded_dttm'])
desired_order = ['hospitalization_id', 'encounter_block', 'recorded_dttm', 'crrt_mode_category']
remaining_cols = [col for col in crrt_non_icu_df.columns if col not in desired_order]
crrt_non_icu_df = crrt_non_icu_df[desired_order + remaining_cols]
adt_df_non_icu_hosps = adt_stitched[adt_stitched['encounter_block'].isin(non_icu_hosps)]
adt_df_non_icu_hosps.to_csv('../output/intermediate/adt_df_non_icu_hosps.csv', index=False)


# Strobe

In [None]:
import pandas as pd

# Display strobe counts
display(strobe_counts)

# Save strobe counts to CSV in ../output/intermediate
strobe_counts_df = pd.DataFrame(list(strobe_counts.items()), columns=['counter', 'value'])
strobe_counts_df.to_csv('../output/final/strobe_counts.csv', index=False)

In [None]:
import importlib
import utils
importlib.reload(utils)
from utils import create_consort_diagram

# Generate CONSORT diagram - percentages calculated automatically
create_consort_diagram(strobe_counts=strobe_counts)

# Outcomes

In [None]:
# ============================================================================
# OUTCOMES CALCULATION 
# ============================================================================

# 1. ICU LENGTH OF STAY 
print("\n1. Processing ICU segments...")
icu_segs = adt_final_stitched.copy()
icu_segs = icu_segs[
    (icu_segs['location_category'] == 'icu') &
    (icu_segs['in_dttm'].notna()) &
    (icu_segs['out_dttm'].notna()) &
    (icu_segs['out_dttm'] > icu_segs['in_dttm'])
]

print(f"   ICU segments identified: {len(icu_segs):,}")

# Calculate ICU LOS as sum of all ICU segment durations
icu_los = icu_segs[icu_segs['encounter_block'].isin(cohort_df['encounter_block'])].copy()
icu_los['seg_days'] = (icu_los['out_dttm'] - icu_los['in_dttm']).dt.total_seconds() / (24 * 3600)

icu_los_summary = icu_los.groupby('encounter_block').agg({
    'seg_days': 'sum'
}).reset_index()
icu_los_summary.rename(columns={'seg_days': 'icu_los_days'}, inplace=True)

print(f"   Median ICU LOS: {icu_los_summary['icu_los_days'].median():.2f} days")

# ============================================================================
# 2. HOSPITAL LENGTH OF STAY (difference between first and last vital)
# ============================================================================
print("\n3. Calculating Hospital Length of Stay...")
hosp_los = cohort_df[['encounter_block']].merge(
    vitals_range,
    on='encounter_block',
    how='left'
)

# Hospital LOS = last_vital_dttm - first_vital_dttm
hosp_los['hosp_los_days'] = (
    hosp_los['last_vital_dttm'] - hosp_los['first_vital_dttm']
).dt.total_seconds() / (24 * 3600)

# Ensure non-negative values
hosp_los['hosp_los_days'] = hosp_los['hosp_los_days'].apply(
    lambda x: max(x, 0) if pd.notna(x) and np.isfinite(x) else np.nan
)

print(f"   Median Hospital LOS: {hosp_los['hosp_los_days'].median():.2f} days")

# ============================================================================
# 4. DEATH STATUS AND FINAL OUTCOME DATETIME
# ============================================================================
print("\n4. Determining death status and final outcome datetime...")

# Get discharge category and death_dttm from hospitalization and patient tables
patient_df = clif.patient.df[['patient_id', 'death_dttm', 'race_category', 'sex_category', 'ethnicity_category']]

death_info = cohort_df.merge(
    hosp_df[['hospitalization_id', 'patient_id', 'discharge_category', 'age_at_admission', 'admission_type_category']],
    on='hospitalization_id',
    how='left'
).merge(
    patient_df,
    on='patient_id',
    how='left'
).merge(
    vitals_range,
    on='encounter_block',
    how='left'
)

# Drop 'hospitalization_id' from death_info
if 'hospitalization_id' in death_info.columns:
    death_info = death_info.drop(columns=['hospitalization_id'])

# Collapse to unique encounter_block, aggregating required columns
death_info = death_info.sort_values('encounter_block')  

agg_dict = {
    'admission_type_category': 'last',
    'discharge_category': 'last',
    'race_category': 'last',
    'sex_category': 'last',
    'ethnicity_category': 'last',
    'death_dttm': 'last',
    'first_vital_dttm': 'min',
    'last_vital_dttm': 'max'
}

# Include all other columns not being aggregated with "first" to keep at least one value per group, unless they are non-aggregatable
for col in death_info.columns:
    if col not in agg_dict and col not in ['encounter_block']:
        agg_dict[col] = 'first'

death_info = death_info.groupby('encounter_block', as_index=False).agg(agg_dict)

# Standardize discharge category
death_info['discharge_category'] = death_info['discharge_category'].str.lower()

# Step 1: Determine if patient died (based on discharge_category)
death_info['died'] = death_info['discharge_category'].isin(['expired', 'hospice']).astype(int)

# Step 2: Determine final_outcome_dttm
# If died: use death_dttm if available, otherwise use last_vital_dttm
# If not died: use last_vital_dttm
death_info['final_outcome_dttm'] = (
    death_info['death_dttm']
    .fillna(death_info['last_vital_dttm'])  # Fallback to last_vital
    .where(death_info['died'] == 1, pd.NaT)  # Only keep for died==1, else NaT
)

print(f"   Patients identified as died (expired/hospice): {death_info['died'].sum():,}")

num_with_death_dttm = ((death_info['died'] == 1) & (death_info['death_dttm'].notna())).sum()
num_using_last_vital = ((death_info['died'] == 1) & (death_info['death_dttm'].isna())).sum()

print(f"   - With death_dttm: {num_with_death_dttm:,}")
print(f"   - Using last_vital_dttm: {num_using_last_vital:,}")

# ============================================================================
# 5. MORTALITY CALCULATIONS
# ============================================================================
print("\n5. Calculating mortality outcomes...")

# In-hospital death: died AND final_outcome_dttm is between first and last vital
death_info['in_hosp_death'] = (
    (death_info['died'] == 1) &
    (death_info['final_outcome_dttm'].notna()) &
    (death_info['final_outcome_dttm'] >= death_info['first_vital_dttm']) &
    (death_info['final_outcome_dttm'] <= death_info['last_vital_dttm'])
).astype(int)

# 30-day mortality: died AND final_outcome_dttm within 30 days of first vital
death_info['death_30d'] = (
    (death_info['died'] == 1) &
    (death_info['final_outcome_dttm'].notna()) &
    (death_info['final_outcome_dttm'] <= (death_info['first_vital_dttm'] + pd.Timedelta(days=30)))
).astype(int)


print(f"   In-hospital deaths: {death_info['in_hosp_death'].sum():,} ({death_info['in_hosp_death'].mean()*100:.1f}%)")
print(f"   30-day deaths: {death_info['death_30d'].sum():,} ({death_info['death_30d'].mean()*100:.1f}%)")

# ============================================================================
# 6. COMBINE ALL OUTCOMES
# ============================================================================
print("\n6. Combining all outcomes...")
outcomes_df = cohort_df[['hospitalization_id', 'encounter_block']].merge(
    icu_los_summary, on='encounter_block', how='left'
).merge(
    hosp_los[['encounter_block', 'hosp_los_days']], on='encounter_block', how='left'
).merge(
    death_info, on='encounter_block', how='left'
)

print(f"\nFinal outcomes dataset:")
print(f"   Total records: {len(outcomes_df):,}")
print(f"   Records with ICU LOS: {outcomes_df['icu_los_days'].notna().sum():,}")
print(f"   Records with Hospital LOS: {outcomes_df['hosp_los_days'].notna().sum():,}")
print(f"   In-hospital mortality rate: {outcomes_df['in_hosp_death'].mean()*100:.1f}%")
print(f"   30-day mortality rate: {outcomes_df['death_30d'].mean()*100:.1f}%")

# Display summary statistics
print("\n" + "="*60)
print("OUTCOMES SUMMARY STATISTICS")
print("="*60)
print(f"ICU LOS (days):")
print(f"  Median [IQR]: {outcomes_df['icu_los_days'].median():.1f} [{outcomes_df['icu_los_days'].quantile(0.25):.1f}-{outcomes_df['icu_los_days'].quantile(0.75):.1f}]")
print(f"\nHospital LOS (days):")
print(f"  Median [IQR]: {outcomes_df['hosp_los_days'].median():.1f} [{outcomes_df['hosp_los_days'].quantile(0.25):.1f}-{outcomes_df['hosp_los_days'].quantile(0.75):.1f}]")
print(f"\nMortality:")
print(f"  In-hospital: {outcomes_df['in_hosp_death'].sum():,}/{len(outcomes_df):,} ({outcomes_df['in_hosp_death'].mean()*100:.1f}%)")
print(f"  30-day: {outcomes_df['death_30d'].sum():,}/{len(outcomes_df):,} ({outcomes_df['death_30d'].mean()*100:.1f}%)")
print("="*60)

# Convert specified columns to lowercase (if they exist)
category_cols = [
    'admission_type_category', 'discharge_category',
    'race_category', 'sex_category', 'ethnicity_category'
]
for col in category_cols:
    if col in outcomes_df.columns:
        outcomes_df[col] = outcomes_df[col].str.lower()

# Arrange columns: patient_id, hospitalization_id, encounter_block, then everything else
front_cols = [col for col in ['patient_id', 'hospitalization_id', 'encounter_block'] if col in outcomes_df.columns]
other_cols = [col for col in outcomes_df.columns if col not in front_cols]
outcomes_df = outcomes_df[front_cols + other_cols]

# Save Intermediate data

In [None]:
cohort_df.to_parquet("../output/intermediate/cohort_df.parquet", index=False)
outcomes_df.to_parquet("../output/intermediate/outcomes_df.parquet", index=False)
# Filter weight_df to hospitalization_ids present in cohort_df before saving
weight_df_filtered = weight_df[weight_df["hospitalization_id"].isin(cohort_df["hospitalization_id"])]
weight_df_filtered.to_parquet("../output/intermediate/weight_df.parquet", index=False)