## CRRT Cohort Check

Required Checks for hospitalizations since 2021:

1. Definition I  : Hospitalizations that are on a ventilator for the first 24 hours of their first icu stay.
2. Definition II : Hospitalizations that are on vasoactive medications during the first 24 hrs of their first ICU stay. 
3. Definition III: Hospitalizations that have stage I AKI defined as 

    3a. 0.3 mg/dl absolute increase in serum creatinine over a 24 hour period since admission   

    3b. 50% increase in serum creatinine over 7 days 

## 00 Load libraries and core CLIF tables


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
import json

import pyCLIF
import pyCLIF_mimic
import waterfall
## import outlier json
with open('../config/outlier_config.json', 'r', encoding='utf-8') as f:
    outlier_cfg = json.load(f)

In [None]:
patient = pyCLIF.load_data('clif_patient')
hospitalization = pyCLIF.load_data('clif_hospitalization')
adt = pyCLIF.load_data('clif_adt')

# ensure id variable is of dtype character
hospitalization['hospitalization_id']= hospitalization['hospitalization_id'].astype(str)
patient['patient_id']= patient['patient_id'].astype(str)
adt['hospitalization_id']= adt['hospitalization_id'].astype(str)

# check for duplicates
# patient table should be unique by patient id
patient = pyCLIF.remove_duplicates(patient, ['patient_id'], 'patient')
# hospitalization table should be unique by hospitalization id
hospitalization = pyCLIF.remove_duplicates(hospitalization, ['hospitalization_id'], 'hospitalization')
# adt table should be unique by hospitalization id and in dttm
adt = pyCLIF.remove_duplicates(adt, ['hospitalization_id', 'hospital_id', 'in_dttm'], 'adt')

In [None]:
# Standardize all _dttm variables to the same format
patient = pyCLIF.convert_datetime_columns_to_site_tz(patient,  pyCLIF.helper['timezone'])
hospitalization = pyCLIF.convert_datetime_columns_to_site_tz(hospitalization, pyCLIF.helper['timezone'])
adt = pyCLIF.convert_datetime_columns_to_site_tz(adt,  pyCLIF.helper['timezone'])

#### Hospitalizations

In [None]:
cohort = hospitalization[(hospitalization['admission_dttm'].dt.year >= 2021) & 
                   (hospitalization['admission_dttm'].dt.year <= 2024) & 
                   (hospitalization['age_at_admission'] >=18)&
                    (hospitalization['age_at_admission'] <=119)]

In [None]:
strobe_counts = {}
strobe_counts["A_adult_hospitalizations_since_2021"] = len(cohort['hospitalization_id'].drop_duplicates())

#### ADT

In [None]:
# Convert location category to lowercase and filter for ICU
# Filter ADT table to include only hospitalizations from the cohort
adt_cohort = adt[adt['hospitalization_id'].isin(cohort['hospitalization_id'])]
adt_cohort['location_category'] = adt_cohort['location_category'].str.lower()
# Filter to encounters that had at least one ICU stay
icu_hospitalization_ids = adt_cohort[adt_cohort['location_category'] == 'icu']['hospitalization_id'].unique()
adt_filtered = adt_cohort[adt_cohort['hospitalization_id'].isin(icu_hospitalization_ids)]
strobe_counts["B_adult_hospitalizations_since_2021_with_icu"] = len(adt_filtered['hospitalization_id'].drop_duplicates())

In [None]:
strobe_counts

In [None]:
cohort = cohort[cohort['hospitalization_id'].isin(adt_filtered['hospitalization_id'])]
print("Final list of cohort ids", len(cohort['hospitalization_id'].drop_duplicates()))

# Hourly Scaffold

In [None]:
# 1) define the 'end_time' for the sequence from vitals or outcome.
vitals_cohort = pyCLIF.load_data('clif_vitals',
    filters={'hospitalization_id': cohort['hospitalization_id'].unique().tolist()}
)
vitals_cohort = pyCLIF.convert_datetime_columns_to_site_tz(vitals_cohort, pyCLIF.helper['timezone'])
vitals_cohort = vitals_cohort.sort_values(['hospitalization_id', 'recorded_dttm'])

# Get first and last vitals timestamp for each hospitalization
vital_bounds = (
    vitals_cohort
    .groupby('hospitalization_id')
    .agg({
        'recorded_dttm': ['min', 'max']
    })
    .droplevel(0, axis=1)
    .rename(columns={'min': 'first_vital_dttm', 'max': 'last_vital_dttm'})
)

# Create hourly scaffold for each hospitalization
hourly_scaffold = pd.DataFrame([
    (hosp_id, time)
    for hosp_id, start, end in zip(
        vital_bounds.index,
        vital_bounds['first_vital_dttm'],
        vital_bounds['last_vital_dttm']
    )
    for time in pd.date_range(start=start, end=end, freq='H', tz=pyCLIF.helper['timezone'])
], columns=['hospitalization_id', 'recorded_dttm'])

# Add date and hour columns
hourly_scaffold['recorded_date'] = hourly_scaffold['recorded_dttm'].dt.date
hourly_scaffold['recorded_hour'] = hourly_scaffold['recorded_dttm'].dt.hour
hourly_scaffold = hourly_scaffold.drop(columns=['recorded_dttm'])

# Definition I

Hospitalizations that are on a ventilator for the first 24 hours of their first icu stay.

Notes: 

- Use ADT table to identify hospitalizations first ICU stay ; location_category.lower == "icu". Fields in ADT table = hospitalization_id, location_category, in_dttm, out_dttm

- Use Respiratory Support table to identify the duration of ventilator for the first ICU stay. Use device_category.lower() == "imv" to identify those on vent. Other vars in the table- hospitalization_id, recorded_dttm, device_category, mode_category
- Identify hospitalizations that were on vent for the first 24 hours of their first ICU stay


#### First ICU Stay

In [None]:
# Get the first ICU stay for each hospitalization 

# Convert location category to lowercase and filter for ICU
icu_stays = adt_filtered[adt_filtered['location_category'] == 'icu'].copy()

# Sort by hospitalization_id and in_dttm to get first ICU stay
icu_stays = icu_stays.sort_values(['hospitalization_id', 'in_dttm', 'out_dttm'])

# Get first ICU stay for each hospitalization
first_icu_stays = icu_stays.groupby('hospitalization_id').first().reset_index()

#### Respiratory Support

In [None]:
# load resp support 
rst_required_columns = [
    'hospitalization_id',
    'recorded_dttm',
    'device_name',
    'device_category',
    'mode_name', 
    'mode_category',
    'tracheostomy',
    'fio2_set',
    'lpm_set',
    'resp_rate_set',
    'peep_set',
    'resp_rate_obs',
    'tidal_volume_set', 
    'pressure_control_set',
    'pressure_support_set',
    'peak_inspiratory_pressure_set'

]

# 1) Load respiratory support
resp_support_raw = pyCLIF.load_data(
    'clif_respiratory_support',
    columns=rst_required_columns,
    filters={'hospitalization_id': cohort['hospitalization_id'].unique().tolist()}
)

resp_support = resp_support_raw.copy()
resp_support['device_category'] = resp_support['device_category'].str.lower()
resp_support['mode_category'] = resp_support['mode_category'].str.lower()
resp_support['lpm_set'] = pd.to_numeric(resp_support['lpm_set'], errors='coerce')
resp_support['resp_rate_set'] = pd.to_numeric(resp_support['resp_rate_set'], errors='coerce')
resp_support['peep_set'] = pd.to_numeric(resp_support['peep_set'], errors='coerce')
resp_support['resp_rate_obs'] = pd.to_numeric(resp_support['resp_rate_obs'], errors='coerce')
resp_support = resp_support.sort_values(['hospitalization_id', 'recorded_dttm'])
# del resp_support_raw

print("\n=== Apply outlier thresholds ===\n")
resp_support['fio2_set'] = pd.to_numeric(resp_support['fio2_set'], errors='coerce')
# (Optional) If FiO2 is >1 on average => scale by /100
fio2_mean = resp_support['fio2_set'].mean(skipna=True)
# If the mean is greater than 1, divide 'fio2_set' by 100
if fio2_mean and fio2_mean > 1.0:
    # Only divide values greater than 1 to avoid re-dividing already correct values
    resp_support.loc[resp_support['fio2_set'] > 1, 'fio2_set'] = \
        resp_support.loc[resp_support['fio2_set'] > 1, 'fio2_set'] / 100
    print("Updated fio2_set to be between 0.21 and 1")
else:
    print("FIO2_SET mean=", fio2_mean, "is within the required range")

In [None]:
## Identify encounters on IMV
# Create mask to identify IMV entries
imv_mask = resp_support['device_category'].str.contains("imv", case=False, na=False)

# Get unique hospitalization_ids with at least one IMV entry
resp_stitched_imv_ids = resp_support[imv_mask][['hospitalization_id']].drop_duplicates()

strobe_counts["C_adult_hospitalizations_since_2021_with_icu_imv"] = len(resp_stitched_imv_ids['hospitalization_id'].drop_duplicates())
# Filter the full table to just these hospitalization_ids
resp_support_filtered = resp_support[
    resp_support["hospitalization_id"].isin(resp_stitched_imv_ids["hospitalization_id"])
].reset_index(drop=True)

# filter down to only those hospitalization_ids that are in the cohort
all_ids = cohort[cohort['hospitalization_id'].isin(resp_support_filtered['hospitalization_id'].unique())]

In [None]:
strobe_counts

In [None]:
print("Final list of ids adult_hospitalizations_since_2021_with_icu_imv", len(all_ids['hospitalization_id'].drop_duplicates()))

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")

processed_resp_support = waterfall.process_resp_support_waterfall(resp_support_filtered, 
                                                        id_col = "hospitalization_id",
                                                        verbose = True)

processed_resp_support = pyCLIF.convert_datetime_columns_to_site_tz(processed_resp_support, pyCLIF.helper['timezone'])

In [None]:
vent_records = processed_resp_support.merge(
    first_icu_stays[['hospitalization_id', 'in_dttm', 'out_dttm']], 
    on='hospitalization_id', 
    how='inner'
)

# Create on_vent column (1 when device_category is IMV, 0 otherwise)
vent_records['on_vent'] = (vent_records['device_category'].str.lower() == 'imv').astype(int)

# Create in_icu column (1 when timestamp is between ICU admission and discharge)
vent_records['in_icu'] = (
    (vent_records['recorded_dttm'] >= vent_records['in_dttm']) & 
    (vent_records['recorded_dttm'] <= vent_records['out_dttm'])
).astype(int)

# Calculate window end (24 hours after ICU admission)
vent_records['window_end'] = vent_records['in_dttm'] + pd.Timedelta(hours=24)

# Flag records in first 24h of ICU stay
vent_records['in_icu_24h'] = (
    (vent_records['recorded_dttm'] >= vent_records['in_dttm']) & 
    (vent_records['recorded_dttm'] <= vent_records['window_end'])
).astype(int)

While aggregating flags at the hourly level, I used the last value during that hour assuming the last value better represents the patient's status going into the next hour. 

In [None]:
cohort_hourly = vent_records[['hospitalization_id', 'recorded_dttm', 
                             'on_vent', 'in_icu', 'in_icu_24h']]

# Create recorded_date and recorded_hour columns
cohort_hourly['recorded_date'] = cohort_hourly['recorded_dttm'].dt.date
cohort_hourly['recorded_hour'] = cohort_hourly['recorded_dttm'].dt.hour

# Aggregate by hospitalization_id, recorded_date, and recorded_hour
# First sort by time and get last value in hour, preserving the actual timestamp
cohort_hourly_agg = (
    cohort_hourly
    .sort_values(['hospitalization_id', 'recorded_dttm'])
    .groupby(['hospitalization_id', 'recorded_date', 'recorded_hour'])
    .agg({
        'on_vent': 'last',        # Last vent status in hour
        'in_icu': 'last',         # Last ICU status in hour
        'in_icu_24h': 'last'      # Last 24h status in hour
    })
    .reset_index()
)

In [None]:
final_df = hourly_scaffold.merge(
    cohort_hourly_agg,
    on=['hospitalization_id', 'recorded_date', 'recorded_hour'],
    how='left'
)

final_df = (
    final_df
    .sort_values(['hospitalization_id', 'recorded_date', 'recorded_hour'])
    .groupby(['hospitalization_id', 'recorded_date', 'recorded_hour'])
    .last()
    .reset_index()
)

In [None]:
#  forward fill missing hours
final_df = (
    final_df
    .set_index('hospitalization_id')
    .groupby('hospitalization_id')
    .ffill()
    .reset_index()
)

In [None]:
# Calculate def_1 flag directly from final_df
def_1_status = (
    final_df[final_df['in_icu_24h'] == 1]  # Only look at records in first 24h
    .groupby('hospitalization_id')
    .agg({
        'on_vent': 'min'  # Will be 1 only if ALL hours in first 24h were on_vent=1
    })
)

def_1_status['def_1'] = (def_1_status['on_vent'] == 1).astype(int)

# Merge back to original dataframe
final_df = final_df.merge(
    def_1_status[['def_1']], 
    on='hospitalization_id', 
    how='left'
)

print(f"Hospitalizations meeting def_1: {def_1_status['def_1'].sum()}")
strobe_counts["Hospitalizations meeting def_1 (On vent for first 24 hrs of first ICU stay)"] = def_1_status['def_1'].sum()

# Definition II
Hospitalizations that are on vasoactive medications during the first 24 hrs of their first ICU stay.


#### Medication Admin Continuous

- Filter down to the required meds and the cohort
- Identify if any of these meds were administered continuously during that hour, and create a flag for each med at the hourly level. 

In [None]:
meds_required_columns = [
    'hospitalization_id',
    'admin_dttm',
    'med_name',
    'med_category',
    'med_dose',
    'med_dose_unit'
]
meds_of_interest = [
    'norepinephrine', 'epinephrine', 'phenylephrine', 'vasopressin',
    'dopamine', 'angiotensin','dobutamine'
]

meds_filters = {
    'hospitalization_id': all_ids['hospitalization_id'].unique().tolist(),
    'med_category': meds_of_interest
}
meds = pyCLIF.load_data('clif_medication_admin_continuous', columns=meds_required_columns, filters=meds_filters)

# ensure correct format
meds['hospitalization_id']= meds['hospitalization_id'].astype(str)
meds['med_dose_unit'] = meds['med_dose_unit'].str.lower()
meds = pyCLIF.convert_datetime_columns_to_site_tz(meds,  pyCLIF.helper['timezone'])
meds['med_dose'] = pd.to_numeric(meds['med_dose'], errors='coerce')
# Create 'date' and 'hour_of_day' columns
meds['recorded_date'] = meds['admin_dttm'].dt.date
meds['recorded_hour'] = meds['admin_dttm'].dt.hour

In [None]:
strobe_counts["D_adult_hospitalizations_since_2021_icu_meds"] = len(meds['hospitalization_id'].drop_duplicates())
strobe_counts

In [None]:
meds.value_counts('med_category')

In [None]:
# Filter meds_filtered for the medications in red_meds_list
meds_filtered = meds[meds['med_category'].isin(meds_of_interest)].copy()

# Create a flag for each medication in red_meds_list
for med in meds_of_interest:
    # Create a flag that is 1 if the medication was administered in that hour, 0 otherwise
    meds_filtered[med + '_flag'] = np.where((meds_filtered['med_category'] == med) & 
                                         (meds_filtered['med_dose'] > 0.0) & 
                                         (meds_filtered['med_dose'].notna()), 1, 0).astype(int)

# Aggregate to get the maximum value for each flag (per hospitalization_id, recorded_date, recorded_hour)
# This ensures that if the medication was administered even once in the hour, the flag is 1
meds_flags = meds_filtered.groupby(['hospitalization_id', 'recorded_date', 'recorded_hour']).agg(
    {med + '_flag': 'max' for med in meds_of_interest}
).reset_index()

#  combine all flags into a single 'red_meds_flag', you can do so like this:
meds_flags['vasoactive_meds_flag'] = meds_flags[[med + '_flag' for med in meds_of_interest]].max(axis=1)

In [None]:
final_df = final_df.merge(
    meds_flags[['hospitalization_id', 'recorded_date', 'recorded_hour', 'vasoactive_meds_flag']],
    on=['hospitalization_id', 'recorded_date', 'recorded_hour'],
    how='left'
)

In [None]:
# Forward fill vasoactive_meds_flag within each hospitalization
final_df = (
    final_df
    .sort_values(['hospitalization_id', 'recorded_date', 'recorded_hour'])
    .assign(vasoactive_meds_flag=lambda x: x.groupby('hospitalization_id')['vasoactive_meds_flag'].ffill())
)

# Calculate def_2 flag for vasoactive medications in first 24h
def_2_status = (
    final_df[final_df['in_icu_24h'] == 1]  # Only look at records in first 24h
    .groupby('hospitalization_id')
    .agg({
        'vasoactive_meds_flag': 'max'  # 1 if ANY hour in first 24h had vasoactive meds
    })
)

def_2_status['def_2'] = (def_2_status['vasoactive_meds_flag'] == 1).astype(int)

# Merge back to original dataframe
final_df = final_df.merge(
    def_2_status[['def_2']], 
    on='hospitalization_id', 
    how='left'
)

print(f"Hospitalizations meeting def_2 (vasoactive meds in first 24h): {def_2_status['def_2'].sum()}")
strobe_counts["Hospitalizations meeting def_2 (vasoactive meds in first 24h)"] = def_2_status['def_2'].sum()

# Definition III

Hospitalizations that have stage I AKI defined as 

    3a. 0.3 mg/dl absolute increase in serum creatinine over a 24 hour period since admission   

    3b. 50% increase in serum creatinine over 7 days

#### Labs- Creatinine

In [None]:
labs_required_columns = [
    'hospitalization_id',
    'lab_result_dttm',
    'lab_name',
    'lab_category',
    'lab_value',
    'lab_value_numeric'
]
labs_of_interest = ['creatinine']

# Import labs
labs_filters = {
    'hospitalization_id': cohort['hospitalization_id'].unique().tolist(),
    'lab_category': labs_of_interest
}
labs = pyCLIF.load_data('clif_labs', columns=labs_required_columns, filters=labs_filters)
print("unique encounters in labs", pyCLIF.count_unique_encounters(labs))
labs['hospitalization_id']= labs['hospitalization_id'].astype(str)
labs = labs.sort_values(by=['hospitalization_id', 'lab_result_dttm'])
labs = pyCLIF.convert_datetime_columns_to_site_tz(labs, pyCLIF.helper['timezone'])
labs['lab_value_numeric'] = pd.to_numeric(labs['lab_value_numeric'], errors='coerce')
labs['recorded_hour'] = labs['lab_result_dttm'].dt.hour
labs['recorded_date'] = labs['lab_result_dttm'].dt.date

In [None]:
strobe_counts["E_adult_hospitalizations_since_2021_icu_creatinine"] = len(labs['hospitalization_id'].drop_duplicates())
strobe_counts

In [None]:
creatinine = labs[['hospitalization_id','recorded_date', 'recorded_hour', 'lab_value_numeric']]
creatinine = creatinine.sort_values(by=['hospitalization_id', 'recorded_date', 'recorded_hour'])

In [None]:
final_df = final_df.merge(
    creatinine,
    on=['hospitalization_id', 'recorded_date', 'recorded_hour'],
    how='left'
)

In [None]:
# # First, get baseline creatinine (first value after ICU admission) for each hospitalization
# baseline_creat = (
#     final_df[final_df['lab_value_numeric'].notna()]
#     .sort_values(['hospitalization_id', 'recorded_date', 'recorded_hour'])
#     .groupby('hospitalization_id')
#     .first()
#     [['lab_value_numeric']]
#     .rename(columns={'lab_value_numeric': 'baseline_creatinine'})
# )

# # Merge baseline back to main df
# final_df = final_df.merge(baseline_creat, on='hospitalization_id', how='left')

# # Forward fill creatinine for up to 24 hours only
# final_df = (
#     final_df
#     .sort_values(['hospitalization_id', 'recorded_date', 'recorded_hour'])
#     .assign(
#         # Create a flag for values within 24h of baseline measurement
#         within_24h_baseline=lambda x: x.groupby('hospitalization_id')['recorded_date'].transform(
#             lambda g: (g <= g.iloc[0] + pd.Timedelta(days=1))
#         ),
#         # Forward fill creatinine only within 24h window
#         lab_value_numeric_filled=lambda x: x.groupby('hospitalization_id').apply(
#             lambda group: group['lab_value_numeric'].where(
#                 group['within_24h_baseline'], np.nan
#             ).ffill()
#         ).reset_index(0, drop=True)
#     )
# )

# # Calculate def_3a: 0.3 mg/dl increase over 24h period
# creat_24h = (
#     final_df[final_df['within_24h_baseline']]
#     .groupby('hospitalization_id')
#     .agg({
#         'lab_value_numeric_filled': 'last',  # Creatinine at 24h
#         'baseline_creatinine': 'first'
#     })
# )
# creat_24h['def_3a'] = (
#     (creat_24h['lab_value_numeric_filled'] - creat_24h['baseline_creatinine']) >= 0.3
# ).astype(int)

# # Calculate def_3b: 50% increase over 7 days
# # Create 7-day window flag
# final_df['within_7d_baseline'] = final_df.groupby('hospitalization_id')['recorded_date'].transform(
#     lambda g: (g <= g.iloc[0] + pd.Timedelta(days=7))
# )

# creat_7d = (
#     final_df[final_df['within_7d_baseline'] & final_df['lab_value_numeric'].notna()]
#     .groupby('hospitalization_id')
#     .agg({
#         'lab_value_numeric': 'last',  # Creatinine at 7 days (no forward fill beyond 24h)
#         'baseline_creatinine': 'first'
#     })
# )
# creat_7d['def_3b'] = (
#     (creat_7d['lab_value_numeric'] / creat_7d['baseline_creatinine']) >= 1.5
# ).astype(int)

# # Combine def_3a and def_3b
# aki_flags = creat_24h[['def_3a']].merge(creat_7d[['def_3b']], on='hospitalization_id', how='outer').fillna(0)
# aki_flags['def_3'] = ((aki_flags['def_3a'] == 1) | (aki_flags['def_3b'] == 1)).astype(int)

# # Merge back to final_df
# final_df = final_df.merge(aki_flags, on='hospitalization_id', how='left')

# print(f"Hospitalizations with def_3a (0.3 mg/dl increase in 24h): {aki_flags['def_3a'].sum()}")
# print(f"Hospitalizations with def_3b (50% increase in 7d): {aki_flags['def_3b'].sum()}")
# print(f"Hospitalizations with def_3 (either 3a or 3b): {aki_flags['def_3'].sum()}")

# strobe_counts["Hospitalizations with def_3a (0.3 mg/dl increase in 24h)"] = aki_flags['def_3a'].sum()
# strobe_counts["Hospitalizations with def_3b (50% increase in 7d)"] = aki_flags['def_3b'].sum()
# strobe_counts["Hospitalizations with def_3 (either 3a or 3b)"] =aki_flags['def_3'].sum()

In [None]:
strobe_counts

In [None]:
import numpy as np
import pandas as pd

# Get ICU admission times for reference
icu_admission_times = first_icu_stays[['hospitalization_id', 'in_dttm']].copy()
icu_admission_times['icu_admission_date'] = icu_admission_times['in_dttm'].dt.date

# Merge ICU admission reference to final_df
final_df = final_df.merge(icu_admission_times[['hospitalization_id', 'icu_admission_date']], 
                         on='hospitalization_id', how='left')

# Get baseline creatinine (first value at or after ICU admission)
baseline_creat = (
    final_df[
        (final_df['lab_value_numeric'].notna()) & 
        (final_df['recorded_date'] >= final_df['icu_admission_date'])  # Only after ICU admission
    ]
    .sort_values(['hospitalization_id', 'recorded_date', 'recorded_hour'])
    .groupby('hospitalization_id')
    .first()
    [['lab_value_numeric', 'recorded_date']]
    .rename(columns={'lab_value_numeric': 'baseline_creatinine', 
                    'recorded_date': 'baseline_creatinine_date'})
)

# Merge baseline back to main df
final_df = final_df.merge(baseline_creat, on='hospitalization_id', how='left')

# Create proper time windows from ICU admission
final_df['within_24h_icu'] = (
    (final_df['recorded_date'] >= final_df['icu_admission_date']) &
    (final_df['recorded_date'] <= (final_df['icu_admission_date'] + pd.Timedelta(days=1)))
)

final_df['within_7d_icu'] = (
    (final_df['recorded_date'] >= final_df['icu_admission_date']) &
    (final_df['recorded_date'] <= (final_df['icu_admission_date'] + pd.Timedelta(days=7)))
)

# Forward fill creatinine for up to 24 hours from ICU admission only
final_df = (
    final_df
    .sort_values(['hospitalization_id', 'recorded_date', 'recorded_hour'])
    .assign(
        # Forward fill creatinine only within 24h window from ICU admission
        lab_value_numeric_filled=lambda x: x.groupby('hospitalization_id').apply(
            lambda group: group['lab_value_numeric'].where(
                group['within_24h_icu'], np.nan
            ).ffill()
        ).reset_index(0, drop=True)
    )
)

# Calculate def_3a: 0.3 mg/dl increase within 24h of ICU admission
creat_24h = (
    final_df[final_df['within_24h_icu']]
    .groupby('hospitalization_id')
    .agg({
        'lab_value_numeric_filled': 'last',  # Last creatinine in 24h window from ICU admission
        'baseline_creatinine': 'first'
    })
)

# Only calculate def_3a if we have both baseline and 24h values
creat_24h = creat_24h.dropna()
creat_24h['def_3a'] = (
    (creat_24h['lab_value_numeric_filled'] - creat_24h['baseline_creatinine']) >= 0.3
).astype(int)

# Calculate def_3b: 50% increase within 7 days of ICU admission
creat_7d = (
    final_df[
        (final_df['within_7d_icu']) & 
        (final_df['lab_value_numeric'].notna())  # Only actual measurements, no forward fill
    ]
    .groupby('hospitalization_id')
    .agg({
        'lab_value_numeric': 'last',  # Last actual creatinine in 7d window from ICU admission
        'baseline_creatinine': 'first'
    })
)

# Only calculate def_3b if we have both baseline and 7d values
creat_7d = creat_7d.dropna()
creat_7d['def_3b'] = (
    (creat_7d['lab_value_numeric'] / creat_7d['baseline_creatinine']) >= 1.5
).astype(int)

# Combine def_3a and def_3b
aki_flags = creat_24h[['def_3a']].merge(creat_7d[['def_3b']], on='hospitalization_id', how='outer').fillna(0)
aki_flags['def_3'] = ((aki_flags['def_3a'] == 1) | (aki_flags['def_3b'] == 1)).astype(int)

# Merge back to final_df
final_df = final_df.merge(aki_flags[['def_3a', 'def_3b', 'def_3']], on='hospitalization_id', how='left').fillna(0)

print(f"Hospitalizations with def_3a (0.3 mg/dl increase in 24h from ICU admission): {aki_flags['def_3a'].sum()}")
print(f"Hospitalizations with def_3b (50% increase in 7d from ICU admission): {aki_flags['def_3b'].sum()}")
print(f"Hospitalizations with def_3 (either 3a or 3b): {aki_flags['def_3'].sum()}")

# Update strobe counts
strobe_counts["Hospitalizations with def_3a (0.3 mg/dl increase in 24h from ICU admission)"] = aki_flags['def_3a'].sum()
strobe_counts["Hospitalizations with def_3b (50% increase in 7d from ICU admission)"] = aki_flags['def_3b'].sum()
strobe_counts["Hospitalizations with def_3 (either 3a or 3b from ICU admission)"] = aki_flags['def_3'].sum()

# Optional: Show some validation statistics
print(f"\nValidation:")
print(f"Hospitalizations with baseline creatinine: {baseline_creat.shape[0]}")
print(f"Hospitalizations with 24h creatinine data: {creat_24h.shape[0]}")
print(f"Hospitalizations with 7d creatinine data: {creat_7d.shape[0]}")

# Summary

In [None]:
import matplotlib.pyplot as plt
from upsetplot import UpSet, from_indicators
import pandas as pd
import os
import numpy as np

# Create output directory if it doesn't exist
os.makedirs('../output/final', exist_ok=True)

# Create a summary dataframe with all definitions
summary_df = final_df[['hospitalization_id', 'def_1', 'def_2', 'def_3']].drop_duplicates()

# Fill NaN values with 0 for the definitions
summary_df = summary_df.fillna(0)

# Convert to boolean for upset plot
summary_df['def_1'] = summary_df['def_1'].astype(bool)
summary_df['def_2'] = summary_df['def_2'].astype(bool) 
summary_df['def_3'] = summary_df['def_3'].astype(bool)

# Create upset plot with better sizing
fig = plt.figure(figsize=(16, 12))  # Larger figure size
upset_data = from_indicators(['def_1', 'def_2', 'def_3'], 
                           data=summary_df.set_index('hospitalization_id'))

upset = UpSet(upset_data, 
              subset_size='count',
              show_counts=True,
              sort_by='cardinality',
              element_size=50,  # Larger dots
              with_lines=True)  # Add connecting lines for clarity

# Plot with custom spacing
upset.plot(fig=fig)

# Adjust spacing to prevent overlapping
plt.subplots_adjust(left=0.2, bottom=0.2, right=0.95, top=0.85, hspace=0.3, wspace=0.3)

# Add title with more space
plt.suptitle('Overlap of Clinical Definitions\n(def_1: 24h ventilation, def_2: 24h vasoactives, def_3: AKI)', 
             fontsize=16, y=0.95)

# Adjust font sizes for better readability
for ax in fig.get_axes():
    for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
                 ax.get_xticklabels() + ax.get_yticklabels()):
        item.set_fontsize(12)

# Save the plot
plt.savefig('../output/final/definition_overlap_upset_plot.png', dpi=300, bbox_inches='tight')
plt.savefig('../output/final/definition_overlap_upset_plot.pdf', bbox_inches='tight')
plt.show()

# Create detailed summary table
combinations = []

# Individual definitions
combinations.append({
    'Combination': 'def_1 only',
    'Description': '24h ventilation only',
    'Count': int(((summary_df['def_1']) & (~summary_df['def_2']) & (~summary_df['def_3'])).sum())
})

combinations.append({
    'Combination': 'def_2 only', 
    'Description': '24h vasoactives only',
    'Count': int(((~summary_df['def_1']) & (summary_df['def_2']) & (~summary_df['def_3'])).sum())
})

combinations.append({
    'Combination': 'def_3 only',
    'Description': 'AKI only', 
    'Count': int(((~summary_df['def_1']) & (~summary_df['def_2']) & (summary_df['def_3'])).sum())
})

# Pairwise combinations
combinations.append({
    'Combination': 'def_1 & def_2',
    'Description': '24h ventilation + 24h vasoactives',
    'Count': int(((summary_df['def_1']) & (summary_df['def_2']) & (~summary_df['def_3'])).sum())
})

combinations.append({
    'Combination': 'def_1 & def_3',
    'Description': '24h ventilation + AKI',
    'Count': int(((summary_df['def_1']) & (~summary_df['def_2']) & (summary_df['def_3'])).sum())
})

combinations.append({
    'Combination': 'def_2 & def_3', 
    'Description': '24h vasoactives + AKI',
    'Count': int(((~summary_df['def_1']) & (summary_df['def_2']) & (summary_df['def_3'])).sum())
})

# All three
combinations.append({
    'Combination': 'def_1 & def_2 & def_3',
    'Description': 'All three conditions',
    'Count': int(((summary_df['def_1']) & (summary_df['def_2']) & (summary_df['def_3'])).sum())
})

# None
combinations.append({
    'Combination': 'None',
    'Description': 'No conditions met',
    'Count': int(((~summary_df['def_1']) & (~summary_df['def_2']) & (~summary_df['def_3'])).sum())
})

# Create summary table
combo_df = pd.DataFrame(combinations)
combo_df['Percentage'] = (combo_df['Count'] / len(summary_df) * 100).round(1)

print("Summary Table of Definition Combinations:")
print("=" * 60)
print(combo_df.to_string(index=False))

# Also show totals for each individual definition
individual_totals = {
    'def_1_24h_ventilation_count': int(summary_df['def_1'].sum()),
    'def_1_24h_ventilation_percentage': float(summary_df['def_1'].mean()*100),
    'def_2_24h_vasoactives_count': int(summary_df['def_2'].sum()),
    'def_2_24h_vasoactives_percentage': float(summary_df['def_2'].mean()*100),
    'def_3_aki_count': int(summary_df['def_3'].sum()),
    'def_3_aki_percentage': float(summary_df['def_3'].mean()*100),
    'total_hospitalizations': int(len(summary_df))
}

print("\n\nIndividual Definition Totals:")
print("=" * 40)
print(f"def_1 (24h ventilation): {individual_totals['def_1_24h_ventilation_count']} ({individual_totals['def_1_24h_ventilation_percentage']:.1f}%)")
print(f"def_2 (24h vasoactives): {individual_totals['def_2_24h_vasoactives_count']} ({individual_totals['def_2_24h_vasoactives_percentage']:.1f}%)")
print(f"def_3 (AKI): {individual_totals['def_3_aki_count']} ({individual_totals['def_3_aki_percentage']:.1f}%)")
print(f"Total hospitalizations: {individual_totals['total_hospitalizations']}")

# Save the summary tables
combo_df.to_csv('../output/final/definition_combinations_summary.csv', index=False)

# Save individual totals as JSON for easy reading
import json
with open('../output/final/individual_definition_totals.json', 'w') as f:
    json.dump(individual_totals, f, indent=2)

# Save the final strobe counts
with open('../output/final/strobe_counts.json', 'w') as f:
    # Convert numpy types to native Python types for JSON serialization
    strobe_counts_serializable = {}
    for key, value in strobe_counts.items():
        if hasattr(value, 'item'):  # numpy scalar
            strobe_counts_serializable[key] = value.item()
        elif isinstance(value, (np.integer, np.int64, np.int32)):
            strobe_counts_serializable[key] = int(value)
        elif isinstance(value, (np.floating, np.float64, np.float32)):
            strobe_counts_serializable[key] = float(value)
        else:
            strobe_counts_serializable[key] = value
    json.dump(strobe_counts_serializable, f, indent=2)

print(f"\n\nFiles saved to output/final/:")
print("- definition_overlap_upset_plot.png")
print("- definition_overlap_upset_plot.pdf") 
print("- definition_combinations_summary.csv")
print("- individual_definition_totals.json")
print("- strobe_counts.json")