# ICU Mortality Model - Feature Engineering

This notebook loads the ICU cohort and creates hourly wide dataset for the first 24 hours of ICU stay.

## Objective
- Load ICU cohort from 01_cohort.ipynb
- Use pyCLIF to extract features from CLIF tables
- Create hourly wide dataset for the first 24 hours
- Filter to encounters with complete 24-hour data
- Save features for modeling

## Feature Sources
- **Vitals**: All vital_category values
- **Labs**: All lab_category values
- **Patient Assessments**: GCS_total, RASS
- **Respiratory Support**: Mode, FiO2, PEEP, ventilator settings (with one-hot encoding)
- **Medications**: All vasoactives and sedatives

## Setup and Configuration

In [1]:
import sys
import os
sys.path.append(os.path.join('..', 'src'))

import pandas as pd
import numpy as np
from pyclif import CLIF
from pyclif.utils.wide_dataset import convert_wide_to_hourly
import json
import warnings
warnings.filterwarnings('ignore')

print("=== ICU Mortality Model - Feature Engineering ===")
print("Setting up environment...")

=== ICU Mortality Model - Feature Engineering ===
Setting up environment...


In [2]:
def load_config():
    """Load configuration from config.json"""
    config_path = os.path.join("config_demo.json")
    
    if os.path.exists(config_path):
        with open(config_path, 'r') as file:
            config = json.load(file)
        print("✅ Loaded configuration from config.json")
    else:
        raise FileNotFoundError("Configuration file not found. Please create config.json based on the config_template.")
    
    return config

# Load configuration
config = load_config()
print(f"Site: {config['site']}")
print(f"Data path: {config['clif2_path']}")
print(f"File type: {config['filetype']}")

✅ Loaded configuration from config.json
Site: MIMIC
Data path: /Users/sudo_sage/Documents/work/mimic_demo
File type: parquet


In [3]:
# Initialize pyCLIF
clif = CLIF(
    data_dir=config['clif2_path'],
    filetype=config['filetype'],
    timezone="US/Eastern"
)

print("✅ pyCLIF initialized successfully")

CLIF Object Initialized.
✅ pyCLIF initialized successfully


## Load ICU Cohort

In [4]:
# Load ICU cohort from 01_cohort.ipynb
cohort_path = os.path.join('output', 'intermitted', 'icu_cohort.csv')

if os.path.exists(cohort_path):
    cohort_df = pd.read_csv(cohort_path)
    
    # Convert datetime columns
    datetime_cols = ['start_dttm', 'hour_24_start_dttm', 'hour_24_end_dttm']
    for col in datetime_cols:
        cohort_df[col] = pd.to_datetime(cohort_df[col])
    
    print(f"✅ Loaded ICU cohort: {len(cohort_df)} hospitalizations")
    print(f"Mortality rate: {cohort_df['disposition'].mean():.3f}")
    print(f"Time range: {cohort_df['start_dttm'].min()} to {cohort_df['start_dttm'].max()}")
    
else:
    raise FileNotFoundError(f"Cohort file not found at {cohort_path}. Please run 01_cohort.ipynb first.")

# Display sample
print("\nSample cohort records:")
print(cohort_df.head())

✅ Loaded ICU cohort: 89 hospitalizations
Mortality rate: 0.090
Time range: 2110-04-11 20:52:22+00:00 to 2201-12-12 01:11:52+00:00

Sample cohort records:
   hospitalization_id                start_dttm        hour_24_start_dttm  \
0            24597018 2157-11-21 00:18:02+00:00 2157-11-21 00:18:02+00:00   
1            25563031 2110-04-11 20:52:22+00:00 2110-04-11 20:52:22+00:00   
2            20321825 2156-05-01 02:53:00+00:00 2156-05-01 02:53:00+00:00   
3            23473524 2156-05-11 19:49:34+00:00 2156-05-11 19:49:34+00:00   
4            28662225 2156-04-12 21:24:18+00:00 2156-04-12 21:24:18+00:00   

           hour_24_end_dttm  disposition  
0 2157-11-22 00:18:02+00:00            0  
1 2110-04-12 20:52:22+00:00            0  
2 2156-05-02 02:53:00+00:00            0  
3 2156-05-12 19:49:34+00:00            0  
4 2156-04-13 21:24:18+00:00            0  


## Feature Extraction Configuration

In [5]:
# Define feature extraction configuration
print("Configuring feature extraction...")

# Get hospitalization IDs from cohort
cohort_ids = cohort_df['hospitalization_id'].astype(str).unique().tolist()
print(f"Extracting features for {len(cohort_ids)} hospitalizations")

# Define category filters for each table
category_filters = {
    'vitals': [  # Common vital signs
        'heart_rate', 'map', 'respiratory_rate', 'spo2', 'temp_c',
        'weight_kg', 'height_cm'
    ],
    'labs': [  # Common lab values
        "albumin",    "alkaline_phosphatase",    "alt",    "ast",    "basophils_percent",    "basophils_absolute",    "bicarbonate",    "bilirubin_total",    "bilirubin_conjugated",    "bilirubin_unconjugated",
    "bun",
    "calcium_total",    "calcium_ionized",    "chloride",    "creatinine",    "crp",    "eosinophils_percent",
    "eosinophils_absolute",    "esr",    "ferritin",    "glucose_fingerstick",    "glucose_serum",    "hemoglobin",    "phosphate",    "inr",    "lactate",    "ldh",
    "lymphocytes_percent",    "lymphocytes_absolute",    "magnesium",    "monocytes_percent",    "monocytes_absolute",    "neutrophils_percent",    "neutrophils_absolute",
    "pco2_arterial",    "po2_arterial",    "pco2_venous",    "ph_arterial",    "ph_venous",    "platelet_count",    "potassium",    "procalcitonin",
    "pt",    "ptt",    "so2_arterial",    "so2_mixed_venous",    "so2_central_venous",    "sodium",
    "total_protein",    "troponin_i",    "troponin_t",    "wbc"
    ],
    'patient_assessments': [  # Neurological assessments
        'gcs_total', 'rass'
    ],
    'medication_admin_continuous': [  # Vasoactives and sedatives
        "norepinephrine",
    "epinephrine",
    "phenylephrine",
    "angiotensin",
    "vasopressin",
    "dopamine",
    "dobutamine",
    "milrinone",
    "isoproterenol",
    "propofol",
    "dexmedetomidine",
    "ketamine",
    "midazolam",
    "fentanyl",
    "hydromorphone",
    "morphine",
    "remifentanil",
    "pentobarbital",
    "lorazepam"
    ],
    'respiratory_support': [  # All respiratory support categories
        'mode_category', 'device_category', 'fio2'
    ]
}

print("Feature extraction configuration:")
for table, categories in category_filters.items():
    print(f"  {table}: {len(categories)} categories")
    print(f"    {categories[:5]}..." if len(categories) > 5 else f"    {categories}")

Configuring feature extraction...
Extracting features for 89 hospitalizations
Feature extraction configuration:
  vitals: 7 categories
    ['heart_rate', 'map', 'respiratory_rate', 'spo2', 'temp_c']...
  labs: 52 categories
    ['albumin', 'alkaline_phosphatase', 'alt', 'ast', 'basophils_percent']...
  patient_assessments: 2 categories
    ['gcs_total', 'rass']
  medication_admin_continuous: 19 categories
    ['norepinephrine', 'epinephrine', 'phenylephrine', 'angiotensin', 'vasopressin']...
  respiratory_support: 3 categories
    ['mode_category', 'device_category', 'fio2']


## Create Wide Dataset Using pyCLIF

In [6]:
# Create wide dataset for cohort hospitalizations
print("Creating wide dataset using pyCLIF...")


wide_df = clif.create_wide_dataset(
    hospitalization_ids=cohort_ids,
    optional_tables=['vitals', 'labs', 'patient_assessments', 'medication_admin_continuous', 'respiratory_support'],
    category_filters=category_filters,
    save_to_data_location=False  # Keep in memory for processing
)


Creating wide dataset using pyCLIF...
Auto-loading required base table: patient
Loading clif_patient.parquet
Data loaded successfully from clif_patient.parquet
Validation completed with 2 error(s). See `errors` attribute.
Auto-loading required base table: hospitalization
Loading clif_hospitalization.parquet
Data loaded successfully from clif_hospitalization.parquet
Validation completed successfully.
Auto-loading required base table: adt
Loading clif_adt.parquet
Data loaded successfully from clif_adt.parquet
Validation completed with 4 error(s). See `errors` attribute.
Auto-loading optional table: vitals
Loading clif_vitals.parquet
Data loaded successfully from clif_vitals.parquet
Validation completed with 5 error(s).
  - 5 range validation error(s)
See `errors` and `range_validation_errors` attributes for details.
Auto-loading optional table: labs
Loading clif_labs.parquet
Data loaded successfully from clif_labs.parquet
Validation completed with 24 error(s).
  - 8 schema validation err

In [7]:
wide_df.to_csv("wide_df.csv", index=False)

## Filter to 24-Hour Window

In [8]:
# Filter wide dataset to 24-hour windows
print("Filtering to 24-hour windows...")
cohort_df['hospitalization_id'] = cohort_df['hospitalization_id'].astype(str)
# Merge with cohort to get time windows
wide_df_filtered = pd.merge(
    wide_df,
    cohort_df[['hospitalization_id', 'hour_24_start_dttm', 'hour_24_end_dttm', 'disposition']],
    on='hospitalization_id',
    how='inner'
)

print(f"After merge with cohort: {len(wide_df_filtered)} records")

# Filter events within 24-hour window
wide_df_filtered = wide_df_filtered[
    (wide_df_filtered['event_time'] >= wide_df_filtered['hour_24_start_dttm']) &
    (wide_df_filtered['event_time'] <= wide_df_filtered['hour_24_end_dttm'])
].reset_index(drop=True)

print(f"✅ Filtered to 24-hour windows: {len(wide_df_filtered)} records")
print(f"Hospitalizations with data: {wide_df_filtered['hospitalization_id'].nunique()}")

# Show time window validation
print("\nTime window validation:")
print(f"All events within window: {((wide_df_filtered['event_time'] >= wide_df_filtered['hour_24_start_dttm']) & (wide_df_filtered['event_time'] <= wide_df_filtered['hour_24_end_dttm'])).all()}")
print(f"Average records per hospitalization: {len(wide_df_filtered) / wide_df_filtered['hospitalization_id'].nunique():.1f}")

Filtering to 24-hour windows...
After merge with cohort: 27380 records
✅ Filtered to 24-hour windows: 6769 records
Hospitalizations with data: 89

Time window validation:
All events within window: True
Average records per hospitalization: 76.1


In [9]:
wide_df_filtered.columns.to_list()

['patient_id',
 'hospitalization_id',
 'hospitalization_joined_id',
 'admission_dttm',
 'discharge_dttm',
 'age_at_admission',
 'admission_type_name',
 'admission_type_category',
 'discharge_name',
 'discharge_category',
 'zipcode_nine_digit',
 'zipcode_five_digit',
 'census_block_code',
 'census_block_group_code',
 'census_tract',
 'state_code',
 'county_code',
 'race_name',
 'race_category',
 'ethnicity_name',
 'ethnicity_category',
 'sex_name',
 'sex_category',
 'birth_date',
 'death_dttm',
 'language_name',
 'language_category',
 'event_time',
 'hospital_id',
 'in_dttm',
 'out_dttm',
 'location_name',
 'location_category',
 'location_type',
 'heart_rate',
 'height_cm',
 'map',
 'respiratory_rate',
 'spo2',
 'temp_c',
 'weight_kg',
 'albumin',
 'alkaline_phosphatase',
 'alt',
 'ast',
 'basophils_absolute',
 'basophils_percent',
 'bicarbonate',
 'bilirubin_conjugated',
 'bilirubin_total',
 'bilirubin_unconjugated',
 'bun',
 'calcium_ionized',
 'calcium_total',
 'chloride',
 'creatini

In [10]:
aggregation_config = {
    'max': ['eosinophils_absolute',
 'glucose_fingerstick',
 'lymphocytes_absolute',
 'monocytes_absolute',
 'neutrophils_absolute',
 'procalcitonin',
 'troponin_i',
 'wbc',
 'gcs_total',
 'rass',
 'angiotensin',
 'isoproterenol',
 'ketamine',
 'remifentanil',
 'pentobarbital',
 'lorazepam',
 'fio2'],
    'min': ['eosinophils_absolute',
 'glucose_fingerstick',
 'lymphocytes_absolute',
 'monocytes_absolute',
 'neutrophils_absolute',
 'procalcitonin',
 'troponin_i',
 'wbc',
 'gcs_total',
 'rass',
 'angiotensin',
 'isoproterenol',
 'ketamine',
 'remifentanil',
 'pentobarbital',
 'lorazepam',
 'fio2'],
    'mean': ['eosinophils_absolute',
 'glucose_fingerstick',
 'lymphocytes_absolute',
 'monocytes_absolute',
 'neutrophils_absolute',
 'procalcitonin',
 'troponin_i',
 'wbc',
 'gcs_total',
 'rass',
 'angiotensin',
 'isoproterenol',
 'ketamine',
 'remifentanil',
 'pentobarbital',
 'lorazepam',
 'fio2'],
    'median': ['eosinophils_absolute',
 'glucose_fingerstick',
 'lymphocytes_absolute',
 'monocytes_absolute',
 'neutrophils_absolute',
 'procalcitonin',
 'troponin_i',
 'wbc',
 'gcs_total',
 'rass',
 'angiotensin',
 'isoproterenol',
 'ketamine',
 'remifentanil',
 'pentobarbital',
 'lorazepam',
 'fio2',],
    'boolean': ['eosinophils_absolute',
 'glucose_fingerstick',
 'lymphocytes_absolute',
 'monocytes_absolute',
 'neutrophils_absolute',
 'procalcitonin',
 'troponin_i',
 'wbc',
 'gcs_total',
 'rass',
 'angiotensin',
 'isoproterenol',
 'ketamine',
 'remifentanil',
 'pentobarbital',
 'lorazepam',
 'fio2',],
    'one_hot_encode': [ 'mode_category','device_category']
}

hourly_df = convert_wide_to_hourly(wide_df, aggregation_config)



Starting hourly aggregation of wide dataset...
Calculating nth_hour starting from 0 based on first event...
Processing 27380 records into hourly buckets...
The following columns are not mentioned in aggregation_config, defaulting to 'first' with '_c' postfix:
  - hospitalization_joined_id
  - admission_dttm
  - discharge_dttm
  - age_at_admission
  - admission_type_name
  - admission_type_category
  - discharge_name
  - discharge_category
  - zipcode_nine_digit
  - zipcode_five_digit
  - census_block_code
  - census_block_group_code
  - census_tract
  - state_code
  - county_code
  - race_name
  - race_category
  - ethnicity_name
  - ethnicity_category
  - sex_name
  - sex_category
  - birth_date
  - death_dttm
  - language_name
  - language_category
  - hospital_id
  - in_dttm
  - out_dttm
  - location_name
  - location_category
  - location_type
  - heart_rate
  - height_cm
  - map
  - respiratory_rate
  - spo2
  - temp_c
  - weight_kg
  - albumin
  - alkaline_phosphatase
  - alt
  -

Aggregating data by hour: 100%|██████████| 10982/10982 [00:43<00:00, 253.23group/s]


Hourly aggregation complete: 10982 hourly records from 27380 original records
Columns in hourly dataset: 222


In [13]:
hourly_df

Unnamed: 0,hospitalization_id,event_time_hour,nth_hour,hour_bucket,patient_id,day_number,eosinophils_absolute_max,glucose_fingerstick_max,lymphocytes_absolute_max,monocytes_absolute_max,...,device_category_Face_Mask,device_category_Nasal_Cannula,device_category_High_Flow_NC,device_category_Other,device_category_NIPPV,mode_category_Pressure_Regulated_Volume_Control,mode_category_Pressure_Control,mode_category_Other,mode_category_Volume_Support,device_category_CPAP
0,20044587,2113-08-25 08:00:00+00:00,0,8,10023771,1,,,,,...,0,0,0,0,0,0,0,0,0,0
1,20044587,2113-08-25 12:00:00+00:00,4,12,10023771,1,,,,,...,0,0,0,0,0,0,0,0,0,0
2,20044587,2113-08-25 14:00:00+00:00,6,14,10023771,1,,,,,...,0,0,0,0,0,0,0,0,0,0
3,20044587,2113-08-25 15:00:00+00:00,7,15,10023771,1,,,,,...,0,0,0,0,0,0,0,0,0,0
4,20044587,2113-08-25 16:00:00+00:00,8,16,10023771,1,,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10977,29974575,2131-03-11 13:00:00+00:00,283,13,10020944,13,,,,,...,0,0,0,0,0,0,0,0,0,0
10978,29974575,2131-03-12 11:00:00+00:00,305,11,10020944,14,,,,,...,0,0,0,0,0,0,0,0,0,0
10979,29974575,2131-03-12 12:00:00+00:00,306,12,10020944,14,,,,,...,0,0,0,0,0,0,0,0,0,0
10980,29974575,2131-03-13 11:00:00+00:00,329,11,10020944,15,,,,,...,0,0,0,0,0,0,0,0,0,0


In [14]:
hourly_df.columns.to_list()

['hospitalization_id',
 'event_time_hour',
 'nth_hour',
 'hour_bucket',
 'patient_id',
 'day_number',
 'eosinophils_absolute_max',
 'glucose_fingerstick_max',
 'lymphocytes_absolute_max',
 'monocytes_absolute_max',
 'neutrophils_absolute_max',
 'procalcitonin_max',
 'troponin_i_max',
 'wbc_max',
 'gcs_total_max',
 'rass_max',
 'angiotensin_max',
 'isoproterenol_max',
 'ketamine_max',
 'remifentanil_max',
 'pentobarbital_max',
 'lorazepam_max',
 'fio2_max',
 'eosinophils_absolute_min',
 'glucose_fingerstick_min',
 'lymphocytes_absolute_min',
 'monocytes_absolute_min',
 'neutrophils_absolute_min',
 'procalcitonin_min',
 'troponin_i_min',
 'wbc_min',
 'gcs_total_min',
 'rass_min',
 'angiotensin_min',
 'isoproterenol_min',
 'ketamine_min',
 'remifentanil_min',
 'pentobarbital_min',
 'lorazepam_min',
 'fio2_min',
 'eosinophils_absolute_mean',
 'glucose_fingerstick_mean',
 'lymphocytes_absolute_mean',
 'monocytes_absolute_mean',
 'neutrophils_absolute_mean',
 'procalcitonin_mean',
 'troponin