## Standard test for wide function

In [None]:
import pandas as pd
from clifpy import ClifOrchestrator

# Initialize the ClifOrchestrator
# You'll need to adjust the data_directory path to your actual data location
co = ClifOrchestrator(
    data_directory='../clifpy/data/clif_demo/',  # Adjust this path
    filetype='parquet',  # or 'csv' depending on your data format
    timezone='UTC',
    output_directory=None  # Will create 'output' directory in current working directory
)

# Create the cohort DataFrame
cohort_df = pd.DataFrame({
    'hospitalization_id': ['23559586', '20626031'],
    'start_time': pd.to_datetime(['2137-01-01 14:29:00-06:00', '2132-12-14 08:00:00-06:00']),
    'end_time': pd.to_datetime(['2137-08-25 14:00:00-06:00', '2132-12-20 01:00:00-06:00'])
})

# Load the required tables
co.load_table('vitals')
co.load_table('labs')
co.load_table('patient_assessments')
co.load_table('medication_admin_continuous')

# Create the wide dataset
wide_df = co.create_wide_dataset(
   # tables_to_load=['vitals', 'labs', 'patient_assessments', 'medication_admin_continuous'],
   
    category_filters={
        'labs': ['creatinine','platelet_count','po2_arterial','bilirubin_total'],
        'vitals': ['map','spo2', 'weight_kg'],
        'patient_assessments': ['gcs_total','sbt_delivery_pass_fail','braden_activity'],
        "medication_admin_continuous": ["norepinephrine","epinephrine","phenylephrine","vasopressin",
                "dopamine","angiotensin","dobutamine","milrinone"]
    },
    sample=True,  # Use 20 random hospitalizations
    cohort_df=cohort_df
)

# Optional: Display the resulting DataFrame
print(f"Wide dataset created with shape: {co.wide_df.shape}")
print(f"Columns: {list(co.wide_df.columns)}")

## expected shape -> Wide dataset created with shape: (924, 29)

In [None]:
co.wide_df.dtypes

## Wide df test with encounter Stitching

In [None]:
import pandas as pd
from clifpy import ClifOrchestrator
import sys
from pathlib import Path
import pandas as pd
import numpy as np

def find_project_root(start=None):
    p = Path(start or Path.cwd())
    for d in [p, *p.parents]:
        if (d / "pyproject.toml").exists() or (d / "clifpy").is_dir():
            return d
    return p

project_root = find_project_root()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
DATA_DIR = (project_root / "clifpy" / "data" / "clif_demo").resolve()
OUTPUT_DIR = (project_root / "examples" / "output").resolve()
FILETYPE = "parquet"
TIMEZONE = "US/Eastern"

print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")


# Initialize orchestrator with encounter stitching enabled
clif = ClifOrchestrator(
    data_directory=str(DATA_DIR),
    filetype=FILETYPE,
    timezone=TIMEZONE,
    output_directory=str(OUTPUT_DIR),
    stitch_encounter=True,  # Enable encounter stitching
    stitch_time_interval=6  # 6-hour window (default)
)

clif.initialize(['hospitalization', 'adt'])

# Access the encounter mapping
encounter_mapping = clif.get_encounter_mapping()

if encounter_mapping is not None:
    print(f"Total hospitalizations: {len(encounter_mapping)}")
    print(f"Total encounter blocks: {encounter_mapping['encounter_block'].nunique()}")
    print(f"\nEncounter mapping shape: {encounter_mapping.shape}")

In [None]:
# Create the cohort DataFrame
cohort_df = pd.DataFrame({
    'hospitalization_id': ['23559586', '20626031'],
    'start_time': pd.to_datetime(['2137-01-01 14:29:00-06:00', '2132-12-14 08:00:00-06:00']),
    'end_time': pd.to_datetime(['2137-08-25 14:00:00-06:00', '2132-12-20 01:00:00-06:00'])
})

# Load the required tables
clif.load_table('vitals')
clif.load_table('labs')
clif.load_table('patient_assessments')
clif.load_table('medication_admin_continuous')

# Create the wide dataset
clif.create_wide_dataset(
   # tables_to_load=['vitals', 'labs', 'patient_assessments', 'medication_admin_continuous'],
   
    category_filters={
        'labs': ['creatinine','platelet_count','po2_arterial','bilirubin_total'],
        'vitals': ['map','spo2', 'weight_kg'],
        'patient_assessments': ['gcs_total'],
        "medication_admin_continuous": ["norepinephrine","epinephrine","phenylephrine","vasopressin",
                "dopamine","angiotensin","dobutamine","milrinone"]
    },
    sample=True,  # Use 20 random hospitalizations
    cohort_df=cohort_df
)

# Optional: Display the resulting DataFrame
print(f"Wide dataset created with shape: {clif.wide_df.shape}")
print(f"Columns: {list(clif.wide_df.columns)}")

## test with meds conversion

In [1]:
import pandas as pd
from clifpy import ClifOrchestrator
from clifpy.utils import apply_outlier_handling
import sys
from pathlib import Path
import pandas as pd
import numpy as np

def find_project_root(start=None):
    p = Path(start or Path.cwd())
    for d in [p, *p.parents]:
        if (d / "pyproject.toml").exists() or (d / "clifpy").is_dir():
            return d
    return p

project_root = find_project_root()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
DATA_DIR = (project_root / "clifpy" / "data" / "clif_demo").resolve()
OUTPUT_DIR = (project_root / "examples" / "output").resolve()
FILETYPE = "parquet"
TIMEZONE = "US/Eastern"

print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")


# Initialize orchestrator with encounter stitching enabled
clif = ClifOrchestrator(
    data_directory=str(DATA_DIR),
    filetype=FILETYPE,
    timezone=TIMEZONE,
    output_directory=str(OUTPUT_DIR),
    stitch_encounter=True,  # Enable encounter stitching
    stitch_time_interval=6  # 6-hour window (default)
)

clif.initialize(['hospitalization', 'adt'])

# Access the encounter mapping
encounter_mapping = clif.get_encounter_mapping()

if encounter_mapping is not None:
    print(f"Total hospitalizations: {len(encounter_mapping)}")
    print(f"Total encounter blocks: {encounter_mapping['encounter_block'].nunique()}")
    print(f"\nEncounter mapping shape: {encounter_mapping.shape}")

Data directory: /Users/sudo_sage/Documents/WORK/clifpy/clifpy/data/clif_demo
Output directory: /Users/sudo_sage/Documents/WORK/clifpy/examples/output
Using directly provided parameters
ClifOrchestrator initialized.
Using directly provided parameters
Loading clif_hospitalization.parquet
Data loaded successfully from clif_hospitalization.parquet
admission_dttm: null count before conversion= 0
admission_dttm: Converted from UTC to your timezone (US/Eastern).
admission_dttm: null count after conversion= 0
discharge_dttm: null count before conversion= 0
discharge_dttm: Converted from UTC to your timezone (US/Eastern).
discharge_dttm: null count after conversion= 0
Using directly provided parameters
Loading clif_adt.parquet
Data loaded successfully from clif_adt.parquet
in_dttm: null count before conversion= 0
in_dttm: Converted from UTC to your timezone (US/Eastern).
in_dttm: null count after conversion= 0
out_dttm: null count before conversion= 275
out_dttm: Converted from UTC to your time

In [2]:
# # Create the cohort DataFrame
# cohort_df = pd.DataFrame({
#     'hospitalization_id': ['23559586', '20626031'],
#     'start_time': pd.to_datetime(['2137-01-01 14:29:00-06:00', '2132-12-14 08:00:00-06:00']),
#     'end_time': pd.to_datetime(['2137-08-25 14:00:00-06:00', '2132-12-20 01:00:00-06:00'])
# })

# Load the required tables
clif.load_table('vitals')
apply_outlier_handling(clif.vitals)
clif.load_table('labs')
clif.load_table('patient_assessments')
clif.load_table('medication_admin_continuous')
preferred_units_cont = {
    "propofol": "mcg/min",
    "fentanyl": "mcg/hr",
    "insulin": "u/hr",
    "midazolam": "mg/hr",
    "heparin": "u/min"
}

clif.convert_dose_units_for_continuous_meds(preferred_units_cont)
# Create the wide dataset
clif.create_wide_dataset(
   # tables_to_load=['vitals', 'labs', 'patient_assessments', 'medication_admin_continuous'],
   
    category_filters={
        'labs': ['creatinine','platelet_count','po2_arterial','bilirubin_total'],
        'vitals': ['map','spo2', 'weight_kg'],
        'patient_assessments': ['gcs_total'],
        "medication_admin_continuous": ["norepinephrine","epinephrine","phenylephrine","vasopressin",
                "dopamine","angiotensin","dobutamine","milrinone"]
    }
    #,
    #sample=True,  # Use 20 random hospitalizations
   # cohort_df=cohort_df
)

# Optional: Display the resulting DataFrame
print(f"Wide dataset created with shape: {clif.wide_df.shape}")
print(f"Columns: {list(clif.wide_df.columns)}")

Using directly provided parameters
Loading clif_vitals.parquet
Data loaded successfully from clif_vitals.parquet
recorded_dttm: null count before conversion= 0
recorded_dttm: Converted from UTC to your timezone (US/Eastern).
recorded_dttm: null count after conversion= 0
Using CLIF standard outlier ranges

Building outlier expressions...


Building expressions: 100%|██████████| 1/1 [00:00<00:00, 1128.71column/s]


Applying outlier filtering...


Processing: 100%|██████████| 1/1 [00:00<00:00, 80.09operation/s]


Vitals Table - Category Statistics:
  dbp                 :  14351 values →      0 nullified (  0.0%)
  heart_rate          :  13913 values →      0 nullified (  0.0%)
  height_cm           :     71 values →      2 nullified (  2.8%)
  map                 :  14368 values →     20 nullified (  0.1%)
  respiratory_rate    :  13913 values →      0 nullified (  0.0%)
  sbp                 :  14356 values →      0 nullified (  0.0%)
  spo2                :  13540 values →      3 nullified (  0.0%)
  temp_c              :   3767 values →      5 nullified (  0.1%)
  weight_kg           :    806 values →      1 nullified (  0.1%)
Using directly provided parameters
Loading clif_labs.parquet
Data loaded successfully from clif_labs.parquet
lab_order_dttm: null count before conversion= 43419
lab_order_dttm: Converted from UTC to your timezone (US/Eastern).
lab_order_dttm: null count after conversion= 43419
lab_collect_dttm: null count before conversion= 0
lab_collect_dttm: Converted from UTC to y




Using directly provided parameters
Loading clif_medication_admin_continuous.parquet
Data loaded successfully from clif_medication_admin_continuous.parquet
admin_dttm: null count before conversion= 0
admin_dttm: Converted from UTC to your timezone (US/Eastern).
admin_dttm: null count after conversion= 0
No weight_kg column found, adding the most recent from vitals
=== WIDE DATASET CREATION STARTED ===

Phase 1: Initialization
  1.1: Validating parameters
  1.2: Configuring encounter stitching (enabled)

Phase 2: Encounter Processing
  2.1: === SPECIAL: ENCOUNTER STITCHING ===
       - No encounter_blocks provided - processing all encounter blocks

Phase 3: Table Loading
  3.1: Auto-loading base tables
       - Loading patient table...
Using directly provided parameters
Loading clif_patient.parquet
Data loaded successfully from clif_patient.parquet
death_dttm: null count before conversion= 85
death_dttm: Converted from UTC to your timezone (US/Eastern).
death_dttm: null count after conve

In [3]:
clif.medication_admin_continuous.df.med_category.unique()

array(['dextrose', 'propofol', 'insulin', 'magnesium', 'heparin',
       'esmolol', 'diltiazem', 'phenylephrine', 'norepinephrine',
       'vasopressin', 'fentanyl', 'amiodarone', 'labetalol',
       'dexmedetomidine', 'sodium bicarbonate', 'pantoprazole', 'tpn',
       'nicardipine', 'dobutamine', 'dopamine', 'midazolam', 'furosemide',
       'morphine', 'octreotide', 'aminocaproic', 'epinephrine',
       'bumetanide', 'milrinone', 'rocuronium', 'hydromorphone'],
      dtype=object)

In [4]:
clif.wide_df.columns.to_list()

['hospitalization_id',
 'patient_id',
 'age_at_admission',
 'event_time',
 'dobutamine_mcg_min',
 'dopamine_mcg_min',
 'epinephrine_mcg_min',
 'milrinone_mcg_min',
 'norepinephrine_mcg_min',
 'phenylephrine_mcg_min',
 'vasopressin_u_min',
 'map',
 'spo2',
 'weight_kg',
 'bilirubin_total',
 'creatinine',
 'platelet_count',
 'po2_arterial',
 'hospital_id',
 'in_dttm',
 'out_dttm',
 'location_category',
 'location_type',
 'encounter_block',
 'day_number',
 'hosp_id_day_key',
 'angiotensin',
 'gcs_total']

## Test Hourly Aggregation with Encounter Blocks

This test demonstrates the new `id_name` parameter in `convert_wide_to_hourly()` which allows aggregating by different ID columns:
- Default: `hospitalization_id` - each hospitalization aggregated separately
- With encounter stitching: `encounter_block` - linked hospitalizations aggregated together

In [None]:
# Test hourly aggregation with encounter blocks
import pandas as pd

# Define aggregation configuration for various column types
aggregation_config = {
    'mean': ['map', 'spo2', 'weight_kg'],  # Vitals
    'max': ['creatinine', 'bilirubin_total'],  # Labs - worst values
    'min': ['platelet_count'],  # Labs - lowest count
    'first': ['gcs_total'],  # Assessments - first in hour
    'boolean': ['norepinephrine_mcg_min', 'epinephrine_mcg_min', 'phenylephrine_mcg_min', 
                'vasopressin_u_min', 'dopamine_mcg_min', 'dobutamine_mcg_min', 
                'milrinone_mcg_min']  # Meds - any use in hour
}

# Make sure we have a wide dataset with encounter_block column
if 'encounter_block' not in clif.wide_df.columns:
    print("WARNING: encounter_block not in wide_df columns. Ensure encounter stitching was performed.")
    print("Available columns:", clif.wide_df.columns.tolist())
else:
    print(f"Wide dataset ready: {clif.wide_df.shape}")
    print(f"Unique encounter blocks in wide_df: {clif.wide_df['encounter_block'].nunique()}")


try:
    hourly_df_enc = clif.convert_wide_to_hourly(
        aggregation_config=aggregation_config,
        id_name='encounter_block'
    )
    
    print(f"✓ Shape: {hourly_df_enc.shape}")
    print(f"✓ Unique encounter_blocks: {hourly_df_enc['encounter_block'].nunique()}")
    print(f"✓ Sample columns: {list(hourly_df_enc.columns)[:10]}...")
    print(f"✓ Hour range: nth_hour {hourly_df_enc['nth_hour'].min()} to {hourly_df_enc['nth_hour'].max()}")
except Exception as e:
    print(f"✗ Error: {e}")

In [6]:
hourly_df_enc.columns

Index(['encounter_block', 'event_time_hour', 'nth_hour', 'hour_bucket',
       'patient_id', 'day_number', 'creatinine_max', 'bilirubin_total_max',
       'platelet_count_min', 'map_mean', 'spo2_mean', 'weight_kg_mean',
       'gcs_total_first', 'hospitalization_id_c', 'age_at_admission_c',
       'dobutamine_mcg_min_c', 'dopamine_mcg_min_c', 'epinephrine_mcg_min_c',
       'milrinone_mcg_min_c', 'norepinephrine_mcg_min_c',
       'phenylephrine_mcg_min_c', 'vasopressin_u_min_c', 'po2_arterial_c',
       'hospital_id_c', 'in_dttm_c', 'out_dttm_c', 'location_category_c',
       'location_type_c', 'hosp_id_day_key_c', 'angiotensin_c'],
      dtype='object')

In [None]:
'mean': ['map', 'spo2', 'weight_kg'],  # Vitals
'max': ['creatinine', 'bilirubin_total'],  # Labs - worst values
'min': ['platelet_count'],  # Labs - lowest count
'first': ['gcs_total'],  # Assessments - first in hour
'boolean': ['norepinephrine_mcg_min', 'epinephrine_mcg_min', 'phenylephrine_mcg_min', 
            'vasopressin_u_min', 'dopamine_mcg_min', 'dobutamine_mcg_min', 
            'milrinone_mcg_min']  # Meds - any use in hour (unit-aware column names)