## Standard test for wide function

In [None]:
import pandas as pd
from clifpy import ClifOrchestrator

# Initialize the ClifOrchestrator
# You'll need to adjust the data_directory path to your actual data location
co = ClifOrchestrator(
    data_directory='../clifpy/data/clif_demo/',  # Adjust this path
    filetype='parquet',  # or 'csv' depending on your data format
    timezone='UTC',
    output_directory=None  # Will create 'output' directory in current working directory
)

# Create the cohort DataFrame
cohort_df = pd.DataFrame({
    'hospitalization_id': ['23559586', '20626031'],
    'start_time': pd.to_datetime(['2137-01-01 14:29:00-06:00', '2132-12-14 08:00:00-06:00']),
    'end_time': pd.to_datetime(['2137-08-25 14:00:00-06:00', '2132-12-20 01:00:00-06:00'])
})

# Load the required tables
co.load_table('vitals')
co.load_table('labs')
co.load_table('patient_assessments')
co.load_table('medication_admin_continuous')

# Create the wide dataset
wide_df = co.create_wide_dataset(
   # tables_to_load=['vitals', 'labs', 'patient_assessments', 'medication_admin_continuous'],
   
    category_filters={
        'labs': ['creatinine','platelet_count','po2_arterial','bilirubin_total'],
        'vitals': ['map','spo2', 'weight_kg'],
        'patient_assessments': ['gcs_total','sbt_delivery_pass_fail','braden_activity'],
        "medication_admin_continuous": ["norepinephrine","epinephrine","phenylephrine","vasopressin",
                "dopamine","angiotensin","dobutamine","milrinone"]
    },
    sample=True,  # Use 20 random hospitalizations
    cohort_df=cohort_df
)

# Optional: Display the resulting DataFrame
print(f"Wide dataset created with shape: {co.wide_df.shape}")
print(f"Columns: {list(co.wide_df.columns)}")

## expected shape -> Wide dataset created with shape: (924, 29)

In [None]:
co.wide_df.dtypes

## Wide df test with encounter Stitching

In [None]:
import pandas as pd
from clifpy import ClifOrchestrator
import sys
from pathlib import Path
import pandas as pd
import numpy as np

def find_project_root(start=None):
    p = Path(start or Path.cwd())
    for d in [p, *p.parents]:
        if (d / "pyproject.toml").exists() or (d / "clifpy").is_dir():
            return d
    return p

project_root = find_project_root()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
DATA_DIR = (project_root / "clifpy" / "data" / "clif_demo").resolve()
OUTPUT_DIR = (project_root / "examples" / "output").resolve()
FILETYPE = "parquet"
TIMEZONE = "US/Eastern"

print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")


# Initialize orchestrator with encounter stitching enabled
clif = ClifOrchestrator(
    data_directory=str(DATA_DIR),
    filetype=FILETYPE,
    timezone=TIMEZONE,
    output_directory=str(OUTPUT_DIR),
    stitch_encounter=True,  # Enable encounter stitching
    stitch_time_interval=6  # 6-hour window (default)
)

clif.initialize(['hospitalization', 'adt'])

# Access the encounter mapping
encounter_mapping = clif.get_encounter_mapping()

if encounter_mapping is not None:
    print(f"Total hospitalizations: {len(encounter_mapping)}")
    print(f"Total encounter blocks: {encounter_mapping['encounter_block'].nunique()}")
    print(f"\nEncounter mapping shape: {encounter_mapping.shape}")

In [None]:
# Create the cohort DataFrame
cohort_df = pd.DataFrame({
    'hospitalization_id': ['23559586', '20626031'],
    'start_time': pd.to_datetime(['2137-01-01 14:29:00-06:00', '2132-12-14 08:00:00-06:00']),
    'end_time': pd.to_datetime(['2137-08-25 14:00:00-06:00', '2132-12-20 01:00:00-06:00'])
})

# Load the required tables
clif.load_table('vitals')
clif.load_table('labs')
clif.load_table('patient_assessments')
clif.load_table('medication_admin_continuous')

# Create the wide dataset
clif.create_wide_dataset(
   # tables_to_load=['vitals', 'labs', 'patient_assessments', 'medication_admin_continuous'],
   
    category_filters={
        'labs': ['creatinine','platelet_count','po2_arterial','bilirubin_total'],
        'vitals': ['map','spo2', 'weight_kg'],
        'patient_assessments': ['gcs_total'],
        "medication_admin_continuous": ["norepinephrine","epinephrine","phenylephrine","vasopressin",
                "dopamine","angiotensin","dobutamine","milrinone"]
    },
    sample=True,  # Use 20 random hospitalizations
    cohort_df=cohort_df
)

# Optional: Display the resulting DataFrame
print(f"Wide dataset created with shape: {clif.wide_df.shape}")
print(f"Columns: {list(clif.wide_df.columns)}")

## test with meds conversion

In [None]:
import pandas as pd
from clifpy import ClifOrchestrator
from clifpy.utils import apply_outlier_handling
import sys
from pathlib import Path
import pandas as pd
import numpy as np

def find_project_root(start=None):
    p = Path(start or Path.cwd())
    for d in [p, *p.parents]:
        if (d / "pyproject.toml").exists() or (d / "clifpy").is_dir():
            return d
    return p

project_root = find_project_root()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
DATA_DIR = (project_root / "clifpy" / "data" / "clif_demo").resolve()
OUTPUT_DIR = (project_root / "examples" / "output").resolve()
FILETYPE = "parquet"
TIMEZONE = "US/Eastern"

print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")


# Initialize orchestrator with encounter stitching enabled
clif = ClifOrchestrator(
    data_directory=str(DATA_DIR),
    filetype=FILETYPE,
    timezone=TIMEZONE,
    output_directory=str(OUTPUT_DIR),
    stitch_encounter=True,  # Enable encounter stitching
    stitch_time_interval=6  # 6-hour window (default)
)

clif.initialize(['hospitalization', 'adt'])

# Access the encounter mapping
encounter_mapping = clif.get_encounter_mapping()

if encounter_mapping is not None:
    print(f"Total hospitalizations: {len(encounter_mapping)}")
    print(f"Total encounter blocks: {encounter_mapping['encounter_block'].nunique()}")
    print(f"\nEncounter mapping shape: {encounter_mapping.shape}")

In [None]:
# # Create the cohort DataFrame
# cohort_df = pd.DataFrame({
#     'hospitalization_id': ['23559586', '20626031'],
#     'start_time': pd.to_datetime(['2137-01-01 14:29:00-06:00', '2132-12-14 08:00:00-06:00']),
#     'end_time': pd.to_datetime(['2137-08-25 14:00:00-06:00', '2132-12-20 01:00:00-06:00'])
# })

# Load the required tables
clif.load_table('vitals')
apply_outlier_handling(clif.vitals)
clif.load_table('labs')
clif.load_table('patient_assessments')
clif.load_table('medication_admin_continuous')
preferred_units_cont = {
    "propofol": "mcg/min",
    "fentanyl": "mcg/hr",
    "insulin": "u/hr",
    "midazolam": "mg/hr",
    "heparin": "u/min"
}

clif.convert_dose_units_for_continuous_meds(preferred_units_cont)
# Create the wide dataset
clif.create_wide_dataset(
   # tables_to_load=['vitals', 'labs', 'patient_assessments', 'medication_admin_continuous'],
   
    category_filters={
        'labs': ['creatinine','platelet_count','po2_arterial','bilirubin_total'],
        'vitals': ['map','spo2', 'weight_kg'],
        'patient_assessments': ['gcs_total'],
        "medication_admin_continuous": ["norepinephrine","epinephrine","phenylephrine","vasopressin",
                "dopamine","angiotensin","dobutamine","milrinone"],
        "respiratory_support": ['device_category']
    }
    #,
    #sample=True,  # Use 20 random hospitalizations
   # cohort_df=cohort_df
)

# Optional: Display the resulting DataFrame
print(f"Wide dataset created with shape: {clif.wide_df.shape}")
print(f"Columns: {list(clif.wide_df.columns)}")

In [None]:
clif.medication_admin_continuous.df.med_category.unique()

In [None]:
clif.wide_df.dtypes

In [None]:
clif.wide_df.columns.to_list()

## Test Hourly Aggregation with New Parameters

This section demonstrates the `convert_wide_to_hourly()` function with new flexible parameters:

**Key Parameters:**
- `id_name`: Group by different ID columns
  - `'hospitalization_id'` (default): Each hospitalization aggregated separately
  - `'encounter_block'`: Linked hospitalizations aggregated together (requires encounter stitching)
- `hourly_window`: Configurable time window (1-72 hours)
  - `1` (default): Standard 1-hour windows
  - `6`, `12`, `24`, etc.: Larger aggregation windows
- `fill_gaps`: Control output density
  - `False` (default): Sparse output - only windows with data
  - `True`: Dense output - fill all windows from 0 to max with NaN for gaps

**New Output Schema:**
- `window_number`: Sequential window index (0-indexed, starts at 0 for each group)
- `window_start_dttm`: Window start timestamp (inclusive)
- `window_end_dttm`: Window end timestamp (exclusive)
- Windows are **event-based**, starting from each group's first event (not calendar boundaries)

In [None]:
# Test hourly aggregation with various parameter combinations
import pandas as pd

# Define aggregation configuration for various column types
aggregation_config = {
    'mean': ['map', 'spo2', 'weight_kg'],  # Vitals
    'max': ['creatinine', 'bilirubin_total'],  # Labs - worst values
    'min': ['platelet_count'],  # Labs - lowest count
    'first': ['gcs_total'],  # Assessments - first in window
    'boolean': ['norepinephrine_mcg_min', 'epinephrine_mcg_min', 'phenylephrine_mcg_min', 
                'vasopressin_u_min', 'dopamine_mcg_min', 'dobutamine_mcg_min', 
                'milrinone_mcg_min']  # Meds - any use in window
}

print("="*80)
print("TEST 1: Default (1-hour windows, sparse, by hospitalization_id)")
print("="*80)
hourly_df_1h = clif.convert_wide_to_hourly(
    aggregation_config=aggregation_config,
    id_name='hospitalization_id',  # Default
    hourly_window=1,                # Default: 1-hour windows
    fill_gaps=False                 # Default: sparse output
)

hourly_df_1h_fill = clif.convert_wide_to_hourly(
    aggregation_config=aggregation_config,
    id_name='hospitalization_id',  # Default
    hourly_window=1,                # Default: 1-hour windows
    fill_gaps=True                 # Default: sparse output
)


hourly_df_1h_fill_enc_blck = clif.convert_wide_to_hourly(
    aggregation_config=aggregation_config,
    id_name='encounter_block',  # Default
    hourly_window=1,                # Default: 1-hour windows
    fill_gaps=True                 # Default: sparse output
)

In [None]:
hourly_df_1h_fill_enc_blck.dtypes

In [None]:
hourly_df_1h[hourly_df_1h['hospitalization_id']=='20044587']['encounter_block_c']

In [None]:
clif.wide_df[clif.wide_df['hospitalization_id']=='20044587']

In [None]:
hourly_df_1h_fill[hourly_df_1h_fill['hospitalization_id']=='20044587']