# Hourly Wide Dataset Creation with pyCLIF

This notebook demonstrates how to create hourly aggregated wide datasets using pyCLIF. The hourly aggregation converts wide datasets into regular hourly buckets with user-defined aggregation methods.

## Key Features
- **Regular time intervals**: Convert irregular events to hourly buckets
- **Multiple aggregation methods**: max, min, mean, median, first, last, boolean, one_hot_encode
- **Suffixed column names**: All aggregated columns get clear suffixes (e.g., `map_max`, `heart_rate_mean`)
- **Time tracking**: 
  - `nth_hour` starts from 0 for first event, increments with each hour change
  - `event_time_hour` provides clean hour-truncated timestamps (e.g., 2023-01-01 13:00:00)

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
from pyclif import CLIF
import warnings
warnings.filterwarnings('ignore') 

print("=== pyCLIF Hourly Wide Dataset Example ===")

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
from pyclif import CLIF
from pyclif.utils.wide_dataset import convert_wide_to_hourly

# Initialize CLIF with your data directory
data_dir = "/Users/vaishvik/Downloads/CLIF_MIMIC"  # Update this path
clif = CLIF(data_dir=data_dir, filetype='parquet', timezone="US/Eastern")

print("✅ CLIF initialized successfully!")

## Step 1: Create Wide Dataset

In [None]:
# Create a wide dataset with essential tables and categories
wide_df = clif.create_wide_dataset(
    optional_tables=['vitals', 'labs', 'medication_admin_continuous'],
    category_filters={
        'vitals': ['map', 'heart_rate', 'spo2', 'temp_c'],
        'labs': ['hemoglobin', 'sodium', 'creatinine'], 
        'medication_admin_continuous': ['norepinephrine', 'propofol']
    },
    sample=True  # Use small sample for demo
)

print(f"Wide dataset created: {len(wide_df)} rows, {len(wide_df.columns)} columns")
print(f"Hospitalizations: {wide_df['hospitalization_id'].nunique()}")
print(f"Time range: {wide_df['event_time'].min()} to {wide_df['event_time'].max()}")

# Show sample data
print("\nSample data:")
cols_to_show = ['hospitalization_id', 'event_time', 'day_number', 'map', 'heart_rate', 'norepinephrine']
available_cols = [col for col in cols_to_show if col in wide_df.columns]
print(wide_df[available_cols].head())

In [None]:
wide_df

In [None]:
# Define how each column should be aggregated into hourly buckets
# Note: All columns will get suffixes based on aggregation method
aggregation_config = {
    'max': ['map', 'temp_c'],              # → map_max, temp_c_max
    'mean': ['heart_rate', 'spo2'],        # → heart_rate_mean, spo2_mean  
    'min': ['spo2'],                       # → spo2_min (in addition to spo2_mean)
    'first': ['hemoglobin', 'sodium'],     # → hemoglobin_first, sodium_first
    'boolean': ['norepinephrine', 'propofol']  # → norepinephrine_boolean, propofol_boolean
}

print("Aggregation configuration:")
for method, columns in aggregation_config.items():
    print(f"\n{method.upper()}:")
    for col in columns:
        print(f"  {col} → {col}_{method}")

In [None]:
# Convert wide dataset to hourly aggregation
hourly_df = convert_wide_to_hourly(wide_df, aggregation_config)

print(f"Hourly dataset created: {len(hourly_df)} rows, {len(hourly_df.columns)} columns")
print(f"Reduction: {len(wide_df)} → {len(hourly_df)} rows ({(1-len(hourly_df)/len(wide_df))*100:.1f}% reduction)")
print(f"Max nth_hour: {hourly_df['nth_hour'].max()} (≈ {hourly_df['nth_hour'].max()/24:.1f} days)")

# Show sample hourly data
print("\nSample hourly data:")
sample_cols = ['hospitalization_id', 'nth_hour', 'day_number', 'hour_bucket', 
               'map_max', 'heart_rate_mean', 'spo2_mean', 'spo2_min', 'norepinephrine_boolean']
available_cols = [col for col in sample_cols if col in hourly_df.columns]
print(hourly_df[available_cols].head(10))

In [None]:
hourly_df

In [None]:
# Convert wide dataset to hourly aggregation
hourly_df = convert_wide_to_hourly(wide_df, aggregation_config)

print(f"Hourly dataset created: {len(hourly_df)} rows, {len(hourly_df.columns)} columns")
print(f"Reduction: {len(wide_df)} → {len(hourly_df)} rows ({(1-len(hourly_df)/len(wide_df))*100:.1f}% reduction)")
print(f"Max nth_hour: {hourly_df['nth_hour'].max()} (≈ {hourly_df['nth_hour'].max():.1f} hours from first event)")

# Show sample hourly data with new columns
print("\nSample hourly data:")
sample_cols = ['hospitalization_id', 'nth_hour', 'event_time_hour', 'hour_bucket', 
               'map_max', 'heart_rate_mean', 'spo2_mean', 'spo2_min', 'norepinephrine_boolean']
available_cols = [col for col in sample_cols if col in hourly_df.columns]
print(hourly_df[available_cols].head(10))

# Show one patient's first 24 hours
first_hosp = hourly_df['hospitalization_id'].iloc[0]
patient_data = hourly_df[
    (hourly_df['hospitalization_id'] == first_hosp) & 
    (hourly_df['nth_hour'] <= 24)
]

print(f"First 24 hours for patient {first_hosp}:")
trend_cols = ['nth_hour', 'event_time_hour', 'map_max', 'heart_rate_mean', 'spo2_mean', 'norepinephrine_boolean']
available_trend_cols = [col for col in trend_cols if col in patient_data.columns]
print(patient_data[available_trend_cols])

# Summary statistics
print(f"\nSummary Statistics:")
print(f"• Total patients: {hourly_df['hospitalization_id'].nunique()}")
print(f"• Total hourly records: {len(hourly_df)}")
print(f"• Average records per patient: {len(hourly_df) / hourly_df['hospitalization_id'].nunique():.1f}")

# Show nth_hour progression
print(f"\n⏰ nth_hour progression:")
print(f"• Min nth_hour: {hourly_df['nth_hour'].min()} (first event)")
print(f"• Max nth_hour: {hourly_df['nth_hour'].max()} (hours elapsed)")
print(f"• nth_hour starts at 0 for first event, increments with each hour change")

# Check medication usage
if 'norepinephrine_boolean' in hourly_df.columns:
    norepi_hours = hourly_df['norepinephrine_boolean'].sum()
    print(f"• Hours with norepinephrine: {norepi_hours} ({norepi_hours/len(hourly_df)*100:.1f}%)")

print(f"\n✅ Hourly wide dataset ready for analysis!")
print(f"📊 All aggregated columns have clear suffixes indicating the aggregation method")
print(f"📅 event_time_hour provides clean hour-level timestamps")

## Step 3: Convert to Hourly Dataset

# Convert wide dataset to hourly aggregation
hourly_df = convert_wide_to_hourly(wide_df, aggregation_config)

print(f"Hourly dataset created: {len(hourly_df)} rows, {len(hourly_df.columns)} columns")
print(f"Reduction: {len(wide_df)} → {len(hourly_df)} rows ({(1-len(hourly_df)/len(wide_df))*100:.1f}% reduction)")
print(f"Max nth_hour: {hourly_df['nth_hour'].max()} (≈ {hourly_df['nth_hour'].max()/24:.1f} days)")

# Show sample hourly data
print("\nSample hourly data:")
sample_cols = ['hospitalization_id', 'nth_hour', 'day_number', 'hour_bucket', 
               'map_max', 'heart_rate_mean', 'spo2_mean', 'spo2_min', 'norepinephrine_boolean']
available_cols = [col for col in sample_cols if col in hourly_df.columns]
print(hourly_df[available_cols].head(10))

In [None]:
## Step 4: Analyze Results

# Show one patient's first 24 hours
first_hosp = hourly_df['hospitalization_id'].iloc[0]
patient_data = hourly_df[
    (hourly_df['hospitalization_id'] == first_hosp) & 
    (hourly_df['nth_hour'] <= 24)
]

print(f"First 24 hours for patient {first_hosp}:")
trend_cols = ['nth_hour', 'map_max', 'heart_rate_mean', 'spo2_mean', 'norepinephrine_boolean']
available_trend_cols = [col for col in trend_cols if col in patient_data.columns]
print(patient_data[available_trend_cols])

# Summary statistics
print(f"\nSummary Statistics:")
print(f"• Total patients: {hourly_df['hospitalization_id'].nunique()}")
print(f"• Total hourly records: {len(hourly_df)}")
print(f"• Average records per patient: {len(hourly_df) / hourly_df['hospitalization_id'].nunique():.1f}")

# Check medication usage
if 'norepinephrine_boolean' in hourly_df.columns:
    norepi_hours = hourly_df['norepinephrine_boolean'].sum()
    print(f"• Hours with norepinephrine: {norepi_hours} ({norepi_hours/len(hourly_df)*100:.1f}%)")

print(f"\n✅ Hourly wide dataset ready for analysis!")
print(f"📊 All aggregated columns have clear suffixes indicating the aggregation method")

### Use Case 3: Create Features for Machine Learning