# ECDF Generator - Combined (Polars Only)

This notebook processes **Labs, Vitals, and Respiratory Support** using **Polars exclusively**.

## Imports and Setup

In [None]:
import json
import yaml
import os
from pathlib import Path
from typing import Dict, List, Any, Tuple, Optional
import polars as pl
import numpy as np

PROJECT_ROOT = Path(os.getcwd()).parent.parent
print(f"Project root: {PROJECT_ROOT}")

## Configuration Loading

In [None]:
def load_configs(
    clif_config_path: str = None,
    outlier_config_path: str = None,
    lab_vital_config_path: str = None
) -> Tuple[Dict, Dict, Dict]:
    """
    Load all required configuration files.
    """
    if clif_config_path is None:
        clif_config_path = PROJECT_ROOT / 'config' / 'config.json'
    if outlier_config_path is None:
        outlier_config_path = PROJECT_ROOT / 'modules' / 'ecdf' / 'config' / 'outlier_config.yaml'
    if lab_vital_config_path is None:
        lab_vital_config_path = PROJECT_ROOT / 'modules' / 'ecdf' / 'config' / 'lab_vital_config.yaml'

    with open(clif_config_path, 'r') as f:
        clif_config = json.load(f)

    with open(outlier_config_path, 'r') as f:
        outlier_config = yaml.safe_load(f)

    with open(lab_vital_config_path, 'r') as f:
        lab_vital_config = yaml.safe_load(f)

    return clif_config, outlier_config, lab_vital_config

In [None]:
clif_config, outlier_config, lab_vital_config = load_configs()

print("CLIF Config:")
print(f"  tables_path: {clif_config.get('tables_path')}")
print(f"  file_type: {clif_config.get('file_type')}")
print()
print(f"Outlier config tables: {list(outlier_config.get('tables', {}).keys())}")
print(f"Lab categories in config: {list(lab_vital_config.get('labs', {}).keys())}")
print(f"Vital categories in config: {list(lab_vital_config.get('vitals', {}).keys())}")

## Helper Functions

In [None]:
def sanitize_unit_for_filename(unit) -> str:
    """
    Sanitize unit string for use in filename.
    Handles both string and list inputs.
    """
    if unit is None:
        return "unknown"
    
    if isinstance(unit, list):
        unit = '_'.join(str(u) for u in unit if u is not None)
    
    if not unit:
        return "unknown"

    sanitized = unit.replace('/', '_').replace('%', 'pct').replace('Â°', 'deg')
    sanitized = ''.join(c if c.isalnum() or c == '_' else '_' for c in sanitized)
    while '__' in sanitized:
        sanitized = sanitized.replace('__', '_')
    sanitized = sanitized.strip('_').lower()

    return sanitized

## Lab Standardization (using clifpy)

In [None]:
import clifpy
from clifpy.tables import Labs
print(f"clifpy location: {clifpy.__file__}")

In [None]:
def standardize_labs_polars(
    tables_path: str,
    file_type: str,
    output_path: str
) -> None:
    """
    Standardize lab reference units and save as parquet.
    """
    labs_path = os.path.join(tables_path, f'clif_labs.{file_type}')
    print(f"Loading labs from {labs_path}...")

    labs_lazy = pl.scan_parquet(labs_path)
    labs_inst = Labs(data=labs_lazy)
    
    labs_inst.standardize_reference_units(
        save=True, 
        output_directory=str(Path(output_path).parent),
        lowercase=True, 
        inplace=True
    )
    
    if isinstance(labs_inst.df, pl.LazyFrame):
        labs_inst.df.collect().write_parquet(output_path)
    elif isinstance(labs_inst.df, pl.DataFrame):
        labs_inst.df.write_parquet(output_path)
    else:
        pl.from_pandas(labs_inst.df).write_parquet(output_path)
    
    print(f"Saved standardized labs to {output_path}")

In [None]:
# Standardize labs (run once)
STANDARDIZED_LABS_PATH = '../../output/intermediate/clif_labs_standardized.parquet'
os.makedirs(os.path.dirname(STANDARDIZED_LABS_PATH), exist_ok=True)

# Uncomment to run standardization
standardize_labs_polars(
    clif_config['tables_path'],
    clif_config['file_type'],
    STANDARDIZED_LABS_PATH
)

## ICU Time Window Extraction

In [None]:
def extract_icu_time_windows(
    tables_path: str,
    file_type: str
) -> pl.DataFrame:
    """
    Extract ICU time windows from ADT table using Polars.
    """
    adt_path = os.path.join(tables_path, f'clif_adt.{file_type}')
    print(f"Loading ICU time windows from {adt_path}...")

    icu_windows_df = (
        pl.scan_parquet(adt_path)
        .filter(pl.col('location_category').str.to_lowercase() == 'icu')
        .select(['hospitalization_id', 'in_dttm', 'out_dttm'])
        .collect()
        .with_columns([
            pl.col('in_dttm').dt.replace_time_zone(None),
            pl.col('out_dttm').dt.replace_time_zone(None)
        ])
    )

    print(f"Found {len(icu_windows_df):,} ICU time windows")
    return icu_windows_df

In [None]:
icu_windows = extract_icu_time_windows(
    clif_config['tables_path'],
    clif_config['file_type']
)

print("\nSample ICU time windows:")
icu_windows.head(10)

## ECDF Computation (Polars)

In [None]:
def compute_ecdf_polars(values: pl.Series) -> pl.DataFrame:
    """
    Compute ECDF using pure Polars operations.
    """
    if len(values) == 0:
        return pl.DataFrame({'value': [], 'probability': []})

    n = len(values)
    
    ecdf_df = (
        pl.DataFrame({'value': values})
        .sort('value')
        .with_row_index('rank')
        .with_columns(
            ((pl.col('rank') + 1) / n).alias('probability')
        )
        .group_by('value')
        .agg(pl.col('probability').max())
        .sort('value')
    )

    return ecdf_df

## Binning Functions (Polars)

In [None]:
def create_flat_bins_polars(
    data: pl.Series,
    num_bins: int = 10
) -> List[Dict[str, Any]]:
    """
    Create flat quantile bins using Polars.
    No segmentation - used for respiratory support columns.
    """
    if len(data) == 0:
        return []

    data = data.drop_nulls()
    
    if len(data) == 0:
        return []

    if len(data) < num_bins * 2:
        num_bins = max(1, len(data) // 2)

    total_count = len(data)
    
    quantiles = [i / num_bins for i in range(num_bins + 1)]
    edges_list = [data.quantile(q) for q in quantiles]
    
    bins = []
    
    for i in range(len(edges_list) - 1):
        bin_min = edges_list[i]
        bin_max = edges_list[i + 1]
        
        if bin_min == bin_max and i > 0:
            continue
        
        if i == 0:
            count = data.filter((data >= bin_min) & (data <= bin_max)).len()
            interval_str = f"[{bin_min:.2f}, {bin_max:.2f}]"
        else:
            count = data.filter((data > bin_min) & (data <= bin_max)).len()
            interval_str = f"({bin_min:.2f}, {bin_max:.2f}]"
        
        if count > 0:
            bins.append({
                'bin_num': len(bins) + 1,
                'bin_min': float(bin_min),
                'bin_max': float(bin_max),
                'count': int(count),
                'percentage': float(count / total_count * 100),
                'interval': interval_str
            })

    return bins

In [None]:
def create_bins_for_segment_polars(
    data: pl.Series,
    segment_min: float,
    segment_max: float,
    num_bins: int,
    segment_name: str,
    extra_bins_last: int = 0,
    split_first: bool = False
) -> List[Dict[str, Any]]:
    """
    Create quantile-based bins for a segment using Polars.
    """
    segment_data = data.filter((data >= segment_min) & (data <= segment_max))

    if len(segment_data) == 0:
        return []

    if num_bins == 1 or len(segment_data) < num_bins * 2:
        return [{
            'segment': segment_name,
            'bin_num': 1,
            'bin_min': float(segment_data.min()),
            'bin_max': float(segment_data.max()),
            'count': len(segment_data),
            'percentage': 100.0
        }]

    total_segment_count = len(segment_data)
    
    quantiles = [i / num_bins for i in range(num_bins + 1)]
    edges_list = [segment_data.quantile(q) for q in quantiles]
    
    bins = []
    
    for i in range(len(edges_list) - 1):
        bin_min = edges_list[i]
        bin_max = edges_list[i + 1]
        
        if bin_min == bin_max and i > 0:
            continue
        
        if i == 0:
            count = segment_data.filter((segment_data >= bin_min) & (segment_data <= bin_max)).len()
        else:
            count = segment_data.filter((segment_data > bin_min) & (segment_data <= bin_max)).len()
        
        if count > 0:
            bins.append({
                'segment': segment_name,
                'bin_num': len(bins) + 1,
                'bin_min': float(bin_min),
                'bin_max': float(bin_max),
                'count': int(count),
                'percentage': float(count / total_segment_count * 100)
            })

    # Handle extra bins for extreme values
    if extra_bins_last > 0 and len(bins) > 0:
        if split_first:
            extreme_bin = bins[0]
            extreme_data = segment_data.filter(
                (segment_data >= extreme_bin['bin_min']) & 
                (segment_data <= extreme_bin['bin_max'])
            )
            
            if len(extreme_data) >= extra_bins_last * 2:
                extra_quantiles = [i / extra_bins_last for i in range(extra_bins_last + 1)]
                extra_edges = [extreme_data.quantile(q) for q in extra_quantiles]
                
                new_bins = []
                for j in range(len(extra_edges) - 1):
                    e_min, e_max = extra_edges[j], extra_edges[j + 1]
                    if e_min == e_max and j > 0:
                        continue
                    if j == 0:
                        e_count = extreme_data.filter((extreme_data >= e_min) & (extreme_data <= e_max)).len()
                    else:
                        e_count = extreme_data.filter((extreme_data > e_min) & (extreme_data <= e_max)).len()
                    
                    if e_count > 0:
                        new_bins.append({
                            'segment': segment_name,
                            'bin_num': len(new_bins) + 1,
                            'bin_min': float(e_min),
                            'bin_max': float(e_max),
                            'count': int(e_count),
                            'percentage': float(e_count / total_segment_count * 100)
                        })
                
                for b in bins[1:]:
                    b['bin_num'] = len(new_bins) + 1
                    new_bins.append(b)
                bins = new_bins
        else:
            extreme_bin = bins[-1]
            extreme_data = segment_data.filter(
                (segment_data >= extreme_bin['bin_min']) & 
                (segment_data <= extreme_bin['bin_max'])
            )
            
            if len(extreme_data) >= extra_bins_last * 2:
                extra_quantiles = [i / extra_bins_last for i in range(extra_bins_last + 1)]
                extra_edges = [extreme_data.quantile(q) for q in extra_quantiles]
                
                bins = bins[:-1]
                
                for j in range(len(extra_edges) - 1):
                    e_min, e_max = extra_edges[j], extra_edges[j + 1]
                    if e_min == e_max and j > 0:
                        continue
                    if j == 0:
                        e_count = extreme_data.filter((extreme_data >= e_min) & (extreme_data <= e_max)).len()
                    else:
                        e_count = extreme_data.filter((extreme_data > e_min) & (extreme_data <= e_max)).len()
                    
                    if e_count > 0:
                        bins.append({
                            'segment': segment_name,
                            'bin_num': len(bins) + 1,
                            'bin_min': float(e_min),
                            'bin_max': float(e_max),
                            'count': int(e_count),
                            'percentage': float(e_count / total_segment_count * 100)
                        })

    return bins

In [None]:
def create_all_bins_polars(
    data: pl.Series,
    normal_lower: float,
    normal_upper: float,
    outlier_min: float,
    outlier_max: float,
    bins_below: int,
    bins_normal: int,
    bins_above: int,
    extra_bins_below: int = 0,
    extra_bins_above: int = 0
) -> List[Dict[str, Any]]:
    """
    Create bins for all segments (below/normal/above) using Polars.
    Used for labs and vitals.
    """
    all_bins = []

    if bins_below > 0 and outlier_min < normal_lower:
        below_bins = create_bins_for_segment_polars(
            data, outlier_min, normal_lower, bins_below, 'below',
            extra_bins_last=extra_bins_below,
            split_first=True
        )
        all_bins.extend(below_bins)

    if bins_normal > 0:
        normal_bins = create_bins_for_segment_polars(
            data, normal_lower, normal_upper, bins_normal, 'normal'
        )
        all_bins.extend(normal_bins)

    if bins_above > 0 and normal_upper < outlier_max:
        above_bins = create_bins_for_segment_polars(
            data, normal_upper, outlier_max, bins_above, 'above',
            extra_bins_last=extra_bins_above,
            split_first=False
        )
        all_bins.extend(above_bins)

    return all_bins

## Process Labs/Vitals (Polars)

In [None]:
def process_category_polars(
    table_type: str,
    category: str,
    unit: Optional[str],
    icu_windows: pl.DataFrame,
    tables_path: str,
    file_type: str,
    outlier_range: Dict[str, float],
    cat_config: Dict[str, Any],
    output_dir: str = None,
    extreme_bins_count: int = 5,
    save_output: bool = False
) -> Tuple[Dict[str, Any], pl.DataFrame, pl.DataFrame]:
    """
    Process a single lab/vital category using pure Polars.
    """
    if table_type == 'labs':
        file_path = '../../output/intermediate/clif_labs_standardized.parquet'
        category_col = 'lab_category'
        value_col = 'lab_value_numeric'
        datetime_col = 'lab_result_dttm'
    else:
        file_path = os.path.join(tables_path, f'clif_vitals.{file_type}')
        category_col = 'vital_category'
        value_col = 'vital_value'
        datetime_col = 'recorded_dttm'

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Data file not found: {file_path}")

    display_name = f"{category} ({unit})" if table_type == 'labs' and unit else category
    print(f"Processing {display_name}...")

    data_lazy = pl.scan_parquet(file_path)
    
    if table_type == 'labs' and unit:
        if isinstance(unit, list):
            unit_filter = pl.col('reference_unit').is_in(unit)
            if '(no units)' in unit:
                unit_filter = unit_filter | pl.col('reference_unit').is_null()
        else:
            if unit == '(no units)':
                unit_filter = pl.col('reference_unit').is_null()
            else:
                unit_filter = pl.col('reference_unit') == unit
        
        data_category = data_lazy.filter(
            (pl.col(category_col) == category) & unit_filter
        ).select(['hospitalization_id', datetime_col, value_col])
    else:
        data_category = data_lazy.filter(
            pl.col(category_col) == category
        ).select(['hospitalization_id', datetime_col, value_col])

    values_df = (
        data_category
        .join(icu_windows.lazy(), on='hospitalization_id', how='inner')
        .filter(
            (pl.col(datetime_col).dt.replace_time_zone(None) >= pl.col('in_dttm')) &
            (pl.col(datetime_col).dt.replace_time_zone(None) <= pl.col('out_dttm'))
        )
        .select([value_col])
        .collect()
    )

    original_count = len(values_df)
    print(f"  Original count: {original_count:,}")

    if original_count == 0:
        print(f"  WARNING: No data found for {display_name} during ICU stays")
        return (
            {'category': category, 'unit': unit, 'original_count': 0, 'clean_count': 0},
            pl.DataFrame({'value': [], 'probability': []}),
            pl.DataFrame()
        )

    values_clean = values_df.filter(
        (pl.col(value_col) >= outlier_range['min']) &
        (pl.col(value_col) <= outlier_range['max'])
    )

    clean_count = len(values_clean)
    print(f"  After outlier removal: {clean_count:,} (removed {original_count - clean_count:,})")

    if clean_count == 0:
        print(f"  WARNING: No data remaining after outlier removal")
        return (
            {'category': category, 'unit': unit, 'original_count': original_count, 'clean_count': 0},
            pl.DataFrame({'value': [], 'probability': []}),
            pl.DataFrame()
        )

    values_series = values_clean[value_col]

    # Compute ECDF
    ecdf_df = compute_ecdf_polars(values_series)
    print(f"  ECDF: {len(ecdf_df):,} distinct pairs (compression: {clean_count / len(ecdf_df):.1f}x)")

    # Compute Bins
    bins_config = cat_config.get('bins', {})
    bins_below = bins_config.get('below_normal', 0) or 0
    bins_normal = bins_config.get('normal', 0) or 0
    bins_above = bins_config.get('above_normal', 0) or 0

    extra_bins_below = extreme_bins_count if bins_below > 1 else 0
    extra_bins_above = extreme_bins_count if bins_above > 1 else 0

    normal_range = cat_config.get('normal_range', {})
    normal_lower = normal_range.get('lower', outlier_range['min'])
    normal_upper = normal_range.get('upper', outlier_range['max'])

    bins = create_all_bins_polars(
        data=values_series,
        normal_lower=normal_lower,
        normal_upper=normal_upper,
        outlier_min=outlier_range['min'],
        outlier_max=outlier_range['max'],
        bins_below=bins_below,
        bins_normal=bins_normal,
        bins_above=bins_above,
        extra_bins_below=extra_bins_below,
        extra_bins_above=extra_bins_above
    )

    for bin_info in bins:
        if bin_info['bin_num'] == 1:
            interval = f"[{bin_info['bin_min']:.2f}, {bin_info['bin_max']:.2f}]"
        else:
            interval = f"({bin_info['bin_min']:.2f}, {bin_info['bin_max']:.2f}]"
        bin_info['interval'] = interval

    bins_df = pl.DataFrame(bins) if bins else pl.DataFrame()
    print(f"  Bins: {len(bins_df)}")

    if save_output and output_dir:
        if table_type == 'labs' and unit:
            unit_for_filename = unit[0] if isinstance(unit, list) else unit
            unit_safe = sanitize_unit_for_filename(unit_for_filename)
            filename = f'{category}_{unit_safe}.parquet'
        else:
            filename = f'{category}.parquet'

        ecdf_dir = os.path.join(output_dir, 'ecdf', table_type)
        os.makedirs(ecdf_dir, exist_ok=True)
        ecdf_df.write_parquet(os.path.join(ecdf_dir, filename))

        bins_dir = os.path.join(output_dir, 'bins', table_type)
        os.makedirs(bins_dir, exist_ok=True)
        bins_df.write_parquet(os.path.join(bins_dir, filename))

        print(f"  Saved to {output_dir}")

    stats = {
        'category': category,
        'unit': unit if table_type == 'labs' else None,
        'original_count': original_count,
        'clean_count': clean_count,
        'ecdf_distinct_pairs': len(ecdf_df),
        'num_bins': len(bins)
    }

    return stats, ecdf_df, bins_df

## Process Respiratory Support (Polars)

In [None]:
def process_respiratory_column_polars(
    column_name: str,
    icu_windows: pl.DataFrame,
    tables_path: str,
    file_type: str,
    outlier_range: Dict[str, float],
    output_dir: str = None,
    num_bins: int = 10,
    save_output: bool = False
) -> Tuple[Dict[str, Any], pl.DataFrame, pl.DataFrame]:
    """
    Process a single respiratory support column using pure Polars.
    """
    file_path = os.path.join(tables_path, f'clif_respiratory_support.{file_type}')

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Respiratory support file not found: {file_path}")

    print(f"Processing {column_name}...")

    values_df = (
        pl.scan_parquet(file_path)
        .select(['hospitalization_id', 'recorded_dttm', column_name])
        .join(icu_windows.lazy(), on='hospitalization_id', how='inner')
        .filter(
            (pl.col('recorded_dttm').dt.replace_time_zone(None) >= pl.col('in_dttm')) &
            (pl.col('recorded_dttm').dt.replace_time_zone(None) <= pl.col('out_dttm'))
        )
        .select([column_name])
        .drop_nulls()
        .collect()
    )

    original_count = len(values_df)
    print(f"  Original count: {original_count:,}")

    if original_count == 0:
        print(f"  WARNING: No data found for {column_name}")
        return (
            {'column': column_name, 'original_count': 0, 'clean_count': 0},
            pl.DataFrame({'value': [], 'probability': []}),
            pl.DataFrame()
        )

    values_clean = values_df.filter(
        (pl.col(column_name) >= outlier_range['min']) &
        (pl.col(column_name) <= outlier_range['max'])
    )

    clean_count = len(values_clean)
    print(f"  After outlier removal: {clean_count:,} (removed {original_count - clean_count:,})")

    if clean_count == 0:
        print(f"  WARNING: No data remaining after outlier removal")
        return (
            {'column': column_name, 'original_count': original_count, 'clean_count': 0},
            pl.DataFrame({'value': [], 'probability': []}),
            pl.DataFrame()
        )

    values_series = values_clean[column_name]

    # Compute ECDF
    ecdf_df = compute_ecdf_polars(values_series)
    print(f"  ECDF: {len(ecdf_df):,} distinct pairs (compression: {clean_count / len(ecdf_df):.1f}x)")

    # Compute Flat Bins
    bins = create_flat_bins_polars(values_series, num_bins=num_bins)
    bins_df = pl.DataFrame(bins) if bins else pl.DataFrame()
    print(f"  Bins: {len(bins_df)}")

    if save_output and output_dir:
        ecdf_dir = os.path.join(output_dir, 'ecdf', 'respiratory_support')
        os.makedirs(ecdf_dir, exist_ok=True)
        ecdf_df.write_parquet(os.path.join(ecdf_dir, f'{column_name}.parquet'))

        bins_dir = os.path.join(output_dir, 'bins', 'respiratory_support')
        os.makedirs(bins_dir, exist_ok=True)
        bins_df.write_parquet(os.path.join(bins_dir, f'{column_name}.parquet'))

        print(f"  Saved to {output_dir}")

    stats = {
        'column': column_name,
        'original_count': original_count,
        'clean_count': clean_count,
        'ecdf_distinct_pairs': len(ecdf_df),
        'num_bins': len(bins)
    }

    return stats, ecdf_df, bins_df

## Configuration References

In [None]:
labs_config = lab_vital_config.get('labs', {})
labs_outlier = outlier_config['tables']['labs']['lab_value_numeric']

vitals_config = lab_vital_config.get('vitals', {})
vitals_outlier = outlier_config['tables']['vitals']['vital_value']

resp_outlier = outlier_config['tables'].get('respiratory_support', {})

resp_columns = [
    'fio2_set', 'lpm_set', 'tidal_volume_set', 'resp_rate_set',
    'pressure_control_set', 'pressure_support_set', 'flow_rate_set',
    'peak_inspiratory_pressure_set', 'inspiratory_time_set', 'peep_set',
    'tidal_volume_obs', 'resp_rate_obs', 'plateau_pressure_obs',
    'peak_inspiratory_pressure_obs', 'peep_obs', 'minute_vent_obs',
    'mean_airway_pressure_obs'
]

print(f"Labs: {len(labs_config)} categories")
print(f"Vitals: {len(vitals_config)} categories")
print(f"Respiratory: {len(resp_columns)} columns")


# Run Full Pipeline

In [None]:
OUTPUT_DIR = str(PROJECT_ROOT / 'output' / 'final')
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory: {OUTPUT_DIR}")

## Process All Labs

In [None]:
print("="*80)
print("Processing Labs")
print("="*80)

labs_stats = []

for category, cat_config in labs_config.items():
    if cat_config is None:
        print(f"WARNING: Category '{category}' has None config, skipping")
        continue

    if not isinstance(cat_config, dict):
        continue

    if category not in labs_outlier:
        print(f"WARNING: Category '{category}' not in outlier config, skipping")
        continue

    config_unit = cat_config.get('reference_unit')
    if config_unit is None:
        print(f"WARNING: Category '{category}' has no reference_unit in config, skipping")
        continue

    if isinstance(config_unit, str):
        unit = config_unit.lower()
    elif isinstance(config_unit, list):
        unit = [u.lower() if isinstance(u, str) else u for u in config_unit]
    else:
        unit = config_unit

    try:
        stats, ecdf_df, bins_df = process_category_polars(
            table_type='labs',
            category=category,
            unit=unit,
            icu_windows=icu_windows,
            tables_path=clif_config['tables_path'],
            file_type=clif_config['file_type'],
            outlier_range=labs_outlier[category],
            cat_config=cat_config,
            output_dir=OUTPUT_DIR,
            extreme_bins_count=5,
            save_output=True
        )
        labs_stats.append(stats)
    except Exception as e:
        print(f"ERROR processing {category}: {e}")

print(f"\nProcessed {len(labs_stats)} lab categories")

## Process All Vitals

In [None]:
print("="*80)
print("Processing Vitals")
print("="*80)

vitals_stats = []

for category, cat_config in vitals_config.items():
    if cat_config is None:
        print(f"WARNING: Category '{category}' has None config, skipping")
        continue

    if not isinstance(cat_config, dict):
        continue

    if category not in vitals_outlier:
        print(f"WARNING: Category '{category}' not in outlier config, skipping")
        continue

    extreme_bins = 5 if category in ['height_cm', 'weight_kg'] else 10

    try:
        stats, ecdf_df, bins_df = process_category_polars(
            table_type='vitals',
            category=category,
            unit=None,
            icu_windows=icu_windows,
            tables_path=clif_config['tables_path'],
            file_type=clif_config['file_type'],
            outlier_range=vitals_outlier[category],
            cat_config=cat_config,
            output_dir=OUTPUT_DIR,
            extreme_bins_count=extreme_bins,
            save_output=True
        )
        vitals_stats.append(stats)
    except Exception as e:
        print(f"ERROR processing {category}: {e}")

print(f"\nProcessed {len(vitals_stats)} vital categories")

## Process All Respiratory Support

In [None]:
print("="*80)
print("Processing Respiratory Support")
print("="*80)

resp_stats = []

for column_name in resp_columns:
    if column_name not in resp_outlier:
        print(f"WARNING: Column '{column_name}' not in outlier config, skipping")
        continue
    
    try:
        stats, ecdf_df, bins_df = process_respiratory_column_polars(
            column_name=column_name,
            icu_windows=icu_windows,
            tables_path=clif_config['tables_path'],
            file_type=clif_config['file_type'],
            outlier_range=resp_outlier[column_name],
            output_dir=OUTPUT_DIR,
            num_bins=10,
            save_output=True
        )
        resp_stats.append(stats)
    except Exception as e:
        print(f"ERROR processing {column_name}: {e}")

print(f"\nProcessed {len(resp_stats)} respiratory columns")

## Summary

In [None]:
print("="*80)
print("Processing Summary")
print("="*80)
print(f"\nLabs: {len(labs_stats)} categories processed")
print(f"Vitals: {len(vitals_stats)} categories processed")
print(f"Respiratory: {len(resp_stats)} columns processed")
print(f"\nOutput saved to: {OUTPUT_DIR}")