# Table One Overall Aggregator



In [36]:
import polars as pl
from pathlib import Path
import os

In [None]:
BOX_FOLDER_PATH = 'path/to/box'

# Convert to Path object
root_dir = Path(BOX_FOLDER_PATH).expanduser()

# Verify the path exists
if not root_dir.exists():
    raise FileNotFoundError(f"Path does not exist: {root_dir}")

print(f"Searching in: {root_dir.absolute()}")
print()

# Find all table_one_by_year.csv files
csv_files = list(root_dir.glob('**/table_one_overall.csv'))

print(f"Found {len(csv_files)} CSV files:")
for file in csv_files:
    print(f"  - {file}")

Searching in: /Users/dema/Library/CloudStorage/Box-Box/CLIF/projects/CLIF-TableOne-2025

Found 9 CSV files:
  - /Users/dema/Library/CloudStorage/Box-Box/CLIF/projects/CLIF-TableOne-2025/upenn/tableone/table_one_overall.csv
  - /Users/dema/Library/CloudStorage/Box-Box/CLIF/projects/CLIF-TableOne-2025/nu/final/tableone/table_one_overall.csv
  - /Users/dema/Library/CloudStorage/Box-Box/CLIF/projects/CLIF-TableOne-2025/rush/final/tableone/table_one_overall.csv
  - /Users/dema/Library/CloudStorage/Box-Box/CLIF/projects/CLIF-TableOne-2025/ohsu/final/tableone/table_one_overall.csv
  - /Users/dema/Library/CloudStorage/Box-Box/CLIF/projects/CLIF-TableOne-2025/ucmc/final/tableone/table_one_overall.csv
  - /Users/dema/Library/CloudStorage/Box-Box/CLIF/projects/CLIF-TableOne-2025/umn/final/tableone/table_one_overall.csv
  - /Users/dema/Library/CloudStorage/Box-Box/CLIF/projects/CLIF-TableOne-2025/emory/final/tableone/table_one_overall.csv
  - /Users/dema/Library/CloudStorage/Box-Box/CLIF/projects/

In [38]:
# Dictionary to store dataframes
dfs = {}

# Read each CSV and store with folder name
for csv_file in csv_files:
    # Get part after 'CLIF-TableOne-2025/'
    parts = csv_file.parts
    try:
        clif_idx = parts.index('CLIF-TableOne-2025')
        folder_name = parts[clif_idx + 1]
    except ValueError:
        raise ValueError(f"'CLIF-TableOne-2025' not found in path: {csv_file}")
    except IndexError:
        raise ValueError(f"No folder after 'CLIF-TableOne-2025' in path: {csv_file}")

    # Read the CSV
    df = pl.read_csv(csv_file)
    
    # Store in dictionary
    dfs[folder_name] = df
    print(f"Loaded {folder_name}: {df.shape}")

Loaded upenn: (113, 2)
Loaded nu: (107, 2)
Loaded rush: (110, 2)
Loaded ohsu: (109, 2)
Loaded ucmc: (102, 2)
Loaded umn: (96, 2)
Loaded emory: (108, 2)
Loaded umich: (87, 2)
Loaded mimic_iv: (106, 2)


In [39]:
overall_dfs = []

for folder_name, df in dfs.items():
    overall_col = None
    for c in df.columns:
        if "Overall" in c:
            overall_col = c
            break
    df_overall = df[['Variable', overall_col]].clone()
    # Properly rename the column using a dictionary.
    df_overall = df_overall.rename({overall_col: f"{folder_name}"})
    overall_dfs.append(df_overall)
    print(f"Extracted from {folder_name}: {df_overall.shape}")

# Convert all values in the 'Variable' column to lower case for each overall_df
for i in range(len(overall_dfs)):
    if 'Variable' in overall_dfs[i].columns:
        overall_dfs[i] = overall_dfs[i].with_columns(
            pl.col('Variable').str.to_lowercase().alias('Variable')
        )

unaggregated = overall_dfs[0]
for df_overall in overall_dfs[1:]:
    # Only the 'Variable' column will match; all other columns are uniquely named
    unaggregated = unaggregated.join(df_overall, on="Variable", how="left")

unaggregated.shape

Extracted from upenn: (113, 2)
Extracted from nu: (107, 2)
Extracted from rush: (110, 2)
Extracted from ohsu: (109, 2)
Extracted from ucmc: (102, 2)
Extracted from umn: (96, 2)
Extracted from emory: (108, 2)
Extracted from umich: (87, 2)
Extracted from mimic_iv: (106, 2)


(113, 10)

# Aggregate consortium results 

In [40]:
import re

def get_numeric(value):
    """
    Extract numeric value at start of string before '(' if pattern matches, else return None.
    Example: '164 (0.1%)' -> 164
    """
    if isinstance(value, str):
        match = re.match(r"^\s*([-\d,\.]+)\s*(?:\([^\)]*\))?\s*$", value)
        if match:
            num_str = match.group(1).replace(",", "")
            try:
                # Some values may be decimal floats, just cast to float first
                return float(num_str)
            except:
                return None
    return None

def value_has_iqr(value):
    # Check for pattern [something, something]
    if not isinstance(value, str):
        return False
    return bool(re.search(r"\[.*?,.*?\]", value))

def aggregate_overalls(df):
    # Get all columns that are site columns (not 'Variable')
    site_cols = [c for c in df.columns if c != "Variable"]

    # First pass: calculate all numeric sums
    numeric_sums = {}
    for row in df.iter_rows(named=True):
        variable = row['Variable']
        values = [row[c] for c in site_cols]

        # If any value has [min, max] (IQR), skip aggregation
        if any(value_has_iqr(val) for val in values):
            numeric_sums[variable] = None
            continue

        # For columns with values like "164 (0.1%)" or '68,359 (57.6%)'
        numbers = []
        for val in values:
            num = get_numeric(val)
            if num is not None:
                numbers.append(num)
        if numbers:
            s = int(sum(numbers))
            numeric_sums[variable] = s
        else:
            numeric_sums[variable] = None

    # Get denominators
    denominator_patients = numeric_sums.get("n: unique patients")
    denominator_encounters = numeric_sums.get("n: encounter blocks")

    # Define variables that should NOT have percentages
    no_percentage_vars = [
        "n: encounter blocks",
        "n: unique patients",
        "n: hospitals"
    ]

    # Define variables that use patients as denominator
    mortality_vars = [
        "hospital mortality, n (%)",
        "discharged to hospice, n (%)",
        "expired, n (%)"
    ]

    # Second pass: format with percentages
    values_cons = []
    for row in df.iter_rows(named=True):
        variable = row['Variable']
        value = numeric_sums[variable]

        if value is None:
            values_cons.append(None)
            continue

        # Skip percentage for denominator variables themselves
        if variable in no_percentage_vars:
            formatted = f"{value:,}"
            values_cons.append(formatted)
            continue

        # Determine denominator: patients or encounters?
        use_patients_denom = (
            any(term in variable.lower() for term in ["race:", "ethnicity:", "sex:"]) or
            variable in mortality_vars
        )

        if use_patients_denom:
            denominator = denominator_patients
        else:
            denominator = denominator_encounters

        # Calculate percentage if denominator exists
        if denominator and denominator > 0:
            percentage = (value / denominator) * 100
            formatted = f"{value:,} ({percentage:.1f}%)"
        else:
            formatted = f"{value:,}"

        values_cons.append(formatted)

    # Create a new polars Series/column for consortium
    unaggregated_with_cons = df.with_columns(
        pl.Series("consortium", values_cons)
    )
    return unaggregated_with_cons


consortium_overalls = aggregate_overalls(unaggregated)

In [41]:
import numpy as np

def extract_median_q1_q3(value):
    """
    Extracts median, q1, q3 from a value string formatted as 'median [q1, q3]'.
    Returns a tuple of floats (median, q1, q3), or None if not applicable.
    """
    if not isinstance(value, str):
        return None
    match = re.match(r'^\s*([-\d\.]+)\s*\[\s*([-\d\.]+)\s*,\s*([-\d\.]+)\s*\]', value)
    if match:
        try:
            return float(match.group(1)), float(match.group(2)), float(match.group(3))
        except Exception:
            return None
    return None

def aggregate_median_q1_q3(df):
    site_cols = [c for c in df.columns if c not in ("Variable", "consortium")]
    cons_values = []
    
    for row in df.iter_rows(named=True):
        variable = row['Variable']
        values = [row[c] for c in site_cols]
        # Filter to only those with median[q1, q3] format
        extracted = [extract_median_q1_q3(val) for val in values]
        non_null = [r for r in extracted if r is not None]
        if non_null:
            # Calculate median of medians, min of q1, max of q3
            medians = [v[0] for v in non_null]
            q1s = [v[1] for v in non_null]
            q3s = [v[2] for v in non_null]
            median_median = np.median(medians)
            q1_min = min(q1s)
            q3_max = max(q3s)
            # Preserve integer if possible
            frac = lambda x: int(x) if float(x).is_integer() else round(x, 2)
            cons_val = f'{frac(median_median)} [{frac(q1_min)},{frac(q3_max)}]'
            cons_values.append(cons_val)
        else:
            cons_values.append(row.get('consortium', None))
    # Return new DataFrame with consortium column updated
    return df.with_columns(pl.Series("consortium", cons_values))

# Recompute consortium_overalls with the above median[q1,q3] logic
consortium_overalls_final = aggregate_median_q1_q3(consortium_overalls)


In [42]:
consortium_overalls_final.columns

['Variable',
 'upenn',
 'nu',
 'rush',
 'ohsu',
 'ucmc',
 'umn',
 'emory',
 'umich',
 'mimic_iv',
 'consortium']

In [43]:
# Add a row called 'years' capturing min-to-max year for each site, inferred from all 'table_one_by_year.csv' files

csv_files = list(root_dir.glob('**/table_one_by_year.csv'))
site_years = {}

for csv_file in csv_files:
    # Determine site name from folder, following logic as in lines 5-21
    parts = csv_file.parts
    try:
        clif_idx = parts.index('CLIF-TableOne-2025')
        folder_name = parts[clif_idx + 1]
    except ValueError:
        raise ValueError(f"'CLIF-TableOne-2025' not found in path: {csv_file}")
    except IndexError:
        raise ValueError(f"No folder after 'CLIF-TableOne-2025' in path: {csv_file}")

    df_tmp = pl.read_csv(csv_file)
    year_columns = [col for col in df_tmp.columns if col not in ("Variable", "Overall")]
    years = [int(col) for col in year_columns if str(col).isdigit()]
    if years:
        min_year, max_year = min(years), max(years)
        if min_year != max_year:
            years_string = f"{min_year}-{max_year}"
        else:
            years_string = str(min_year)
    else:
        years_string = ""
    site_years[folder_name] = years_string

# Also gather overall min/max year across all sites for the 'consortium' column
all_years_flat = []
for site, ystr in site_years.items():
    if site == "mimic_iv":
        continue
    if '-' in ystr:
        y1, y2 = ystr.split('-')
        all_years_flat.extend([int(y1), int(y2)])
    elif ystr:
        all_years_flat.append(int(ystr))
if all_years_flat:
    cons_min, cons_max = min(all_years_flat), max(all_years_flat)
    cons_years_string = f"{cons_min}-{cons_max}" if cons_min != cons_max else str(cons_min)
else:
    cons_years_string = ""

# Use the SAME order and complete set of columns as in consortium_overalls_final, which includes:
# ['Variable','upenn','nu','rush','ohsu','ucmc','umn','emory','umich','mimic_iv','consortium']
years_row = {}
for col in consortium_overalls_final.columns:
    if col == "Variable":
        years_row[col] = "years"
    elif col == "consortium":
        years_row[col] = cons_years_string
    else:
        years_row[col] = site_years.get(col, "")

# Make sure years_row has keys for ALL columns in consortium_overalls_final.
years_row_df = pl.DataFrame([years_row]).select(consortium_overalls_final.columns)

# Insert the 'years' row as the first row
final = pl.concat([years_row_df, consortium_overalls_final], how="vertical")
final

Variable,upenn,nu,rush,ohsu,ucmc,umn,emory,umich,mimic_iv,consortium
str,str,str,str,str,str,str,str,str,str,str
"""years""","""2017-2025""","""2018-2024""","""2018-2025""","""2016-2025""","""2018-2025""","""2011-2025""","""2022-2025""","""2017-2024""","""2105-2214""","""2011-2025"""
"""n: encounter blocks""","""152,603""","""106,234""","""67,385""","""63,185""","""43,779""","""178,613""","""56,717""","""49,908""","""89,889""","""808,313"""
"""n: unique patients""","""118,690""","""85,393""","""46,781""","""53,775""","""35,684""","""129,480""","""46,065""","""42,193""","""68,109""","""626,170"""
"""n: hospitals""","""8""","""10""","""1""","""2""","""1""","""13""","""10""","""1""","""1""","""47"""
"""age at admission, median [q1, …","""65 [53, 76]""","""66 [53, 76]""","""63 [51, 73]""","""64 [50, 74]""","""62 [47, 72]""","""66 [53, 78]""","""65 [52, 75]""","""63 [50, 72]""","""67 [55, 78]""","""65 [47,78]"""
…,…,…,…,…,…,…,…,…,…,…
""" dexmedetomidine""","""14,942 (9.8%)""","""22,198 (20.9%)""","""7,587 (11.3%)""","""13,812 (21.9%)""","""8,875 (20.3%)""","""27,844 (15.6%)""","""11,675 (20.6%)""",,"""7,996 (8.9%)""","""114,929 (14.2%)"""
""" fentanyl""","""27,122 (17.8%)""","""17,358 (16.3%)""","""8,310 (12.3%)""","""4,355 (6.9%)""","""11,627 (26.6%)""","""20,114 (11.3%)""","""8,199 (14.5%)""",,"""14,355 (16.0%)""","""111,440 (13.8%)"""
"""neuromuscular blocking agents,…",,,,,,,,,,
""" cisatracurium""","""3,550 (2.3%)""","""1,269 (1.2%)""","""636 (0.9%)""","""684 (1.1%)""","""591 (1.3%)""","""1,694 (0.9%)""","""382 (0.7%)""",,"""1,716 (1.9%)""","""10,522 (1.3%)"""


In [44]:
# Move 'consortium' column to the second position
cols = final.columns
if 'consortium' in cols:
    new_cols = []
    for i, col in enumerate(cols):
        if i == 1:
            new_cols.append('consortium')
        if col != 'consortium':
            new_cols.append(col)
    final = final.select(new_cols)
final

Variable,consortium,upenn,nu,rush,ohsu,ucmc,umn,emory,umich,mimic_iv
str,str,str,str,str,str,str,str,str,str,str
"""years""","""2011-2025""","""2017-2025""","""2018-2024""","""2018-2025""","""2016-2025""","""2018-2025""","""2011-2025""","""2022-2025""","""2017-2024""","""2105-2214"""
"""n: encounter blocks""","""808,313""","""152,603""","""106,234""","""67,385""","""63,185""","""43,779""","""178,613""","""56,717""","""49,908""","""89,889"""
"""n: unique patients""","""626,170""","""118,690""","""85,393""","""46,781""","""53,775""","""35,684""","""129,480""","""46,065""","""42,193""","""68,109"""
"""n: hospitals""","""47""","""8""","""10""","""1""","""2""","""1""","""13""","""10""","""1""","""1"""
"""age at admission, median [q1, …","""65 [47,78]""","""65 [53, 76]""","""66 [53, 76]""","""63 [51, 73]""","""64 [50, 74]""","""62 [47, 72]""","""66 [53, 78]""","""65 [52, 75]""","""63 [50, 72]""","""67 [55, 78]"""
…,…,…,…,…,…,…,…,…,…,…
""" dexmedetomidine""","""114,929 (14.2%)""","""14,942 (9.8%)""","""22,198 (20.9%)""","""7,587 (11.3%)""","""13,812 (21.9%)""","""8,875 (20.3%)""","""27,844 (15.6%)""","""11,675 (20.6%)""",,"""7,996 (8.9%)"""
""" fentanyl""","""111,440 (13.8%)""","""27,122 (17.8%)""","""17,358 (16.3%)""","""8,310 (12.3%)""","""4,355 (6.9%)""","""11,627 (26.6%)""","""20,114 (11.3%)""","""8,199 (14.5%)""",,"""14,355 (16.0%)"""
"""neuromuscular blocking agents,…",,,,,,,,,,
""" cisatracurium""","""10,522 (1.3%)""","""3,550 (2.3%)""","""1,269 (1.2%)""","""636 (0.9%)""","""684 (1.1%)""","""591 (1.3%)""","""1,694 (0.9%)""","""382 (0.7%)""",,"""1,716 (1.9%)"""


In [None]:
final.write_csv("overall.csv")

<!-- unaggregated  -->