# Table One Overall Aggregator



In [None]:
import polars as pl
from pathlib import Path
import os

In [None]:
BOX_FOLDER_PATH = 'path/to/box'

# Convert to Path object
root_dir = Path(BOX_FOLDER_PATH).expanduser()

# Verify the path exists
if not root_dir.exists():
    raise FileNotFoundError(f"Path does not exist: {root_dir}")

print(f"Searching in: {root_dir.absolute()}")
print()

# Find all table_one_by_year.csv files
csv_files = list(root_dir.glob('**/table_one_overall.csv'))

print(f"Found {len(csv_files)} CSV files:")
for file in csv_files:
    print(f"  - {file}")

In [None]:
# Dictionary to store dataframes
dfs = {}

# Read each CSV and store with folder name
for csv_file in csv_files:
    # Get the top-level folder name (e.g., 'upenn', 'nu', 'rush')
    folder_name = csv_file.parts[9] 
    
    # Read the CSV
    df = pl.read_csv(csv_file)
    
    # Store in dictionary
    dfs[folder_name] = df
    print(f"Loaded {folder_name}: {df.shape}")

In [None]:
overall_dfs = []

for folder_name, df in dfs.items():
    overall_col = None
    for c in df.columns:
        if "Overall" in c:
            overall_col = c
            break
    df_overall = df[['Variable', overall_col]].clone()
    # Properly rename the column using a dictionary.
    df_overall = df_overall.rename({overall_col: f"{folder_name}"})
    overall_dfs.append(df_overall)
    print(f"Extracted from {folder_name}: {df_overall.shape}")

# Convert all values in the 'Variable' column to lower case for each overall_df
for i in range(len(overall_dfs)):
    if 'Variable' in overall_dfs[i].columns:
        overall_dfs[i] = overall_dfs[i].with_columns(
            pl.col('Variable').str.to_lowercase().alias('Variable')
        )

unaggregated = overall_dfs[0]
for df_overall in overall_dfs[1:]:
    # Only the 'Variable' column will match; all other columns are uniquely named
    unaggregated = unaggregated.join(df_overall, on="Variable", how="left")

unaggregated.shape

# Aggregate consortium results 

In [None]:
import re

def get_numeric(value):
    """
    Extract numeric value at start of string before '(' if pattern matches, else return None.
    Example: '164 (0.1%)' -> 164
    """
    if isinstance(value, str):
        match = re.match(r"^\s*([-\d,\.]+)\s*(?:\([^\)]*\))?\s*$", value)
        if match:
            num_str = match.group(1).replace(",", "")
            try:
                # Some values may be decimal floats, just cast to float first
                return float(num_str)
            except:
                return None
    return None

def value_has_iqr(value):
    # Check for pattern [something, something]
    if not isinstance(value, str):
        return False
    return bool(re.search(r"\[.*?,.*?\]", value))

def aggregate_overalls(df):
    # Get all columns that are site columns (not 'Variable')
    site_cols = [c for c in df.columns if c != "Variable"]

    # First pass: calculate all numeric sums
    numeric_sums = {}
    for row in df.iter_rows(named=True):
        variable = row['Variable']
        values = [row[c] for c in site_cols]

        # If any value has [min, max] (IQR), skip aggregation
        if any(value_has_iqr(val) for val in values):
            numeric_sums[variable] = None
            continue

        # For columns with values like "164 (0.1%)" or '68,359 (57.6%)'
        numbers = []
        for val in values:
            num = get_numeric(val)
            if num is not None:
                numbers.append(num)
        if numbers:
            s = int(sum(numbers))
            numeric_sums[variable] = s
        else:
            numeric_sums[variable] = None

    # Get denominators
    denominator_patients = numeric_sums.get("n: unique patients")
    denominator_encounters = numeric_sums.get("n: encounter blocks")

    # Define variables that should NOT have percentages
    no_percentage_vars = [
        "n: encounter blocks",
        "n: unique patients",
        "n: hospitals"
    ]

    # Define variables that use patients as denominator
    mortality_vars = [
        "hospital mortality, n (%)",
        "discharged to hospice, n (%)",
        "expired, n (%)"
    ]

    # Second pass: format with percentages
    values_cons = []
    for row in df.iter_rows(named=True):
        variable = row['Variable']
        value = numeric_sums[variable]

        if value is None:
            values_cons.append(None)
            continue

        # Skip percentage for denominator variables themselves
        if variable in no_percentage_vars:
            formatted = f"{value:,}"
            values_cons.append(formatted)
            continue

        # Determine denominator: patients or encounters?
        use_patients_denom = (
            any(term in variable.lower() for term in ["race:", "ethnicity:", "sex:"]) or
            variable in mortality_vars
        )

        if use_patients_denom:
            denominator = denominator_patients
        else:
            denominator = denominator_encounters

        # Calculate percentage if denominator exists
        if denominator and denominator > 0:
            percentage = (value / denominator) * 100
            formatted = f"{value:,} ({percentage:.1f}%)"
        else:
            formatted = f"{value:,}"

        values_cons.append(formatted)

    # Create a new polars Series/column for consortium
    unaggregated_with_cons = df.with_columns(
        pl.Series("consortium", values_cons)
    )
    return unaggregated_with_cons


consortium_overalls = aggregate_overalls(unaggregated)

In [None]:
import numpy as np

def extract_median_q1_q3(value):
    """
    Extracts median, q1, q3 from a value string formatted as 'median [q1, q3]'.
    Returns a tuple of floats (median, q1, q3), or None if not applicable.
    """
    if not isinstance(value, str):
        return None
    match = re.match(r'^\s*([-\d\.]+)\s*\[\s*([-\d\.]+)\s*,\s*([-\d\.]+)\s*\]', value)
    if match:
        try:
            return float(match.group(1)), float(match.group(2)), float(match.group(3))
        except Exception:
            return None
    return None

def aggregate_median_q1_q3(df):
    site_cols = [c for c in df.columns if c not in ("Variable", "consortium")]
    cons_values = []
    
    for row in df.iter_rows(named=True):
        variable = row['Variable']
        values = [row[c] for c in site_cols]
        # Filter to only those with median[q1, q3] format
        extracted = [extract_median_q1_q3(val) for val in values]
        non_null = [r for r in extracted if r is not None]
        if non_null:
            # Calculate median of medians, min of q1, max of q3
            medians = [v[0] for v in non_null]
            q1s = [v[1] for v in non_null]
            q3s = [v[2] for v in non_null]
            median_median = np.median(medians)
            q1_min = min(q1s)
            q3_max = max(q3s)
            # Preserve integer if possible
            frac = lambda x: int(x) if float(x).is_integer() else round(x, 2)
            cons_val = f'{frac(median_median)} [{frac(q1_min)},{frac(q3_max)}]'
            cons_values.append(cons_val)
        else:
            cons_values.append(row.get('consortium', None))
    # Return new DataFrame with consortium column updated
    return df.with_columns(pl.Series("consortium", cons_values))

# Recompute consortium_overalls with the above median[q1,q3] logic
consortium_overalls_final = aggregate_median_q1_q3(consortium_overalls)


In [None]:
consortium_overalls_final.write_csv("consortium_overall.csv")

<!-- unaggregated  -->