In [None]:
import polars as pl
from pathlib import Path
import os

In [None]:
BOX_FOLDER_PATH = '/Users/dema/Library/CloudStorage/Box-Box/CLIF/projects/CLIF-TableOne-2025'

# Convert to Path object
root_dir = Path(BOX_FOLDER_PATH).expanduser()

# Verify the path exists
if not root_dir.exists():
    raise FileNotFoundError(f"Path does not exist: {root_dir}")

print(f"Searching in: {root_dir.absolute()}")
print()

# Find all table_one_by_year.csv files
csv_files = list(root_dir.glob('**/table_one_by_year.csv'))

print(f"Found {len(csv_files)} CSV files:")
for file in csv_files:
    print(f"  - {file}")

In [None]:
# Dictionary to store dataframes
dfs = {}

# SITES TO EXCLUDE IF NEEDED
excluded_folders = {"mimic_iv"}

# Read each CSV and store with folder name
for csv_file in csv_files:
    # Get part after 'CLIF-TableOne-2025/'
    parts = csv_file.parts
    try:
        clif_idx = parts.index('CLIF-TableOne-2025')
        folder_name = parts[clif_idx + 1]
    except ValueError:
        raise ValueError(f"'CLIF-TableOne-2025' not found in path: {csv_file}")
    except IndexError:
        raise ValueError(f"No folder after 'CLIF-TableOne-2025' in path: {csv_file}")

    # Exclude certain folder names
    if folder_name.lower() in excluded_folders:
        print(f"Skipping excluded folder: {folder_name}")
        continue

    # Read the CSV
    df = pl.read_csv(csv_file)
    
    # Store in dictionary
    dfs[folder_name] = df
    print(f"Loaded {folder_name}: {df.shape}")

In [None]:
# --- Outer join on 'Variable' with NO duplicate Variable columns created ---
# Additional fix: If a row's "Variable" is empty ("" or null), set all other values in that row to null to ensure only true variables display data.

all_dfs = []

for folder_name, df in dfs.items():
    # Identify all columns except 'Variable'
    non_variable_cols = [col for col in df.columns if col != "Variable"]
    renamed = {}
    # Prepend folder_name to each non-variable column
    for col in non_variable_cols:
        renamed[col] = f"{folder_name}_{col}"
    df_renamed = df.rename(renamed)
    all_dfs.append(df_renamed)
    print(f"Renamed {folder_name}: {df_renamed.shape} | {list(df_renamed.columns)}")

# Standardize Variable column to lowercase for merge
for i in range(len(all_dfs)):
    if "Variable" in all_dfs[i].columns:
        all_dfs[i] = all_dfs[i].with_columns(
            pl.col("Variable").str.to_lowercase().alias("Variable")
        )

def outer_merge_on_variable(dfs_list):
    result = dfs_list[0]
    for df in dfs_list[1:]:
        # Ensure only one 'Variable' in right df, drop if duplicated
        df_cols = set(df.columns)
        extra_vars = [c for c in df_cols if c.lower() == "variable" and c != "Variable"]
        if extra_vars:
            df = df.drop(extra_vars)
        # Remove any Variable_* leftover columns from previous joins
        var_num_cols = [c for c in result.columns if c.startswith("Variable_") and c != "Variable"]
        if var_num_cols:
            result = result.drop(var_num_cols)
        result = result.join(df, on="Variable", how="outer")
        # Remove duplicate 'Variable' columns
        dedup_variable_cols = [c for c in result.columns if c.lower() == "variable" and c != "Variable"]
        if dedup_variable_cols:
            result = result.drop(dedup_variable_cols)
    # Remove any Variable_* columns at end except main 'Variable'
    result = result.select(
        [col for col in result.columns if not col.startswith("Variable_") or col == "Variable"]
    )
    # New step: Null out any row where 'Variable' is missing/blank (except for 'Variable' itself)
    # In polars, "" and None test as "not Variable"
    non_variable_cols = [col for col in result.columns if col != "Variable"]
    result = result.with_columns(
        [
            pl.when(pl.col("Variable").is_null() | (pl.col("Variable").str.strip_chars() == ""))
            .then(None)
            .otherwise(pl.col(c))
            .alias(c)
            for c in non_variable_cols
        ]
    )
    return result

unaggregated = outer_merge_on_variable(all_dfs)

unaggregated.shape

In [None]:
unaggregated.write_csv("../output/consortium_t1_by_year_site.csv")