# Status Tracker Consolidator



In [None]:
import polars as pl
from pathlib import Path
import os
import json

In [None]:
BOX_FOLDER_PATH = '/Users/dema/Library/CloudStorage/Box-Box/CLIF/projects/CLIF-TableOne-2025'

# Convert to Path object
root_dir = Path(BOX_FOLDER_PATH).expanduser()

# Verify the path exists
if not root_dir.exists():
    raise FileNotFoundError(f"Path does not exist: {root_dir}")

print(f"Searching in: {root_dir.absolute()}")
print()

# Find all JSON files inside any results/ folder
json_files = list(root_dir.glob('**/results/*.json'))

print(f"Found {len(json_files)} JSON files in results/ folders:")
for file in json_files:
    print(f"  - {file}")

In [None]:
# Dictionary to store table_name/status unique pairs for each folder
unique_statuses = {}

# Folder names to exclude (use lowercase for matching)
excluded_folders = {}

# First, gather per-folder mappings from json_files for easier lookup
from collections import defaultdict

# We'll build, for each folder_name, a dict: table_name -> (path, file_type)
folder_table_type_files = defaultdict(dict)

for json_file in json_files:
    parts = json_file.parts
    try:
        clif_idx = parts.index('CLIF-TableOne-2025')
        folder_name = parts[clif_idx + 1]
    except ValueError:
        raise ValueError(f"'CLIF-TableOne-2025' not found in path: {json_file}")
    except IndexError:
        raise ValueError(f"No folder after 'CLIF-TableOne-2025' in path: {json_file}")

    if folder_name.lower() in excluded_folders:
        continue

    name = json_file.name
    table_name = None
    file_type = None
    if "validation_response" in name:
        prefix = name.split('_validation_response')[0]
        table_name = prefix
        file_type = 'validation_response'
    elif name.endswith('_summary_validation.json'):
        table_name = name.split('_summary_validation.json')[0]
        file_type = 'summary_validation'
    else:
        continue

    # Only store one per type -- if multiple, last wins (shouldn't be the case)
    folder_table_type_files[folder_name].setdefault(table_name, {})
    folder_table_type_files[folder_name][table_name][file_type] = json_file

# Now, for each folder, prefer validation_response for a table (if exists), else summary_validation
for folder_name, table_files in folder_table_type_files.items():
    status_records = []
    for table_name, file_types in table_files.items():
        if 'validation_response' in file_types:
            json_file = file_types['validation_response']
            status_key = 'adjusted_status'
        elif 'summary_validation' in file_types:
            json_file = file_types['summary_validation']
            status_key = 'status'
        else:
            continue  # Should never happen

        # Try to load status
        try:
            with open(json_file, 'r') as f:
                data = json.load(f)
            status = data.get(status_key, None)
            if status is None:
                print(f"No '{status_key}' key in {json_file}")
                continue
            status_records.append({
                "table_name": table_name,
                "status": status
            })
            print(f"Loaded status for table '{table_name}' from {folder_name}.")
        except Exception as e:
            print(f"Failed to read JSON file {json_file}: {e}")
            continue

    if status_records:
        unique_statuses[folder_name] = pl.DataFrame(status_records)


In [None]:
# Join all tables in unique_statuses on the 'table_name' column

# First, convert all table_name values to lowercase for consistent joins
for folder_name in unique_statuses:
    if 'table_name' in unique_statuses[folder_name].columns:
        unique_statuses[folder_name] = unique_statuses[folder_name].with_columns(
            pl.col('table_name').str.to_lowercase().alias('table_name')
        )

# Prepare dfs for join, renaming the 'status' column to each site's/folder's name
status_dfs = []
for folder_name, df in unique_statuses.items():
    df_status = df.clone()
    df_status = df_status.rename({'status': f"{folder_name}"})
    # To make sure there is never a duplicate "table_name" column, remove extra occurrences
    # which may occur from previous joins, using select (and order is not strictly guaranteed, but fine)
    if [col for col in df_status.columns if col == "table_name"].count("table_name") > 1:
        df_status = df_status.select(pl.col("table_name"), pl.col(f"{folder_name}"))
    status_dfs.append(df_status)

# Perform a full outer join across all dataframes on 'table_name',
# using a different suffix to avoid DuplicateError on table_name/right
if len(status_dfs) > 0:
    joined_status = status_dfs[0]
    for i, next_df in enumerate(status_dfs[1:]):
        # Use a custom suffix for each join, highly unlikely to collide with a real name
        suffix = f"_dupsite{i+1}"
        joined_status = joined_status.join(
            next_df,
            on="table_name",
            how="outer",
            suffix=suffix
        )
        # After the join, drop any duplicate 'table_name' columns (with the suffix)
        right_table_name = f"table_name{suffix}"
        if right_table_name in joined_status.columns:
            joined_status = joined_status.drop(right_table_name)
else:
    joined_status = None

joined_status.shape if joined_status is not None else (0, 0)

In [None]:
joined_status = joined_status.sort("table_name") if joined_status is not None else None

if joined_status is not None:
    counts = {}
    row_count = 18  # as specified

    # Gather columns to analyze (excluding 'table_name')
    value_columns = [col for col in joined_status.columns if col != "table_name"]

    # Count for each value: complete, partial, incomplete
    for value in ["complete", "partial", "incomplete"]:
        counts[value] = [
            (joined_status[col] == value).sum() for col in value_columns
        ]

    # "Missing" is now defined as the number of missing entries (null or N/A) in each column
    counts["missing"] = [
        joined_status[col].is_null().sum() for col in value_columns
    ]

    # Construct summary rows for each metric, showing counts instead of percentages
    summary_rows = []
    labels = ["#complete", "#partial", "#incomplete", "#missing"]
    keys = ["complete", "partial", "incomplete", "missing"]
    for i, label in enumerate(labels):
        row = {"table_name": label}
        current_counts = counts[keys[i]]
        for j, col in enumerate(value_columns):
            # Format the value as a string to match column type expectations
            row[col] = str(current_counts[j])
        summary_rows.append(row)

    # Build summary_df as a DataFrame of string type
    summary_df = pl.DataFrame(summary_rows)
    # Ensure all columns (besides 'table_name') are UTF8 (string), for safe vertical concat
    for col in value_columns:
        if summary_df.schema[col] != pl.String:
            summary_df = summary_df.with_columns([pl.col(col).cast(pl.String)])

    # Also, ensure joined_status's value columns are all String for vertical concat
    for col in value_columns:
        if joined_status.schema[col] != pl.String:
            joined_status = joined_status.with_columns([pl.col(col).cast(pl.String)])

    # Prepare to append: re-order columns of summary_df to match joined_status
    summary_df = summary_df.select(joined_status.columns)

    # Append the summary (now types should match)
    display_df = pl.concat([joined_status, summary_df])

    display_df
else:
    joined_status

In [None]:
display_df

In [None]:
display_df.write_csv("../output/overall_status_by_site.csv")

<!-- unaggregated  -->