# SEC-CCM Unified Runner (Colab-first)

This notebook unifies the old CCM merge and SEC filing workflows.

Execution order (locked):
1. Build/reuse CCM daily panel
2. Parse/merge SEC filings
3. Run SEC-CCM pre-merge (doc grain)
4. Run gated item extraction using matched `doc_id` allowlist
5. Run diagnostics (run report + unmatched + no-item + boundary)


In [None]:
from __future__ import annotations

import os
import sys
from pathlib import Path

IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/drive", force_remount=False)

ROOT = Path.cwd().resolve()
SRC = ROOT / "src"
if SRC.exists() and str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

print({"IN_COLAB": IN_COLAB, "ROOT": str(ROOT), "SRC_EXISTS": SRC.exists()})


In [None]:
import polars as pl
from pathlib import Path

from thesis_pkg.pipeline import (
    SecCcmJoinSpecV1,
    build_or_reuse_ccm_daily_stage,
    run_sec_ccm_premerge_pipeline,
)
from thesis_pkg.filing_text import (
    process_zip_year_raw_text,
    process_zip_year,
    merge_yearly_batches,
    summarize_year_parquets,
    build_light_metadata_dataset,
    process_year_dir_extract_items_gated,
    compute_no_item_diagnostics,
)
from thesis_pkg.core.sec.suspicious_boundary_diagnostics import (
    DiagnosticsConfig,
    parse_focus_items,
    run_boundary_diagnostics,
)


## Config

Edit these toggles and paths first.


In [None]:
RUN_CCM_MODE = "REUSE"  # REBUILD | REUSE
RUN_SEC_PARSE = False
RUN_SEC_YEARLY_MERGE = True
RUN_SEC_CCM_PREMERGE = True
RUN_GATED_ITEM_EXTRACTION = True
RUN_UNMATCHED_DIAGNOSTIC_TRACK = False
RUN_NO_ITEM_DIAGNOSTICS = True
RUN_BOUNDARY_DIAGNOSTICS = True
RUN_VALIDATION_CHECKS = True

SEC_PARSE_MODE = "parsed"  # raw | parsed
YEARS = list(range(1993, 2025))
ITEM_EXTRACTION_REGIME = "legacy"

if IN_COLAB:
    WORK_ROOT = Path("/content/drive/MyDrive/Data_LM")
else:
    WORK_ROOT = Path("C:/Users/erik9/Documents/SEC_Data")

SEC_ZIP_DIR = WORK_ROOT / "Data" / "Sample_Filings"
SEC_BATCH_ROOT = WORK_ROOT / "Data" / "Sample_Filings" / "parquet_batches"
SEC_YEAR_MERGED_DIR = SEC_BATCH_ROOT / "_year_merged"
SEC_LIGHT_METADATA_PATH = WORK_ROOT / "Data" / "Sample_Filings" / "filings_metadata_LIGHT.parquet"

CCM_BASE_DIR = WORK_ROOT / "Data" / "CRSP_Compustat_data" / "parquet_data"
CCM_DERIVED_DIR = WORK_ROOT / "Data" / "CRSP_Compustat_data" / "derived_data"
CCM_REUSE_DAILY_PATH = WORK_ROOT / "Data" / "Sample_CCM" / "final_flagged_data_compdesc_added.sample_3pct_seed42_compdesc.parquet"

RUN_ROOT = ROOT / "results" / "sec_ccm_unified_runner"
SEC_CCM_OUTPUT_DIR = RUN_ROOT / "sec_ccm_premerge"
SEC_ITEMS_ANALYSIS_DIR = RUN_ROOT / "items_analysis"
SEC_ITEMS_DIAGNOSTIC_DIR = RUN_ROOT / "items_diagnostic"
SEC_NO_ITEM_DIR = RUN_ROOT / "no_item_diagnostics"
BOUNDARY_OUT_DIR = RUN_ROOT / "boundary_diagnostics"
BOUNDARY_INPUT_DIR = BOUNDARY_OUT_DIR / "matched_filings_input"

if IN_COLAB:
    LOCAL_TMP = Path("/content/_tmp_zip")
    LOCAL_WORK = Path("/content/_batch_work")
    LOCAL_ITEM_WORK = Path("/content/_item_work")
    LOCAL_MERGE_WORK = Path("/content/_merge_work")
else:
    LOCAL_TMP = ROOT / ".tmp" / "zip"
    LOCAL_WORK = ROOT / ".tmp" / "batch_work"
    LOCAL_ITEM_WORK = ROOT / ".tmp" / "item_work"
    LOCAL_MERGE_WORK = ROOT / ".tmp" / "merge_work"

for p in [SEC_BATCH_ROOT, SEC_YEAR_MERGED_DIR, RUN_ROOT, SEC_CCM_OUTPUT_DIR, SEC_ITEMS_ANALYSIS_DIR, SEC_ITEMS_DIAGNOSTIC_DIR, SEC_NO_ITEM_DIR, BOUNDARY_OUT_DIR, BOUNDARY_INPUT_DIR, LOCAL_TMP, LOCAL_WORK, LOCAL_ITEM_WORK, LOCAL_MERGE_WORK]:
    p.mkdir(parents=True, exist_ok=True)

FORMS_10K_10Q = ["10-K", "10-K/A", "10-KA", "10-Q", "10-Q/A", "10-QA", "10-KT", "10-KT/A", "10-QT", "10-QT/A", "10-K405"]
DAILY_FEATURE_COLUMNS = ("RET", "RETX", "PRC", "BIDLO", "ASKHI", "VOL")
REQUIRED_DAILY_NON_NULL_FEATURES = ("RET",)

print({"RUN_CCM_MODE": RUN_CCM_MODE, "WORK_ROOT": str(WORK_ROOT), "RUN_ROOT": str(RUN_ROOT)})


## 1) CCM stage (build or reuse)


In [None]:
ccm_stage_paths = build_or_reuse_ccm_daily_stage(
    run_mode=RUN_CCM_MODE,
    ccm_base_dir=CCM_BASE_DIR,
    ccm_derived_dir=CCM_DERIVED_DIR,
    ccm_reuse_daily_path=Path(CCM_REUSE_DAILY_PATH),
    forms_10k_10q=FORMS_10K_10Q,
    start_date="1990-01-01",
    canonical_name="canonical_link_table.parquet",
    daily_name="final_flagged_data_compdesc_added.parquet",
    verbose=1,
)

ccm_daily_path = ccm_stage_paths["ccm_daily_path"]
canonical_link_path = ccm_stage_paths["canonical_link_path"]

ccm_daily_lf = pl.scan_parquet(ccm_daily_path)
print({"ccm_daily_path": str(ccm_daily_path), "canonical_link_path": str(canonical_link_path), "rows": ccm_daily_lf.select(pl.len()).collect().item()})


## 2) Build link universe + trading calendar


In [None]:
def _first_existing(schema: pl.Schema, candidates: tuple[str, ...], label: str) -> str:
    for c in candidates:
        if c in schema:
            return c
    raise ValueError(f"{label} missing candidates: {list(candidates)}")


schema = ccm_daily_lf.collect_schema()
resolved_permno_col = _first_existing(schema, ("KYPERMNO", "LPERMNO", "PERMNO"), "ccm_daily")
resolved_date_col = _first_existing(schema, ("CALDT", "caldt"), "ccm_daily")

link_universe_lf = pl.scan_parquet(canonical_link_path)
trading_calendar_lf = ccm_daily_lf.select(pl.col(resolved_date_col).cast(pl.Date, strict=False).alias("CALDT")).drop_nulls(subset=["CALDT"]).unique().sort("CALDT")

print({"permno_col": resolved_permno_col, "date_col": resolved_date_col, "canonical_link_path": str(canonical_link_path)})
print({"link_rows": link_universe_lf.select(pl.len()).collect().item(), "trading_days": trading_calendar_lf.select(pl.len()).collect().item()})


## 3) SEC parse and yearly merge


In [None]:
if RUN_SEC_PARSE:
    common = dict(tmp_dir=LOCAL_TMP, local_work_dir=LOCAL_WORK, compression="zstd", copy_retries=5, copy_sleep=2.0, validate_on_copy=True)
    for year in YEARS:
        zip_path = SEC_ZIP_DIR / f"{year}.zip"
        if not zip_path.exists():
            continue
        out_year = SEC_BATCH_ROOT / str(year)
        out_year.mkdir(parents=True, exist_ok=True)
        existing = list(out_year.glob(f"{year}_batch_*.parquet"))
        if existing:
            continue
        if SEC_PARSE_MODE == "raw":
            process_zip_year_raw_text(zip_path=zip_path, out_dir=out_year, batch_max_rows=1000, batch_max_text_bytes=250 * 1024 * 1024, encoding="utf-8", **common)
        else:
            process_zip_year(zip_path=zip_path, out_dir=out_year, batch_max_rows=2000, batch_max_text_bytes=250 * 1024 * 1024, header_search_limit=8000, encoding="utf-8", **common)
else:
    print("RUN_SEC_PARSE=False; using existing SEC batches.")

if RUN_SEC_YEARLY_MERGE:
    merge_yearly_batches(batch_dir=SEC_BATCH_ROOT, out_dir=SEC_YEAR_MERGED_DIR, checkpoint_path=SEC_YEAR_MERGED_DIR / "done_years.json", local_work_dir=LOCAL_MERGE_WORK, batch_size=128_000, compression="zstd", compression_level=1, validate_inputs="full", years=[str(y) for y in YEARS])

sec_summaries = summarize_year_parquets(SEC_YEAR_MERGED_DIR)
ok_files = [Path(r["path"]) for r in sec_summaries if r.get("status") == "OK"]
if not ok_files:
    raise ValueError("No OK SEC yearly parquet files found.")
build_light_metadata_dataset(parquet_dir=ok_files, out_path=SEC_LIGHT_METADATA_PATH, drop_columns=("full_text",), sort_columns=("file_date_filename", "cik"), compression="zstd")
print({"ok_year_files": len(ok_files), "light_path": str(SEC_LIGHT_METADATA_PATH)})


## 4) Prepare SEC pre-merge input

Required columns: `doc_id`, `cik_10`, `filing_date`


In [None]:
year_files = sorted([p for p in SEC_YEAR_MERGED_DIR.glob("*.parquet") if p.stem.isdigit() and len(p.stem) == 4])
if not year_files:
    raise ValueError(f"No yearly SEC files found in {SEC_YEAR_MERGED_DIR}")

sec_raw_lf = pl.scan_parquet(year_files)
sec_schema = sec_raw_lf.collect_schema()
for c in ("doc_id", "cik_10"):
    if c not in sec_schema:
        raise ValueError(f"Missing required SEC column: {c}")

if "filing_date" in sec_schema and "file_date_filename" in sec_schema:
    filing_date_expr = pl.coalesce([pl.col("filing_date").cast(pl.Date, strict=False), pl.col("file_date_filename").cast(pl.Date, strict=False)]).alias("filing_date")
elif "filing_date" in sec_schema:
    filing_date_expr = pl.col("filing_date").cast(pl.Date, strict=False).alias("filing_date")
elif "file_date_filename" in sec_schema:
    filing_date_expr = pl.col("file_date_filename").cast(pl.Date, strict=False).alias("filing_date")
else:
    raise ValueError("Missing both filing_date and file_date_filename.")

optional_cols = [c for c in ("document_type_filename", "form_type", "period_end", "acceptance_datetime", "accession_number", "accession_nodash") if c in sec_schema]
sec_premerge_input_lf = sec_raw_lf.with_columns(pl.col("doc_id").cast(pl.Utf8, strict=False), pl.col("cik_10").cast(pl.Utf8, strict=False), filing_date_expr).select("doc_id", "cik_10", "filing_date", *optional_cols)

null_dates = sec_premerge_input_lf.select(pl.col("filing_date").is_null().sum()).collect().item()
if null_dates > 0:
    raise ValueError(f"Null filing_date rows after fallback: {null_dates}")

print({"rows": sec_premerge_input_lf.select(pl.len()).collect().item(), "doc_ids": sec_premerge_input_lf.select(pl.col("doc_id").n_unique()).collect().item(), "optional_cols": optional_cols})


## 5) SEC-CCM pre-merge (must run before extraction)


In [None]:
sec_ccm_paths: dict[str, Path] | None = None

if RUN_SEC_CCM_PREMERGE:
    join_spec = SecCcmJoinSpecV1(
        alignment_policy="NEXT_TRADING_DAY_STRICT",
        daily_join_enabled=True,
        daily_join_source="MERGED_DAILY_PANEL",
        daily_permno_col=resolved_permno_col,
        daily_date_col=resolved_date_col,
        daily_feature_columns=tuple(DAILY_FEATURE_COLUMNS),
        required_daily_non_null_features=tuple(REQUIRED_DAILY_NON_NULL_FEATURES),
    )

    sec_ccm_paths = run_sec_ccm_premerge_pipeline(
        sec_filings_lf=sec_premerge_input_lf,
        link_universe_lf=link_universe_lf,
        trading_calendar_lf=trading_calendar_lf,
        output_dir=SEC_CCM_OUTPUT_DIR,
        daily_lf=ccm_daily_lf,
        join_spec=join_spec,
        emit_run_report=True,
    )

    for k in sorted(sec_ccm_paths):
        print(f"{k}: {sec_ccm_paths[k]}")


In [None]:
if sec_ccm_paths is not None:
    ms = pl.read_parquet(sec_ccm_paths["sec_ccm_match_status"])
    print(ms.group_by("match_reason_code").agg(pl.len().alias("n_docs")).sort("n_docs", descending=True))
    total = ms.height
    matched = int(ms.select(pl.col("match_flag").cast(pl.Int64).sum()).item())
    acceptance = int(ms.select(pl.col("has_acceptance_datetime").cast(pl.Int64).sum()).item())
    print({"total_docs": total, "matched_docs": matched, "matched_rate": (matched / total) if total else 0.0, "acceptance_coverage": (acceptance / total) if total else 0.0})
    print("run_report:", sec_ccm_paths.get("sec_ccm_run_report"))
    print("run_dag_mermaid:", sec_ccm_paths.get("sec_ccm_run_dag_mermaid"))
    print("run_dag_dot:", sec_ccm_paths.get("sec_ccm_run_dag_dot"))


## 6) Gated item extraction (matched-first)


In [None]:
analysis_item_paths: list[Path] = []
diagnostic_item_paths: list[Path] = []

if RUN_GATED_ITEM_EXTRACTION:
    if sec_ccm_paths is None:
        raise RuntimeError("Run SEC-CCM pre-merge first.")

    analysis_item_paths = process_year_dir_extract_items_gated(
        year_dir=SEC_YEAR_MERGED_DIR,
        out_dir=SEC_ITEMS_ANALYSIS_DIR,
        doc_id_allowlist=sec_ccm_paths["sec_ccm_analysis_doc_ids"],
        years=[str(y) for y in YEARS],
        parquet_batch_rows=16,
        out_batch_max_rows=50_000,
        out_batch_max_text_bytes=250 * 1024 * 1024,
        tmp_dir=LOCAL_TMP,
        compression="zstd",
        local_work_dir=LOCAL_ITEM_WORK,
        non_item_diagnostic=False,
        include_full_text=False,
        regime=True,
        extraction_regime=ITEM_EXTRACTION_REGIME,
    )
    print({"analysis_year_files": len(analysis_item_paths)})

    if RUN_UNMATCHED_DIAGNOSTIC_TRACK:
        diagnostic_item_paths = process_year_dir_extract_items_gated(
            year_dir=SEC_YEAR_MERGED_DIR,
            out_dir=SEC_ITEMS_DIAGNOSTIC_DIR,
            doc_id_allowlist=sec_ccm_paths["sec_ccm_diagnostic_doc_ids"],
            years=[str(y) for y in YEARS],
            parquet_batch_rows=16,
            out_batch_max_rows=50_000,
            out_batch_max_text_bytes=250 * 1024 * 1024,
            tmp_dir=LOCAL_TMP,
            compression="zstd",
            local_work_dir=LOCAL_ITEM_WORK,
            non_item_diagnostic=False,
            include_full_text=False,
            regime=True,
            extraction_regime=ITEM_EXTRACTION_REGIME,
        )
        print({"diagnostic_year_files": len(diagnostic_item_paths)})


## 7) No-item diagnostics + boundary diagnostics


In [None]:
analysis_no_item: list[tuple[str, Path, Path]] = []
if RUN_NO_ITEM_DIAGNOSTICS and RUN_GATED_ITEM_EXTRACTION:
    out_dir = SEC_NO_ITEM_DIR / "analysis"
    out_dir.mkdir(parents=True, exist_ok=True)
    for item_path in analysis_item_paths:
        year = item_path.stem
        filing_path = SEC_YEAR_MERGED_DIR / f"{year}.parquet"
        if not filing_path.exists():
            continue
        out_no_item = out_dir / f"{year}_no_item_filings.parquet"
        out_stats = out_dir / f"{year}_no_item_stats.csv"
        compute_no_item_diagnostics(filing_path, item_path, out_no_item, out_stats, include_full_text=False)
        analysis_no_item.append((year, out_no_item, out_stats))
print({"analysis_no_item_years": len(analysis_no_item)})

boundary_results = None
if RUN_BOUNDARY_DIAGNOSTICS:
    if sec_ccm_paths is None:
        raise RuntimeError("Run SEC-CCM pre-merge first.")

    allow_lf = pl.scan_parquet(sec_ccm_paths["sec_ccm_analysis_doc_ids"]).select(pl.col("doc_id").cast(pl.Utf8)).drop_nulls(subset=["doc_id"]).unique(subset=["doc_id"])
    staged = 0
    for year in YEARS:
        src = SEC_YEAR_MERGED_DIR / f"{year}.parquet"
        if not src.exists():
            continue
        dst = BOUNDARY_INPUT_DIR / src.name
        pl.scan_parquet(src).join(allow_lf, on="doc_id", how="semi").sink_parquet(dst, compression="zstd")
        staged += 1

    diag_config = DiagnosticsConfig(
        parquet_dir=BOUNDARY_INPUT_DIR,
        out_path=BOUNDARY_OUT_DIR / "suspicious_boundaries_matched.csv",
        report_path=BOUNDARY_OUT_DIR / "suspicious_boundaries_matched_report.txt",
        samples_dir=BOUNDARY_OUT_DIR / "samples",
        batch_size=8,
        max_files=0,
        max_examples=50,
        emit_manifest=True,
        manifest_items_path=BOUNDARY_OUT_DIR / "manifest_items.csv",
        manifest_filings_path=BOUNDARY_OUT_DIR / "manifest_filings.csv",
        sample_pass=100,
        sample_seed=42,
        sample_filings_path=BOUNDARY_OUT_DIR / "sample_filings.csv",
        sample_items_path=BOUNDARY_OUT_DIR / "sample_items.csv",
        emit_html=True,
        html_out=BOUNDARY_OUT_DIR / "html",
        html_scope="sample",
        extraction_regime="v2",
        diagnostics_regime="v2",
        target_set="cohen2020_common",
        focus_items=parse_focus_items(None),
        report_item_scope="target",
    )
    print({"boundary_staged_year_files": staged})
    boundary_results = run_boundary_diagnostics(diag_config)
    print(boundary_results)


## Failure modes and diagnostics checklist

- Missing SEC pre-merge columns (`doc_id`, `cik_10`, date fields) -> fail fast with explicit error.
- Missing/invalid `filing_date` after fallback -> fail fast.
- Weak or empty CCM link-universe coverage -> inspect link-universe counts before pre-merge.
- Non-unique `doc_id` issues -> validated against match-status row/uniqueness checks.
- CCM daily schema mismatch (permno/date/features) -> explicit source column resolution and validation.
- Low match rate -> inspect `sec_ccm_unmatched_diagnostics.parquet` and `sec_ccm_run_report.md`.
- Empty gated extraction unexpectedly -> compare allowlist counts vs extracted doc_id counts.
- Boundary diagnostics runtime too high -> reduce year scope or set max-files/sample controls.


## Notebook test scenarios

1. Smoke run in `REUSE` mode on sample data: verify SEC-CCM artifacts and run report/DAG files exist.
2. Functional run in `REBUILD` mode: verify CCM daily parquet is produced and consumed by pre-merge.
3. Gating correctness: verify extracted analysis doc_ids are a subset of analysis allowlist.
4. Pre-merge invariants: one row per `doc_id` in `sec_ccm_match_status.parquet`.
5. Diagnostics presence: boundary CSV/report/HTML outputs exist when enabled.
6. Schema checks: `kypermno` Int32 and `data_status` UInt64 non-null.


## 8) Validation + artifact index


In [None]:
validation_rows: list[dict[str, object]] = []

if RUN_VALIDATION_CHECKS and sec_ccm_paths is not None:
    pre = sec_premerge_input_lf.select(pl.len().alias("rows"), pl.col("doc_id").n_unique().alias("uniq")).collect().row(0, named=True)
    ms_lf = pl.scan_parquet(sec_ccm_paths["sec_ccm_match_status"])
    ms = ms_lf.select(pl.len().alias("rows"), pl.col("doc_id").n_unique().alias("uniq")).collect().row(0, named=True)
    if pre["rows"] != ms["rows"]:
        raise AssertionError(f"premerge rows {pre['rows']} != match_status rows {ms['rows']}")
    if ms["rows"] != ms["uniq"]:
        raise AssertionError("sec_ccm_match_status is not unique on doc_id")

    schema = pl.scan_parquet(sec_ccm_paths["final_flagged_data"]).collect_schema()
    if schema.get("kypermno") != pl.Int32:
        raise AssertionError(f"kypermno dtype not Int32: {schema.get('kypermno')}")
    if schema.get("data_status") != pl.UInt64:
        raise AssertionError(f"data_status dtype not UInt64: {schema.get('data_status')}")
    null_status = pl.scan_parquet(sec_ccm_paths["final_flagged_data"]).select(pl.col("data_status").is_null().sum()).collect().item()
    if null_status != 0:
        raise AssertionError(f"data_status null count: {null_status}")

    validation_rows.append({"check": "premerge_vs_match_status_rows", "ok": True, "details": f"rows={pre['rows']}"})
    validation_rows.append({"check": "match_status_doc_id_unique", "ok": True, "details": f"unique={ms['uniq']}"})

if RUN_VALIDATION_CHECKS and RUN_GATED_ITEM_EXTRACTION and analysis_item_paths and sec_ccm_paths is not None:
    allow_lf = pl.scan_parquet(sec_ccm_paths["sec_ccm_analysis_doc_ids"]).select(pl.col("doc_id").cast(pl.Utf8)).drop_nulls(subset=["doc_id"]).unique(subset=["doc_id"])
    extracted_lf = pl.scan_parquet([str(p) for p in analysis_item_paths]).select(pl.col("doc_id").cast(pl.Utf8)).drop_nulls(subset=["doc_id"]).unique(subset=["doc_id"])
    outside = extracted_lf.join(allow_lf, on="doc_id", how="anti").select(pl.len()).collect().item()
    if outside != 0:
        raise AssertionError(f"Extracted doc_ids outside analysis allowlist: {outside}")
    validation_rows.append({"check": "analysis_items_subset_allowlist", "ok": True, "details": "outside=0"})

print(pl.DataFrame(validation_rows) if validation_rows else "No validations executed")


def _row_count(path: Path) -> int | None:
    if not path.exists() or path.suffix.lower() != ".parquet":
        return None
    return int(pl.scan_parquet(path).select(pl.len()).collect().item())

artifact_rows: list[dict[str, object]] = []

def _add(stage: str, key: str, path: Path) -> None:
    artifact_rows.append({"stage": stage, "artifact": key, "path": str(path), "exists": path.exists(), "rows": _row_count(path)})

if ccm_daily_path is not None:
    _add("ccm", "ccm_daily_path", ccm_daily_path)
if sec_ccm_paths is not None:
    for k in sorted(sec_ccm_paths):
        _add("sec_ccm", k, Path(sec_ccm_paths[k]))
for p in analysis_item_paths:
    _add("items_analysis", p.stem, p)
for p in diagnostic_item_paths:
    _add("items_diagnostic", p.stem, p)
for y, p_no, p_csv in analysis_no_item:
    _add("no_item_analysis", f"{y}_no_item_filings", p_no)
    _add("no_item_analysis", f"{y}_no_item_stats", p_csv)
for k, p in {
    "boundary_csv": BOUNDARY_OUT_DIR / "suspicious_boundaries_matched.csv",
    "boundary_report": BOUNDARY_OUT_DIR / "suspicious_boundaries_matched_report.txt",
    "boundary_manifest_items": BOUNDARY_OUT_DIR / "manifest_items.csv",
    "boundary_manifest_filings": BOUNDARY_OUT_DIR / "manifest_filings.csv",
    "boundary_html": BOUNDARY_OUT_DIR / "html",
}.items():
    _add("boundary", k, p)

print(pl.DataFrame(artifact_rows).sort(["stage", "artifact"]) if artifact_rows else "No artifacts indexed")
