# SEC pipeline runner

Run the core steps in `thesis_pkg.pipelines.sec_pipeline` on a dataset of yearly SEC filing ZIPs.

Update the paths in the setup cell before running.


In [None]:
from __future__ import annotations

import sys
from pathlib import Path


In [None]:
# Make local `src` importable when running from repo checkout
ROOT = Path.cwd().resolve()
SRC = ROOT / "src"
if SRC.exists():
    sys.path.insert(0, str(SRC))

from thesis_pkg.filing_text import (
    build_light_metadata_dataset,
    merge_yearly_batches,
    process_zip_year,
    process_zip_year_raw_text,
    process_year_dir_extract_items,
    summarize_year_parquets,
)


In [None]:
# ---- Paths (edit these) ----
zip_dir = Path(r"C:\\path\\to\\sec_zip_dir")  # directory of yearly .zip archives
batch_dir = zip_dir / "parquet_batches"
merge_dir = zip_dir / "year_merged"
tmp_dir = batch_dir / "_tmp"
light_path = zip_dir / "filings_metadata_LIGHT.parquet"
checkpoint = merge_dir / "done_years.json"

# ---- Tuning ----
batch_max_rows = 1000
batch_max_text_bytes = 250 * 1024 * 1024
compression = "zstd"
compression_level = 1
sleep_between_years = 0.0
overwrite_batches = False
use_parsed_headers = False  # switch to True to run process_zip_year

batch_dir.mkdir(parents=True, exist_ok=True)
merge_dir.mkdir(parents=True, exist_ok=True)
tmp_dir.mkdir(parents=True, exist_ok=True)


In [None]:
# Step 1: process zip archives into parquet batches
for zip_path in sorted(zip_dir.glob("*.zip")):
    existing = list(batch_dir.glob(f"{zip_path.stem}_batch_*.parquet"))
    if existing and not overwrite_batches:
        print(f"[skip] batches exist for {zip_path.name}")
        continue
    if existing:
        for p in existing:
            p.unlink()

    print(f"[batch] {zip_path.name}")
    if use_parsed_headers:
        process_zip_year(
            zip_path=zip_path,
            out_dir=batch_dir,
            batch_max_rows=batch_max_rows,
            batch_max_text_bytes=batch_max_text_bytes,
            tmp_dir=tmp_dir,
            compression=compression,
        )
    else:
        process_zip_year_raw_text(
            zip_path=zip_path,
            out_dir=batch_dir,
            batch_max_rows=batch_max_rows,
            batch_max_text_bytes=batch_max_text_bytes,
            tmp_dir=tmp_dir,
            compression=compression,
        )


In [None]:
# Step 2: merge yearly batches into per-year parquet files
merged_paths = merge_yearly_batches(
    batch_dir=batch_dir,
    out_dir=merge_dir,
    checkpoint_path=checkpoint,
    batch_size=32_000,
    compression=compression,
    compression_level=compression_level,
    sleep_between_years=sleep_between_years,
)
print(f"[merge] merged {len(merged_paths)} yearly files into {merge_dir}")


In [None]:
# Step 3: summarize merged files
summary = summarize_year_parquets(merge_dir)
print("[summary] year rows status")
for item in summary:
    print(f"  {item['year']}: {item['rows']} rows ({item['status']})")


In [None]:
# Step 4: build metadata-only parquet (drops full_text)
build_light_metadata_dataset(
    parquet_dir=merge_dir,
    out_path=light_path,
    drop_columns=("full_text",),
    sort_columns=("file_date_filename", "cik"),
    compression=compression,
)
print(f"[light] wrote {light_path}")


## Optional: extract items from merged yearly parquets

Uncomment and run this if you want item-level extraction after merging.


In [None]:
# items_dir = merge_dir / "items"
# items_dir.mkdir(parents=True, exist_ok=True)
#
# process_year_dir_extract_items(
#     year_dir=merge_dir,
#     out_dir=items_dir,
#     years=None,
#     parquet_batch_rows=16,
#     out_batch_max_rows=50_000,
#     out_batch_max_text_bytes=250 * 1024 * 1024,
#     tmp_dir=tmp_dir,
#     compression=compression,
#     local_work_dir=tmp_dir / "_item_work",
#     non_item_diagnostic=False,
#     include_full_text=False,
#     regime=True,
# )
