# Full Chunking Pipeline (Collections A–D)
This notebook runs all chunkers in order (A → D) to produce fresh `{A-D}_docs.jsonl` and `{A-D}_chunks.jsonl` under `data/processed/<collection>/`.

In [2]:
from pathlib import Path
from collections import Counter
import json
import sys

# Ensure project scripts are importable
sys.path.insert(0, str(Path('.').resolve()))


def summarize_chunks(chunks: list[dict], label: str) -> None:
    """Print quick stats for a chunk list."""
    if not chunks:
        print(f"{label}: no chunks")
        return
    lengths = [len(c["text"]) for c in chunks if "text" in c]
    print(f"{label}: {len(chunks)} chunks")
    print(f"  avg len: {sum(lengths)/len(lengths):.0f} chars  min: {min(lengths)}  max: {max(lengths)}")
    by_doc = Counter(c.get("doc_id", "") for c in chunks)
    print(f"  source docs: {len(by_doc)}")


def read_jsonl(path: Path) -> list[dict]:
    if not path.exists():
        return []
    return [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()]


# Collection A

In [3]:
# Collection A
from scripts.chunkers.chunker_collection_a import (
    chunk_collection as chunk_collection_a,
    build_docs_from_wiki as build_docs_a,
    PROCESSED_DIR as A_PROCESSED,
    RAW_WIKI_DIR as A_RAW_WIKI,
    DOCS_PATH as A_DOCS_PATH,
    CHUNKS_PATH as A_CHUNKS_PATH,
    )

print("=" * 60)
print("Chunking Collection A")
print("=" * 60)

if not A_DOCS_PATH.exists():
    print(f"Docs not found at {A_DOCS_PATH}; generating from {A_RAW_WIKI}...")
    A_PROCESSED.mkdir(parents=True, exist_ok=True)
    generated_docs = build_docs_a()
    if not generated_docs:
        raise FileNotFoundError(
            f"Failed to build docs from {A_RAW_WIKI}. Ensure raw wiki markdown exists."
        )
else:
    print(f"Found existing docs at {A_DOCS_PATH}")

chunks_a = chunk_collection_a("A")

summarize_chunks(chunks_a, "A")
print(f"Docs path   : {A_DOCS_PATH}")
print(f"Chunks path : {A_CHUNKS_PATH}")

Chunking Collection A
Found existing docs at data/processed/A/A_docs.jsonl
  ✓  A_wiki_Andrew Carnegie - Wikipedia              66 chunks
  ✓  A_wiki_Andrew Mellon - Wikipedia                41 chunks
  ✓  A_wiki_Association of American Universities - Wikipedia   19 chunks
  ✓  A_wiki_Association of Independent Technological Universities - Wikipedia    4 chunks
  ✓  A_wiki_Astrobotic Technology - Wikipedia        11 chunks
  ✓  A_wiki_Carnegie Library of Pittsburgh - Wikipedia    6 chunks
  ✓  A_wiki_Carnegie Mellon College of Engineering - Wikipedia    5 chunks
  ✓  A_wiki_Carnegie Mellon College of Fine Arts - Wikipedia    4 chunks
  ✓  A_wiki_Carnegie Mellon School of Art - Wikipedia    6 chunks
  ✓  A_wiki_Carnegie Mellon School of Computer Science - Wikipedia   14 chunks
  ✓  A_wiki_Carnegie Mellon School of Design - Wikipedia   11 chunks
  ✓  A_wiki_Carnegie Mellon School of Drama - Wikipedia   10 chunks
  ✓  A_wiki_Carnegie Mellon School of Music - Wikipedia    2 chunks
  ✓  A_w

# Collection B

In [4]:
# Collection B
from scripts.chunkers.chunk_collection_b import chunk_collection_b, PROCESSED_DIR as B_PROCESSED

print("=" * 60)
print("Chunking Collection B")
print("=" * 60)
chunks_b = chunk_collection_b()

b_chunks_path = B_PROCESSED / "B_chunks.jsonl"
summarize_chunks(chunks_b, "B")
print(f"Chunks path : {b_chunks_path}")
print("(Docs are sourced from processed PDFs; B_docs.jsonl is not emitted by this chunker.)")

Chunking Collection B
Chunking Collection B
  Tax regulations : 38 chunks (heading-aware)
  Operating budget: 1412 chunks (table-aware)

  → Wrote 1450 total chunks to data/processed/B/B_chunks.jsonl  (1489.9 KB)
     tax regulations : 38
     operating budget: 1412
B: 1450 chunks
  avg len: 574 chars  min: 136  max: 5844
  source docs: 2
Chunks path : data/processed/B/B_chunks.jsonl
(Docs are sourced from processed PDFs; B_docs.jsonl is not emitted by this chunker.)


# Collection C

In [5]:
# Collection C — Pittsburgh events (Firecrawl sources)
from scripts.chunkers.chunk_collection_c_pgh_events import chunk_collection_c, PROCESSED_DIR as C_PROCESSED

print("=" * 60)
print("Chunking Collection C — Pittsburgh events")
print("=" * 60)
chunks_c_pgh = chunk_collection_c()

c_chunks_path = C_PROCESSED / "C_chunks.jsonl"
summarize_chunks(chunks_c_pgh, "C (pgh events)")
print(f"Chunks path : {c_chunks_path}")

Chunking Collection C — Pittsburgh events
Chunking Collection C — Pittsburgh Events

[pittsburgh.events]
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events_march.md                              30 events  →  30 chunks
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events_april.md                              30 events  →  30 chunks
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events_may.md                                30 events  →  30 chunks
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events_june.md                               30 events  →  30 chunks
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events_july.md                               30 events  →  30 chunks
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events_august.md                             30 events  →  30 chunks
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events_september.md                          30 events  →  30 chunks
  ⚠  Short card (1 f

In [6]:
# Collection C — Recurring events (CSV)
from scripts.chunkers.chunk_recurring_events_c import chunk_recurring_events, OUT_CHUNKS as C_REC_CHUNKS

print("=" * 60)
print("Chunking Collection C — Recurring events")
print("=" * 60)
chunks_c_rec = chunk_recurring_events(
    csv_path=Path("data/raw/C/recurring_events.csv"),
    source_url="https://www.pittsburghmagazine.com/best-of-the-burgh-listings/",
    append=True,
)

summarize_chunks(chunks_c_rec, "C (recurring)")
print(f"Chunks path : {C_REC_CHUNKS}")

Chunking Collection C — Recurring events
Chunking — Pittsburgh Recurring Events
  ✓  Loaded 370 records from recurring_events.csv

  → Appended 370 chunks to data/processed/C/C_chunks.jsonl  (640.0 KB)
     Free events : 79
     Categories  :
       Other Stuff                      60
       Volunteers                       57
       DJs                              35
       Comedy                           20
       Open Stage                       20
       Literary                         17
       Exhibits                         17
       Jazz                             11
       Outside                          11
       Dance                            10
       Other Music                      10
       Community                        10
       Visual Art                        9
       Trivia                            9
       Kidstuff                          7
       Acoustic                          7
       Fundraisers                       7
       Games              

In [7]:
# Collection C — CMU events
from scripts.chunkers.chunk_collection_c_cmu_events import run as chunk_cmu_events, DEFAULT_CAMPUS_MONTHS, PROCESSED_DIR as C_PROCESSED

print("=" * 60)
print("Chunking Collection C — CMU events")
print("=" * 60)
docs_c_cmu, chunks_c_cmu = chunk_cmu_events(campus_months=DEFAULT_CAMPUS_MONTHS, append=True)

summarize_chunks(chunks_c_cmu, "C (CMU events)")
print(f"Docs path   : {C_PROCESSED / 'C_docs.jsonl'}")
print(f"Chunks path : {C_PROCESSED / 'C_chunks.jsonl'}")

Chunking Collection C — CMU events
Collection C — CMU Events Chunking

[events.cmu.edu — campus calendar]
  ok  cmu_campus_events_20260301.md                    62 events  ->  62 chunks
  ok  cmu_campus_events_20260401.md                    62 events  ->  62 chunks
  ok  cmu_campus_events_20260501.md                    27 events  ->  27 chunks
  ok  cmu_campus_events_20260601.md                    21 events  ->  21 chunks
  ok  cmu_campus_events_20260701.md                    11 events  ->  11 chunks
  ok  cmu_campus_events_20260801.md                     9 events  ->  9 chunks
  ok  cmu_campus_events_20260901.md                     3 events  ->  3 chunks
  ok  cmu_campus_events_20261001.md                     4 events  ->  4 chunks
  ok  cmu_campus_events_20261101.md                     2 events  ->  2 chunks
  ok  cmu_campus_events_20261201.md                     0 events  ->  0 chunks

[cmu.edu/engage/events]
  ok  cmu_engage_events.md                              5 sections  ->  5 

  2. Escaped brackets [text \[Virtual\]](url)


In [8]:
# Collection C — Combined summary
c_chunks_path = C_PROCESSED / "C_chunks.jsonl"
all_c_chunks = read_jsonl(c_chunks_path)

print("=" * 60)
print("Collection C — Combined")
print("=" * 60)
summarize_chunks(all_c_chunks, "C (all sources)")
print(f"Chunks path : {c_chunks_path}")

Collection C — Combined
C (all sources): 937 chunks
  avg len: 324 chars  min: 71  max: 5963
  source docs: 31
Chunks path : data/processed/C/C_chunks.jsonl


# Collection D

In [9]:
# Collection D
from scripts.chunkers.chunk_collection_d import chunk_all, OUT_DOCS as D_DOCS, OUT_CHUNKS as D_CHUNKS

print("=" * 60)
print("Chunking Collection D")
print("=" * 60)
docs_d, chunks_d = chunk_all(append=False)

summarize_chunks(chunks_d, "D")
print(f"Docs path   : {D_DOCS}")
print(f"Chunks path : {D_CHUNKS}")

Chunking Collection D
Collection D — Chunking
Files: 270
  ok  bananasplitfest/bananasplitfest.com.md                    9 chunks
  ok  bananasplitfest/bananasplitfest.com__activities.md        6 chunks
  skip  bananasplitfest/bananasplitfest.com__activities__crafts-games-activities.md  (no content)
  skip  bananasplitfest/bananasplitfest.com__activities__entertainment.md  (no content)
  ok  bananasplitfest/bananasplitfest.com__activities__food.md   1 chunks
  skip  bananasplitfest/bananasplitfest.com__activities__over-21-area.md  (no content)
  skip  bananasplitfest/bananasplitfest.com__activities__participating-vendors.md  (no content)
  ok  bananasplitfest/bananasplitfest.com__events.md            6 chunks
  ok  bananasplitfest/bananasplitfest.com__events__5k-banana-run.md   1 chunks
  ok  bananasplitfest/bananasplitfest.com__events__banana-challenge.md   1 chunks
  ok  bananasplitfest/bananasplitfest.com__events__blood-drive.md   1 chunks
  ok  bananasplitfest/bananasplitfest.com__

---
Run the cells in order to rebuild all chunk outputs. Rerun specific sections as needed if upstream data changes.