In [None]:
from scripts.chunker.chunker import chunk_collection
from collections import Counter

print("=" * 60)
print("Chunking Collection A")
print("=" * 60)

chunks = chunk_collection("A")

# ── Distribution stats ───────────────────────────────────────────
lengths = [len(c["text"]) for c in chunks]
print(f"\nTotal chunks : {len(chunks)}")
print(f"Avg length   : {sum(lengths) / len(lengths):.0f} chars")
print(f"Min length   : {min(lengths)} chars")
print(f"Max length   : {max(lengths)} chars")

# Chunks per source doc
per_doc = Counter(c["doc_id"] for c in chunks)
print(f"\n{'doc_id':<45} {'chunks':>6}")
print("-" * 53)
for doc_id, count in sorted(per_doc.items()):
    print(f"{doc_id:<45} {count:>6}")

In [None]:
# Change index to inspect any chunk
sample = chunks[0]

for key, val in sample.items():
    if key == "text":
        continue
    print(f"{key:<14}: {val}")

print(f"\n{'text':─<60}")
print(sample["text"])

In [None]:
# Change doc_id to explore any document's chunks
target_doc = "A_pittsburgh_britannica"

doc_chunks = [c for c in chunks if c["doc_id"] == target_doc]
print(f"{len(doc_chunks)} chunks for '{target_doc}':\n")

for c in doc_chunks:
    section_label = c["section"]
    sub_label     = f" › {c['subsection']}" if c["subsection"] else ""
    chars         = len(c["text"])
    print(f"  [{c['chunk_index']:>3}] {section_label}{sub_label}  ({chars} chars)")

In [None]:
from scripts.chunk_collection_b import chunk_collection_b

chunks_b = chunk_collection_b()

In [None]:
from collections import Counter
from pathlib import Path
import json

# Counts per doc
per_doc = Counter(c["doc_id"] for c in chunks_b)
print(f"{'doc_id':<40} {'chunks':>7}")
print("-" * 49)
for doc_id, count in sorted(per_doc.items()):
    print(f"{doc_id:<40} {count:>7}")

# Length stats
lengths = [len(c["text"]) for c in chunks_b]
print(f"\nTotal  : {len(chunks_b)}")
print(f"Avg    : {sum(lengths)/len(lengths):.0f} chars")
print(f"Min    : {min(lengths)}")
print(f"Max    : {max(lengths)}")

In [None]:
tax_chunks = [c for c in chunks_b if c["doc_id"] == "B_payroll_tax_regulations"]
sample = tax_chunks[0]
print(f"chunk_id : {sample['chunk_id']}")
print(f"section  : {sample['section']}")
print(f"subsection: {sample['subsection']}")
print()
print(sample["text"])

In [None]:
budget_chunks = [c for c in chunks_b if c["doc_id"] == "B_2025_operating_budget"]

print(f"Total budget chunks: {len(budget_chunks)}\n")

# Show a few examples
for c in budget_chunks[10:14]:
    print(f"── {c['chunk_id']} ──")
    print(c["text"])
    print()

---
## Collection C: Chunking — Pittsburgh Events

In [None]:
from scripts.chunk_collection_c_pgh_events import chunk_collection_c

# Writes data/processed/C_chunks.jsonl
chunks_c = chunk_collection_c()

Chunking Collection C — Pittsburgh Events

[pittsburgh.events]
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events_march.md                              30 events  →  30 chunks
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events_april.md                              30 events  →  30 chunks
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events_may.md                                30 events  →  30 chunks
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events_june.md                               30 events  →  30 chunks
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events_july.md                               30 events  →  30 chunks
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events_august.md                             30 events  →  30 chunks
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events_september.md                          30 events  →  30 chunks
  ⚠  Short card (1 fields): ['Pittsburgh, PA']
  ✓  pgh_events

In [None]:
from scripts.process_recurring_events_csv_c import process_csv

# Update csv_path to wherever your CSV file is located
result = process_csv(
    csv_path="data/raw/recurring_events.csv",
    source_url="https://www.pittsburghmagazine.com/best-of-the-burgh-listings/",
)
print(result)

  ↓  Reading: recurring_events.csv
  ✓  Loaded 370 records
  ✓  Markdown → data/processed/recurring_events_pittsburgh.md  (145.9 KB)
  ✓  Appended doc record to data/processed/C_docs.jsonl
{'records': 370, 'md_path': 'data/processed/recurring_events_pittsburgh.md', 'jsonl_path': 'data/processed/C_docs.jsonl'}


In [None]:
from scripts.chunk_recurring_events_c import chunk_recurring_events

chunks_recurring = chunk_recurring_events(
    csv_path="data/raw/recurring_events.csv",
    source_url="https://www.pittsburghmagazine.com/best-of-the-burgh-listings/",
    append=True,   # merges with existing C_chunks.jsonl
)

Chunking — Pittsburgh Recurring Events
  ✓  Loaded 370 records from recurring_events.csv

  → Appended 370 chunks to data/processed/C_chunks.jsonl  (593.9 KB)
     Free events : 79
     Categories  :
       Other Stuff                      60
       Volunteers                       57
       DJs                              35
       Comedy                           20
       Open Stage                       20
       Literary                         17
       Exhibits                         17
       Jazz                             11
       Outside                          11
       Dance                            10
       Other Music                      10
       Community                        10
       Visual Art                        9
       Trivia                            9
       Kidstuff                          7
       Acoustic                          7
       Fundraisers                       7
       Games                             6
       Submissions        

In [15]:
# Distribution by category
from collections import Counter

cats = Counter(c["category"] for c in chunks_recurring)
areas = Counter(c["area"] for c in chunks_recurring)

print(f"Total chunks: {len(chunks_recurring)}")
print(f"Free events : {sum(1 for c in chunks_recurring if c['is_free'])}\n")

print("By category:")
for cat, n in cats.most_common():
    print(f"  {cat or '(none)':<35} {n}")

print("\nBy area (top 10):")
for area, n in areas.most_common(10):
    print(f"  {area or '(none)':<35} {n}")

Total chunks: 370
Free events : 79

By category:
  Other Stuff                         60
  Volunteers                          57
  DJs                                 35
  Comedy                              20
  Open Stage                          20
  Literary                            17
  Exhibits                            17
  Jazz                                11
  Outside                             11
  Dance                               10
  Other Music                         10
  Community                           10
  Visual Art                          9
  Trivia                              9
  Kidstuff                            7
  Acoustic                            7
  Fundraisers                         7
  Games                               6
  Submissions                         5
  Theater                             4
  Food & Drink Event                  4
  Exercise                            4
  Opera                               3
  Politics         

In [16]:
# Free events in Oakland
results = [
    c for c in chunks_recurring
    if c["is_free"] and c["area"].lower() == "oakland"
]
print(f"Free events in Oakland: {len(results)}\n")
for c in results[:5]:
    print(c["text"])
    print()

Free events in Oakland: 5

Event: BofA's Museums On Us® Offers Free Admission!
Schedule: First Saturday, Sunday of every month
Venue: Carnegie Museums of Pittsburgh
Address: 4400 Forbes Ave, Pittsburgh, Oakland
Price: Free
Category: Visual Art
Description: WHAT: Bank of America, Merrill and Bank of America Private Bank cardholders can receive FREE general admission to Carnegie Museums of Art and Natural History and The Andy Warhol Museum cultural attractions Saturday and Sunday.
WHEN: Saturday, October 4th & Sunday, October 5th (+ the first full weekend of each month)
WHERE: Learn something new this month with a visit to a participating cultural institution, including:
-Carnegie Museums of Pittsburgh, 4400 Forbes Ave, Pittsburgh, PA 15213
-The Andy Warhol Museum, 117 Sandusky St, Pittsburgh, PA 15212

Event: Accessible Tech Training @ LBPH
Schedule: Third Wednesday of every month, 1-2:30 p.m.
Venue: Library for the Blind & Physically Handicapped
Address: 4724 Baum Blvd, Pittsburgh, Oak

In [2]:
from collections import Counter

per_month = Counter(c["section"] for c in chunks_c)
print(f"{'Month':<25} {'Chunks':>7}")
print("-" * 34)
for month, count in sorted(per_month.items()):
    print(f"{month:<25} {count:>7}")

print(f"\nTotal: {len(chunks_c)} event chunks")

Month                      Chunks
----------------------------------
Apr 2026                       35
Aug 2026                       30
Dec 2026                        1
Feb 2026                        7
Jan 2026                       19
Jul 2026                       30
Jun 2026                       34
Mar 2026                       64
May 2026                       34
Nov 2026                        4
Oct 2026                       15
Sep 2026                       31

Total: 304 event chunks


In [9]:
for c in chunks_c[290:305]:
    print(f"── {c['chunk_id']} ──")
    print(c["text"])
    print(f"event_date: {c['event_date']}")
    print()

── C_downtown_pgh_events_05__0004 ──
Event Name: PSO in Libraries
Date: Jan 13, 2026
Description: Look for the Pittsburgh Symphony Orchestra in your community! These FREE chamber-style concerts feature PSO musicians performing curated programs at a local library near you. Click on "Get Tickets" to find performance dates in East Liberty, Downtown, Squirrel Hill, Beechview, and more. Arts + Culture, Sports + Recreation,
Categories: Family, Outdoor, Tours
event_date: 2026-01-13

── C_downtown_pgh_events_05__0005 ──
Event Name: Beauty of the Burgh Bike Tour
Date: Jan 1, 2026 at 10:00 am
Description: Whether you are visiting Pittsburgh or you are a local, you will learn interesting facts about the history and culture of Pittsburgh while riding past beautiful sights and architecture. Arts + Culture,
Categories: Tours
event_date: 2026-01-01 10:00

── C_downtown_pgh_events_05__0006 ──
Event Name: Visit the Old Allegheny Jail and Museum
Date: Jan 5, 2026 at 11:30 am
Description: Explore the Old

In [10]:
# Filter events in May 2026 without using embeddings
may_events = [
    c for c in chunks_c
    if c.get("event_date", "").startswith("2026-05")
]
print(f"Events in May 2026: {len(may_events)}\n")
for c in may_events[:]:
    print(f"  {c['event_date']}  {c['text'].splitlines()[0]}")

Events in May 2026: 34

  2026-05-01 18:40  Event Name: Pittsburgh Pirates vs. Cincinnati Reds VIP Experience Available
  2026-05-01 19:00  Event Name: Pittsburgh Symphony Orchestra: Star Wars The Empire Strikes Back In Concert
  2026-05-01 19:00  Event Name: DeRay Davis
  2026-05-01 19:30  Event Name: Howie Day
  2026-05-01 19:30  Event Name: Demon Hunter
  2026-05-01 20:00  Event Name: Lily Rose
  2026-05-01 20:00  Event Name: Journey Thru the Ages
  2026-05-01 20:00  Event Name: Heather McMahan
  2026-05-01 20:00  Event Name: Beginnings - A Tribute To Chicago
  2026-05-01 21:30  Event Name: DeRay Davis
  2026-05-02 12:00  Event Name: Story Pirates
  2026-05-02 16:05  Event Name: Pittsburgh Pirates vs. Cincinnati Reds VIP Experience Available
  2026-05-02 18:30  Event Name: DeRay Davis
  2026-05-02 19:00  Event Name: Felipe Esparza
  2026-05-02 19:00  Event Name: Pittsburgh Riverhounds SC vs. Phoenix Rising FC
  2026-05-02 19:00  Event Name: Pittsburgh Symphony Orchestra: Star Wars T

In [11]:
venue_keyword = "Heinz"
venue_events = [
    c for c in chunks_c
    if venue_keyword.lower() in c.get("venue", "").lower()
]
print(f"Events at venues containing '{venue_keyword}': {len(venue_events)}\n")
for c in venue_events[:5]:
    print(c["text"])
    print()

Events at venues containing 'Heinz': 8

Event Name: Pittsburgh Symphony Orchestra: American Soundscapes
Date: Sun, Mar 01, 2026 at 2:30 PM
Venue: Heinz Hall \| Capacity: 2676
Location: 15222, 600 Penn Ave, Pittsburgh, PA, US

Event Name: Pittsburgh Symphony Orchestra: Star Wars The Empire Strikes Back In Concert
Date: Fri, May 01, 2026 at 7:00 PM
Venue: Heinz Hall \| Capacity: 2676
Location: 15222, 600 Penn Ave, Pittsburgh, PA, US

Event Name: Pittsburgh Symphony Orchestra: Star Wars The Empire Strikes Back In Concert
Date: Sat, May 02, 2026 at 7:00 PM
Venue: Heinz Hall \| Capacity: 2676
Location: 15222, 600 Penn Ave, Pittsburgh, PA, US

Event Name: Pittsburgh Symphony Orchestra: Star Wars The Empire Strikes Back In Concert
Date: Sun, May 03, 2026 at 2:30 PM
Venue: Heinz Hall \| Capacity: 2676
Location: 15222, 600 Penn Ave, Pittsburgh, PA, US

Event Name: Pittsburgh Symphony Orchestra: Byron Stripling - Disco Divas
Date: Fri, Jun 05, 2026 at 7:30 PM
Venue: Heinz Hall \| Capacity: 2676


In [None]:
import sys
from pathlib import Path

REPO_ROOT  = Path(".")
SCRIPT_DIR = REPO_ROOT

sys.path.insert(0, str(SCRIPT_DIR))

from scripts.chunk_collection_c_cmu_events import (
    run,
    PROCESSED_DIR,
    DEFAULT_CAMPUS_MONTHS,
)

campus_months = DEFAULT_CAMPUS_MONTHS

docs, chunks = run(campus_months=campus_months, append=True)

import json
from collections import Counter


Collection C — CMU Events Chunking

[events.cmu.edu — campus calendar]
  ok  cmu_campus_events_20260301.md                     0 events  ->  0 chunks
  ok  cmu_campus_events_20260401.md                    62 events  ->  62 chunks
  ok  cmu_campus_events_20260501.md                    27 events  ->  27 chunks
  ok  cmu_campus_events_20260601.md                    21 events  ->  21 chunks
  ok  cmu_campus_events_20260701.md                    11 events  ->  11 chunks
  ok  cmu_campus_events_20260801.md                     9 events  ->  9 chunks
  ok  cmu_campus_events_20260901.md                     3 events  ->  3 chunks
  ok  cmu_campus_events_20261001.md                     4 events  ->  4 chunks
  ok  cmu_campus_events_20261101.md                     2 events  ->  2 chunks
  ok  cmu_campus_events_20261201.md                     0 events  ->  0 chunks

[cmu.edu/engage/events]
  ok  cmu_engage_events.md                              5 sections  ->  5 chunks

  -> append done
     C_docs

In [2]:
# ============================================================
# Notebook Cell — Chunk Collection D
# ============================================================
# 前置条件：
#   - clean_collection_d.py 已跑完，data/processed/D/ 下有清洗文件
#   - chunk_collection_d.py 位于 scripts/ 目录下
# ============================================================

import sys, json
from pathlib import Path
from collections import Counter

sys.path.insert(0, "scripts/")
from scripts.chunkers.chunk_collection_d import chunk_all, PROCESSED_D_DIR, OUT_DOCS, OUT_CHUNKS

# ── 参数 ─────────────────────────────────────────────────────
# append=True  → 追加到已有 D_docs.jsonl / D_chunks.jsonl
# append=False → 覆盖（首次运行或重建时使用）
docs, chunks = chunk_all(append=False)

# ── 基础统计 ─────────────────────────────────────────────────
print(f"\n{'='*50}")
print(f"写入文档数  : {len(docs)}")
print(f"写入 chunk 数: {len(chunks)}")

# 按分类统计
cat_counts = Counter(c["category"] for c in chunks)
for cat, cnt in sorted(cat_counts.items()):
    print(f"  {cat:<20} {cnt:>4} chunks")

# ── 抽查 chunk 文本 ──────────────────────────────────────────
print(f"\n── 随机抽查 3 条 chunk ──")
import random
for c in random.sample(chunks, min(3, len(chunks))):
    print(f"\nchunk_id     : {c['chunk_id']}")
    print(f"site / cat   : {c['site_key']} / {c['category']}")
    print(f"section_path : {c['section_path']}")
    print(f"words        : {c['word_count']}  overlap={c['has_overlap']}")
    print("text ↓")
    print(c["text"][:400])
    print("...")

# ── JSONL 文件校验 ───────────────────────────────────────────
print(f"\n── JSONL 校验 ──")
for fpath in [OUT_DOCS, OUT_CHUNKS]:
    records = [json.loads(l) for l in fpath.read_text().splitlines() if l.strip()]
    print(f"  {fpath.name:<25} {len(records):>5} 条  ✓")

Collection D — Chunking
Files: 269
  ok  bananasplitfest/bananasplitfest.com.md                    7 chunks
  ok  bananasplitfest/bananasplitfest.com__activities.md        6 chunks
  skip  bananasplitfest/bananasplitfest.com__activities__crafts-games-activities.md  (no content)
  skip  bananasplitfest/bananasplitfest.com__activities__entertainment.md  (no content)
  ok  bananasplitfest/bananasplitfest.com__activities__food.md   1 chunks
  skip  bananasplitfest/bananasplitfest.com__activities__over-21-area.md  (no content)
  skip  bananasplitfest/bananasplitfest.com__activities__participating-vendors.md  (no content)
  ok  bananasplitfest/bananasplitfest.com__events.md            6 chunks
  ok  bananasplitfest/bananasplitfest.com__events__5k-banana-run.md   1 chunks
  ok  bananasplitfest/bananasplitfest.com__events__banana-challenge.md   1 chunks
  ok  bananasplitfest/bananasplitfest.com__events__blood-drive.md   1 chunks
  ok  bananasplitfest/bananasplitfest.com__events__cornhole-tourn

In [2]:
import sys, json
from pathlib import Path
from collections import Counter

sys.path.insert(0, "scripts/")  # adjust if script is elsewhere
from scripts.chunk_collection_a_wiki import run, OUT_DOCS, OUT_CHUNKS

# ── Run ──────────────────────────────────────────────────────────────────────
# append=False  -> overwrite (default; first run or full rebuild)
# append=True   -> append to existing JSONL
# dry_run=True  -> print stats only, no file writes
docs, chunks = run(append=False)

# ── Quick stats ──────────────────────────────────────────────────────────────
print(f"\n{'='*50}")
print(f"Docs    : {len(docs)}")
print(f"Chunks  : {len(chunks)}")
avg_wc = sum(c['word_count'] for c in chunks) / len(chunks) if chunks else 0
print(f"Avg chunk size : {avg_wc:.0f} words")

# Chunk size distribution
buckets = Counter(
    "<100"  if c['word_count'] <  100 else
    "100-200" if c['word_count'] < 200 else
    "200-300" if c['word_count'] < 300 else
    "300-450" if c['word_count'] < 450 else "450+"
    for c in chunks
)
print("\nChunk size distribution:")
for label in ["<100", "100-200", "200-300", "300-450", "450+"]:
    print(f"  {label:<10} {buckets[label]:>5} chunks")

# ── Sample chunk ─────────────────────────────────────────────────────────────
print("\n── Sample chunk ──")
import random
sample = random.choice(chunks)
print(f"chunk_id   : {sample['chunk_id']}")
print(f"md_title   : {sample['md_title']}")
print(f"section    : {sample['section']}")
print(f"subsection : {sample['subsection']}")
print(f"words      : {sample['word_count']}")
print("\ntext:")
print(sample['text'][:500])

# ── JSONL validation ─────────────────────────────────────────────────────────
print("\n── JSONL validation ──")
for fpath in [OUT_DOCS, OUT_CHUNKS]:
    records = [json.loads(l) for l in fpath.read_text().splitlines() if l.strip()]
    print(f"  {fpath.name:<25} {len(records):>6} records  ✓")

Collection A — Wikipedia Chunking
Files: 51
  ok    Andrew Carnegie - Wikipedia.md                        56 chunks  ~11069 words
  ok    Andrew Mellon - Wikipedia.md                          35 chunks  ~ 6903 words
  ok    Association of American Universities - Wikipedia.md    20 chunks  ~ 3163 words
  ok    Association of Independent Technological Universities - Wikipedia.md     4 chunks  ~  283 words
  ok    Astrobotic Technology - Wikipedia.md                  10 chunks  ~ 1719 words
  ok    Carnegie Library of Pittsburgh - Wikipedia.md          6 chunks  ~  688 words
  ok    Carnegie Mellon College of Engineering - Wikipedia.md     4 chunks  ~  535 words
  ok    Carnegie Mellon College of Fine Arts - Wikipedia.md     2 chunks  ~  525 words
  ok    Carnegie Mellon School of Art - Wikipedia.md           6 chunks  ~  812 words
  ok    Carnegie Mellon School of Computer Science - Wikipedia.md    15 chunks  ~ 2034 words
  ok    Carnegie Mellon School of Design - Wikipedia.md       10 c