In [1]:
"""
FOMC & Fed Speech Downloader
============================
Downloads FOMC statements/minutes, press conferences, and Fed Chair speeches.
Core logic lives in src/downloader.py; this notebook orchestrates the pipeline.
"""

import sys
from pathlib import Path

# Add src to path for imports
sys.path.insert(0, str(Path("..").resolve()))

from src.downloader import create_session, collect_fomc_documents, collect_speeches

# Configuration
DATA_DIR = Path("../data/fed_downloads")

# Create session and download
session = create_session()

# Download FOMC documents
print("Downloading FOMC documents...")
fomc_records = collect_fomc_documents(session, DATA_DIR / "fomc")

# Download speeches
print("\nDownloading speeches...")
speech_records = collect_speeches(session, DATA_DIR / "speeches", start_year=1994)

print(f"\nSummary: FOMC docs: {len(fomc_records)}; speeches: {len(speech_records)}")

Downloading FOMC documents...


Parsing calendar pages: 100%|██████████| 126/126 [00:15<00:00,  8.02it/s]
Downloading FOMC docs: 100%|██████████| 164/164 [00:00<00:00, 321.78it/s]



Downloading speeches...
Attempting yearly archives...


Downloading speeches: 100%|██████████| 249/249 [00:00<00:00, 7917.26it/s]          


Summary: FOMC docs: 164; speeches: 249





In [2]:
"""
Text Extraction & Cleaning
==========================
Parses raw HTML into cleaned text for FOMC minutes and speeches.
Core logic lives in src/text_cleaner.py; this notebook orchestrates the pipeline.
"""

from pathlib import Path
from src.text_cleaner import process_tree, recommend_guard_from_profile

# Paths
INPUT_FOMC = Path("../data/fed_downloads/fomc")
OUTPUT_FOMC = Path("../data/fed_text/fomc")
INPUT_SPEECHES = Path("../data/fed_downloads/speeches")
OUTPUT_SPEECHES = Path("../data/fed_text/speeches")
REMOVED_LINES_FILE = Path("../data/fed_text/removed_lines.txt")

# Process FOMC documents
print("=" * 60)
print("Processing FOMC documents (line-level whitelist-first + dynamic guard)...")
print("=" * 60)

if INPUT_FOMC.exists():
    # Profile pass to compute corpus ratios and collect removed lines
    removed_buffer: list[str] = []
    total_files, total_chars, total_tokens, counter, ratios = process_tree(
        INPUT_FOMC, OUTPUT_FOMC, is_fomc=True, removal_guard=None, removed_collector=removed_buffer
    )
    
    # Save removed lines for inspection
    if removed_buffer:
        REMOVED_LINES_FILE.parent.mkdir(parents=True, exist_ok=True)
        REMOVED_LINES_FILE.write_text("\n".join(removed_buffer), encoding="utf-8")
        print(f"Removed lines saved to: {REMOVED_LINES_FILE}")
    
    # Compute recommended guard and re-run if needed
    if ratios:
        recommended_guard = recommend_guard_from_profile(ratios)
        total_files, total_chars, total_tokens, counter, _ = process_tree(
            INPUT_FOMC, OUTPUT_FOMC, is_fomc=True, removal_guard=recommended_guard
        )
    
    # Report stats
    print(f"\nProcessed files: {total_files}")
    if total_files:
        print(f"Average length (chars): {total_chars / total_files:.1f}")
        print(f"Average tokens: {total_tokens / total_files:.1f}")
        print("Top tokens:")
        for tok, cnt in counter.most_common(10):
            print(f"  {tok}: {cnt}")
    print(f"Output saved to: {OUTPUT_FOMC}")
else:
    print(f"Input directory not found: {INPUT_FOMC}")

# Process speeches
print("\n" + "=" * 60)
print("Processing speeches...")
print("=" * 60)

if INPUT_SPEECHES.exists():
    total_files, total_chars, total_tokens, counter, _ = process_tree(
        INPUT_SPEECHES, OUTPUT_SPEECHES, is_fomc=False, removal_guard=None
    )
    print(f"Processed files: {total_files}")
    if total_files:
        print(f"Average length (chars): {total_chars / total_files:.1f}")
        print(f"Average tokens: {total_tokens / total_files:.1f}")
        print("Top tokens:")
        for tok, cnt in counter.most_common(10):
            print(f"  {tok}: {cnt}")
    print(f"Output saved to: {OUTPUT_SPEECHES}")
else:
    print(f"Input directory not found: {INPUT_SPEECHES}")

Processing FOMC documents (line-level whitelist-first + dynamic guard)...
Removed lines saved to: ..\data\fed_text\removed_lines.txt
Corpus removal profile (FOMC):
  Mean: 2.0% | Median: 0.0% | 90th: 5.9%
  Recommended guard: 30% (clamped 30–60%)

Processed files: 109
Average length (chars): 24166.3
Average tokens: 3691.6
Top tokens:
  the: 27964
  of: 14591
  and: 11445
  to: 11129
  in: 11097
  that: 6626
  a: 4871
  for: 3913
  on: 3791
  participants: 3495
Output saved to: ..\data\fed_text\fomc

Processing speeches...
Processed files: 242
Average length (chars): 17933.5
Average tokens: 2835.8
Top tokens:
  the: 44322
  of: 23569
  and: 21959
  to: 18750
  in: 15903
  a: 10338
  that: 9002
  for: 7457
  is: 5880
  as: 5120
Output saved to: ..\data\fed_text\speeches
