# Init and load

In [1]:
%pip install -q requests python-dotenv pymupdf4llm lxml beautifulsoup4 markdownify

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from dotenv import load_dotenv

load_dotenv()
fc_api_key = os.getenv("FIRECRAWL_API_KEY") or os.getenv("FC_API_KEY")
if not fc_api_key:
    raise RuntimeError("Set FIRECRAWL_API_KEY or FC_API_KEY in env/.env")
print("API key loaded")

API key loaded


In [3]:
import sys
from pathlib import Path

ROOT = Path(".").resolve()
sys.path.insert(0, str(ROOT / "scripts"))
sys.path.insert(0, str(ROOT / "scripts/web_scrapers"))


# Collection A 

The data source of this collection also contain htmls from baseline dataset 

In [None]:
from scripts.web_scrapers.scrape_wikipedia import scrape_all as run_wiki, TARGETS as WIKI_TARGETS
from scripts.web_scrapers.scrape_britannica import scrape_all as run_brit, TARGETS as BRIT_TARGETS
from scripts.web_scrapers.scrape_visitpittsburgh import scrape_all as run_visit, TARGETS as VISIT_TARGETS
from scripts.web_scrapers.scrape_cmu import scrape_all as run_cmu, TARGETS as CMU_TARGETS

if fc_api_key is None:
	raise RuntimeError("API key missing; set FIRECRAWL_API_KEY or FC_API_KEY")
print("Running Collection A web scrapers...")
wiki_results = run_wiki(WIKI_TARGETS, api_key=fc_api_key)
brit_results = run_brit(BRIT_TARGETS, api_key=fc_api_key)
visit_results = run_visit(VISIT_TARGETS, api_key=fc_api_key)
cmu_results = run_cmu(CMU_TARGETS, api_key=fc_api_key)
print("Collection A done")

Running Collection A web scrapers...

[1/2] Scraping: https://en.wikipedia.org/wiki/Pittsburgh
  -> Requesting: https://en.wikipedia.org/wiki/Pittsburgh
  + Raw:   data/raw/A/pittsburgh.md  (501.1 KB)
  + Clean: data/processed/A/pittsburgh.md  (177.0 KB)

[2/2] Scraping: https://en.wikipedia.org/wiki/History_of_Pittsburgh
  -> Requesting: https://en.wikipedia.org/wiki/History_of_Pittsburgh
  + Raw:   data/raw/A/history_of_pittsburgh.md  (162.8 KB)
  + Clean: data/processed/A/history_of_pittsburgh.md  (86.2 KB)

[1/1] Scraping: https://www.britannica.com/place/Pittsburgh
  -> Requesting: https://www.britannica.com/place/Pittsburgh
  + Raw:   data/raw/A/pittsburgh_britannica.md  (28.9 KB)
  + Clean: data/processed/A/pittsburgh_britannica.md  (14.4 KB)

[1/4] Scraping: https://www.visitpittsburgh.com/about-pittsburgh/
  -> Requesting: https://www.visitpittsburgh.com/about-pittsburgh/
  + Raw:   data/raw/A/visitpgh_about.md  (5.0 KB)
  + Clean: data/processed/A/visitpgh_about.md  (2.9 KB)


#### here we clean out the baseline dataset provided by instructor

In [None]:
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import re
from datetime import datetime, timezone

INPUT_DIR = Path("baseline_data")
OUTPUT_DIR = INPUT_DIR / "wikipedia_md_cleaned"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

REMOVE_SELECTORS = [
    "header", "footer", "nav",
    "#mw-head", "#mw-panel", "#siteSub", "#contentSub",
    ".mw-editsection", ".mw-editsection-bracket",
    "#toc", ".toc", ".vector-toc",
    ".mw-jump-link", ".mw-portlet", ".vector-column-start",
    ".vector-page-toolbar", ".vector-page-titlebar",
    ".noprint", ".mw-footer", ".printfooter",
    "sup.reference", ".reference", ".reflist", "ol.references",
    ".hatnote", ".dablink", ".shortdescription", ".ambox",
    "table.infobox", "table.vertical-navbox", "table.navbox",
    ".navbox", ".vertical-navbox", ".sidebar",
    "figure", "img", ".thumb", ".gallery", ".mw-file-element",
    "math", "code", "pre",
]
DROP_TAIL = {"See also", "References", "External links", "Further reading", "Notes"}

def _squeeze_blank(s: str) -> str:
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def _drop_tail(md_text: str) -> str:
    out = []
    for line in md_text.splitlines():
        m = re.match(r"^(#{2,6})\s+(.*)\s*$", line.strip())
        if m and m.group(2).strip() in DROP_TAIL:
            break
        out.append(line)
    return "\n".join(out)

def _post_clean(md_text: str) -> str:
    md_text = re.sub(r'(\[[^\]]+\]\([^\s\)]+)\s+"[^"]*"\)', r"\1)", md_text)
    md_text = re.sub(r"\[\d+\]", "", md_text)
    md_text = re.sub(r"\[\s*\]\([^)]+\)", "", md_text)
    md_text = re.sub(r"^Wikimedia Commons has media related to .*?\.\s*$", "", md_text, flags=re.MULTILINE)
    md_text = _drop_tail(md_text)
    return _squeeze_blank(md_text)

def extract_main(html_text: str) -> tuple[str, str, str]:
    soup = BeautifulSoup(html_text, "lxml")
    title = soup.title.get_text(strip=True) if soup.title else "Wikipedia Page"
    canonical = soup.find("link", rel="canonical")
    source_url = canonical["href"].strip() if canonical and canonical.get("href") else ""
    main = soup.select_one("#mw-content-text") or soup.select_one("main#content") or soup.select_one("#content")
    if not main:
        main = soup.body or soup
    for sel in REMOVE_SELECTORS:
        for tag in main.select(sel):
            tag.decompose()
    for tag in main.select("span.mw-editsection"):
        tag.decompose()
    for tag in main.select(".mw-cite-backlink"):
        tag.decompose()
    for tag in main.select("table.metadata, table.ambox"):
        tag.decompose()
    for a in main.select("a[href]"):
        href = a.get("href", "")
        if href.startswith("/wiki/"):
            a["href"] = "https://en.wikipedia.org" + href
        elif href.startswith("//"):
            a["href"] = "https:" + href
    return title, source_url, str(main)

def html_to_markdown(html_text: str) -> tuple[str, str, str]:
    title, source_url, main_html = extract_main(html_text)
    md_text = md(main_html, heading_style="ATX", bullets="-")
    md_text = _post_clean(md_text)
    return title, source_url, md_text

html_files = sorted(list(INPUT_DIR.glob("*.htm")) + list(INPUT_DIR.glob("*.html")))
print(f"Found {len(html_files)} HTML files in {INPUT_DIR}")

now_iso = datetime.now(timezone.utc).isoformat()
converted = 0
for fp in html_files:
    html_text = fp.read_text(encoding="utf-8", errors="ignore")
    title, source_url, md_text = html_to_markdown(html_text)
    front = [
        "---",
        f"source_url: {source_url}",
        f"scraped_at: {now_iso}",
        f"title: {title}",
        "description: ",
        "---",
        "",
    ]
    out_path = OUTPUT_DIR / f"{fp.stem}.md"
    out_path.write_text("\n".join(front) + md_text + "\n", encoding="utf-8")
    converted += 1

print(f"Converted {converted} files -> {OUTPUT_DIR}")

# Collection B

In [5]:
from scripts.web_scrapers.scrape_pdf_collection_b import process_all as run_pdf, TARGETS as PDF_TARGETS

print("Running Collection B PDFs...")
pdf_results = run_pdf(PDF_TARGETS)
print("Collection B done")

Consider using the pymupdf_layout package for a greatly improved page layout analysis.
Running Collection B PDFs...

[1/2] City of Pittsburgh Payroll Tax Regulations
  URL: https://www.pittsburghpa.gov/files/assets/city/v/1/finance/documents/tax-forms/9626_payroll_tax_regulations.pdf
  ↓  Downloading: https://www.pittsburghpa.gov/files/assets/city/v/1/finance/documents/tax-forms/9626_payroll_tax_regulations.pdf
  ✓  Saved: payroll_tax_regulations.pdf  (578 KB)
  ⚙  Converting to Markdown: payroll_tax_regulations.pdf
Processing data/raw/B/payroll_tax_regulations.pdf...


100%|██████████| 18/18 [00:00<00:00, 18.32it/s]


  ✓  Converted in 1.1s  (44,516 chars)
  ✓  Cleaned: 44,516 → 44,448 chars  (0% reduction)
  ✓  Processed saved: data/processed/B/payroll_tax_regulations.md  (44 KB)

[2/2] City of Pittsburgh 2025 Operating Budget
  URL: https://www.pittsburghpa.gov/files/assets/city/v/4/omb/documents/operating-budgets/2025-operating-budget.pdf
  ↓  Downloading: https://www.pittsburghpa.gov/files/assets/city/v/4/omb/documents/operating-budgets/2025-operating-budget.pdf
  ✓  Saved: 2025_operating_budget.pdf  (24341 KB)
  ⚙  Converting to Markdown: 2025_operating_budget.pdf
Processing data/raw/B/2025_operating_budget.pdf...


100%|██████████| 354/354 [00:27<00:00, 12.92it/s]

  ✓  Converted in 28.3s  (611,991 chars)
  ✓  Cleaned: 611,991 → 607,809 chars  (1% reduction)
  ✓  Processed saved: data/processed/B/2025_operating_budget.md  (595 KB)

Done: 2/2 succeeded
  ✓  payroll_tax_regulations.md  (44,448 chars)
  ✓  2025_operating_budget.md  (607,809 chars)
Collection B done





# Collection C

The data source of this collection also contain a csv from Chrome extender instant data scrape

In [None]:
from scripts.web_scrapers.scrape_events_collection_c import scrape_all as run_events, TARGETS as C_TARGETS
from scripts.web_scrapers.process_recurring_events_csv_c import process_csv as process_recurring

print("Running Collection C events...")
if fc_api_key is None:
    raise RuntimeError("API key missing; set FIRECRAWL_API_KEY or FC_API_KEY")
events_results = run_events(C_TARGETS, api_key=fc_api_key, delay=2.0, skip_existing=False)
csv_path = Path("data/raw/C/recurring_events.csv")
recurring_results = None
if csv_path.exists():
    recurring_results = process_recurring(csv_path, source_url="https://www.pittsburghmagazine.com/")
else:
    print("recurring_events.csv missing")
print("Collection C done")

Running Collection C events...

[1/31] Pittsburgh Events — March
  → https://pittsburgh.events/march/
  ✓  raw: 37.4 KB  →  processed: 13.1 KB  [pgh_events_march]

[2/31] Pittsburgh Events — April
  → https://pittsburgh.events/april/
  ✓  raw: 37.4 KB  →  processed: 13.1 KB  [pgh_events_april]

[3/31] Pittsburgh Events — May
  → https://pittsburgh.events/may/
  ✓  raw: 37.1 KB  →  processed: 13.1 KB  [pgh_events_may]

[4/31] Pittsburgh Events — June
  → https://pittsburgh.events/june/
  ✓  raw: 38.0 KB  →  processed: 13.3 KB  [pgh_events_june]

[5/31] Pittsburgh Events — July
  → https://pittsburgh.events/july/
  ✓  raw: 32.9 KB  →  processed: 11.6 KB  [pgh_events_july]

[6/31] Pittsburgh Events — August
  → https://pittsburgh.events/august/
  ✓  raw: 32.7 KB  →  processed: 11.2 KB  [pgh_events_august]

[7/31] Pittsburgh Events — September
  → https://pittsburgh.events/september/
  ✓  raw: 33.2 KB  →  processed: 11.4 KB  [pgh_events_september]

[8/31] Pittsburgh Events — October
  → ht

## Deal with the csv

In [9]:
from scripts.web_scrapers.process_recurring_events_csv_c import process_csv

# Update csv_path to wherever your CSV file is located
result = process_csv(
    csv_path="data/raw/C/recurring_events.csv",
    source_url="https://www.pittsburghmagazine.com/best-of-the-burgh-listings/",
)
print(result)

  ↓  Reading: recurring_events.csv
  ✓  Loaded 370 records
  ✓  Markdown → data/processed/C/recurring_events_pittsburgh.md  (145.9 KB)
  ✓  Appended doc record to data/processed/C/C_docs.jsonl
{'records': 370, 'md_path': 'data/processed/C/recurring_events_pittsburgh.md', 'jsonl_path': 'data/processed/C/C_docs.jsonl'}


# Collection D

The data source contain one csv from Chrome extender

In [10]:
import runpy
import scripts.web_scrapers.scrape_collection_d as crawl_d

print("Running Collection D crawl...")
crawl_d.API_KEY = fc_api_key
crawl_d.HEADERS = {"Authorization": f"Bearer {fc_api_key}", "Content-Type": "application/json"}
crawl_results = crawl_d.crawl_all(crawl_d.CRAWL_TASKS, dry_run=False)

restaurants_csv = ROOT / "data/raw/D/visitpittsburgh_restaurants.csv"
if restaurants_csv.exists():
    runpy.run_path(str(ROOT / "scripts/web_scrapers/scrape_restaurants_d.py"), run_name="__main__")
else:
    print("restaurants CSV missing")
print("Collection D done")

Running Collection D crawl...
Collection D Crawl  —  13 sites

[pghtacofest]  https://www.pghtacofest.com/about
  [started] job_id=019c92d7-391b-75fc-8097-a9ea8396862d
  [poll] status=scraping  1/1 pages
  [poll] status=completed  1/1 pages
  [done] 1 pages fetched, 1 saved  →  data/raw/D/pghtacofest/

[visitpittsburgh]  https://www.visitpittsburgh.com/events-festivals/
  [rate limit] waiting 62s before next request...
  [started] job_id=019c92d8-55f5-7129-95e2-5c72b83a00a1
  [poll] status=scraping  27/40 pages
  [poll] status=scraping  40/40 pages
  [poll] status=scraping  40/40 pages
  [poll] status=scraping  40/40 pages
  [poll] status=completed  40/40 pages
  [done] 40 pages fetched, 40 saved  →  data/raw/D/visitpittsburgh/

[picklesburgh]  https://www.picklesburgh.com/
  [rate limit] waiting 62s before next request...
  [started] job_id=019c92d9-b2de-73f7-8465-be72bce22df9
  [poll] status=scraping  7/7 pages
  [poll] status=completed  7/7 pages
  [done] 7 pages fetched, 7 saved  →

#### use cleaner to clean all the files of collection D bec they come from a rough crawler 

In [11]:
from scripts.web_scrapers.cleaner_d import clean_all

print("Cleaning Collection D markdown...")
clean_stats = clean_all(strip_button_lines=True, dry_run=False)
print("Clean done")

Cleaning Collection D markdown...
Collection D — Markdown Cleaning
Files: 270
  ok   bananasplitfest/bananasplitfest.com.md                         17863 →    3551 chars  (-80.1%)
  ok   bananasplitfest/bananasplitfest.com__activities.md              2597 →    2182 chars  (-16.0%)
  ok   bananasplitfest/bananasplitfest.com__activities__crafts-games-activities.md     187 →      95 chars  (-49.2%)
  ok   bananasplitfest/bananasplitfest.com__activities__entertainment.md     335 →     253 chars  (-24.5%)
  ok   bananasplitfest/bananasplitfest.com__activities__food.md        1255 →     453 chars  (-63.9%)
  ok   bananasplitfest/bananasplitfest.com__activities__over-21-area.md     189 →     108 chars  (-42.9%)
  ok   bananasplitfest/bananasplitfest.com__activities__participating-vendors.md     175 →      85 chars  (-51.4%)
  ok   bananasplitfest/bananasplitfest.com__events.md                  2771 →    2366 chars  (-14.6%)
  ok   bananasplitfest/bananasplitfest.com__events__5k-banana-run.md 