In [None]:
%pip install -q requests python-dotenv pymupdf4llm lxml beautifulsoup4 markdownify

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
fc_api_key = os.getenv("FIRECRAWL_API_KEY") or os.getenv("FC_API_KEY")
if not fc_api_key:
    raise RuntimeError("Set FIRECRAWL_API_KEY or FC_API_KEY in env/.env")
print("API key loaded")

✓ API Key 已加载：fc-5da20...


In [None]:
import sys
from pathlib import Path

ROOT = Path(".").resolve()
sys.path.insert(0, str(ROOT / "scripts"))
sys.path.insert(0, str(ROOT / "scripts/web_scrapers"))


In [None]:
from scripts.web_scrapers.scrape_wikipedia import scrape_all as run_wiki, TARGETS as WIKI_TARGETS
from scripts.web_scrapers.scrape_britannica import scrape_all as run_brit, TARGETS as BRIT_TARGETS
from scripts.web_scrapers.scrape_visitpittsburgh import scrape_all as run_visit, TARGETS as VISIT_TARGETS
from scripts.web_scrapers.scrape_cmu import scrape_all as run_cmu, TARGETS as CMU_TARGETS

print("Running Collection A web scrapers...")
wiki_results = run_wiki(WIKI_TARGETS, api_key=fc_api_key)
brit_results = run_brit(BRIT_TARGETS, api_key=fc_api_key)
visit_results = run_visit(VISIT_TARGETS, api_key=fc_api_key)
cmu_results = run_cmu(CMU_TARGETS, api_key=fc_api_key)
print("Collection A done")

In [None]:
from scripts.web_scrapers.scrape_pdf_collection_b import process_all as run_pdf, TARGETS as PDF_TARGETS

print("Running Collection B PDFs...")
pdf_results = run_pdf(PDF_TARGETS)
print("Collection B done")

In [None]:
from scripts.web_scrapers.scrape_events_collection_c import scrape_all as run_events, TARGETS as C_TARGETS
from scripts.web_scrapers.process_recurring_events_csv_c import process_csv as process_recurring

print("Running Collection C events...")
events_results = run_events(C_TARGETS, api_key=fc_api_key, delay=2.0, skip_existing=False)
csv_path = Path("data/raw/C/recurring_events.csv")
recurring_results = None
if csv_path.exists():
    recurring_results = process_recurring(csv_path, source_url="https://www.pittsburghmagazine.com/")
else:
    print("recurring_events.csv missing")
print("Collection C done")

In [None]:
import runpy
import scripts.web_scrapers.crawl_collection_d as crawl_d

print("Running Collection D crawl...")
crawl_d.API_KEY = fc_api_key
crawl_d.HEADERS = {"Authorization": f"Bearer {fc_api_key}", "Content-Type": "application/json"}
crawl_results = crawl_d.crawl_all(crawl_d.CRAWL_TASKS, dry_run=False)

restaurants_csv = ROOT / "data/raw/D/visitpittsburgh_restaurants.csv"
if restaurants_csv.exists():
    runpy.run_path(str(ROOT / "scripts/web_scrapers/scrape_restaurants_d.py"), run_name="__main__")
else:
    print("restaurants CSV missing")
print("Collection D done")

In [None]:
from scripts.web_scrapers.cleaner_d import clean_all

print("Cleaning Collection D markdown...")
clean_stats = clean_all(strip_button_lines=True, dry_run=False)
print("Clean done")

In [None]:
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import re
from datetime import datetime, timezone

INPUT_DIR = Path("baseline_data")
OUTPUT_DIR = INPUT_DIR / "wikipedia_md_cleaned"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

REMOVE_SELECTORS = [
    "header", "footer", "nav",
    "#mw-head", "#mw-panel", "#siteSub", "#contentSub",
    ".mw-editsection", ".mw-editsection-bracket",
    "#toc", ".toc", ".vector-toc",
    ".mw-jump-link", ".mw-portlet", ".vector-column-start",
    ".vector-page-toolbar", ".vector-page-titlebar",
    ".noprint", ".mw-footer", ".printfooter",
    "sup.reference", ".reference", ".reflist", "ol.references",
    ".hatnote", ".dablink", ".shortdescription", ".ambox",
    "table.infobox", "table.vertical-navbox", "table.navbox",
    ".navbox", ".vertical-navbox", ".sidebar",
    "figure", "img", ".thumb", ".gallery", ".mw-file-element",
    "math", "code", "pre",
]
DROP_TAIL = {"See also", "References", "External links", "Further reading", "Notes"}

def _squeeze_blank(s: str) -> str:
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def _drop_tail(md_text: str) -> str:
    out = []
    for line in md_text.splitlines():
        m = re.match(r"^(#{2,6})\s+(.*)\s*$", line.strip())
        if m and m.group(2).strip() in DROP_TAIL:
            break
        out.append(line)
    return "\n".join(out)

def _post_clean(md_text: str) -> str:
    md_text = re.sub(r'(\[[^\]]+\]\([^\s\)]+)\s+"[^"]*"\)', r"\1)", md_text)
    md_text = re.sub(r"\[\d+\]", "", md_text)
    md_text = re.sub(r"\[\s*\]\([^)]+\)", "", md_text)
    md_text = re.sub(r"^Wikimedia Commons has media related to .*?\.\s*$", "", md_text, flags=re.MULTILINE)
    md_text = _drop_tail(md_text)
    return _squeeze_blank(md_text)

def extract_main(html_text: str) -> tuple[str, str, str]:
    soup = BeautifulSoup(html_text, "lxml")
    title = soup.title.get_text(strip=True) if soup.title else "Wikipedia Page"
    canonical = soup.find("link", rel="canonical")
    source_url = canonical["href"].strip() if canonical and canonical.get("href") else ""
    main = soup.select_one("#mw-content-text") or soup.select_one("main#content") or soup.select_one("#content")
    if not main:
        main = soup.body or soup
    for sel in REMOVE_SELECTORS:
        for tag in main.select(sel):
            tag.decompose()
    for tag in main.select("span.mw-editsection"):
        tag.decompose()
    for tag in main.select(".mw-cite-backlink"):
        tag.decompose()
    for tag in main.select("table.metadata, table.ambox"):
        tag.decompose()
    for a in main.select("a[href]"):
        href = a.get("href", "")
        if href.startswith("/wiki/"):
            a["href"] = "https://en.wikipedia.org" + href
        elif href.startswith("//"):
            a["href"] = "https:" + href
    return title, source_url, str(main)

def html_to_markdown(html_text: str) -> tuple[str, str, str]:
    title, source_url, main_html = extract_main(html_text)
    md_text = md(main_html, heading_style="ATX", bullets="-")
    md_text = _post_clean(md_text)
    return title, source_url, md_text

html_files = sorted(list(INPUT_DIR.glob("*.htm")) + list(INPUT_DIR.glob("*.html")))
print(f"Found {len(html_files)} HTML files in {INPUT_DIR}")

now_iso = datetime.now(timezone.utc).isoformat()
converted = 0
for fp in html_files:
    html_text = fp.read_text(encoding="utf-8", errors="ignore")
    title, source_url, md_text = html_to_markdown(html_text)
    front = [
        "---",
        f"source_url: {source_url}",
        f"scraped_at: {now_iso}",
        f"title: {title}",
        "description: ",
        "---",
        "",
    ]
    out_path = OUTPUT_DIR / f"{fp.stem}.md"
    out_path.write_text("\n".join(front) + md_text + "\n", encoding="utf-8")
    converted += 1

print(f"Converted {converted} files -> {OUTPUT_DIR}")