In [3]:
!pip install -q lxml

In [1]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting beautifulsoup4 (from bs4)
  Downloading beautifulsoup4-4.14.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>=1.6.1 (from beautifulsoup4->bs4)
  Downloading soupsieve-2.8.3-py3-none-any.whl.metadata (4.6 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading beautifulsoup4-4.14.3-py3-none-any.whl (107 kB)
Downloading soupsieve-2.8.3-py3-none-any.whl (37 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [bs4]
[1A[2KSuccessfully installed beautifulsoup4-4.14.3 bs4-0.0.2 soupsieve-2.8.3


In [2]:
# Notebook cell: Convert all Wikipedia .htm/.html files in a folder to cleaned Markdown files
# (Improved: drop tail sections, remove link titles, extra wiki junk removal)

from pathlib import Path
from datetime import datetime, timezone
import re

# ---- 0) Config: set your input folder here ----
INPUT_DIR = Path("baseline_data")
OUTPUT_DIR = INPUT_DIR / "wikipedia_md_cleaned"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- 1) Dependencies (install if missing) ----
try:
    from bs4 import BeautifulSoup
except ImportError:
    !pip -q install beautifulsoup4 lxml
    from bs4 import BeautifulSoup

try:
    from markdownify import markdownify as md
except ImportError:
    !pip -q install markdownify
    from markdownify import markdownify as md


# ---- 2) Wikipedia-specific cleaning helpers ----
WIKI_REMOVE_SELECTORS = [
    # navigation / chrome
    "header", "footer", "nav",
    "#mw-head", "#mw-panel", "#siteSub", "#contentSub",
    ".mw-editsection", ".mw-editsection-bracket",
    "#toc", ".toc", ".vector-toc",
    ".mw-jump-link", ".mw-portlet", ".vector-column-start",
    ".vector-page-toolbar", ".vector-page-titlebar",
    ".noprint", ".mw-footer", ".printfooter",
    # references / citations / hatnotes
    "sup.reference", ".reference", ".reflist", "ol.references",
    ".hatnote", ".dablink", ".shortdescription", ".ambox",
    # infobox + navboxes + sidebars
    "table.infobox", "table.vertical-navbox", "table.navbox",
    ".navbox", ".vertical-navbox", ".sidebar",
    # media / images
    "figure", "img", ".thumb", ".gallery", ".mw-file-element",
    # math/code
    "math", "code", "pre",
]

DROP_TAIL_SECTIONS = {
    "See also",
    "References",
    "External links",
    "Further reading",
    "Notes",
}

def _squeeze_blank_lines(s: str) -> str:
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def _drop_tail_sections_markdown(md_text: str) -> str:
    """Drop low-value sections by stopping at the first matching heading."""
    out = []
    for line in md_text.splitlines():
        m = re.match(r"^(#{2,6})\s+(.*)\s*$", line.strip())
        if m:
            heading_text = m.group(2).strip()
            if heading_text in DROP_TAIL_SECTIONS:
                break
        out.append(line)
    return "\n".join(out)

def _post_clean_markdown(md_text: str) -> str:
    # Remove link titles: [text](url "title") -> [text](url)
    md_text = re.sub(r'(\[[^\]]+\]\([^\s\)]+)\s+"[^"]*"\)', r"\1)", md_text)

    # Remove leftover citation markers like [1], [23]
    md_text = re.sub(r"\[\d+\]", "", md_text)

    # Remove empty link artifacts like []()
    md_text = re.sub(r"\[\s*\]\([^)]+\)", "", md_text)

    # Remove common Wikimedia Commons template sentence if it appears
    md_text = re.sub(r"^Wikimedia Commons has media related to .*?\.\s*$", "", md_text, flags=re.MULTILINE)

    # Drop tail sections
    md_text = _drop_tail_sections_markdown(md_text)

    return _squeeze_blank_lines(md_text)

def extract_wikipedia_main_html(html: str) -> tuple[str, str, str]:
    """
    Returns: (title, source_url, main_html)
    """
    soup = BeautifulSoup(html, "lxml")

    # Title
    title = soup.title.get_text(strip=True) if soup.title else "Wikipedia Page"

    # Canonical URL if available
    canonical = soup.find("link", rel="canonical")
    source_url = canonical["href"].strip() if canonical and canonical.get("href") else ""

    # Main content container (Wikipedia)
    main = soup.select_one("#mw-content-text") or soup.select_one("main#content") or soup.select_one("#content")
    if not main:
        main = soup.body or soup

    # Remove unwanted elements
    for sel in WIKI_REMOVE_SELECTORS:
        for tag in main.select(sel):
            tag.decompose()

    # Remove edit links or residual UI spans if any
    for tag in main.select("span.mw-editsection"):
        tag.decompose()

    # Remove "mw-cite-backlink" / citation back-links if present
    for tag in main.select(".mw-cite-backlink"):
        tag.decompose()

    # Convert relative wiki links to absolute
    for a in main.select("a[href]"):
        href = a.get("href", "")
        if href.startswith("/wiki/"):
            a["href"] = "https://en.wikipedia.org" + href
        elif href.startswith("//"):
            a["href"] = "https:" + href

    # Some pages keep "Coordinates" in a small span; remove if still present
    for tag in main.find_all(string=re.compile(r"Coordinates", re.IGNORECASE)):
        if tag.parent and tag.parent.name in {"span", "small"}:
            tag.parent.decompose()

    # Remove tables that sometimes survive (e.g., metadata, authority control)
    for tag in main.select("table.metadata, table.ambox"):
        tag.decompose()

    main_html = str(main)
    return title, source_url, main_html

def wikipedia_html_to_clean_markdown(html: str) -> tuple[str, str, str]:
    """
    Returns: (title, source_url, markdown_text)
    """
    title, source_url, main_html = extract_wikipedia_main_html(html)

    # Convert to Markdown
    md_text = md(main_html, heading_style="ATX", bullets="-")

    # Post-clean in Markdown space
    md_text = _post_clean_markdown(md_text)

    return title, source_url, md_text


# ---- 3) Batch convert ----
html_files = sorted(list(INPUT_DIR.glob("*.htm")) + list(INPUT_DIR.glob("*.html")))
print(f"Found {len(html_files)} HTML files in: {INPUT_DIR}")

now_iso = datetime.now(timezone.utc).isoformat()

converted = 0
for fp in html_files:
    html = fp.read_text(encoding="utf-8", errors="ignore")
    title, source_url, md_text = wikipedia_html_to_clean_markdown(html)

    # YAML front matter (matches your earlier style)
    front_matter = [
        "---",
        f"source_url: {source_url or ''}",
        f"scraped_at: {now_iso}",
        f"title: {title}",
        "description: ",
        "---",
        "",
    ]
    out_text = "\n".join(front_matter) + md_text + "\n"

    out_name = fp.stem + ".md"
    out_path = OUTPUT_DIR / out_name
    out_path.write_text(out_text, encoding="utf-8")

    converted += 1

print(f"Converted {converted} files -> {OUTPUT_DIR}")
print("Example output:", (OUTPUT_DIR / (html_files[0].stem + ".md")) if html_files else "N/A")

Found 51 HTML files in: baseline_data
Converted 51 files -> baseline_data/wikipedia_md_cleaned
Example output: baseline_data/wikipedia_md_cleaned/Andrew Carnegie - Wikipedia.md
