In [1]:
from bs4 import BeautifulSoup


def parse_html_to_text(html_path: str) -> str:
    """
    Read an HTML file and return clean text with headings preserved.
    """

    # 1. Read HTML file
    with open(html_path, "r", encoding="utf-8") as f:
        html = f.read()

    # 2. Parse HTML
    soup = BeautifulSoup(html, "html.parser")

    # 3. Remove junk we never want
    for tag in soup(["script", "style", "nav", "footer", "header"]):
        tag.decompose()

    lines = []

    # 4. Extract headings and paragraphs in order
    for element in soup.find_all(["h1", "h2", "h3", "p", "li"]):
        text = element.get_text(strip=True)
        if not text:
            continue

        # Mark headings clearly (important for chunking later)
        if element.name in ["h1", "h2", "h3"]:
            lines.append(f"\n{text.upper()}\n")
        else:
            lines.append(text)

    # 5. Join everything into one clean string
    clean_text = "\n".join(lines)

    return clean_text

In [6]:
pwd

'/Users/abdulrasheed/Desktop/DeepLearning/Tx_Snap_RAG/notebook/ingest'