In [2]:
from pathlib import Path

downloads_dir = Path.home() / "Downloads"

html_files = list(downloads_dir.glob("*.html"))
html_files

[PosixPath('/home/agus/Downloads/typing — Support for type hints — Python 3.11.5 documentation.html'),
 PosixPath('/home/agus/Downloads/aws-autotrain-product-categories _ Inference Endpoints - Hugging Face.html')]

In [3]:
html_path = html_files[0]

In [6]:
import re
from pathlib import Path
from typing import Literal, Protocol, TypeVar, Union

from bs4 import BeautifulSoup, Tag


def process_text(text: str) -> str:
    return re.sub(r"\n\s*\n", "\n\n", text)


def admonition_style(admonition_type: Literal["seealso", "note"]) -> str:
    """Return the admonition style."""
    colors = {
        "seealso": "#a2e8dd",
        "note": "#e6d3a3",
    }
    style_dict = {
        "background-color": colors[admonition_type],
        "border-radius": "10px",
        "padding": "20px",
        "margin-top": "10px",
        "margin-bottom": "10px",
    }

    return "; ".join(f"{k}: {v}" for k, v in style_dict.items())


# Protocols for better type hinting
class ConvertibleElement(Protocol):
    text: str

    def find_all(
        self, name_or_tags, recursive: bool = True
    ) -> list[Union[Tag, "ConvertibleElement"]]:
        ...

    def replace_with(self, *args) -> None:
        ...

    def get(self, key: str) -> str:
        ...

    def select(self, selector: str) -> list[Union[Tag, "ConvertibleElement"]]:
        ...


T = TypeVar("T", bound=ConvertibleElement)


def open_html(html_path: Union[str, Path]) -> T:
    """Parse the provided HTML file and return the main body."""
    with open(html_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
    return soup.body.select("h1")[0].parent


def process_code(element: T) -> None:
    """Convert HTML code blocks to markdown format."""
    for code in element.select("div.highlight-python3"):
        code_source = code.get_text()
        code.replace_with(f"\n\n```python\n{code_source}```\n\n")


def process_highlight(element: T) -> None:
    """Convert HTML code blocks to markdown format."""
    for code in element.select("div.highlight-python3"):
        code_source = code.get_text()
        code.replace_with(f"\n\n```python\n{code_source}```\n\n")


def process_links(element: T) -> None:
    """Convert HTML anchor tags to markdown links."""
    for a in element.find_all("a"):
        a.replace_with(f"[{a.text}]({a.get('href')})")


def process_inline_code(element: T) -> None:
    """Convert HTML inline code to markdown inline code."""
    for code in element.select("code.literal"):
        code.replace_with(f"`{code.text}`")


def process_lists(element: T) -> None:
    """Convert HTML unordered lists to markdown lists."""
    for ul in element.find_all("ul"):
        for li in ul.find_all("li", recursive=False):
            li.replace_with(f"- {li.text.strip()}\n")


def process_strong(element: T) -> None:
    """Convert HTML strong tags to markdown bold."""
    for strong in element.find_all("strong"):
        strong.replace_with(f"**{strong.text}**")


def process_em(element: T) -> None:
    """Convert HTML emphasis tags to markdown emphasis."""
    for em in element.find_all("em"):
        em.replace_with(f"*{em.text}*")


def process_titles(element: T) -> None:
    """Convert HTML title tags to markdown headers."""
    for h in element.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
        level = int(h.name[1])
        h.replace_with(f"\n\n{'#' * level} {h.text}\n\n")


style_dict = {
    "background-color": "#e6d3a3",
    "border-radius": "10px",
    "padding": "20px",
    "margin-top": "10px",
    "margin-bottom": "10px",
}


def process_admonitions(element: T) -> None:
    """Convert HTML admonitions to divs with a light orange background."""
    for admonition in element.select("div.admonition"):
        text = admonition.text.strip()
        style = ""
        if text.startswith("Note"):
            style = admonition_style("note")
            text = text.replace("Note", "").strip()
            text = f"**Note**\n\n{text}"
        elif text.startswith("See also"):
            style = admonition_style("seealso")
            text = text.replace("See also", "").strip()
            text = f"**See also**\n\n{text}"
        admonition.replace_with(f"<div style='{style}'>\n\n{text}</div>\n\n")


def html_to_markdown(element: T) -> str:
    """Convert a BeautifulSoup element to Markdown."""
    process_code(element)
    process_highlight(element)
    process_links(element)
    process_inline_code(element)
    process_lists(element)
    process_strong(element)
    process_em(element)
    process_titles(element)
    process_admonitions(element)

    return process_text(element.text)


markdown = html_to_markdown(open_html(html_path))


def save_markdown(markdown: str, output_path: Path) -> None:
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(markdown)


save_markdown(markdown, downloads_dir / f"{html_path.stem}.md")

In [8]:
import nbformat as nbf


def markdown_to_notebook(markdown_content: str) -> nbf.notebooknode.NotebookNode:
    """
    Convert markdown content to a Jupyter notebook.

    Parameters:
    - markdown_content (str): The content of the markdown file.

    Returns:
    - nbf.notebooknode.NotebookNode: A Jupyter notebook node.
    """
    # Initialize a new notebook
    notebook = nbf.v4.new_notebook()

    # Split the content based on triple backticks
    sections = markdown_content.split("```")

    # Process each section
    for i, section in enumerate(sections):
        # Even indices are markdown, odd indices are code
        if i % 2 == 0:
            # Add a markdown cell
            notebook.cells.append(nbf.v4.new_markdown_cell(section.strip()))
        else:
            # Add a code cell
            section = "\n".join(section.strip().split("\n")[1:])
            notebook.cells.append(nbf.v4.new_code_cell(section.strip()))

    return notebook


# Convert the markdown content to a Jupyter notebook
notebook = markdown_to_notebook(markdown)


def save_notebook(notebook: nbf.notebooknode.NotebookNode, output_path: Path) -> None:
    """
    Save the notebook to a .ipynb file.

    Parameters:
    - notebook (nbf.notebooknode.NotebookNode): The notebook to save.
    - output_path (Path): The path to save the notebook to.
    """
    with open(output_path, "w") as f:
        nbf.write(notebook, f)


save_notebook(notebook, downloads_dir / f"{html_path.stem}.ipynb")