diff --git a/hypernode/downloader/main.py b/hypernode/downloader/main.py index 08edc082..28f10309 100644 --- a/hypernode/downloader/main.py +++ b/hypernode/downloader/main.py @@ -4,11 +4,12 @@ from pathlib import Path from posixpath import basename from textwrap import dedent -from typing import List, Optional, Tuple +from typing import Dict, List, Optional, Tuple from urllib.parse import urlparse import mdformat import requests +import yaml from bs4 import BeautifulSoup from markdownify import markdownify as md from slugify import slugify @@ -150,6 +151,40 @@ def fetch_document(url: str) -> BeautifulSoup: return BeautifulSoup(response.content, "html.parser") +def get_document_metadata(document: BeautifulSoup) -> Dict[str, str]: + result = {} + + ALLOWED_META_NAMES = "description" + ALLOWED_META_PROPS = () + + for meta_element in document.find_all("meta"): + if not meta_element.has_attr("content") or not meta_element["content"]: + continue + + if meta_element.has_attr("name") and meta_element["name"] in ALLOWED_META_NAMES: + result[meta_element["name"]] = meta_element["content"] + elif ( + meta_element.has_attr("property") + and meta_element["property"] in ALLOWED_META_PROPS + ): + result["property={}".format(meta_element["property"])] = meta_element[ + "content" + ] + + return result + + +def get_metadata_frontmatter(document: BeautifulSoup) -> str: + frontmatter_yaml = "\n" + + metadata = get_document_metadata(document) + if metadata: + frontmatter = {"myst": {"html_meta": metadata}} + frontmatter_yaml = yaml.dump(frontmatter, default_flow_style=False) + + return "---\n{}---".format(frontmatter_yaml) + + def convert_document( document: BeautifulSoup, url: str, output_dir: Optional[Path] = None ) -> Tuple[Path, str]: @@ -177,7 +212,8 @@ def convert_document( document_source_comment = f"" document_contents = ( - f"{document_source_comment}\n" + get_metadata_frontmatter(document) + "\n\n" + f"{document_source_comment}\n\n" f"# {article_heading}\n" f"{article_body_markdown}" ) diff --git a/requirements/base.txt b/requirements/base.txt index cb71a201..91014eca 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,3 +1,4 @@ +# docs/ requirements sphinx==5.3.0 sphinx_rtd_theme==1.1.1 myst-parser==0.18.1 @@ -7,6 +8,8 @@ mdformat==0.7.16 mdformat-myst==0.1.5 mdformat-frontmatter==0.4.1 +# hypernode/ requirements beautifulsoup4==4.11.1 markdownify==0.11.2 python-slugify==6.1.2 +pyyaml==6.0