Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 38 additions & 2 deletions hypernode/downloader/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
from pathlib import Path
from posixpath import basename
from textwrap import dedent
from typing import List, Optional, Tuple
from typing import Dict, List, Optional, Tuple
from urllib.parse import urlparse

import mdformat
import requests
import yaml
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from slugify import slugify
Expand Down Expand Up @@ -150,6 +151,40 @@ def fetch_document(url: str) -> BeautifulSoup:
return BeautifulSoup(response.content, "html.parser")


def get_document_metadata(document: BeautifulSoup) -> Dict[str, str]:
result = {}

ALLOWED_META_NAMES = "description"
ALLOWED_META_PROPS = ()

for meta_element in document.find_all("meta"):
if not meta_element.has_attr("content") or not meta_element["content"]:
continue

if meta_element.has_attr("name") and meta_element["name"] in ALLOWED_META_NAMES:
result[meta_element["name"]] = meta_element["content"]
elif (
meta_element.has_attr("property")
and meta_element["property"] in ALLOWED_META_PROPS
):
result["property={}".format(meta_element["property"])] = meta_element[
"content"
]

return result


def get_metadata_frontmatter(document: BeautifulSoup) -> str:
frontmatter_yaml = "\n"

metadata = get_document_metadata(document)
if metadata:
frontmatter = {"myst": {"html_meta": metadata}}
frontmatter_yaml = yaml.dump(frontmatter, default_flow_style=False)

return "---\n{}---".format(frontmatter_yaml)


def convert_document(
document: BeautifulSoup, url: str, output_dir: Optional[Path] = None
) -> Tuple[Path, str]:
Expand Down Expand Up @@ -177,7 +212,8 @@ def convert_document(

document_source_comment = f"<!-- source: {url} -->"
document_contents = (
f"{document_source_comment}\n"
get_metadata_frontmatter(document) + "\n\n"
f"{document_source_comment}\n\n"
f"# {article_heading}\n"
f"{article_body_markdown}"
)
Expand Down
3 changes: 3 additions & 0 deletions requirements/base.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# docs/ requirements
sphinx==5.3.0
sphinx_rtd_theme==1.1.1
myst-parser==0.18.1
Expand All @@ -7,6 +8,8 @@ mdformat==0.7.16
mdformat-myst==0.1.5
mdformat-frontmatter==0.4.1

# hypernode/ requirements
beautifulsoup4==4.11.1
markdownify==0.11.2
python-slugify==6.1.2
pyyaml==6.0