Add support for FrontMatter block tokens (initial YAML block) (#25)

The classic use case for sphinx is: ```markdown --- orphan: true --- This is an orphan document, not specified in any toctrees. ```
executablebooks · Feb 13, 2020 · ae6c3c3 · ae6c3c3
1 parent 4f5df00
commit ae6c3c3
Show file tree

Hide file tree

Showing 8 changed files with 121 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ For more information, also see the [CommonMark Spec](https://spec.commonmark.org
 
 ### Block Tokens
 
+- **FrontMatter**: A YAML block at the start of the document enclosed by `---`
 - **HTMLBlock**: Any valid HTML (rendered in HTML output only)
 - **LineComment**: `% this is a comment`
 - **BlockCode**: indented text (4 spaces)

diff --git a/docs/syntax.md b/docs/syntax.md
@@ -25,6 +25,7 @@ For more information, also see the [CommonMark Spec](https://spec.commonmark.org
 
 ### Block Tokens
 
+- **FrontMatter**: A YAML block at the start of the document enclosed by `---`
 - **HTMLBlock**: Any valid HTML (rendered in HTML output only)
 - **LineComment**: `% this is a comment`
 - **BlockCode**: indented text (4 spaces)
@@ -259,6 +260,20 @@ most beautiful mathematical formulas.
 Here is some extra markdown syntax which provides functionality in rST that doesn't
 exist in CommonMark.
 
+## Front Matter
+
+This is a YAML block at the start of the document, as used for example in [jekyll](https://jekyllrb.com/docs/front-matter/).
+Sphinx intercepts this data and stores it within the global environment (as discussed [here](https://www.sphinx-doc.org/en/master/usage/restructuredtext/field-lists.html)), and a classic use-case is to specify 'orphan' documents,
+that are not specified in any toctrees.
+
+```markdown
+---
+orphan: true
+---
+
+This is an orphan document, not specified in any toctrees.
+```
+
 ### Comments
 
 You may add comments by putting the `%` character at the beginning of a line. This will

diff --git a/myst_parser/block_tokens.py b/myst_parser/block_tokens.py
@@ -1,9 +1,12 @@
 import re
 
+import yaml
+
 from mistletoe import block_token, span_token
 import mistletoe.block_tokenizer as tokenizer
 
 from mistletoe.block_token import (  # noqa: F401
+    tokenize,
     HTMLBlock,
     ThematicBreak,
     List,
@@ -25,13 +28,70 @@
     "List",
     "Table",
     "Footnote",
+    "FrontMatter",
     "Paragraph",
 ]
 
 # TODO add FieldList block token, see:
 # https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#field-lists
 
 
+class FrontMatter(block_token.BlockToken):
+    """Front matter YAML block.
+
+    Not included in the parsing process, but called by Document.__init__.
+    """
+
+    def __init__(self, lines):
+        assert lines and lines[0].startswith("---")
+        end_line = None
+        for i, line in enumerate(lines[1:]):
+            if line.startswith("---"):
+                end_line = i + 2
+                break
+        # TODO raise/report error if closing block not found
+        if end_line is None:
+            end_line = len(lines)
+        self.range = (0, end_line)
+        yaml_block = "\n".join(lines[1 : end_line - 1])
+        self.data = yaml.safe_load(yaml_block) or {}
+        self.children = []
+
+    @classmethod
+    def start(cls, line):
+        False
+
+    @classmethod
+    def read(cls, lines):
+        raise NotImplementedError()
+
+
+class Document(block_token.BlockToken):
+    """Document token."""
+
+    def __init__(self, lines):
+
+        self.footnotes = {}
+        block_token._root_node = self
+        span_token._root_node = self
+
+        if isinstance(lines, str):
+            lines = lines.splitlines(keepends=True)
+        lines = [line if line.endswith("\n") else "{}\n".format(line) for line in lines]
+        start_line = 0
+        self.children = []
+        if lines and lines[0].startswith("---"):
+            front_matter = FrontMatter(lines)
+            self.children.append(front_matter)
+            start_line = front_matter.range[1]
+            print(start_line)
+            lines = lines[start_line:]
+        self.children.extend(tokenize(lines, start_line))
+
+        span_token._root_node = None
+        block_token._root_node = None
+
+
 class LineComment(block_token.BlockToken):
     """Line comment start with % """
 
@@ -119,6 +179,7 @@ def read(cls, lines):
         while (
             next_line is not None
             and next_line.strip() != ""
+            # TODO transition checks should only be made on 'active' tokens
             and not LineComment.start(next_line)
             and not Heading.start(next_line)
             and not CodeFence.start(next_line)

diff --git a/myst_parser/docutils_renderer.py b/myst_parser/docutils_renderer.py
@@ -96,6 +96,33 @@ def render_document(self, token):
         self.render_children(token)
         return self.document
 
+    def render_front_matter(self, token):
+        """Pass document front matter data
+
+        For RST, all field lists are captured by
+        ``docutils.docutils.parsers.rst.states.Body.field_marker``,
+        then, if one occurs at the document, it is transformed by
+        `docutils.docutils.transforms.frontmatter.DocInfo`, and finally
+        this is intercepted by sphinx and added to the env in
+        `sphinx.environment.collectors.metadata.MetadataCollector.process_doc`
+
+        So technically the values should be parsed to AST, but this is redundant,
+        since `process_doc` just converts them back to text.
+
+        """
+        # TODO this data could be used to support default option values for directives
+        docinfo = nodes.docinfo()
+        for key, value in token.data.items():
+            if not isinstance(value, (str, int, float)):
+                continue
+            value = str(value)
+            field_node = nodes.field()
+            field_node.source = value
+            field_node += nodes.field_name(key, "", nodes.Text(key, key))
+            field_node += nodes.field_body(value, nodes.Text(value, value))
+            docinfo += field_node
+        self.current_node.append(docinfo)
+
     def render_paragraph(self, token):
         if len(token.children) == 1 and isinstance(
             token.children[0], myst_span_tokens.Target

diff --git a/myst_parser/sphinx_parser.py b/myst_parser/sphinx_parser.py
@@ -1,7 +1,7 @@
 from docutils import frontend, parsers
-from mistletoe import Document
 
 from myst_parser.docutils_renderer import DocutilsRenderer
+from myst_parser.block_tokens import Document
 
 
 class MystParser(parsers.Parser):

diff --git a/tests/sphinx/sourcedirs/basic/orphan.md b/tests/sphinx/sourcedirs/basic/orphan.md
@@ -0,0 +1,5 @@
+---
+orphan: true
+---
+
+This is an orphan document, not specified in any toctrees.
diff --git a/tests/test_docutils_renderer.py b/tests/test_docutils_renderer.py
@@ -5,10 +5,10 @@
 
 import pytest
 
-from mistletoe import Document
 from mistletoe.block_token import tokenize
 from mistletoe.span_token import tokenize_inner
 
+from myst_parser.block_tokens import Document
 from myst_parser.docutils_renderer import DocutilsRenderer
 
 
@@ -279,6 +279,10 @@ def test_footnote(renderer):
 def test_full_run(sphinx_renderer, file_regression):
     string = dedent(
         """\
+        ---
+        a: 1
+        ---
+
         (target)=
         # header 1
         ## sub header 1

diff --git a/tests/test_docutils_renderer/test_full_run.xml b/tests/test_docutils_renderer/test_full_run.xml
@@ -1,4 +1,10 @@
 <document source="">
+    <docinfo>
+        <field>
+            <field_name>
+                a
+            <field_body>
+                1
     <target ids="target" names="target">
     <section ids="header-1" names="header\ 1">
         <title>