Improve parsing of composite inline blocks (#66)

* Incorporate math blocks into core token search * improve `find_core_tokens` * Update span_tokens.py * Improve consisency between examples * Retry `math:numref` * Update wealth_dynamics_md.md
executablebooks · Feb 20, 2020 · 7cc2c92 · 7cc2c92
1 parent 1a1cd05
commit 7cc2c92
Show file tree

Hide file tree

Showing 10 changed files with 222 additions and 32 deletions.
diff --git a/docs/examples/wealth_dynamics_md.md b/docs/examples/wealth_dynamics_md.md
@@ -4,6 +4,10 @@
 You can {download}`Download the source file for this page <./wealth_dynamics_md.md>`
 ```
 
+```{contents}
+:depth: 2
+```
+
 In addition to what's in Anaconda, this lecture will need the following
 libraries:
 
@@ -16,7 +20,6 @@ class: hide-output
 
 ## Overview
 
-
 This notebook gives an introduction to wealth distribution dynamics,
 with a focus on
 
@@ -82,7 +85,7 @@ ax.legend()
 plt.show()
 ```
 
-This curve can be understood as follows: if point $x,y$ lies on the
+This curve can be understood as follows: if point $(x,y)$ lies on the
 curve, it means that, collectively, the bottom $(100x)\%$ of the
 population holds $(100y)\%$ of the wealth.
 

diff --git a/docs/using/syntax.md b/docs/using/syntax.md
@@ -318,10 +318,7 @@ label: euler
 ---
 ```
 
-```{todo}
-Figure out why equation referencing didn't work
-```
-Euler's identity, equation {DOESN'T WORKeq}`euler`, was elected one of the
+Euler's identity, equation {math:numref}`euler`, was elected one of the
 most beautiful mathematical formulas.
 
 ## Extra markdown syntax
@@ -363,7 +360,7 @@ header-rows: 1
 
 Math can be called in-line with single `$` characters around your math.
 For example, `$x_{hey}=it+is^{math}$` renders as $x_{hey}=it+is^{math}$.
-This is equivalent to writing
+This is equivalent to writing:
 
 ```
 {math}`x_{hey}=it+is^{math}`

diff --git a/myst_parser/span_tokens.py b/myst_parser/span_tokens.py
@@ -1,14 +1,17 @@
 import re
+from threading import local
 
-from mistletoe import span_token
-from mistletoe.span_token import (
+from mistletoe import span_token, core_tokens
+from mistletoe.span_token import (  # noqa F401
     HTMLSpan,
+    Emphasis,
     EscapeSequence,
     AutoLink,
-    CoreTokens,
-    InlineCode,
+    Image,
     LineBreak,
+    Link,
     RawText,
+    Strong,
 )
 
 """
@@ -17,12 +20,12 @@
 """
 __all__ = (
     "Role",
-    "Math",
     "HTMLSpan",
     "EscapeSequence",
     "AutoLink",
     "Target",
     "CoreTokens",
+    "Math",
     "InlineCode",
     "LineBreak",
     "RawText",
@@ -31,6 +34,27 @@
 # since there is no matching element in docutils
 
 
+_core_matches = local()
+_core_matches.value = {}
+
+
+class CoreTokens(span_token.SpanToken):
+    precedence = 3
+
+    def __new__(self, match):
+        return globals()[match.type](match)
+
+    @classmethod
+    def find(cls, string):
+        return find_core_tokens(string, span_token._root_node)
+
+
+class InlineCode(span_token.InlineCode):
+    @classmethod
+    def find(cls, string):
+        return _core_matches.value.pop("InlineCode", [])
+
+
 class Role(span_token.SpanToken):
     """
     Inline role tokens. ("{name}`some code`")
@@ -41,7 +65,6 @@ class Role(span_token.SpanToken):
         re.DOTALL,
     )
     parse_inner = False
-    # precedence = 6  # higher precedence than InlineCode?
 
     def __init__(self, match):
         self.name = match.group(1)
@@ -52,9 +75,16 @@ def __init__(self, match):
 
 
 class Math(span_token.SpanToken):
+
     pattern = re.compile(r"(?<!\\)(?:\\\\)*(\${1,2})([^\$]+?)\1")
+
     parse_inner = False
     parse_group = 0
+    precedence = 2
+
+    @classmethod
+    def find(cls, string):
+        return _core_matches.value.pop("Math", [])
 
 
 class Target(span_token.SpanToken):
@@ -67,3 +97,84 @@ def __init__(self, match):
         content = match.group(self.parse_group)
         self.children = (RawText(content),)
         self.target = content
+
+
+def find_core_tokens(string, root):
+    # TODO add speed comparison to original mistletoe implementation
+    matches = []
+    # escaped denotes that the last cursor position had `\`
+    escaped = False
+    # delimiter runs are sequences of `*` or `_`
+    in_delimiter_run = None
+    delimiters = []
+    in_image = False
+    start = 0
+    i = 0
+
+    def _advance_block_regexes(_cursor):
+        # TODO Role, etc should probably be added here as well, but add more tests
+        # to test_ast first (particularly with mixed span blocks / *'s)
+        # TODO lazy pattern search?
+        return [
+            ("InlineCode", InlineCode.pattern.search(string, _cursor)),
+            ("Math", Math.pattern.search(string, _cursor)),
+        ]
+
+    next_span_blocks = _advance_block_regexes(i)
+    while i < len(string):
+
+        # look for there span block (that does not nest any other spans)
+        span_block_found = False
+        for span_name, span_match in next_span_blocks:
+            if span_match is not None and i == span_match.start():
+                # restart delimiter runs:
+                if in_delimiter_run:
+                    delimiters.append(core_tokens.Delimiter(start, i, string))
+                in_delimiter_run = None
+
+                _core_matches.value.setdefault(span_name, []).append(span_match)
+                i = span_match.end()
+                next_span_blocks = _advance_block_regexes(i)
+                span_block_found = True
+                break
+        if span_block_found:
+            continue
+
+        c = string[i]
+        # if the cursor position is escaped, record and advance
+        if c == "\\" and not escaped:
+            escaped = True
+            i += 1
+            continue
+        # if the cursor reaches the end of a delimiter run,
+        # record the delimiter and reset
+        if in_delimiter_run is not None and (c != in_delimiter_run or escaped):
+            delimiters.append(
+                core_tokens.Delimiter(start, i if not escaped else i - 1, string)
+            )
+            in_delimiter_run = None
+        # if the cursor reaches a new delimiter, start a delimiter run
+        if in_delimiter_run is None and (c in {"*", "_"}) and not escaped:
+            in_delimiter_run = c
+            start = i
+        if not escaped:
+            if c == "[":
+                if not in_image:
+                    delimiters.append(core_tokens.Delimiter(i, i + 1, string))
+                else:
+                    delimiters.append(core_tokens.Delimiter(i - 1, i + 1, string))
+                    in_image = False
+            elif c == "!":
+                in_image = True
+            elif c == "]":
+                i = core_tokens.find_link_image(string, i, delimiters, matches, root)
+                next_span_blocks = _advance_block_regexes(i)
+            elif in_image:
+                in_image = False
+        else:
+            escaped = False
+        i += 1
+    if in_delimiter_run:
+        delimiters.append(core_tokens.Delimiter(start, i, string))
+    core_tokens.process_emphasis(string, None, delimiters, matches)
+    return matches
diff --git a/tests/test_syntax/test_ast.py b/tests/test_syntax/test_ast.py
@@ -22,7 +22,17 @@ def ast_renderer():
         ("no_closing", ["$a"]),
         ("internal_emphasis", ["$*a*$"]),
         ("external_emphasis", ["*$a$*"]),
-        ("issue_51", ["`$x_{hey}=it+is^{math}$` renders as $x_{hey}=it+is^{math}$."]),
+        ("multiline", ["$$a", "c", "b$$"]),
+        (
+            "issue_51",
+            [
+                "Math can be called in-line with single `$` characters around math.",
+                "For example, `$x_{hey}=it+is^{math}$` renders $x_{hey}=it+is^{math}$.",
+            ],
+        ),
+        ("in_link_content", ["[$a$](link)"]),
+        ("in_link_target", ["[a]($b$)"]),
+        ("in_image", ["![$a$]($b$)"]),
     ],
 )
 def test_math(name, ast_renderer, data_regression, strings):

diff --git a/tests/test_syntax/test_ast/test_math_in_image_strings12_.yml b/tests/test_syntax/test_ast/test_math_in_image_strings12_.yml
@@ -0,0 +1,14 @@
+children:
+- children:
+  - children:
+    - content: $a$
+      type: Math
+    src: $b$
+    title: ''
+    type: Image
+  range:
+  - 1
+  - 1
+  type: Paragraph
+footnotes: {}
+type: Document
diff --git a/tests/test_syntax/test_ast/test_math_in_link_content_strings10_.yml b/tests/test_syntax/test_ast/test_math_in_link_content_strings10_.yml
@@ -0,0 +1,14 @@
+children:
+- children:
+  - children:
+    - content: $a$
+      type: Math
+    target: link
+    title: ''
+    type: Link
+  range:
+  - 1
+  - 1
+  type: Paragraph
+footnotes: {}
+type: Document
diff --git a/tests/test_syntax/test_ast/test_math_in_link_target_strings11_.yml b/tests/test_syntax/test_ast/test_math_in_link_target_strings11_.yml
@@ -0,0 +1,14 @@
+children:
+- children:
+  - children:
+    - content: a
+      type: RawText
+    target: $b$
+    title: ''
+    type: Link
+  range:
+  - 1
+  - 1
+  type: Paragraph
+footnotes: {}
+type: Document
diff --git a/tests/test_syntax/test_ast/test_math_issue_51_strings8_.yml b/tests/test_syntax/test_ast/test_math_issue_51_strings8_.yml
diff --git a/tests/test_syntax/test_ast/test_math_issue_51_strings9_.yml b/tests/test_syntax/test_ast/test_math_issue_51_strings9_.yml
@@ -0,0 +1,31 @@
+children:
+- children:
+  - content: 'Math can be called in-line with single '
+    type: RawText
+  - children:
+    - content: $
+      type: RawText
+    type: InlineCode
+  - content: ' characters around math.'
+    type: RawText
+  - content: ''
+    soft: true
+    type: LineBreak
+  - content: 'For example, '
+    type: RawText
+  - children:
+    - content: $x_{hey}=it+is^{math}$
+      type: RawText
+    type: InlineCode
+  - content: ' renders '
+    type: RawText
+  - content: $x_{hey}=it+is^{math}$
+    type: Math
+  - content: .
+    type: RawText
+  range:
+  - 1
+  - 2
+  type: Paragraph
+footnotes: {}
+type: Document
diff --git a/tests/test_syntax/test_ast/test_math_multiline_strings8_.yml b/tests/test_syntax/test_ast/test_math_multiline_strings8_.yml
@@ -0,0 +1,14 @@
+children:
+- children:
+  - content: '$$a
+
+      c
+
+      b$$'
+    type: Math
+  range:
+  - 1
+  - 3
+  type: Paragraph
+footnotes: {}
+type: Document