In [None]:
#| default_exp md_hier

# Markdown Parser

> Parse markdown content into a dictionary with hierarchical keys

In [None]:
#| export
import re
from fastcore.utils import *

In [None]:
#| exports
def markdown_to_dict(
    markdown_content: str # The markdown content to parse
    ) -> dict: # The dictionary with hierarchical keys
    "Convert a markdown document with headings into a hierarchical dictionary with keys based on the heading structure"
    def clean_heading(text): return re.sub(r'[^A-Za-z0-9 ]+', '', text).strip()

    lines = markdown_content.splitlines()
    headings = []
    in_code_block = False

    # Parse headings with their levels and line numbers
    for idx, line in enumerate(lines):
        # Toggle code block state when encountering fence
        if line.strip().startswith('```'): in_code_block = not in_code_block
        
        # Only detect headings when not in a code block
        elif not in_code_block:
            match = re.match(r'^(#{1,6})\s*(.*)', line)
            if match:
                level = len(match.group(1))
                text = match.group(2).strip()
                headings.append({'level': level, 'text': text, 'line': idx})

    # Assign content to each heading, including subheadings
    for i, h in enumerate(headings):
        start = h['line']  # Include the heading line itself
        # Find the end index: next heading of same or higher level
        for j in range(i + 1, len(headings)):
            if headings[j]['level'] <= h['level']:
                end = headings[j]['line']
                break
        else: end = len(lines)
        h['content'] = '\n'.join(lines[start:end]).strip()

    # Build the dictionary with hierarchical keys
    result,stack = {},[]
    first_level = headings[0]['level']
    for h in headings:
        stack = stack[:h['level'] - first_level] + [clean_heading(h['text'])]
        key = '.'.join(stack)
        result[key] = h['content']
    return dict2obj(result)

For example, given the following markdown content:

In [None]:
md_content = """
# User

This is the User section.

## Tokens

Details about tokens.

```python
print('Hello, world!')
```

### Value

The value of tokens.

Some more details.

## Settings

User settings information.

# Admin

Admin section.

## Users

Admin users management.
"""

We can parse it into a dictionary with hierarchical keys:

In [None]:
result = markdown_to_dict(md_content); result

```json
{ 'Admin': '# Admin\n\nAdmin section.\n\n## Users\n\nAdmin users management.',
  'Admin.Users': '## Users\n\nAdmin users management.',
  'User': '# User\n'
          '\n'
          'This is the User section.\n'
          '\n'
          '## Tokens\n'
          '\n'
          'Details about tokens.\n'
          '\n'
          '```python\n'
          "print('Hello, world!')\n"
          '```\n'
          '\n'
          '### Value\n'
          '\n'
          'The value of tokens.\n'
          '\n'
          'Some more details.\n'
          '\n'
          '## Settings\n'
          '\n'
          'User settings information.',
  'User.Settings': '## Settings\n\nUser settings information.',
  'User.Tokens': '## Tokens\n'
                 '\n'
                 'Details about tokens.\n'
                 '\n'
                 '```python\n'
                 "print('Hello, world!')\n"
                 '```\n'
                 '\n'
                 '### Value\n'
                 '\n'
                 'The value of tokens.\n'
                 '\n'
                 'Some more details.',
  'User.Tokens.Value': '### Value\n'
                       '\n'
                       'The value of tokens.\n'
                       '\n'
                       'Some more details.'}
```

In [None]:
def test_empty_content():
    md_content = "# Empty Heading"
    result = markdown_to_dict(md_content)
    assert result['Empty Heading'] == '# Empty Heading'

def test_special_characters():
    md_content = "# Heading *With* Special _Characters_!\nContent under heading."
    result = markdown_to_dict(md_content)
    assert 'Heading With Special Characters' in result
    assert result['Heading With Special Characters'] == '# Heading *With* Special _Characters_!\nContent under heading.'

def test_duplicate_headings():
    md_content = "# Duplicate\n## Duplicate\n### Duplicate\nContent under duplicate headings."
    result = markdown_to_dict(md_content)
    assert 'Duplicate' in result
    assert 'Duplicate.Duplicate' in result
    assert 'Duplicate.Duplicate.Duplicate' in result
    assert result['Duplicate.Duplicate.Duplicate'] == '### Duplicate\nContent under duplicate headings.'

def test_no_content():
    md_content = "# No Content Heading\n## Subheading"
    result = markdown_to_dict(md_content)
    assert result['No Content Heading'] == '# No Content Heading\n## Subheading'
    assert result['No Content Heading.Subheading'] == '## Subheading'

def test_different_levels():
    md_content = "### Level 3 Heading\nContent at level 3.\n# Level 1 Heading\nContent at level 1."
    result = markdown_to_dict(md_content)
    assert 'Level 3 Heading' in result
    assert 'Level 1 Heading' in result
    assert result['Level 3 Heading'] == '### Level 3 Heading\nContent at level 3.'
    assert result['Level 1 Heading'] == '# Level 1 Heading\nContent at level 1.'

def test_parent_includes_subheadings():
    md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content."
    result = markdown_to_dict(md_content)
    assert result['Parent'] == '# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content.'
    assert result['Parent.Child'] == '## Child\nChild content.\n### Grandchild\nGrandchild content.'
    assert result['Parent.Child.Grandchild'] == '### Grandchild\nGrandchild content.'

def test_multiple_level2_siblings():
    md_content = "##Sib 1\n##Sib 2\n##Sib 3\n##Sib 4\n##Sib 5'"
    result = markdown_to_dict(md_content)
    assert 'Sib 1' in result
    assert 'Sib 2' in result
    assert 'Sib 3' in result
    assert 'Sib 4' in result
    assert 'Sib 5' in result
    
def test_code_chunks_escaped():
    md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
    result = markdown_to_dict(md_content)
    assert 'Code comment' not in result
    assert "# Code comment" in result['Parent.Child']

test_empty_content()
test_special_characters()
test_duplicate_headings()
test_no_content()
test_different_levels()
test_parent_includes_subheadings()
test_multiple_level2_siblings()
test_code_chunks_escaped()
print('Tests passed!')

Tests passed!


In [None]:
#| exports
def create_heading_dict(
    text: str # The markdown text to convert
    ) -> dict: # The nested dictionary structure
    "Convert markdown headings into a nested dictionary structure"
    lines = text.splitlines()
    headings = []
    in_code_block = False
    
    # Find headings while skipping code blocks
    for line in lines:
        line = line.strip()
        if line.startswith('```'): in_code_block = not in_code_block
        elif not in_code_block and line.startswith('#'):
            level = line.count('#', 0, line.find(' '))  # Count only leading #
            title = line.strip('#').strip()
            headings.append((level, title))
    
    # Build the hierarchical dictionary
    result = {}
    stack = [result]
    prev_level = 0

    for level, title in headings:
        while level <= prev_level:
            stack.pop()
            prev_level -= 1
        new_dict = {}
        stack[-1][title] = new_dict
        stack.append(new_dict)
        prev_level = level
    
    return dict2obj(result)

For instance, given the same markdown content as before:

In [None]:
create_heading_dict(md_content)

```json
{'Admin': {'Users': {}}, 'User': {'Settings': {}, 'Tokens': {'Value': {}}}}
```

In [None]:
def test_nested_headings():    
    md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content."
    result = create_heading_dict(md_content)
    assert 'Child' in result['Parent']
    assert 'Grandchild' in result['Parent']['Child']

def test_code_chunks_escaped():
    md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
    result = create_heading_dict(md_content)
    assert 'Code comment' not in result
    
test_nested_headings()
test_nested_headings()
print('Tests passed!')

Tests passed!


## Export -

In [None]:
#|hide
#|eval: false
from nbdev.doclinks import nbdev_export
nbdev_export()