In [3]:
import re
import json

def parse_markdown_to_json(lines, index=0):
    def parse_list(lines, index):
        list_items = []
        while index < len(lines):
            line = lines[index].strip()
            if re.match(r'- \[\d+\.\]', line):
                item_label = re.search(r'\[(\d+\.)\]', line).group(1)
                item_content = line.split('] ')[1]
                children, index = parse_list(lines, index + 1)
                list_items.append({
                    "label": item_label,
                    "type": "list_item",
                    "content": [item_content],
                    "children": children
                })
            elif re.match(r'- \[[a-z]+\.\]', line):
                item_label = re.search(r'\[([a-z]+\.)\]', line).group(1)
                item_content = line.split('] ')[1]
                list_items.append({
                    "label": item_label,
                    "type": "list_item",
                    "content": [item_content],
                    "children": []
                })
                index += 1
            else:
                break
        return list_items, index

    json_output = {
        "label": "",
        "type": "document",
        "children": []
    }

    while index < len(lines):
        line = lines[index].strip()
        if line.startswith('# '):
            heading_content = line[2:]
            json_output["children"].append({
                "label": "1",
                "type": "heading",
                "content": [heading_content],
                "children": []
            })
            index += 1
        elif re.match(r'- \[\d+\.\]', line):
            list_items, index = parse_list(lines, index)
            json_output["children"][-1]["children"].append({
                "label": "",
                "type": "list",
                "children": list_items
            })
        else:
            json_output["children"][-1]["children"].append({
                "label": "",
                "type": "content",
                "content": [line]
            })
            index += 1

    return json_output

markdown_input = """
# Art. 1 Sachüberschrift, Abs. 1 und 1bis
Gegenstand und Geltungsbereich

- [1.] Dieses Gesetz regelt:
  - [a.] die Veranstaltung, die Aufbereitung, die Übertragung und den Empfang von Radio- und Fernsehprogrammen;
  - [b.] die Fördermassnahmen zugunsten der elektronischen Medien.
  - [abis.] blabla.
- [2.] Es gilt für:
  - [a.] die Veranstalter von Radio- und Fernsehprogrammen;

Einfügen der Art. 76a-76c vor dem Gliederungstitel des 4. Kapitels
"""

lines = markdown_input.strip().split('\n')
json_output = parse_markdown_to_json(lines)
print(json.dumps(json_output, indent=2))

{
  "label": "",
  "type": "document",
  "children": [
    {
      "label": "1",
      "type": "heading",
      "content": [
        "Art. 1 Sach\u00fcberschrift, Abs. 1 und 1bis"
      ],
      "children": [
        {
          "label": "",
          "type": "content",
          "content": [
            "Gegenstand und Geltungsbereich"
          ]
        },
        {
          "label": "",
          "type": "content",
          "content": [
            ""
          ]
        },
        {
          "label": "",
          "type": "list",
          "children": [
            {
              "label": "1.",
              "type": "list_item",
              "content": [
                "Dieses Gesetz regelt:"
              ],
              "children": [
                {
                  "label": "a.",
                  "type": "list_item",
                  "content": [
                    "die Veranstaltung, die Aufbereitung, die \u00dcbertragung und den Empfang von Radio- und Fernsehprog

In [9]:
def parse_markdown_to_json_schema_old(markdown):
    """
    Parses a markdown string into a defined JSON schema structure.
    Args:
        markdown (str): The markdown string to be parsed.
    Returns:
        dict: A JSON schema representation of the markdown content.
            The structure includes:
            - "label": A string label for the node.
            - "type": The type of the node (e.g., "document", "heading", "list", "list_item", "content").
            - "content": A list containing the content of the node.
            - "children": A list of child nodes, each following the same structure.
    """
    lines = markdown.split('\n')
    document = {
        "label": "",
        "type": "document",
        "children": []
    }

    depth = 0
    list_stack = []

    for line in lines:
        # Match list items
        list_item_match = re.match(r'( *)[-*] (\[.*?\] *)?(.*)', line)
        if not list_item_match:
            if list_stack:
                if document["children"]:
                    document["children"][-1]["children"].append(list_stack[0])
                else:
                    document["children"].append(list_stack[0])
                list_stack = []
                depth = 0
        else:
            indent = len(list_item_match.group(1))
            label = list_item_match.group(2).strip('[] ') if list_item_match.group(2) else ""
            content = list_item_match.group(3)

            node = {
                "label": label,
                "type": "list_item",
                "content": [content],
                "children": []
            }

            list_node = {
                "label": "",
                "type": "list",
                "children": []
            }

            if not list_stack:
                list_stack.append(list_node)
            elif indent > depth:
                list_stack[-1]["children"][-1]["children"].append(list_node)
                list_stack.append(list_node)
            elif indent < depth:
                list_stack.pop()

            list_stack[-1]["children"].append(node)
            depth = indent
            continue

        line = line.strip()

        # Match headings
        heading_match = re.match(r'^(#+)\s+(.*)', line)
        if heading_match:
            node = {
                "label": str(len(heading_match.group(1))),
                "type": "heading",
                "content": [heading_match.group(2)],
                "children": []
            }
            document["children"].append(node)
            continue

        # Match content
        if line:
            node = {
                "label": "",
                "type": "content",
                "content": [line],
            }
            if "children" in document["children"][-1] if document["children"] else False:
                document["children"][-1]["children"].append(node)
            else:
                document["children"].append(node)

    return document


markdown_input = """
# Art. 1 Sachüberschrift, Abs. 1 und 1bis
Gegenstand und Geltungsbereich

- [1.] Dieses Gesetz regelt:
  - [a.] die Veranstaltung, die Aufbereitung, die Übertragung und den Empfang von Radio- und Fernsehprogrammen;
  - [b.] die Fördermassnahmen zugunsten der elektronischen Medien.
  - [abis.] blabla.
- [2.] Es gilt für:
  - [a.] die Veranstalter von Radio- und Fernsehprogrammen;

Einfügen der Art. 76a-76c vor dem Gliederungstitel des 4. Kapitels
"""

lines = markdown_input.strip().split('\n')
json_output = parse_markdown_to_json_schema_old(markdown_input)
print(json.dumps(json_output, indent=2))

{
  "label": "",
  "type": "document",
  "children": [
    {
      "label": "1",
      "type": "heading",
      "content": [
        "Art. 1 Sach\u00fcberschrift, Abs. 1 und 1bis"
      ],
      "children": [
        {
          "label": "",
          "type": "content",
          "content": [
            "Gegenstand und Geltungsbereich"
          ]
        },
        {
          "label": "",
          "type": "list",
          "children": [
            {
              "label": "1.",
              "type": "list_item",
              "content": [
                "Dieses Gesetz regelt:"
              ],
              "children": [
                {
                  "label": "",
                  "type": "list",
                  "children": [
                    {
                      "label": "a.",
                      "type": "list_item",
                      "content": [
                        "die Veranstaltung, die Aufbereitung, die \u00dcbertragung und den Empfang von Radio- und