--
description: DOM Model with Pydantic and Pandoc Integration
output-file: core.dom.html
title: core.dom

This notebook demonstrates a Document Object Model (DOM) using Pydantic for static typing and validation, and integrates Pandoc (via pypandoc) for Markdown processing.
---

In [20]:
# | default_exp dom

In [21]:
# | hide
from nbdev.showdoc import *
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import Markdown, display

InteractiveShell.ast_node_interactivity = "all"

In [22]:
# | export
from typing import List, Optional
from pydantic import BaseModel, Field, field_validator
import base64
import pypandoc
import pathlib
import json
import re
import os
from pathlib import Path
from pprint import pprint
import markdown

In [23]:
#| export
from ollama import chat, ChatResponse, Client, AsyncClient

## Base Element Class

In [24]:
# | export
class Element(BaseModel):
    summary: Optional[str] = None

## Figure Class with Base64 Validation

In [25]:
# | export
class Figure(Element):
    rawdata: str = Field(..., description="Base64-encoded image data")

    @field_validator("rawdata")
    def validate_base64(cls, v):
        try:
            base64.b64decode(v)
        except Exception:
            raise ValueError("rawdata must be valid base64")
        return v

## Table Structure: Cell, Column, Row, Table

In [26]:
# | export
class Cell(BaseModel):
    c: str


class Column(BaseModel):
    cells: List[Cell]


class Row(BaseModel):
    cols: List[Column]


class Table(Element):
    rows: List[Row]

In [27]:
# | export
def get_text_summary_response(content: str, model:str="gemma3-27b", role:str="user", lang: str="zh") -> ChatResponse:
    """
    Returns a ChatResponse from the chat model with a summary prompt for the given content.
    """
    match lang:
        case "en":
            prompt = (
                f"Please provide a summary of the following string. "
                f"The summary should be concise and informative: {content}. "
            )
        case "zh":
            content = re.sub(r"\s+", " ", content.strip())
            prompt = (
                f"请提供以下字符串的摘要。"
                f"摘要应简明扼要且信息丰富: {content}. "
            )
        case _:
            raise ValueError(f"Unsupported language: {lang}")
    return chat(
        model=model,
        messages=[
            {
                "role": role,
                "content": prompt,
            }
        ],
    )

In [28]:
# | export
image_link_pattern = r'[^\s]+\.(?:jpg|jpeg|png|gif|bmp|webp)'

def get_image_summary_response(image_link: str | Path, model:str="gemma3:27b", role:str="user", lang: str='zh') -> ChatResponse:
    """
    Returns a ChatResponse from the chat model with a summary prompt for the given image.
    """
    if isinstance(image_link, Path):
        image_link = str(image_link)
    if not re.match(image_link_pattern, image_link):
        # If the image link is not a URL, throw an error
        raise ValueError(f"Invalid image link: {image_link}")
    
    match lang:
        case 'en':
            prompt = "Please provide a summary of the following image. The summary should be concise and informative about the robot."
        case 'zh':
            prompt = "请提供以下图像的摘要。关于机器人机械尺寸,运动范围,自由度的说明应简明扼要。"
        case _:
            raise ValueError(f"Unsupported language: {lang}")

    response = chat(
        model=model,
        messages=[
            {
                "role": role,
                "content": prompt,
                'images': [f"{image_link}"],
            }
        ],
    )
    return response


In [29]:
# | export
async def get_text_summary_response_async(content: str, model:str="gemma3-27b", role:str="user", lang: str="zh") -> ChatResponse:
    """
    Returns a ChatResponse from the chat model with a summary prompt for the given content.
    """
    match lang:
        case "en":
            prompt = (
                f"Please provide a summary of the following string. "
                f"The summary should be concise and informative: {content}. "
            )
        case "zh":
            content = re.sub(r"\s+", " ", content.strip())
            prompt = (
                f"请提供以下字符串的摘要。"
                f"摘要应简明扼要且信息丰富: {content}. "
            )
        case _:
            raise ValueError(f"Unsupported language: {lang}")
    
    message = {
        "role": role,
        "content": prompt,
    }
    response =  await AsyncClient().chat(
        model=model,
        messages=[message]
    )
    return response

In [30]:
# | export
image_link_pattern = r'[^\s]+\.(?:jpg|jpeg|png|gif|bmp|webp)'

async def get_image_summary_response_async(image_link: str | Path, model:str="gemma3:27b", role:str="user", lang: str='zh') -> ChatResponse:
    """
    Returns a ChatResponse from the chat model with a summary prompt for the given image.
    """
    if isinstance(image_link, Path):
        image_link = str(image_link)
    if not re.match(image_link_pattern, image_link):
        # If the image link is not a URL, throw an error
        raise ValueError(f"Invalid image link: {image_link}")
    
    match lang:
        case 'en':
            prompt = "Please provide a summary of the following image. The summary should be concise and informative about the robot."
        case 'zh':
            prompt = "请提供以下图像的摘要。关于机器人机械尺寸,运动范围,自由度的说明应简明扼要。"
        case _:
            raise ValueError(f"Unsupported language: {lang}")

    response = await AsyncClient().chat(
        model=model,
        messages=[
            {
                "role": role,
                "content": prompt,
                'images': [f"{image_link}"],
            }
        ],
    )
    return response


In [31]:
# os.getcwd()
image_link = "../res/siasun_md_sammple_hrsl/SN024002/img/__2.png"
# image_link
res = re.match(image_link_pattern, image_link)
res.group(0)
# get_image_summary_response("../res/")

'../res/siasun_md_sammple_hrsl/SN024002/img/__2.png'

In [39]:
# imagepath = Path(os.getcwd()).parent / 'res/siasun_md_sample_hrsl/SN024002/img/img_13.png'
imagepath = Path(os.getcwd()).parent / 'res/siasun_md_sample/SN024002/img/img_13.png'
# imagepath = Path(os.getcwd()).parent / 'siasun_md_sample_hrsl/SN024002/img/img_11.png'
# imagepath = Path(os.getcwd()).parent / 'res/siasun_md_sample_hrsl/SX322002/img/img_13.png'
image_link = str(imagepath)
image_link
display(Markdown(f"![image]({image_link})"))

'/d/devel/rag/ribosome/res/siasun_md_sample/SN024002/img/img_13.png'

![image](/d/devel/rag/ribosome/res/siasun_md_sample/SN024002/img/img_13.png)

In [None]:
# res = re.match(image_link_pattern, 'http://baidu.com/?home/img/small.png')
res = re.match(image_link_pattern, 'img/small.png')
# res = re.match(image_link_pattern, image_link)
# in case of match, print the matched string
if res:
    print(f"Matched image link: {res.group(0)}")
else:
    print(res)

Matched image link: img/small.png


In [40]:
if True: 
    response = await get_image_summary_response_async(image_link, model="gemma3:27b", role="user", lang='zh')
    md_text = markdown.markdown(response.message.content)
    Markdown(md_text)
    # print(md_text)
    # print(response.content)
    # response_txt = await get_text_summary_response_async(md_text,model="gemma3:27b", role="user", lang='zh')
    # md_text = markdown.markdown(response_txt.message.content)
    # Markdown(md_text)


<p>以下是对图像的摘要，着重于机器人机械尺寸、运动范围和自由度：</p>
<p><strong>摘要:</strong></p>
<p>该图像展示了一个二维示意图，描绘了一个简化的笛卡尔机器人结构。</p>
<ul>
<li><strong>机械尺寸:</strong> 从图中的尺寸标注可以看出，机器人的基座呈正方形，边长为 300mm。 垂直运动范围为 200mm。</li>
<li><strong>运动范围:</strong> 机器人可以沿 X、Y 和 Z 三个轴进行直线运动，构成一个矩形工作空间。</li>
<li><strong>自由度:</strong> 该机器人具有三个自由度（DOF），每个轴对应一个自由度，允许其在三维空间中定位和定向工具。</li>
</ul>
<p>请注意，该示意图非常简化，可能缺乏有关关节类型、电机或传感器的详细信息。</p>

## DOM Class with pypandoc Integration

In [None]:
# | export
from platform import node
from typing import Iterator
from queue import LifoQueue
import asyncio

class DOM(BaseModel):
    # The content of the Markdown document. This can be a string containing Markdown syntax.
    raw_markdown: Optional[str] = Field(None, description="Raw Markdown content")
    # raw json
    raw_json: Optional[str] = Field(None, description="Raw JSON content")
    # The json representation of the Markdown AST.
    ast_json: Optional[str] = Field(None, description="JSON representation of the Markdown AST")
    root_path: Optional[str] = Field(None, description="root path of the markdown document, required to get access to the images")
    table_count: int = Field(0, description="Number of tables in the Markdown document")
    header_count: int = Field(0, description="Number of headers in the Markdown document")
    section_level: list = Field(default_factory=list, description="List of section levels in the Markdown document")


    TextBlock_Types: set[str] = {
        "Plain",
        "Para",
        "Figure",
        "LineBlock","CodeBlock","RawBlock","OrderedList","BulletList","DefinitionList",
        "Header","BlockQuote",
        "Table","TableRow", "TableCell"}

    NonTextBlock_Types: set[str] = {"HorizontalRule", "Div", "Null"}

    Block_Types: set[str] = TextBlock_Types.union(TextBlock_Types)

    Inline_Types: set[str] = {
        "Str", "Emph", "Strong", "Strikeout", "Superscript", "Subscript",
        "Decimal", "Period",
        "Link", 
        "Image", 
        "Code", "Math", "RawInline", "SoftBreak",
        "HardBreak", "Span"   
    }
    
    Element_Types: set[str] = Block_Types | Inline_Types

    def __init__(self, md_file_path: Path, **data):
        """ Initializes the Markdown object with raw Markdown content.
        If content is provided, it will be set as the raw_markdown.
        """
        super().__init__(**data)
        content = md_file_path.read_text(encoding="utf-8")
        self.root_path = md_file_path.parent
        if content:
            self.raw_markdown = content
            self.raw_json = pypandoc.convert_text(self.raw_markdown, "json", "md")
        else:
            self.raw_markdown = None
            self.raw_json = None

    def to_markdown(self) -> str | None:
        return self.raw_markdown

    def to_html(self) -> str | None:
        if not self.raw_markdown:
            return None
        # Convert raw Markdown to HTML using pypandoc
        return pypandoc.convert_text(self.raw_markdown, "html", "md")

    def to_latex(self) -> str | None:
        if not self.raw_markdown:
            return None
        return pypandoc.convert_text(self.raw_markdown, "latex", "md")

    def to_json(self) -> str | None:
        """ Converts the Markdown content to a JSON representation of its AST.
        This uses pypandoc to convert the Markdown content into a JSON format.
        """
        return self.raw_json

    def walk(self, action: callable = None) -> None:
        """ Walks through the Markdown AST and applies an action to each node.
        If no action is provided, it defaults to the identity function.
        """
        if not self.raw_json:
            raise ValueError("raw_json content is empty. Cannot walk the AST.")
        if action is None:
            action = self.__class__.identity

        ast = json.loads(self.raw_json)

        def walk_node(node):
            node = action(node)
            if isinstance(node, dict):
                for key, value in node.items():
                    if isinstance(value, list):
                        node[key] = [
                            walk_node(child)
                            if isinstance(child, (dict, list))
                            else child
                            for child in value
                        ]
                    elif isinstance(value, dict):
                        node[key] = walk_node(value)
            elif isinstance(node, list):
                node = [
                    walk_node(child) if isinstance(child, (dict, list)) else child
                    for child in node
                ]
            return node

        ast = walk_node(ast)
        self.ast_json = json.dumps(ast, ensure_ascii=False).encode("utf-8").decode("utf-8")

    def reorg(self, action: callable = None) -> None:
        """
        Reorganizes the node structure to ensure that headers are treated as sections.
            Applies an action to each node. If no action is provided, it defaults to the identity function.
        """
        if not self.raw_json:
            raise ValueError("raw_json content is empty. Cannot walk the AST.")
        if action is None:
            action = self.__class__.identity

        ast = json.loads(self.raw_json)

        def reorg_section(section: dict, it: Iterator) -> (dict, Iterator):
            """
            Reorganizes the list of nodes after a Section node.
            Puts all nodes after the Section into the Content of the Section.
            This is done until the next Section node is encountered.
            Returns the new Section node
            """

            assert section.get('t') == "Section", "Expected a Section node"
            level = section['c']['Level']

            while True:
                try:
                    item = next(it)
                    if isinstance(item, dict) and item.get('t') == "Section":
                        # If the item is a Section, add the next items into its Content until the next Section
                        next_level = item['c']['Level']
                        if next_level == level + 1:  # lower levels need to be continuous 
                            # If the next Section is at a lower level, step into further subsection handling
                            item, it = reorg_section(item, it)
                        elif next_level > level:  # higher levels don't need to be continuous
                            # If the next Section is at a higher level, we can stop here
                            it = iter([item] + list(it))
                            return section, it
                        else:
                            raise ValueError("Unexpected Section level encountered")
                            
                    # If the item is not a Section, we can add it to the Content of the Section
                    section['c']['Content'].append(item)

                except StopIteration:
                    break
            return section, it
        
        def reorg_node(node):
            """Reorganizes the node structure to ensure that headers are treated as sections."""
            node = action(node)
            if isinstance(node, dict):
                for key, value in node.items():
                    if isinstance(value, list):
                        
                        # node[key] = [
                        #     reorg_node(child)
                        #     if isinstance(child, (dict, list))
                        #     else child
                        #     for child in value
                        # ]
                        for key, value in node.items():
                            if isinstance(value, list):
                                new_list = []
                                for child in value:
                                    if isinstance(child, (dict, list)):
                                        new_list.append(reorg_node(child))
                                    else:
                                        new_list.append(child)
                                    if isinstance(child, dict):
                                        # If the child is a dict, check if it has a 't' key
                                        if 't' in child:
                                            t = child['t']
                                            if t == "Header":
                                                # If the node is a Header, create a new section 
                                                # and include the header in its content 
                                                # along with the following nodes until the next Header
                                                child['t'] = "Section"
                                                child['c'] = {
                                                    'Level': child['c'][0],  # Header level
                                                    'Header': {
                                                        'Level': child['c'][0],  # Header level
                                                        'Attributes': child['c'][1],  # Header content
                                                        'Header': child['c'][2],  # Header content
                                                    },
                                                    'Content': []  # Placeholder for section content
                                                }
                                                self.section_level.append(child['c']['Level'])  # Append the header level to section_levell
                                it = iter(new_list)
                                new_value = []
                                while True:
                                    try:
                                        item = next(it)
                                        if isinstance(item, dict) and item.get('t') == "Section":
                                            item, it = reorg_section(item,it)
                                        new_value.append(item)
                                    except StopIteration:
                                        break
                                node[key] = new_value
                            elif isinstance(value, dict):
                                node[key] = reorg_node(value)
                    elif isinstance(value, dict):
                        node[key] = reorg_node(value)
            elif isinstance(node, list):
                node = [
                    reorg_node(child) if isinstance(child, (dict, list)) else child
                    for child in node
                ]
            return node

        ast = reorg_node(ast)
        self.ast_json = json.dumps(ast, ensure_ascii=False).encode("utf-8").decode("utf-8")
        
    async def textualize(self, action: callable = None) -> None:
        """ Walks through the Markdown AST and applies an action to each node.
        If no action is provided, it defaults to the identity function.
        """

        async def get_leaf_summary_async(node:str) -> str:

            # If the string length is less than 200, return the node as is
            if len(node) < 200:
                return node
            # If the node is not an image link, return a summary of the text
            response = await get_text_summary_response_async(node, model="gemma3:27b", role="user")
            if response.message.content:
                return response.message.content
            else:
                # If the response is empty, return the original node
                return node

        async def get_list_summary_async(root: list) -> str:
            """
            Given a list of strings, return a summary of the list.
            If the list is empty, return an empty string.
            If the list has only one element, return the summary of that element.
            If the list has more than one element, return a summary of the concatenated elements.
            """
            list_summary = []
            for n in root:
                if isinstance(n, str):
                    # If the element is a string, get its summary
                    summary = await get_leaf_summary_async(n)
                elif isinstance(n, int) or isinstance(n, float):
                    # If the element is a number, get its summary
                    summary = await get_leaf_summary_async(str(n))
                elif isinstance(n, dict):
                    # If the dict has a 's' key, use it as the summary
                    summary = n.get('s', '')
                elif isinstance(n, list):
                    # If the element is a list, get its summary
                    summary = await get_list_summary_async(n)
                elif n is None:
                    # If the element is None, skip it
                    summary = ""
                else:
                    raise ValueError(f"Unsupported element type: {type(n)} in {n}")
                list_summary.append(summary)
                
            # Concatenate all elements and summarize
            # concatenated = " ".join(list_summary)
            # response = get_text_summary_response(concatenated, model="gemma3:27b", role="user", lang='zh')
            # if response.message.content:
            #     return response.message.content
            # else:
            #     # If the response is empty, raise an exception
            #     raise ValueError("Summary response is empty. Please check the input data.")
            return await get_leaf_summary_async(" ".join(list_summary))

        async def summary_node_main(action: callable = None) -> None:
            """
            Main function to walk the AST and summarize nodes.
            This function will be called by the walk method.
            """
            if not self.raw_json:
                raise ValueError("raw_json content is empty. Cannot walk the AST and summarize.")
            if action is None:
                action = self.__class__.identity

            ast = json.loads(self.raw_json)
            block = ast.get("blocks", [])

            block = await summary_node_async(block)
            # Convert the summarized AST back to JSON
            self.ast_json = json.dumps({"blocks": block}, ensure_ascii=False).encode("utf-8").decode("utf-8")
        async def summary_node_async(node: dict | list) -> dict | list:
            '''
            Given a string node, add key,value pair: node['s'] = node_summary, and return the node 
            '''
            if isinstance(node, dict):
                try:
                    t = node["t"]
                except KeyError:
                    raise ValueError(f"Node does not have a 't' key: {node}")
                if t == "Image":  # Image summary, the Image node is as defined in the pandoc AST
                    summary = []
                    summary.append(node['c'][1][0]['c'])  # The content of the second element is the image caption
                    try:
                        # summary.append(node['c'][0])  # The first element are defined to be attributes of the image rendering, i.e. content-irrelevant.
                        # If the node is an image, get its link
                        image_link = self.root_path / node['c'][2][0]  # Assuming the image link is in the third element of the list
                        image_link = str(image_link)
                        if image_link and re.match(image_link_pattern, image_link):
                            # If the node is an image link, get its summary
                            response = await get_image_summary_response_async(image_link, model="gemma3:27b", role="user", lang='zh')
                            summary.append(response.message.content)
                        else:
                            # If the node is not an image link, summarize its content
                            raise ValueError(f"Invalid image link: {image_link}")
                        # Get the summary of the image caption

                    except (IndexError, KeyError):
                        # Handle cases where the image link is not in the expected format
                        raise ValueError(f"Invalid image node structure: {node}")
                    # The second element is the image title
                    summary.append(node['c'][2][1])  # The second element is the image title

                    response_txt = await get_text_summary_response_async(
                        " ".join(summary), model="gemma3:27b", role="user"
                    )
                    node["s"] = response_txt.message.content

                    print(f"Summarize image: {image_link}")

                else: # TextBlock summary
                    dict_summary = []
                    for key, value in node.items(): # get summary of the values (content)
                        if isinstance(value, list):  # get the summary of the string list
                            if value == []:
                                # If the list is empty, skip it
                                continue
                            # If the value is a list, summarize each element
                            node[key] = [
                                await summary_node_async(child)
                                if isinstance(child, (dict,list))
                                else child
                                for child in value
                            ]
                            dict_summary.append(await get_list_summary_async(value))
                        elif isinstance(value, dict):
                            child = await summary_node_async(value)  # insert the value['s']
                            assert isinstance(child, dict) and 's' in child, f"Expected dict with 's' key, got {child}"
                            dict_summary.append(child['s'])

                        elif value is None or value == "":
                            # If the value is None, skip it
                            continue
                        else:
                            if not isinstance(value, str):
                                # If the value is not a string, convert it to a string
                                value = str(value)
                            dict_summary.append(await get_leaf_summary_async(value))
                    # get the summary of the node
                    if dict_summary:  # type: ignore
                        # If there are summaries, concatenate them
                        node["s"] = await get_leaf_summary_async(" ".join(dict_summary))
                        # node["s"] = get_text_summary_response(
                        #     " ".join(dict_summary), model="gemma3:27b", role="user", lang='zh'
                        # ).message.content
                    else:
                        # If no summaries, set to empty string
                        node["s"] = ""

                    # if t is table
                    if t == "Table":
                        print(f"Summarize table: {self.table_count}")
                        self.table_count += 1
                    elif t == "Header":
                        print(f"Summarize header: {self.header_count} header depth {node['c'][0]}")
                        self.header_count += 1

            elif isinstance(node, list):
                node = [
                    await summary_node_async(child) if isinstance(child, (dict,list)) else child
                    for child in node
                ]

            return node

        # Run the summary_node_main function asynchronously
        # asyncio.run(summary_node_main(action))
        await summary_node_main(action)
        
    @classmethod
    def identity(cls, obj):
        """Identity function for use in walk."""
        return obj

In [None]:
# md_file = Path(os.getcwd()).parent / 'res/siasun_md_sample_hrsl/SX322002/SX322002.md'
md_file = Path(os.getcwd()).parent / 'res/siasun_md_sample_hrsl/SN024002/SN024002.md'
md_file.parent
os.getcwd()

# md_file
dom = DOM(md_file)

Path('/d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SN024002')

'/d/devel/rag/ribosome/nbs'

In [None]:
import asyncio

async def func():
    print("Hello, World!")
    await asyncio.sleep(1)
    return "Hello, World again!"

await func()

Hello, World!


'Hello, World again!'

In [None]:
await dom.textualize()

Summarize image: /d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SN024002/img/20231103-085114_0.png
Summarize header: 0 header depth 3
Summarize table: 0
Summarize image: /d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SN024002/img/img_1.png
Summarize header: 1 header depth 3
Summarize image: /d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SN024002/img/img_2.png
Summarize image: /d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SN024002/img/__3.png
Summarize image: /d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SN024002/img/20241112-164221_4.png
Summarize image: /d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SN024002/img/img_5.png
Summarize image: /d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SN024002/img/img_6.png
Summarize image: /d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SN024002/img/__7.png
Summarize image: /d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SN024002/img/20241112-165517_8.png
Summarize image: /d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SN024002/img/img_9.png

In [None]:
md_file.name
js_semantics_file = md_file.parent / (str(md_file.stem) + "_semantics.json")
js_semantics_file

In [None]:
dom.ast_json

In [None]:
js_semantics_file.write_text(dom.ast_json, encoding="utf-8")


In [None]:

js_file = md_file.with_suffix('.json')
js_file.write_text(js, encoding="utf-8")

In [None]:
# create test case for Markdown class with identity function
import os
import shutil
from pathlib import Path
# change directory to the script's directory
# cwd = Path(os.getcwd()).parent / "res/siasun_md_sample_hrsl"
# os.chdir(cwd)
cwd = os.getcwd()
os.listdir(cwd)

# os.chdir(Path(cwd).parent / "res/SR02401")


In [None]:
dom = DOM(content="# Test\n\nThis is a test.\n\nThis is **bold**.")
# save the markdown content to a file
md_file = Path(cwd) / "test.md"
md_file.write_text(dom.raw_markdown, encoding="utf-8")
rst = pypandoc.convert_file('test.md', "rst", format="md")
rst_file = Path(cwd) / "test.rst"
rst_file.write_text(rst, encoding="utf-8")

js = dom.to_json()
js_file = Path(cwd) / "test.json"
js_file.write_text(js, encoding="utf-8")
dom.walk()
# dom.ast_json
(Path(cwd) / "test_ast.json").write_text(dom.ast_json, encoding="utf-8")
(Path(cwd) / "test_raw.json").write_text(dom.raw_json, encoding="utf-8")

In [None]:
cwd

In [None]:
# md_file = Path(cwd) / "SN024002/SN024002《新松SN7B-7-0.90规格参数》A-1.md"
md_file = Path(cwd) / "SX322002/SX322002《新松机器人码垛应用操作手册》-V1.0.md"
dom = DOM.from_file(md_file)
js = dom.to_json()
js_file = Path(cwd) / "SX322002/SX322002《新松机器人码垛应用操作手册》-V1.0.json"
js_file.write_text(js, encoding="utf-8")

In [None]:
dom.walk()
(Path(cwd) / "SX322002/SX322002《新松机器人码垛应用操作手册》-V1.0_ast.json").write_text(dom.ast_json, encoding="utf-8")
(Path(cwd) / "SX322002/SX322002《新松机器人码垛应用操作手册》-V1.0_raw.json").write_text(dom.raw_json, encoding="utf-8")

## Section Class: Recursive Document Structure

In [None]:
# | export
class Section(BaseModel):
    summary: Optional[str] = None
    paragraphs: List[str] = Field(default_factory=list)
    figures: List[Figure] = Field(default_factory=list)
    tables: List[Table] = Field(default_factory=list)
    subsections: List["Section"] = Field(default_factory=list)

    def __init__(
        self,
        summary: Optional[str] = None,
        paragraphs: Optional[List[str]] = None,
        figures: Optional[List[Figure]] = None,
        tables: Optional[List[Table]] = None,
        subsections: Optional[List[dict]] = None,
    ):
        # Recursively initialize subsections if provided as dicts
        if subsections is not None:
            subs = [Section(**s) if isinstance(s, dict) else s for s in subsections]
        else:
            subs = []
        super().__init__(
            summary=summary,
            paragraphs=paragraphs or [],
            figures=figures or [],
            tables=tables or [],
            subsections=subs,
        )

    @classmethod
    def init(cls, md: Markdown):
        # Placeholder for initialization from Markdown
        return cls()

    @classmethod
    def update_forward_refs(cls, **localns):
        ...
        #BaseModel.model_rebuild()


# Support for recursive Section references
Section.model_rebuild()