--
description: DOM Model with Pydantic and Pandoc Integration
output-file: core.dom.html
title: core.dom

This notebook demonstrates a Document Object Model (DOM) using Pydantic for static typing and validation, and integrates Pandoc (via pypandoc) for Markdown processing.
---

In [1]:
# | default_exp dom

In [2]:
# | hide
from nbdev.showdoc import *
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import Markdown, display

InteractiveShell.ast_node_interactivity = "all"

In [3]:
# | export
from typing import List, Optional
from pydantic import BaseModel, Field, field_validator
import base64
import pypandoc
import pathlib
import json
import re
import os
from pathlib import Path
from pprint import pprint
import markdown

In [4]:
#| export
from ollama import chat, ChatResponse, Client

## Base Element Class

In [5]:
# | export
class Element(BaseModel):
    summary: Optional[str] = None

## Figure Class with Base64 Validation

In [6]:
# | export
class Figure(Element):
    rawdata: str = Field(..., description="Base64-encoded image data")

    @field_validator("rawdata")
    def validate_base64(cls, v):
        try:
            base64.b64decode(v)
        except Exception:
            raise ValueError("rawdata must be valid base64")
        return v

## Table Structure: Cell, Column, Row, Table

In [7]:
# | export
class Cell(BaseModel):
    c: str


class Column(BaseModel):
    cells: List[Cell]


class Row(BaseModel):
    cols: List[Column]


class Table(Element):
    rows: List[Row]

In [8]:
# | export
def get_text_summary_response(content: str, model:str="gemma3-27b", role:str="user") -> ChatResponse:
    """
    Returns a ChatResponse from the chat model with a summary prompt for the given content.
    """
    return chat(
        model=model,
        messages=[
            {
                "role": role,
                "content": (
                    f"Please provide a summary of the following string. "
                    f"The summary should be concise and informative: {content}. "
                    
                ),
            }
        ],
    )

In [9]:
# | export
image_link_pattern = r'[^\s]+\.(?:jpg|jpeg|png|gif|bmp|webp)'

def get_image_summary_response(image_link: str, model:str="gemma3:27b", role:str="user", lang: str='en') -> ChatResponse:
    """
    Returns a ChatResponse from the chat model with a summary prompt for the given image.
    """
    if not re.match(image_link_pattern, image_link):
        # If the image link is not a URL, throw an error
        raise ValueError(f"Invalid image link: {image_link}")
    
    match lang:
        case 'en':
            prompt = "Please provide a summary of the following image. The summary should be concise and informative about the robot."
        case 'zh':
            prompt = "请提供以下图像的摘要。关于机器人机械尺寸,运动范围,自由度的说明应简明扼要。"
        case _:
            raise ValueError(f"Unsupported language: {lang}")

    response = chat(
        model=model,
        messages=[
            {
                "role": role,
                "content": prompt,
                'images': [f"{image_link}"],
            }
        ],
    )
    return response


In [10]:
# os.getcwd()
image_link = "../res/siasun_md_sammple_hrsl/SN024002/img/__2.png"
# image_link
res = re.match(image_link_pattern, image_link)
res.group(0)
# get_image_summary_response("../res/")

'../res/siasun_md_sammple_hrsl/SN024002/img/__2.png'

In [11]:
# imagepath = Path(os.getcwd()).parent / 'res/siasun_md_sample/SN024002/img/__3.png'
# imagepath = Path(os.getcwd()).parent / 'siasun_md_sample_hrsl/SN024002/img/img_11.png'
imagepath = Path(os.getcwd()).parent / 'res/siasun_md_sample_hrsl/SX322002/img/img_11.png'
image_link = str(imagepath)
image_link
display(Markdown(f"![image]({image_link})"))

'/d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SX322002/img/img_11.png'

![image](/d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SX322002/img/img_11.png)

In [12]:
# res = re.match(image_link_pattern, 'http://baidu.com/?home/img/small.png')
res = re.match(image_link_pattern, 'img/small.png')
# res = re.match(image_link_pattern, image_link)
# in case of match, print the matched string
if res:
    print(f"Matched image link: {res.group(0)}")
else:
    print(res)

Matched image link: img/small.png


In [13]:
response = get_image_summary_response(image_link, model="gemma3:27b", role="user", lang='zh')
# print(response.content)


In [14]:
md_text = markdown.markdown(response.message.content)
Markdown(md_text)
# print(md_text)

<p>好的，根据提供的图像，以下是对机器人的简要摘要：</p>
<p><strong>整体描述：</strong></p>
<p>图像展示了一种工业机器人，很明显是一种六轴机器人。它被放置在一个带有坐标网格的地板上，周围有一些用于参考和校准的工件。</p>
<p><strong>机器人机械尺寸、运动范围、自由度：</strong></p>
<ul>
<li><strong>机械尺寸:</strong> 机器人臂由多个连杆组成，长度不一。从图像大致估计，最大伸展长度大约为2-3米。</li>
<li><strong>自由度:</strong> 机器人具有六个自由度，这意味着它可以在三维空间中灵活地移动和旋转其末端执行器。这包括三个旋转自由度（在基座、肩部和肘部）和三个平移自由度（通过移动底座）。</li>
<li><strong>运动范围：</strong> 机器人能够覆盖一个相当大的工作空间，可以到达工件上方、下方和周围的位置。具体运动范围取决于连杆长度和关节角度限制。</li>
</ul>
<p><strong>图像细节：</strong></p>
<ul>
<li>图像1和3展示机器人手臂伸向工件的位置，以及末端执行器（可能是夹具）的示意图。</li>
<li>图像2展示机器人处于一个更弯曲的状态，可以更好地理解其运动轨迹。</li>
<li>图像中还有“工作”、“定位”等标签，表明了机器人可以用于各种自动化任务，例如装配、焊接、喷涂等。</li>
</ul>
<p>总而言之，这是一个典型的工业机器人，具有灵活性和精度，可以用于各种自动化生产过程。</p>

## DOM Class with pypandoc Integration

In [None]:
# | export
from platform import node


class DOM(BaseModel):
    # The content of the Markdown document. This can be a string containing Markdown syntax.
    raw_markdown: Optional[str] = Field(None, description="Raw Markdown content")
    # raw json
    raw_json: Optional[str] = Field(None, description="Raw JSON content")
    # The json representation of the Markdown AST.
    ast_json: Optional[str] = Field(None, description="JSON representation of the Markdown AST")

    TextBlock_Types: set[str] = {
        "Plain",
        "Para",
        "Figure",
        "LineBlock","CodeBlock","RawBlock","OrderedList","BulletList","DefinitionList",
        "Header","BlockQuote",
        "Table","TableRow", "TableCell"}

    NonTextBlock_Types: set[str] = {"HorizontalRule", "Div", "Null"}

    Block_Types: set[str] = TextBlock_Types.union(TextBlock_Types)

    Inline_Types: set[str] = {
        "Str", "Emph", "Strong", "Strikeout", "Superscript", "Subscript",
        "Decimal", "Period",
        "Link", 
        "Image", 
        "Code", "Math", "RawInline", "SoftBreak",
        "HardBreak", "Span"   
    }
    
    Element_Types: set[str] = Block_Types | Inline_Types

    def __init__(self, content: Optional[str] = None, **data):
        """ Initializes the Markdown object with raw Markdown content.
        If content is provided, it will be set as the raw_markdown.
        """
        super().__init__(**data)
        if content:
            self.raw_markdown = content
            self.raw_json = pypandoc.convert_text(self.raw_markdown, "json", "md")
        else:
            self.raw_markdown = None
            self.raw_json = None

    def to_markdown(self) -> str | None:
        return self.raw_markdown

    def to_html(self) -> str | None:
        if not self.raw_markdown:
            return None
        # Convert raw Markdown to HTML using pypandoc
        return pypandoc.convert_text(self.raw_markdown, "html", "md")

    def to_latex(self) -> str | None:
        if not self.raw_markdown:
            return None
        return pypandoc.convert_text(self.raw_markdown, "latex", "md")

    def to_json(self) -> str | None:
        """ Converts the Markdown content to a JSON representation of its AST.
        This uses pypandoc to convert the Markdown content into a JSON format.
        """
        return self.raw_json

    def walk(self, action: callable = None) -> None:
        """ Walks through the Markdown AST and applies an action to each node.
        If no action is provided, it defaults to the identity function.
        """
        if not self.raw_markdown:
            raise ValueError("Markdown content is empty. Cannot walk the AST.")
        if action is None:
            action = self.__class__.identity

        ast = json.loads(self.raw_json)

        def walk_node(node):
            node = action(node)
            if isinstance(node, dict):
                for key, value in node.items():
                    if isinstance(value, list):
                        node[key] = [
                            walk_node(child)
                            if isinstance(child, (dict, list))
                            else child
                            for child in value
                        ]
                    elif isinstance(value, dict):
                        node[key] = walk_node(value)
            elif isinstance(node, list):
                node = [
                    walk_node(child) if isinstance(child, (dict, list)) else child
                    for child in node
                ]
            return node

        ast = walk_node(ast)
        self.ast_json = json.dumps(ast, ensure_ascii=False).encode("utf-8").decode("utf-8")

    def textualize(self, action: callable = None) -> None:
        """ Walks through the Markdown AST and applies an action to each node.
        If no action is provided, it defaults to the identity function.
        """
        if not self.raw_markdown:
            raise ValueError("Markdown content is empty. Cannot walk the AST.")
        if action is None:
            action = self.__class__.identity

        ast = json.loads(self.raw_json)

        def get_leaf_summary(node:str) -> str:
            if re.match(image_link_pattern, node):
                # If the node is an image link, return a summary of the image
                response = get_image_summary_response(node, model="gemma3:27b", role="user", lang='zh')
                return response.message.content 
            else:
                # If the string length is less than 200, return the node as is
                if len(node) < 200:
                    return node
                # If the node is not an image link, return a summary of the text
                response = get_text_summary_response(node, model="gemma3:27b", role="user")
                if response.message.content:
                    return response.message.content
                else:
                    # If the response is empty, return the original node
                    return node

        def node_summary(node: dict | list) -> dict | list:
            '''
            Given a string node, add key,value pair: node['s'] = node_summary, and return the node 
            '''
            node = action(node)
            if isinstance(node, dict):
                try:
                    t = node["t"]
                except KeyError:
                    raise ValueError(f"Node does not have a 't' key: {node}")
                if t == "Image": # Image summary
                    summary = []
                    try:
                        # summary.append(node['c'][0])  # The first element are defined to be attributes of the image rendering, i.e. content-irrelevant.
                        summary.append(get_leaf_summary(node['c'][1]))  # The second element is the image caption
                        # If the node is an image, get its link
                        image_link = node["c"][2]  # Assuming the image link is in the third element of the list
                        if image_link and re.match(image_link_pattern, image_link):
                            # If the node is an image link, get its summary
                            response = get_image_summary_response(image_link, model="gemma3:27b", role="user", lang='zh')
                            summary.append(response.message.content)
                        else:
                            # If the node is not an image link, summarize its content
                            raise ValueError(f"Invalid image link: {image_link}")
                        node["s"] = get_text_summary_response(
                            " ".join(summary), model="gemma3:27b", role="user"
                        ).message.content
                    except (IndexError, KeyError):
                        # Handle cases where the image link is not in the expected format
                        raise ValueError(f"Invalid image node structure: {node}")
                else: # TextBlock summary
                    # summary = []
                    l = []
                    for key, value in node.items(): # get summary of the values (content)
                        if isinstance(value, list):  # get the summary of the string list
                            list_summary = []
                            for child in value:
                                if isinstance(child, (dict, list)):
                                    # Recursively summarize child nodes
                                    child = node_summary(child)  # insert the child['s']
                                    for grandchhild in child:
                                        
                                    l.append(child) 
                                    list_summary.append(child['s'])
                                else:
                                    # If the child is not a dict or list, get its summary
                                    list_summary.append(get_leaf_summary(child))
                            list_summary = get_text_summary_response(
                                " ".join(list_summary), model="gemma3:27b", role="user"
                            ).message.content
                            node[key] = l
                            node['s'] = list_summary

                        elif isinstance(value, dict):
                            child = node_summary(value)  # insert the value['s']
                            summary.append(child['s'])
                        else:
                            summary.append(get_leaf_summary(value))
                    # get the summary of the node        
                    node["s"] = get_text_summary_response(
                        " ".join(summary), model="gemma3:27b", role="user"
                    ).message.content

            elif isinstance(node, list):
                list_summary = []
                for child in node:
                    if isinstance(child, (dict, list)):
                        child = node_summary(child)
                        list_summary.append(child[])
                    else:
                        list_summary.append(get_leaf_summary(child))
                node['s'] =  get_text_summary_response(
                    " ".join(list_summary), model="gemma3:27b", role='user'
                )

            return node
        def get_summary(t:str, node:str):
            """Get a summary of the node."""
            if t == "image":
                return get_leaf_summary(node)
            elif isinstance(node, dict):
                return {k: get_summary(v) for k, v in node.items()}
            elif isinstance(node, list):
                return [get_summary(item) for item in node]
            else:
                return node
        def summary_node(node):
            node = action(node)
            if isinstance(node, dict):
                t = node.get("t")
                for key, value in node.items():
                    if isinstance(value, list):
                        for child in value:
                            if isinstance(child, (dict, list)):
                                # Recursively summarize child nodes
                                node['s'] = summary_node(child)
                            else:
                                # If the child is not a dict or list, get its summary
                                node['s'] = get_leaf_summary(child)
                        # node[key] = [
                        #     summary_node(child)
                        #     if isinstance(child, (dict, list))
                        #     else child
                        #     for child in value
                        # ]
                    elif isinstance(value, dict):
                        node[key] = summary_node(value)
            elif isinstance(node, list):
                node = [
                    summary_node(child) if isinstance(child, (dict, list)) 
                    else get_summary(child)
                    for child in node
                ]

            return node

        # Apply the image_summary function to the AST
        ast = image_summary(ast)
        # Apply the summary_node function to the AST
        ast = summary_node(ast)
        self.ast_json = json.dumps(ast, ensure_ascii=False).encode("utf-8").decode("utf-8")

    def summary(self, action: callable = None) -> None:
        """ Walks through the Markdown AST and applies an action to each node.
        If no action is provided, it defaults to the identity function.
        """
        if not self.raw_markdown:
            raise ValueError("Markdown content is empty. Cannot walk the AST.")
        if action is None:
            action = self.__class__.identity

        ast = json.loads(self.raw_json)

        def get_leaf_summary(node:str) -> str:
            if re.match(image_link_pattern, node):
                # If the node is an image link, return a summary of the image
                response = get_image_summary_response(node, model="gemma3:27b", role="user", lang='zh')
                return response.message.content 
            else:
                # If the node is not an image link, return the node as is
                return node

        def image_summary(node):
            node = action(node)
            if isinstance(node, dict):
                t = node.get("t")
                for key, value in node.items():
                    if isinstance(value, list):
                        node[key] = [
                            summary_node(child)
                            if isinstance(child, (dict, list))
                            else get_leaf_summary(child)
                            for child in value
                        ]
                    elif isinstance(value, dict):
                        node[key] = summary_node(value)
            elif isinstance(node, list):
                node = [
                    summary_node(child) if isinstance(child, (dict, list)) 
                    else get_leaf_summary(child)
                    for child in node
                ]

            return node
        def get_summary(t:str, node:str):
            """Get a summary of the node."""
            if t == "image":

                return get_leaf_summary(node)
            elif isinstance(node, dict):
                return {k: get_summary(v) for k, v in node.items()}
            elif isinstance(node, list):
                return [get_summary(item) for item in node]
            else:
                return node
        def summary_node(node):
            node = action(node)
            if isinstance(node, dict):
                t = node.get("t")
                for key, value in node.items():
                    if isinstance(value, list):
                        for child in value:
                            if isinstance(child, (dict, list)):
                                # Recursively summarize child nodes
                                node['s'] = summary_node(child)
                            else:
                                # If the child is not a dict or list, get its summary
                                node['s'] = get_leaf_summary(child)
                        # node[key] = [
                        #     summary_node(child)
                        #     if isinstance(child, (dict, list))
                        #     else child
                        #     for child in value
                        # ]
                    elif isinstance(value, dict):
                        node[key] = summary_node(value)
            elif isinstance(node, list):
                node = [
                    summary_node(child) if isinstance(child, (dict, list)) 
                    else get_summary(child)
                    for child in node
                ]

            return node

        # Apply the image_summary function to the AST
        ast = image_summary(ast)
        # Apply the summary_node function to the AST
        ast = summary_node(ast)
        self.ast_json = json.dumps(ast, ensure_ascii=False).encode("utf-8").decode("utf-8")
    @staticmethod
    def from_file(filepath: pathlib.Path):
        content = filepath.read_text(encoding="utf-8")
        return Markdown(content=content)

    @classmethod
    def identity(cls, obj):
        """Identity function for use in walk."""
        return obj

In [40]:
# create test case for Markdown class with identity function
import os
import shutil
from pathlib import Path
# change directory to the script's directory
cwd = Path(os.getcwd()).parent / "siasun_md_sample_hrsl"
os.chdir(cwd)
cwd = os.getcwd()
os.listdir(cwd)

# os.chdir(Path(cwd).parent / "res/SR02401")


['test.html',
 'SR02400401',
 'SR024011',
 'SN024002',
 'test.md',
 'SX322002',
 'test.rst',
 'test_raw.json',
 'test.json',
 'test_ast.json']

In [41]:
dom = DOM(content="# Test\n\nThis is a test.\n\nThis is **bold**.")
# save the markdown content to a file
md_file = Path(cwd) / "test.md"
md_file.write_text(dom.raw_markdown, encoding="utf-8")
rst = pypandoc.convert_file('test.md', "rst", format="md")
rst_file = Path(cwd) / "test.rst"
rst_file.write_text(rst, encoding="utf-8")

js = dom.to_json()
js_file = Path(cwd) / "test.json"
js_file.write_text(js, encoding="utf-8")
dom.walk()
# dom.ast_json
(Path(cwd) / "test_ast.json").write_text(dom.ast_json, encoding="utf-8")
(Path(cwd) / "test_raw.json").write_text(dom.raw_json, encoding="utf-8")

42

46

420

487

420

In [33]:
cwd

'/d/devel/rag/ribosome/res/siasun_md_sample_hrsl'

In [42]:
# md_file = Path(cwd) / "SN024002/SN024002《新松SN7B-7-0.90规格参数》A-1.md"
md_file = Path(cwd) / "SX322002/SX322002《新松机器人码垛应用操作手册》-V1.0.md"
dom = DOM.from_file(md_file)
js = dom.to_json()
js_file = Path(cwd) / "SX322002/SX322002《新松机器人码垛应用操作手册》-V1.0.json"
js_file.write_text(js, encoding="utf-8")

37755

In [None]:
dom.walk()
(Path(cwd) / "SX322002/SX322002《新松机器人码垛应用操作手册》-V1.0_ast.json").write_text(dom.ast_json, encoding="utf-8")
(Path(cwd) / "SX322002/SX322002《新松机器人码垛应用操作手册》-V1.0_raw.json").write_text(dom.raw_json, encoding="utf-8")

43753

37755

In [None]:

def test_markdown_walk_identity():
    dom.walk(identity)
    print(dom.content)  # Should print the original content
    # assert dom.content == "# Test\n\nThis is a test."

test_markdown_walk_identity()

## Section Class: Recursive Document Structure

In [52]:
# | export
class Section(BaseModel):
    summary: Optional[str] = None
    paragraphs: List[str] = Field(default_factory=list)
    figures: List[Figure] = Field(default_factory=list)
    tables: List[Table] = Field(default_factory=list)
    subsections: List["Section"] = Field(default_factory=list)

    def __init__(
        self,
        summary: Optional[str] = None,
        paragraphs: Optional[List[str]] = None,
        figures: Optional[List[Figure]] = None,
        tables: Optional[List[Table]] = None,
        subsections: Optional[List[dict]] = None,
    ):
        # Recursively initialize subsections if provided as dicts
        if subsections is not None:
            subs = [Section(**s) if isinstance(s, dict) else s for s in subsections]
        else:
            subs = []
        super().__init__(
            summary=summary,
            paragraphs=paragraphs or [],
            figures=figures or [],
            tables=tables or [],
            subsections=subs,
        )

    @classmethod
    def init(cls, md: Markdown):
        # Placeholder for initialization from Markdown
        return cls()

    @classmethod
    def update_forward_refs(cls, **localns):
        ...
        #BaseModel.model_rebuild()


# Support for recursive Section references
Section.model_rebuild()