--
description: DOM Model with Pydantic and Pandoc Integration
output-file: core.dom.html
title: core.dom

This notebook demonstrates a Document Object Model (DOM) using Pydantic for static typing and validation, and integrates Pandoc (via pypandoc) for Markdown processing.
---

In [1]:
# | default_exp dom

In [2]:
# | hide
from nbdev.showdoc import *
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [36]:
# | export
from typing import List, Optional
from pydantic import BaseModel, Field, field_validator
import base64
import pypandoc
import pathlib
import json
import re
import os
from pathlib import Path
from pprint import pprint

In [4]:
#| export
from ollama import chat, ChatResponse, Client

## Base Element Class

In [5]:
# | export
class Element(BaseModel):
    summary: Optional[str] = None

## Figure Class with Base64 Validation

In [6]:
# | export
class Figure(Element):
    rawdata: str = Field(..., description="Base64-encoded image data")

    @field_validator("rawdata")
    def validate_base64(cls, v):
        try:
            base64.b64decode(v)
        except Exception:
            raise ValueError("rawdata must be valid base64")
        return v

## Table Structure: Cell, Column, Row, Table

In [7]:
# | export
class Cell(BaseModel):
    c: str


class Column(BaseModel):
    cells: List[Cell]


class Row(BaseModel):
    cols: List[Column]


class Table(Element):
    rows: List[Row]

In [8]:
# | export
def get_text_summary_response(content: str, model:str="gemma3-27b", role:str="user") -> ChatResponse:
    """
    Returns a ChatResponse from the chat model with a summary prompt for the given content.
    """
    return chat(
        model=model,
        messages=[
            {
                "role": role,
                "content": (
                    f"Please provide a summary of the following string. "
                    f"The summary should be concise and informative: {content}. "
                    
                ),
            }
        ],
    )

In [23]:
# | export
image_link_pattern = r'[^\s]+\.(?:jpg|jpeg|png|gif|bmp|webp)'

def get_image_summary_response(image_link: str, model:str="gemma3:27b", role:str="user") -> ChatResponse:
    """
    Returns a ChatResponse from the chat model with a summary prompt for the given image.
    """
    if not re.match(image_link_pattern, image_link):
        # If the image link is not a URL, throw an error
        raise ValueError(f"Invalid image link: {image_link}")
    response = chat(
        model=model,
        messages=[
            {
                "role": role,
                "content": "Please provide a summary of the following image. "
                           "The summary should be concise and informative.",
                'images': [f"{image_link}"],
            }
        ],
    )
    return response


In [24]:
# os.getcwd()
image_link = "../res/siasun_md_sammple/SN024002/img/__3.png"
# image_link
res = re.match(image_link_pattern, image_link)
res.group(0)
# get_image_summary_response("../res/")

'../res/siasun_md_sammple/SN024002/img/__3.png'

In [33]:
# imagepath = Path(os.getcwd()).parent / 'res/siasun_md_sample/SN024002/img/__3.png'
imagepath = Path(os.getcwd()).parent / 'res/siasun_md_sample/SN024002/img/img_1.png'
image_link = str(imagepath)
image_link

'/d/devel/rag/ribosome/res/siasun_md_sample/SN024002/img/img_1.png'

In [26]:
# res = re.match(image_link_pattern, 'img/small.avi')
res = re.match(image_link_pattern, image_link)
# in case of match, print the matched string
if res:
    print(f"Matched image link: {res.group(0)}")
else:
    print(res)

Matched image link: /d/devel/rag/ribosome/res/siasun_md_sample/SN024002/img/__3.png


In [None]:
response = get_image_summary_response(image_link, model="gemma3:27b", role="user")
# print(response.content)
pprint(response.message.content)

## Markdown Class with pypandoc Integration

In [None]:
# | export
class Markdown(BaseModel):
    # The content of the Markdown document. This can be a string containing Markdown syntax.
    raw_markdown: Optional[str] = Field(None, description="Raw Markdown content")
    # raw json
    raw_json: Optional[str] = Field(None, description="Raw JSON content")
    # The json representation of the Markdown AST.
    ast_json: Optional[str] = Field(None, description="JSON representation of the Markdown AST")


    def __init__(self, content: Optional[str] = None, **data):
        """ Initializes the Markdown object with raw Markdown content.
        If content is provided, it will be set as the raw_markdown.
        """
        super().__init__(**data)
        if content:
            self.raw_markdown = content
            self.raw_json = pypandoc.convert_text(self.raw_markdown, "json", "md")
        else:
            self.raw_markdown = None
            self.raw_json = None
    
    def to_markdown(self) -> str | None:
        return self.raw_markdown

    def to_html(self) -> str | None:
        if not self.raw_markdown:
            return None
        # Convert raw Markdown to HTML using pypandoc
        return pypandoc.convert_text(self.raw_markdown, "html", "md")

    def to_latex(self) -> str | None:
        if not self.raw_markdown:
            return None
        return pypandoc.convert_text(self.raw_markdown, "latex", "md")

    def to_json(self) -> str | None:
        """ Converts the Markdown content to a JSON representation of its AST.
        This uses pypandoc to convert the Markdown content into a JSON format.
        """
        return self.raw_json

    def walk(self, action: callable = None) -> None:
        """ Walks through the Markdown AST and applies an action to each node.
        If no action is provided, it defaults to the identity function.
        """
        if not self.raw_markdown:
            raise ValueError("Markdown content is empty. Cannot walk the AST.")
        if action is None:
            action = self.__class__.identity

        ast = json.loads(self.raw_json)

        def walk_node(node):
            node = action(node)
            if isinstance(node, dict):
                for key, value in node.items():
                    if isinstance(value, list):
                        node[key] = [
                            walk_node(child)
                            if isinstance(child, (dict, list))
                            else child
                            for child in value
                        ]
                    elif isinstance(value, dict):
                        node[key] = walk_node(value)
            elif isinstance(node, list):
                node = [
                    walk_node(child) if isinstance(child, (dict, list)) else child
                    for child in node
                ]
            return node

        ast = walk_node(ast)
        self.ast_json = json.dumps(ast, ensure_ascii=False).encode("utf-8").decode("utf-8")

    def summary(self, action: callable = None) -> None:
        """ Walks through the Markdown AST and applies an action to each node.
        If no action is provided, it defaults to the identity function.
        """
        if not self.raw_markdown:
            raise ValueError("Markdown content is empty. Cannot walk the AST.")
        if action is None:
            action = self.__class__.identity

        ast = json.loads(self.raw_json)

        def summary_node(node):
            node = action(node)
            if isinstance(node, dict):
                for key, value in node.items():
                    if isinstance(value, list):
                        node[key] = [
                            summary_node(child)
                            if isinstance(child, (dict, list))
                            else child
                            for child in value
                        ]
                    elif isinstance(value, dict):
                        node[key] = summary_node(value)
            elif isinstance(node, list):
                node = [
                    summary_node(child) if isinstance(child, (dict, list)) else child
                    for child in node
                ]
            return node

        ast = summary_node(ast)
        self.ast_json = json.dumps(ast, ensure_ascii=False).encode("utf-8").decode("utf-8")
    @staticmethod
    def from_file(filepath: pathlib.Path):
        content = filepath.read_text(encoding="utf-8")
        return Markdown(content=content)
    
    @classmethod
    def identity(cls, obj):
        """Identity function for use in walk."""
        return obj

In [12]:
# create test case for Markdown class with identity function
import os
import shutil
from pathlib import Path
# change directory to the script's directory
cwd = Path(os.getcwd()).parent / "res/siasun_md_sample"
os.chdir(cwd)
cwd = os.getcwd()
os.listdir(cwd)

# os.chdir(Path(cwd).parent / "res/SR02401")


['test.html',
 'SR024011',
 'SN024002',
 'test.md',
 'test.rst',
 'test_raw.json',
 'test.json',
 'test_ast.json']

In [13]:
md = Markdown(content="# Test\n\nThis is a test.\n\nThis is **bold**.")
# save the markdown content to a file
md_file = Path(cwd) / "test.md"
md_file.write_text(md.raw_markdown, encoding="utf-8")
rst = pypandoc.convert_file('test.md', "rst", format="md")
rst_file = Path(cwd) / "test.rst"
rst_file.write_text(rst, encoding="utf-8")

js = md.to_json()
js_file = Path(cwd) / "test.json"
js_file.write_text(js, encoding="utf-8")
md.walk()
# md.ast_json
(Path(cwd) / "test_ast.json").write_text(md.ast_json, encoding="utf-8")
(Path(cwd) / "test_raw.json").write_text(md.raw_json, encoding="utf-8")

42

46

420

487

420

In [14]:
cwd

'/d/devel/rag/ribosome/res/siasun_md_sample'

In [24]:
# md_file = Path(cwd) / "SN024002/SN024002《新松SN7B-7-0.90规格参数》A-1.md"
md_file = Path(cwd) / "SX322002/SX322002《新松机器人码垛应用操作手册》-V1.0.md"
md = Markdown.from_file(md_file)

In [25]:
md.walk()
(Path(cwd) / "SX322002/SX322002《新松机器人码垛应用操作手册》-V1.0_ast.json").write_text(md.ast_json, encoding="utf-8")
(Path(cwd) / "SX322002/SX322002《新松机器人码垛应用操作手册》-V1.0_raw.json").write_text(md.raw_json, encoding="utf-8")

43753

37755

In [None]:

def test_markdown_walk_identity():
    md.walk(identity)
    print(md.content)  # Should print the original content
    # assert md.content == "# Test\n\nThis is a test."

test_markdown_walk_identity()

## Section Class: Recursive Document Structure

In [9]:
# | export
class Section(BaseModel):
    summary: Optional[str] = None
    paragraphs: List[str] = Field(default_factory=list)
    figures: List[Figure] = Field(default_factory=list)
    tables: List[Table] = Field(default_factory=list)
    subsections: List["Section"] = Field(default_factory=list)

    def __init__(
        self,
        summary: Optional[str] = None,
        paragraphs: Optional[List[str]] = None,
        figures: Optional[List[Figure]] = None,
        tables: Optional[List[Table]] = None,
        subsections: Optional[List[dict]] = None,
    ):
        # Recursively initialize subsections if provided as dicts
        if subsections is not None:
            subs = [Section(**s) if isinstance(s, dict) else s for s in subsections]
        else:
            subs = []
        super().__init__(
            summary=summary,
            paragraphs=paragraphs or [],
            figures=figures or [],
            tables=tables or [],
            subsections=subs,
        )

    @classmethod
    def init(cls, md: Markdown):
        # Placeholder for initialization from Markdown
        return cls()

    @classmethod
    def update_forward_refs(cls, **localns):
        ...
        #BaseModel.model_rebuild()


# Support for recursive Section references
Section.model_rebuild()