--
description: DOM Model with Pydantic and Pandoc Integration
output-file: core.dom.html
title: core.dom

This notebook demonstrates a Document Object Model (DOM) using Pydantic for static typing and validation, and integrates Pandoc (via pypandoc) for Markdown processing.
---

In [1]:
# | default_exp dom

In [2]:
# | hide
from nbdev.showdoc import *
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [3]:
# | export
from typing import List, Optional
from pydantic import BaseModel, Field, field_validator
import base64
import pypandoc
import pathlib

## Base Element Class

In [4]:
# | export
class Element(BaseModel):
    summary: Optional[str] = None

## Figure Class with Base64 Validation

In [5]:
# | export
class Figure(Element):
    rawdata: str = Field(..., description="Base64-encoded image data")

    @field_validator("rawdata")
    def validate_base64(cls, v):
        try:
            base64.b64decode(v)
        except Exception:
            raise ValueError("rawdata must be valid base64")
        return v

## Table Structure: Cell, Column, Row, Table

In [6]:
# | export
class Cell(BaseModel):
    c: str


class Column(BaseModel):
    cells: List[Cell]


class Row(BaseModel):
    cols: List[Column]


class Table(Element):
    rows: List[Row]

## Markdown Class with pypandoc Integration

In [15]:
# | export
class Markdown(BaseModel):
    content: str = ""

    def to_markdown(self) -> str:
        return self.content

    def to_html(self) -> str:
        return pypandoc.convert_text(self.content, "html", "md")

    def to_latex(self) -> str:
        return pypandoc.convert_text(self.content, "latex", "md")

    def to_json(self) -> str:
        ast_json = pypandoc.convert_text(self.content, "json","md")
        return ast_json
    
    def walk(self, action):
        import json

        ast_json = pypandoc.convert_text(self.content, "json", "md")
        ast = json.loads(ast_json)

        def walk_node(node):
            node = action(node)
            if isinstance(node, dict):
                for key, value in node.items():
                    if isinstance(value, list):
                        node[key] = [
                            walk_node(child)
                            if isinstance(child, (dict, list))
                            else child
                            for child in value
                        ]
                    elif isinstance(value, dict):
                        node[key] = walk_node(value)
            elif isinstance(node, list):
                node = [
                    walk_node(child) if isinstance(child, (dict, list)) else child
                    for child in node
                ]
            return node

        ast = walk_node(ast)
        new_json = json.dumps(ast)
        self.content = pypandoc.convert_text(new_json, "json", "md")

    @staticmethod
    def from_file(filepath: pathlib.Path):
        content = filepath.read_text(encoding="utf-8")
        return Markdown(content=content)

In [16]:
#| export
def identity(obj):
    """Identity function for use in walk."""
    return obj

In [9]:
# create test case for Markdown class with identity function
import os
import shutil
from pathlib import Path
# change directory to the script's directory
cwd = Path(os.getcwd()).parent / "res/SR024011"
os.chdir(cwd)
cwd = os.getcwd()
os.listdir(cwd)

# os.chdir(Path(cwd).parent / "res/SR02401")


['test.md', 'test.rst', 'SR024011.md', 'img']

In [17]:
md = Markdown(content="# Test\n\nThis is a test.")
# save the markdown content to a file
md_file = Path(cwd) / "test.md"
md_file.write_text(md.content, encoding="utf-8")
rst = pypandoc.convert_file('test.md', "rst", format="md")
rst_file = Path(cwd) / "test.rst"
rst_file.write_text(rst, encoding="utf-8")

23

27

In [18]:
js = md.to_json()
js_file = Path(cwd) / "test.json"
js_file.write_text(js, encoding="utf-8")

265

In [14]:
md.walk(identity)

RuntimeError: Pandoc died with exitcode "64" during conversion: JSON parse error: Error in $: Failed reading: not a valid json value at '#Test'


In [None]:

def test_markdown_walk_identity():
    md.walk(identity)
    print(md.content)  # Should print the original content
    # assert md.content == "# Test\n\nThis is a test."

test_markdown_walk_identity()

## Section Class: Recursive Document Structure

In [9]:
# | export
class Section(BaseModel):
    summary: Optional[str] = None
    paragraphs: List[str] = Field(default_factory=list)
    figures: List[Figure] = Field(default_factory=list)
    tables: List[Table] = Field(default_factory=list)
    subsections: List["Section"] = Field(default_factory=list)

    def __init__(
        self,
        summary: Optional[str] = None,
        paragraphs: Optional[List[str]] = None,
        figures: Optional[List[Figure]] = None,
        tables: Optional[List[Table]] = None,
        subsections: Optional[List[dict]] = None,
    ):
        # Recursively initialize subsections if provided as dicts
        if subsections is not None:
            subs = [Section(**s) if isinstance(s, dict) else s for s in subsections]
        else:
            subs = []
        super().__init__(
            summary=summary,
            paragraphs=paragraphs or [],
            figures=figures or [],
            tables=tables or [],
            subsections=subs,
        )

    @classmethod
    def init(cls, md: Markdown):
        # Placeholder for initialization from Markdown
        return cls()

    @classmethod
    def update_forward_refs(cls, **localns):
        ...
        #BaseModel.model_rebuild()


# Support for recursive Section references
Section.model_rebuild()