# core.dom

>DOM Model with Pydantic and Pandoc Integration
output-file: core.dom.html
title: core.dom

This notebook demonstrates a Document Object Model (DOM) using Pydantic for static typing and validation, and integrates Pandoc (via pypandoc) for Markdown processing.
---

In [26]:
# | default_exp dom

In [27]:
# | hide
from nbdev.showdoc import *
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import Markdown, display

InteractiveShell.ast_node_interactivity = "all"

In [28]:
#| export
from fastcore.test import *

In [29]:
# | export
from typing import List, Optional, ClassVar
from pydantic import BaseModel, Field, field_validator
from functools import cached_property
import base64
import pypandoc
import pathlib
import asyncio
import tqdm
import json
import jsoncfg
import re
import os
from pathlib import Path
from pprint import pprint
import markdown
import asyncio 
import tqdm
import hashlib
from functools import cached_property
import chromadb
from chromadb.config import Settings
from chromadb import PersistentClient
from chromadb.api import AsyncClientAPI, ClientAPI
from chromadb.api.models.Collection import Collection
from enum import Enum
import warnings

In [30]:
#| export
from ollama import chat, ChatResponse, Client, AsyncClient

## Base Element Class

In [31]:
# | export
class Element(BaseModel):
    """
    Represents a DOM element with a tag, attributes, and children.
    """
    summary: Optional[str] = None

## Figure Class with Base64 Validation

In [32]:
# | export
class Figure(Element):
    """
    Represents a figure element in the document.
    """
    rawdata: str = Field(..., description="Base64-encoded image data")

    @field_validator("rawdata")
    def validate_base64(cls, v):
        """ 
        Validates that the rawdata is a valid base64-encoded string.
        """
        try:
            base64.b64decode(v)
        except Exception:
            raise ValueError("rawdata must be valid base64")
        return v

## Table Structure: Cell, Column, Row, Table

In [33]:
# | export
class Cell(BaseModel):
    """
    Represents a table cell in the document.
    """
    c: str


class Column(BaseModel):
    """
    Represents a table column in the document.
    """
    cells: List[Cell]


class Row(BaseModel):
    """
    Represents a table row in the document.
    """
    cols: List[Column]


class Table(Element):
    """
    Represents a table element in the document.
    """
    rows: List[Row]

In [34]:
# | export
def get_text_summary_response(content: str, model:str="gemma3-27b", role:str="user", lang: str="zh") -> ChatResponse:
    """
    Returns a ChatResponse from the chat model with a summary prompt for the given content.
    """
    match lang:
        case "en":
            prompt = (
                f"Please provide a summary of the following string. "
                f"The summary should be concise and informative: {content}. "
            )
        case "zh":
            content = re.sub(r"\s+", " ", content.strip())
            prompt = (
                f"请提供以下字符串的摘要。"
                f"摘要应简明扼要且信息丰富: {content}. "
            )
        case _:
            raise ValueError(f"Unsupported language: {lang}")
    return chat(
        model=model,
        messages=[
            {
                "role": role,
                "content": prompt,
            }
        ],
    )

In [35]:
# | export
image_link_pattern = r'[^\s]+\.(?:jpg|jpeg|png|gif|bmp|webp)'

def get_image_summary_response(image_link: str | Path, model:str="gemma3:27b", role:str="user", lang: str='zh') -> ChatResponse:
    """
    Returns a ChatResponse from the chat model with a summary prompt for the given image.
    """
    if isinstance(image_link, Path):
        image_link = str(image_link)
    if not re.match(image_link_pattern, image_link):
        # If the image link is not a URL, throw an error
        raise ValueError(f"Invalid image link: {image_link}")
    
    match lang:
        case 'en':
            prompt = "Please provide a summary of the following image. The summary should be concise and informative about the robot."
        case 'zh':
            prompt = "请提供以下图像的摘要。关于机器人机械尺寸,运动范围,自由度的说明应简明扼要。"
        case _:
            raise ValueError(f"Unsupported language: {lang}")

    response = chat(
        model=model,
        messages=[
            {
                "role": role,
                "content": prompt,
                'images': [f"{image_link}"],
            }
        ],
    )
    return response


In [36]:
# | export
image_link_pattern = r'[^\s]+\.(?:jpg|jpeg|png|gif|bmp|webp)'

async def get_image_summary_response_async(client: AsyncClient, image_link: str | Path, model:str="gemma3:27b", role:str="user", lang: str='zh') -> ChatResponse:
    """
    Returns a ChatResponse from the chat model with a summary prompt for the given image.
    """
    if isinstance(image_link, Path):
        image_link = str(image_link)
    # if not re.match(image_link_pattern, image_link):
    #     # If the image link is not a URL, throw an error
    #     raise ValueError(f"Invalid image link: {image_link}")
    
    match lang:
        case 'en':
            prompt = "Please provide a summary of the following image. The summary should be concise and informative about the robot."
        case 'zh':
            prompt = "请提供以下图像的摘要。关于机器人机械尺寸,运动范围,自由度的说明应简明扼要。"
        case _:
            raise ValueError(f"Unsupported language: {lang}")

    response = await client.chat(
        model=model,
        messages=[
            {
                "role": role,
                "content": prompt,
                'images': [f"{image_link}"],
            }
        ],
    )
    return response


In [37]:
# | export
async def get_text_summary_response_async(client: AsyncClient, content: str, model:str="gemma3-27b", role:str="user", lang: str="zh") -> ChatResponse:
    """
    Returns a ChatResponse from the chat model with a summary prompt for the given content.
    """
    content = re.sub(r"\s+", " ", content.strip())
    match lang:
        case "en":
            prompt = (
                f"Please provide a summary of the following string. "
                f"The summary should be concise and informative: {content}. "
            )
        case "zh":
            prompt = (
                f"请提供以下字符串的摘要。"
                f"摘要应简明扼要且信息丰富: {content}. "
            )
        case _:
            raise ValueError(f"Unsupported language: {lang}")
    
    message = {
        "role": role,
        "content": prompt,
    }
    response =  await client.chat(
        model=model,
        messages=[message]
    )
    return response

In [38]:
#| hide

# os.getcwd()
image_link = "../res/siasun_md_sammple_hrsl/SN024002/img/__2.png"
# image_link
res = re.match(image_link_pattern, image_link)
res.group(0)
# get_image_summary_response("../res/")

'../res/siasun_md_sammple_hrsl/SN024002/img/__2.png'

In [39]:
imagepath = Path(os.getcwd()).parent / 'res/siasun_md_sample_hrsl/SN024002/img/img_13.png'
# imagepath = Path(os.getcwd()).parent / 'res/siasun_md_sample/SN024002/img/img_13.png'
# imagepath = Path(os.getcwd()).parent / 'siasun_md_sample_hrsl/SN024002/img/img_11.png'
# imagepath = Path(os.getcwd()).parent / 'res/siasun_md_sample_hrsl/SX322002/img/img_13.png'
image_link = str(imagepath)
image_link
display(Markdown(f"![image]({image_link})"))

'/d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SN024002/img/img_13.png'

![image](/d/devel/rag/ribosome/res/siasun_md_sample_hrsl/SN024002/img/img_13.png)

In [40]:
# res = re.match(image_link_pattern, 'http://baidu.com/?home/img/small.png')
res = re.match(image_link_pattern, 'img/small.png')
# res = re.match(image_link_pattern, image_link)
# in case of match, print the matched string
if res:
    print(f"Matched image link: {res.group(0)}")
else:
    print(res)

Matched image link: img/small.png


In [41]:
#| export
chat_client: AsyncClient = AsyncClient()  
msg_queue: asyncio.Queue = asyncio.Queue(maxsize=8)  # Limit the queue size to 8 messages
resp_queue: asyncio.Queue = asyncio.Queue(maxsize=1)  # Limit the queue size to 1 message for response 
lock = asyncio.Lock()  # Lock to ensure thread-safe access to the queues

In [42]:
if False: 
    # response = await get_image_summary_response_async(image_link, model="gemma3:27b", role="user", lang='zh')
    response = await get_image_summary_response_async(chat_client, image_link, model="gemma3:27b", role="user", lang='zh')
    assert isinstance(response.message.content, str), "Response content should be a string"
    md_text = markdown.markdown(response.message.content)
    Markdown(md_text)
    # print(md_text)
    # print(response.content)
    # response_txt = await get_text_summary_response_async(md_text,model="gemma3:27b", role="user", lang='zh')
    # md_text = markdown.markdown(response_txt.message.content)
    # Markdown(md_text)


In [43]:
if False: 
    # response = await get_image_summary_response_async(image_link, model="gemma3:27b", role="user", lang='zh')
    response = await get_image_summary_response_async(chat_client, image_link, model="gemma3:27b", role="user", lang='zh')
    assert isinstance(response.message.content, str), "Response content should be a string"
    md_text = markdown.markdown(response.message.content)
    Markdown(md_text)
    # print(md_text)
    # print(response.content)
    # response_txt = await get_text_summary_response_async(md_text,model="gemma3:27b", role="user", lang='zh')
    # md_text = markdown.markdown(response_txt.message.content)
    # Markdown(md_text)


In [44]:
#| export
chat_client: AsyncClient = AsyncClient()  
msg_queue: asyncio.Queue = asyncio.Queue(maxsize=8)  # Limit the queue size to 8 messages
resp_queue: asyncio.Queue = asyncio.Queue(maxsize=1)  # Limit the queue size to 1 message for response 
lock = asyncio.Lock()  # Lock to ensure thread-safe access to the queues

## DOM Class with pypandoc Integration

In [45]:
#| export
ResponseStatus = Enum("ResponseStatus", ["PENDING", "PROCESSING", "REORGNIZED", "SEMANTICIZED", "CANCELLED", "COMPLETED", "ERROR"])
class AnalysisStatus:
    """
    Enum to represent the status of the analysis process.
    """
    status: ResponseStatus = Field(ResponseStatus.PENDING, description="Current status of the analysis process")
    exception: str = Field("", description="Exception message if any error occurs during analysis")

In [None]:
# | export
from platform import node
from typing import Iterator, Callable, Optional, ClassVar
from queue import LifoQueue
import asyncio
import re
import copy
import jsoncfg
from jsoncfg.config_classes import ConfigJSONArray, ConfigJSONObject, ConfigJSONScalar, ConfigNode

class DOM(BaseModel):
    model_config = {"arbitrary_types_allowed": True}
    
    # The content of the Markdown document. This can be a string containing Markdown syntax.
    raw_markdown: Optional[str] = Field(None, description="Raw Markdown content")
    # raw json
    raw_json: Optional[str] = Field(None, description="Raw JSON content")
    # The json representation of the Markdown AST.
    ast_json: Optional[str] = Field(None, description="JSON representation of the Markdown AST")
    ast_json_file: Optional[Path] = Field(None, description="Path to the JSON file of the Markdown AST")
    semantics_json: Optional[str] = Field(None, description="Semantics JSON representation of the Markdown AST")
    semantics_json_file: Optional[Path] = Field(None, description="Path to the JSON file of the Markdown AST semantics")
    embed_json: Optional[str] = Field(None, description="Semantics JSON representation with embeddings and meta information of the Markdown AST")
    embed_json_file: Optional[Path] = Field(None, description="Path to the JSON file of the Markdown AST embeddings")
    file_path: Optional[Path] = Field(None, description="Path to the Markdown file")
    root_path: Path = Field(default_factory=Path, description="root path of the markdown document, required to get access to the images")
    table_count: int = Field(0, description="Number of tables in the Markdown document")
    section_count: int = Field(0, description="Number of headers in the Markdown document")
    section_level: list = Field(default_factory=list, description="List of section levels in the Markdown document")
    title:str = Field('', description="Title of the Markdown document, if available")
    llm_client: AsyncClient = Field(default_factory=AsyncClient, description="AsyncClient instance for chat model interactions")
    db_client: ClientAPI = Field(default_factory=PersistentClient, description="PersistentClient instance for database interactions")
    analysis_status: AnalysisStatus = Field(default_factory=AnalysisStatus, description="Analysis status of the Markdown document")


    # Use ClassVar to indicate these are class variables, not instance fields
    TextBlock_Types: ClassVar[set[str]] = {
        "Plain",
        "Para",
        "Figure",
        "LineBlock","CodeBlock","RawBlock","OrderedList","BulletList","DefinitionList",
        "Header","BlockQuote",
        "Table","TableRow", "TableCell"}

    NonTextBlock_Types: ClassVar[set[str]] = {"HorizontalRule", "Div", "Null"}

    Embed_Types: ClassVar[set[str]] = {"Section", "Image", "Table"}

    Block_Types: ClassVar[set[str]] = TextBlock_Types.union(TextBlock_Types)

    Inline_Types: ClassVar[set[str]] = {
        "Str", "Emph", "Strong", "Strikeout", "Superscript", "Subscript",
        "Decimal", "Period",
        "Link", 
        "Image", "Code", "Math", "RawInline", "SoftBreak", "HardBreak", "Span"   
    }
    
    Element_Types: ClassVar[set[str]] = Block_Types | Inline_Types
    leaf_min_len: ClassVar[int] = 100  # Minimum length of text to consider for summarization

    
    def __init__(self, md_file_path: Path, llm_client: Optional[AsyncClient] = None, db_client: Optional[ClientAPI] = None, **data):
        """ Initializes the Markdown object with raw Markdown content.
        If content is provided, it will be set as the raw_markdown.
        """
        # Set defaults for data if not provided
        data.setdefault('file_path', Path(md_file_path))
        data.setdefault('root_path', Path(md_file_path).parent)
        data.setdefault('title', Path(md_file_path).stem)
        data.setdefault('llm_client', llm_client or AsyncClient())
        data.setdefault('db_client', db_client or PersistentClient())
        data.setdefault('ast_json_file', None)
        data.setdefault('embed_json_file', None)
        data.setdefault('semantics_json_file', None)

        super().__init__(**data)
        

    def setup(self):
        content = self.file_path.read_text(encoding="utf-8")  # type: ignore
        if content:

            self.raw_markdown = content
            self.raw_json = pypandoc.convert_text(self.raw_markdown, "json", "md")
            self.ast_json_file = self.file_path.parent / (str(self.file_path.stem) + "_ast.json")  # type: ignore

            if self.ast_json_file.exists():
                self.ast_json = self.ast_json_file.read_text(encoding="utf-8")  # type: ignore
            else:
                slide_splitter = r"(^<!--\s*Slide number:\s*\d+\s*-->$)"  # Regex to match slide splitters in the Markdown content
                # If the raw_markdown contains slide splitters, we need to reorganize the slides
                if re.search(slide_splitter, self.raw_markdown, flags=(re.MULTILINE|re.IGNORECASE)):  # type: ignore
                    # If there are slide splitters, we need to reorganize the slides
                    self.ast_json = self.reorg_slides(slide_splitter=slide_splitter)
                else:
                    # If there are no slide splitters, we can use the raw_json as is
                    self.ast_json = self.reorg()

            self.semantics_json_file = self.file_path.parent / (str(self.file_path.stem) + "_semantics.json")  # type: ignore
            if self.semantics_json_file.exists():
                self.semantics_json = self.semantics_json_file.read_text(encoding="utf-8")  # type: ignore

            self.embed_json_file = self.file_path.parent / (str(self.file_path.stem) + "_embed.json")  # type: ignore
            if self.embed_json_file.exists():
                self.embed_json = self.embed_json_file.read_text(encoding="utf-8")  # type: ignore
        else:
            self.ast_json_file = None
            self.semantics_json_file = None
            self.embed_json_file = None
            self.raw_markdown = None
            self.raw_json = None
            self.ast_json = None

    def reorg_slides(self, slide_splitter: str = r"(^<!--\s*Slide number:\s*\d+\s*-->$)") -> str:
        """Reorganizes the slides in the Markdown AST.
        This function splits the raw_markdown into slides based on the slide_splitter regex,
        and then reorganizes each slide into a Section with a Header and Content.
        """

        assert self.raw_json or self.raw_markdown, "raw_json/raw_markdown content is empty. Cannot reorganize slides."
        # Split the raw_markdown into slides
        items = re.split(slide_splitter, self.raw_markdown, flags=re.MULTILINE)  # type: ignore
        presentation = json.loads(self.raw_json)  # type: ignore
        presentation['blocks'] = []  # type: ignore
        slide_header0 = {
            't': 'Section', 
            'c': [
                {
                    't': 'Header', 
                    'c': [
                        1,  # Header level, can be 1, 2, 3, etc.
                        ['slide header', [], []],  # The header format list [id, formtat1, format2]
                        [{'t': 'Str', 'c': 'slide header'}],  # The header content list
                    ]
                },  # Header for the slide
                {
                    't': 'Content', 
                    'c': []
                }
                ]  # Content of the slide
            }
        # Reorganize each slide
        slide_header = None  # Initialize slide variable
        if items[0] == "":
            # If the first slide is empty, remove it
            items = items[1:]
            slide_header = copy.deepcopy(slide_header0)
            slide_header['c'][0]['c'][1][0] = '<!-- Slide number: 0 -->'
            slide_header['c'][0]['c'][2][0]['c'] = '<!-- Slide number: 0 -->'

        for item in items:
            if re.match(r"^<!--\s*Slide number:\s*\d+\s*-->$", item):
                # If the slide is a slide splitter
                slide_header = copy.deepcopy(slide_header0)  # must be deepcopy, otherwise the slide will be modified in place
                slide_header['c'][0]['c'][1][0] = item.strip()
                slide_header['c'][0]['c'][2][0]['c'] = item.strip()
            else:
                assert slide_header, f"Slide is not defined!"
                raw_ast = pypandoc.convert_text(item.strip(), "json", "md")
                ast = json.loads(raw_ast)
                assert ast.get('blocks'), f"AST blocks are not defined in {ast}"
                it = iter(ast['blocks'])
                slide, it = self.reorg_section(slide_header, it, bIgnoreLevel=True)         
                presentation['blocks'].append(slide)

        # Convert the list of slides back to a single JSON object
        return json.dumps(presentation, ensure_ascii=False).encode("utf-8").decode("utf-8")

    def to_markdown(self) -> str | None:
        return self.raw_markdown

    def to_html(self) -> str | None:
        if not self.raw_markdown:
            return None
        # Convert raw Markdown to HTML using pypandoc
        return pypandoc.convert_text(self.raw_markdown, "html", "md")

    def to_latex(self) -> str | None:
        if not self.raw_markdown:
            return None
        return pypandoc.convert_text(self.raw_markdown, "latex", "md")

    def to_json(self) -> str | None:
        """ Converts the Markdown content to a JSON representation of its AST.
        This uses pypandoc to convert the Markdown content into a JSON format.
        """
        return self.raw_json

    async def embed(self, action: Optional[Callable] = None, db_path: Optional[Path] = Path("../db")) -> str:
        """ Walks through the Markdown AST and applies an action to each node.
        If no action is provided, it defaults to the identity function.
        """
        if not self.semantics_json:
            raise ValueError("semantics_json content is empty. Cannot embed the content.")
        if action is None:
            action = self.__class__.identity

        if self.embed_json:
            # If embed_json is already set, we don't need to walk the AST again
            print("embed_json is already set. Skipping walk.")
            return self.embed_json

        ast = json.loads(self.semantics_json)  # type: ignore

        assert isinstance(ast, dict), f"AST should be a dictionary, got {type(ast)}"
        canonical_json = json.dumps(ast, 
                                    sort_keys=True, 
                                    ensure_ascii=False,
                                    separators=(',', ';')
                                    ).encode("utf-8").decode("utf-8")
        full_canonical_string = str(self.file_path) + canonical_json
        # Create a unique ID for the AST based on its content
        ast['i'] = hashlib.sha256(full_canonical_string.encode("utf-8")).hexdigest()
        ast['e'] = await self.llm_client.embed(model="bge-m3", input=ast.get('summary', ''))  # Initialize the embedding list
        cur_object_path = [ast['i']]  # Initialize the current object path with the AST ID
        ast['m'] = {
            'obj_path': cur_object_path,
            'embed_model': 'bge-m3',  # type: ignore
        }

        self.db_client = PersistentClient(path=db_path, settings=Settings(allow_reset=True))  # type: ignore
        # Create or get the collection in the database 'mitochondria'
        collection = self.db_client.get_or_create_collection(name="mitochondria")  # defined in the closure for embed_node
        # Create or get the 'mitochondria' collection 'mitochondria' in the ephemeral database for testing purposes, to be commented out
        ephemeral_db_client = chromadb.EphemeralClient()  # type: ignore     
        if ephemeral_db_client:
            # defined in the closure for embed_node
            collection = ephemeral_db_client.get_or_create_collection(name="mitochondria")
        
        async def embed_node(node: dict) -> dict:
            """embeds the node summary text"""
            nonlocal collection, cur_object_path  # declare the collection variable in the closure
            
            assert node.get('t') in self.Embed_Types, f"Node type {node.get('t')} is not in Embed_Types: {self.Embed_Types}"
            # assert node.get('s'), f"Node summary text is empty for node: {node}"
            
            canonical_string = json.dumps(node, 
                                        sort_keys=True, 
                                        ensure_ascii=False,
                                        separators=(',',';')
                                    ).encode("utf-8").decode("utf-8")
            # Create a unique ID for the node based on its content
            full_canonical_string = str(cur_object_path) + canonical_string

            # Create the identifier for the node
            # The identifier is a SHA256 hash of the full canonical string
            # This ensures that the identifier is unique and consistent for the same content
            node['i'] = hashlib.sha256(full_canonical_string.encode("utf-8")).hexdigest()
            # Create the embedding for the node summary
            response = await self.llm_client.embed(model="bge-m3", input=node.get('s', ''))
            node['e'] = response.embedding  # type: ignore
            # Add meta information
            node['m'] = {
                'obj_path': cur_object_path,  # type: ignore
                'embed_model' : response.model,  # type: ignore
            }

            return node

        async def walk_node(node: dict|list) -> dict|list:
            nonlocal cur_object_path

            node = action(node)
            if isinstance(node, dict):
                if node.get('t') in self.Embed_Types:
                    node = await embed_node(node)  # Embed the node summary text
                    cur_object_path.append(node.get('i', ''))  # Add the node ID to the current object path
                for key, value in node.items():
                    if isinstance(value, list):
                        node[key] = [
                            walk_node(child)
                            if isinstance(child, (dict, list))
                            else child
                            for child in value
                        ]
                    elif isinstance(value, dict):
                        if value.get('t') in self.Embed_Types:
                            value = await embed_node(value)  # Embed the node summary text
                        node[key] = await walk_node(value)
            elif isinstance(node, list):
                node = [
                    await walk_node(child) if isinstance(child, (dict, list)) else child
                    for child in node
                ]
            return node 

        ast = await walk_node(ast)
        self.embed_json = json.dumps(ast, ensure_ascii=False).encode("utf-8").decode("utf-8")
        embed_json_file = self.file_path.parent / (str(self.file_path.stem) + "_embed.json")  # type: ignore
        with open(embed_json_file, "w", encoding="utf-8") as f:
            f.write(self.embed_json)
        return self.embed_json

    def walk_nodes_with_line_number(self, action: Optional[Callable] = None) -> None:
        """ Walks through the Markdown AST and applies an action to each node.
        If no action is provided, it defaults to the identity function.
        """
        if not self.raw_json:
            raise ValueError("raw_json content is empty. Cannot walk the AST.")
        if action is None:
            action = self.__class__.identity

        ast = jsoncfg.load_config(str(self.ast_json_file))

        def walk_node_obj(node)->None:
            node = action(node)
            if isinstance(node, ConfigJSONObject):
                # Dictionary
                for key, value in node:
                    print(f"key   \"{key}\" at line {jsoncfg.node_location(value).line}")
                    walk_node_obj(value)
            elif isinstance(node, ConfigJSONArray):
                # Array
                for item in node:
                    walk_node_obj(item)
            elif isinstance(node, ConfigJSONScalar):
                # Scalar
                value = node()
                if isinstance(value, str):
                    value = value.strip()
                print(f"value \"{value}\" at line {jsoncfg.node_location(node).line}")
            else:
                raise ValueError(f"Unknown node type: {type(node)}")

        ast = walk_node_obj(ast)
    def walk(self, action: Optional[Callable] = None) -> None:
        """ Walks through the Markdown AST and applies an action to each node.
        If no action is provided, it defaults to the identity function.
        """
        if not self.raw_json:
            raise ValueError("raw_json content is empty. Cannot walk the AST.")
        if action is None:
            action = self.__class__.identity

        ast = json.loads(self.raw_json)

        def walk_node(node):
            node = action(node)
            if isinstance(node, dict):
                for key, value in node.items():
                    if isinstance(value, list):
                        node[key] = [
                            walk_node(child)
                            if isinstance(child, (dict, list))
                            else child
                            for child in value
                        ]
                    elif isinstance(value, dict):
                        node[key] = walk_node(value)
            elif isinstance(node, list):
                node = [
                    walk_node(child) if isinstance(child, (dict, list)) else child
                    for child in node
                ]
            return node

        ast = walk_node(ast)
        self.ast_json = json.dumps(ast, ensure_ascii=False).encode("utf-8").decode("utf-8")
        
    def reorg_section(self, section: dict, it: Iterator, bIgnoreLevel:bool=False) -> tuple[dict, Iterator]:
        """
        Reorganizes the list of nodes after a Section node.
        Puts all nodes after the Section into the Content of the Section.
        This is done until the next Section node is encountered.
        Returns the new Section node
        """

        assert section.get('t') == "Section", "Expected a Section node"
        level = section['c'][0]['c'][0]

        while True:
            try:
                item = next(it)
                if not bIgnoreLevel and isinstance(item, dict) and item.get('t') == "Section":
                    # If the item is a Section, add the next items into its Content until the next Section
                    next_level = item['c'][0]['c'][0]
                    if next_level >= level + 1:  # lower levels need to be continuous
                        # If the next Section is at a lower level, step into further subsection handling
                        item, it = self.reorg_section(item, it, bIgnoreLevel=False)
                    elif next_level == level:  # lower levels need to be continuous
                        # If the next Section is at the same level, we can stop here at the current recursion level
                        it = iter([item] + list(it))
                        return section, it
                        # item, it = reorg_section(item, it)
                    elif next_level < level:  # higher levels don't need to be continuous
                        # If the next Section is at a higher level, we can stop here at the current recursion level
                        it = iter([item] + list(it))
                        return section, it
                    else:  # next_level >= level + 2:
                        raise ValueError(f"Unexpected Section level encountered: node: {item}")

                # If the item is not a Section, we can add it to the Content of the Section
                assert section['c'][1].get('t') == "Content", "Expected a Content node"
                section['c'][1]['c'].append(item)

            except StopIteration:
                break
        return section, it
        

    def reorg(self, action: Optional[Callable] = None) -> str:
        """
        Reorganizes the node structure to ensure that headers are treated as sections.
            Applies an action to each node. If no action is provided, it defaults to the identity function.
        """
        if not self.raw_json:
            raise ValueError("raw_json content is empty. Cannot walk the AST.")
        if action is None:
            action = self.__class__.identity

        ast = json.loads(self.raw_json)

        
        def reorg_node(node):
            """Reorganizes the node structure to ensure that headers are treated as sections."""
            node = action(node)     

            if isinstance(node, dict):
                for key, value in node.items():
                    if isinstance(value, list):
                        new_list = []
                        for child in value:
                            if isinstance(child, (dict, list)):
                                new_list.append(reorg_node(child))
                            else:
                                new_list.append(child)
                            if isinstance(child, dict):
                                # If the child is a dict, check if it has a 't' key
                                if 't' in child:
                                    t = child['t']
                                    if t == "Header":
                                        # If the node is a Header, create a new section 
                                        # and include the header in its content 
                                        # along with the following nodes until the next Header
                                        child['t'] = "Section"
                                        child['c'] = [
                                            {
                                                't': 'Header',
                                                'c': [
                                                    child['c'][0],  # Header level
                                                    child['c'][1],  # Header content
                                                    child['c'][2],  # Header content
                                                ]
                                            },
                                            {
                                                't': 'Content',
                                                'c': []
                                            }
                                        ]
                                        self.section_level.append(child['c'][0]['c'][0])  # Append the header level to section_levell
                        it = iter(new_list)
                        new_value = []
                        while True:
                            try:
                                item = next(it)
                                if isinstance(item, dict) and item.get('t') == "Section":  # the first recursion level *
                                    item, it = self.reorg_section(item,it, bIgnoreLevel=False)
                                new_value.append(item)
                            except StopIteration:
                                break
                        node[key] = new_value
                    elif isinstance(value, dict):
                        node[key] = reorg_node(value)
            elif isinstance(node, list):
                node = [
                    reorg_node(child) if isinstance(child, (dict, list)) else child
                    for child in node
                ]
            return node

        ast = reorg_node(ast)
        return json.dumps(ast, ensure_ascii=False).encode("utf-8").decode("utf-8")
        
    async def textualize(self, action: Optional[Callable] = None) -> None:
        """ Walks through the Markdown AST and applies an action to each node.
        If no action is provided, it defaults to the identity function.
        """
        async def get_leaf_summary_async(node:str, min_len:int) -> str: # If the string length is less than 200, return the node as is
            if len(node) < min_len:
                return node
            # If the node is not an image link, return a summary of the text
            response = await get_text_summary_response_async(self.llm_client, node, model="gemma3:27b", role="user", lang='zh')
            if response.message.content:
                return response.message.content
            else:
                # If the response is empty, return the original node
                return node

        async def get_list_summary_async(root: list) -> str:
            """
            Given a list of strings, return a summary of the list.
            If the list is empty, return an empty string.
            If the list has only one element, return the summary of that element.
            If the list has more than one element, return a summary of the concatenated elements.
            """
            list_summary = []
            for n in root:
                if isinstance(n, str):
                    # If the element is a string, get its summary
                    summary = await get_leaf_summary_async(n, min_len=self.leaf_min_len)
                elif isinstance(n, int) or isinstance(n, float):
                    # If the element is a number, get its summary
                    summary = await get_leaf_summary_async(str(n), min_len=self.leaf_min_len)
                elif isinstance(n, dict):
                    # If the dict has a 's' key, use it as the summary
                    summary = n.get('s', '')
                elif isinstance(n, list):
                    # If the element is a list, get its summary
                    summary = await get_list_summary_async(n)
                elif n is None:
                    # If the element is None, skip it
                    summary = ""
                else:
                    raise ValueError(f"Unsupported element type: {type(n)} in {n}")
                list_summary.append(summary)
                
            # Concatenate all elements and summarize
            # concatenated = " ".join(list_summary)
            # response = get_text_summary_response(concatenated, model="gemma3:27b", role="user", lang='zh')
            # if response.message.content:
            #     return response.message.content
            # else:
            #     # If the response is empty, raise an exception
            #     raise ValueError("Summary response is empty. Please check the input data.")
            return await get_leaf_summary_async(" ".join(list_summary), min_len=self.leaf_min_len)

        async def summary_nodewline_main(action: Optional[Callable] = None) -> None:
            """
            Main function to walk the AST and summarize nodes.
            This function will be called by the walk method.
            """
            if not self.ast_json:
                raise ValueError("raw_json content is empty. Cannot walk the AST and summarize.")
            if action is None:
                action = self.__class__.identity

            ast = jsoncfg.load_config(str(self.ast_json_file))
            blocks = ast["blocks"]

            ast['blocks'] = await summary_nodewline_async(blocks)
            if ast.get('title') is None:
                # If the title is not set, use the file name as the title
                if self.title is None:
                    self.title = str(self.root_path)
            ast['title'] = self.title  # Add the title to the AST
            ast['file_path'] = str(self.file_path)  # Add the file path to the AST

            assert isinstance(blocks, list), f"Expected a list of blocks, got {type(blocks)}"
            dict_summary = [b['s'] for b in blocks if isinstance(b, dict) and 's' in b]
            dict_summary = [ast['title']] + dict_summary  # Add the title to the summary list
            # the summary of the document from the summaries in the list of blocks
            if not ast['blocks'] or dict_summary == []:
                # If the blocks are empty, set the summary to an empty string
                ast['summary'] = ""
                # If the summary is empty, set the AST JSON to an empty string
                self.ast_json = json.dumps(ast, ensure_ascii=False).encode("utf-8").decode("utf-8")
            else:
                # If the blocks are not empty, set the summary to the concatenated summaries
                doc_summary = await get_leaf_summary_async(" ".join(dict_summary), min_len=self.leaf_min_len)
                ast['summary'] = doc_summary
                # Convert the summarized AST back to JSON
                self.ast_json = json.dumps(ast, ensure_ascii=False).encode("utf-8").decode("utf-8")
            
        async def summary_nodewline_async(node: ConfigNode) -> ConfigNode:
            '''
            Given a string node, add key,value pair: node['s'] = node_summary, and return the node 
            '''
            if isinstance(node, ConfigJSONObject):
                try:
                    t = node["t"]
                except KeyError:
                    raise ValueError(f"Node does not have a 't' key: {node}")
                if t == "Image":  # Image summary, the Image node is as defined in the pandoc AST
                    summary = []
                    if (not node['c'][1] == []) and (node['c'][1][0] is not None) and (node['c'][1][0].get('c') is not None):
                        summary.append(node['c'][1][0]['c'])  # The content of the second element is the image caption
                    try:
                        # summary.append(node['c'][0])  # The first element are defined to be attributes of the image rendering, i.e. content-irrelevant.
                        # If the node is an image, get its link
                        image_link = self.root_path / node['c'][2][0]  # Assuming the image link is in the third element of the list
                        image_link = str(image_link)
                        if image_link and re.search(image_link_pattern, image_link):  # re.match leads to empty match if there's spaces in the path!
                            # If the node is an image link, get its summary
                            response = await get_image_summary_response_async(
                                client=self.llm_client,
                                image_link=image_link,
                                model="gemma3:27b",
                                role="user",
                                lang='zh'
                                )
                            summary.append(response.message.content)
                        else:
                            # If the node is not an image link, summarize its content
                            raise ValueError(f"Invalid image link: {image_link}")
                        # Get the summary of the image caption

                    except (IndexError, KeyError):
                        # Handle cases where the image link is not in the expected format
                        raise ValueError(f"Invalid image node structure: {node}")
                    # The second element is the image title
                    summary.append(node['c'][2][1])  # The second element is the image title

                    response_txt = await get_text_summary_response_async(
                        client=self.llm_client,
                        content=" ".join(summary),
                        model="gemma3:27b",
                        role="user",
                        lang='zh'
                    )
                    node["s"] = response_txt.message.content

                    print(f"Summarize image: {image_link}")
                elif t == "Cite" or t == "AlignDefault" or t == "ColWidth" :  # Quoted node will be ignored
                    node["s"] = ""  # Set the summary to an empty string
                else: # TextBlock summary
                    c = node.get("c", None)
                    if not c:
                        node["s"] = ""
                    else:
                        dict_summary = []
                        for key, value in node: # get summary of the values (content)
                            if key == "t":
                                continue
                            if isinstance(value, ConfigJSONArray):  # get the summary of the string list
                                if value == []:
                                    # If the list is empty, skip it
                                    continue
                                # If the value is a list, summarize each element
                                node[key] = [
                                    await summary_nodewline_async(child)
                                    if isinstance(child, (dict,list))
                                    else child
                                    for child in value
                                ]
                                dict_summary.append(await get_list_summary_async(value))
                            elif isinstance(value, ConfigJSONObject):
                                child = await summary_nodewline_async(value)  # insert the value['s']
                                assert isinstance(child, ConfigJSONObject) and 's' in child, f"Expected dict with 's' key, got {child}"
                                dict_summary.append(child['s'])

                            elif value is None or value == "":
                                # If the value is None, skip it
                                continue
                            elif isinstance(value, ConfigJSONScalar):
                                v = value()
                                if isinstance(v, str):
                                    v = v.strip()
                                else:
                                    raise ValueError(f"Unsupported scalar type: {type(v)} in {v}")
                                dict_summary.append(await get_leaf_summary_async(v,min_len=self.leaf_min_len))
                        # get the summary of the node
                        if dict_summary:  # type: ignore
                            # If there are summaries, concatenate them
                            node["s"] = await get_leaf_summary_async(" ".join(dict_summary),min_len=self.leaf_min_len)
                            # node["s"] = get_text_summary_response(
                            #     " ".join(dict_summary), model="gemma3:27b", role="user", lang='zh'
                            # ).message.content
                        else:
                            # If no summaries, set to empty string
                            node["s"] = ""

                    # if t is table
                    if t == "Table":
                        print(f"{self.file_path} Summarize table: {self.table_count}")
                        self.table_count += 1
                    elif t == "Section":
                        print(f"{self.file_path} Summarize section: {self.section_count} section depth {node['c'][0]['c'][0]}")
                        self.section_count += 1

            elif isinstance(node, ConfigJSONArray):
                node = [
                    await summary_nodewline_async(child) if isinstance(child, (ConfigJSONObject, ConfigJSONArray)) 
                    else child
                    for child in node
                    ]
            elif isinstance(node, ConfigJSONScalar):
                value = node()
                if isinstance(value, str):
                    value = value.strip()
                    node = await get_leaf_summary_async(value, min_len=self.leaf_min_len)
                else:
                    raise ValueError(f"Unsupported scalar type: {type(value)} in {value}")
            else:
                raise ValueError(f"Unsupported node type: {type(node)} in {node}")

            return node

        async def summary_node_main(action: Optional[Callable] = None) -> None:
            """
            Main function to walk the AST and summarize nodes.
            This function will be called by the walk method.
            """
            if not self.ast_json:
                raise ValueError("raw_json content is empty. Cannot walk the AST and summarize.")
            if action is None:
                action = self.__class__.identity

            ast = json.loads(self.ast_json)
            blocks = ast.get("blocks", [])

            ast['blocks'] = await summary_node_async(blocks)
            if ast.get('title') is None:
                # If the title is not set, use the file name as the title
                if self.title is None:
                    self.title = str(self.root_path)
            ast['title'] = self.title  # Add the title to the AST
            ast['file_path'] = str(self.file_path)  # Add the file path to the AST

            assert isinstance(blocks, list), f"Expected a list of blocks, got {type(blocks)}"
            dict_summary = [b['s'] for b in blocks if isinstance(b, dict) and 's' in b]
            dict_summary = [ast['title']] + dict_summary  # Add the title to the summary list
            # the summary of the document from the summaries in the list of blocks
            if not ast['blocks'] or dict_summary == []:
                # If the blocks are empty, set the summary to an empty string
                ast['summary'] = ""
                # If the summary is empty, set the AST JSON to an empty string
                self.ast_json = json.dumps(ast, ensure_ascii=False).encode("utf-8").decode("utf-8")
            else:
                # If the blocks are not empty, set the summary to the concatenated summaries
                doc_summary = await get_leaf_summary_async(" ".join(dict_summary), min_len=self.leaf_min_len)
                ast['summary'] = doc_summary
                # Convert the summarized AST back to JSON
                self.ast_json = json.dumps(ast, ensure_ascii=False).encode("utf-8").decode("utf-8")
            
        async def summary_node_async(node: dict | list) -> dict | list:
            '''
            Given a string node, add key,value pair: node['s'] = node_summary, and return the node 
            '''
            if isinstance(node, ConfigJSONObject):
                try:
                    t = node["t"]
                except KeyError:
                    raise ValueError(f"Node does not have a 't' key: {node}")
                if t == "Image":  # Image summary, the Image node is as defined in the pandoc AST
                    summary = []
                    if (not node['c'][1] == []) and (node['c'][1][0] is not None) and (node['c'][1][0].get('c') is not None):
                        summary.append(node['c'][1][0]['c'])  # The content of the second element is the image caption
                    try:
                        # summary.append(node['c'][0])  # The first element are defined to be attributes of the image rendering, i.e. content-irrelevant.
                        # If the node is an image, get its link
                        image_link = self.root_path / node['c'][2][0]  # Assuming the image link is in the third element of the list
                        image_link = str(image_link)
                        if image_link and re.search(image_link_pattern, image_link):  # re.match leads to empty match if there's spaces in the path!
                            # If the node is an image link, get its summary
                            response = await get_image_summary_response_async(
                                client=self.llm_client,
                                image_link=image_link,
                                model="gemma3:27b",
                                role="user",
                                lang='zh'
                                )
                            summary.append(response.message.content)
                        else:
                            # If the node is not an image link, summarize its content
                            raise ValueError(f"Invalid image link: {image_link}")
                        # Get the summary of the image caption

                    except (IndexError, KeyError):
                        # Handle cases where the image link is not in the expected format
                        raise ValueError(f"Invalid image node structure: {node}")
                    # The second element is the image title
                    summary.append(node['c'][2][1])  # The second element is the image title

                    response_txt = await get_text_summary_response_async(
                        client=self.llm_client,
                        content=" ".join(summary),
                        model="gemma3:27b",
                        role="user",
                        lang='zh'
                    )
                    node["s"] = response_txt.message.content

                    print(f"Summarize image: {image_link}")
                elif t == "Cite" or t == "AlignDefault" or t == "ColWidth" :  # Quoted node will be ignored
                    node["s"] = ""  # Set the summary to an empty string
                else: # TextBlock summary
                    c = node.get("c", None)
                    if not c:
                        node["s"] = ""
                    else:
                        dict_summary = []
                        for key, value in node.items(): # get summary of the values (content)
                            if key == "t":
                                continue
                            if isinstance(value, list):  # get the summary of the string list
                                if value == []:
                                    # If the list is empty, skip it
                                    continue
                                # If the value is a list, summarize each element
                                node[key] = [
                                    await summary_node_async(child)
                                    if isinstance(child, (dict,list))
                                    else child
                                    for child in value
                                ]
                                dict_summary.append(await get_list_summary_async(value))
                            elif isinstance(value, dict):
                                child = await summary_node_async(value)  # insert the value['s']
                                assert isinstance(child, dict) and 's' in child, f"Expected dict with 's' key, got {child}"
                                dict_summary.append(child['s'])

                            elif value is None or value == "":
                                # If the value is None, skip it
                                continue
                            else:
                                if not isinstance(value, str):
                                    # If the value is not a string, convert it to a string
                                    value = str(value)
                                dict_summary.append(await get_leaf_summary_async(value,min_len=self.leaf_min_len))
                        # get the summary of the node
                        if dict_summary:  # type: ignore
                            # If there are summaries, concatenate them
                            node["s"] = await get_leaf_summary_async(" ".join(dict_summary),min_len=self.leaf_min_len)
                            # node["s"] = get_text_summary_response(
                            #     " ".join(dict_summary), model="gemma3:27b", role="user", lang='zh'
                            # ).message.content
                        else:
                            # If no summaries, set to empty string
                            node["s"] = ""

                    # if t is table
                    if t == "Table":
                        print(f"{self.file_path} Summarize table: {self.table_count}")
                        self.table_count += 1
                    elif t == "Section":
                        print(f"{self.file_path} Summarize section: {self.section_count} section depth {node['c'][0]['c'][0]}")
                        self.section_count += 1

            elif isinstance(node, list):
                node = [
                    await summary_node_async(child) if isinstance(child, (dict,list)) else child
                    for child in node
                ]
            else:
                assert isinstance(node, str), f"Expected a string node, got {type(node)}, on json line"
                # If the node is a string, get its summary
                node_summary = await get_leaf_summary_async(node, min_len=self.leaf_min_len)
                node = node_summary

            return node

        # Run the summary_nodewline_main function asynchronously
        # asyncio.run(summary_nodewline_main(action))
        await summary_nodewline_main(action)
        return 
        
    @classmethod
    def identity(cls, obj):
        """Identity function for use in walk."""
        return obj

In [63]:
if True:
    md_file = "../res/.md.mid.hrsl.test.not.processed/SX322003《新松机器人弧焊应用操作手册》(B-2)/SX322003《新松机器人弧焊应用操作手册》(B-2).md"
    print(f"Using file: {md_file}")
    md_file = Path(md_file)
    print(f"File exists: {md_file.exists()}")


    if md_file.exists():
        dom = DOM(md_file, client=chat_client)
        dom.setup()
        print(f"Raw JSON length: {len(dom.raw_json) if dom.raw_json else 0}")
        print(f"AST JSON length: {len(dom.ast_json) if dom.ast_json else 0}")
        dom.walk_nodes_with_line_number()
        # jsoncfg.load_config(str(dom.ast_json_file))

Using file: ../res/.md.mid.hrsl.test.not.processed/SX322003《新松机器人弧焊应用操作手册》(B-2)/SX322003《新松机器人弧焊应用操作手册》(B-2).md
File exists: True
Raw JSON length: 1368241
AST JSON length: 32442951
key   "pandoc-api-version" at line 2
value "1" at line 3
value "23" at line 4
value "1" at line 5
key   "meta" at line 7
key   "blocks" at line 8
key   "t" at line 10
value "Para" at line 10
key   "c" at line 11
key   "t" at line 13
value "Strong" at line 13
key   "c" at line 14
key   "t" at line 16
value "Str" at line 16
key   "c" at line 17
value "机器人让世界更美好" at line 17
key   "t" at line 24
value "Para" at line 24
key   "c" at line 25
key   "t" at line 27
value "Strong" at line 27
key   "c" at line 28
key   "t" at line 30
value "Str" at line 30
key   "c" at line 31
value "ROBOTS" at line 31
key   "t" at line 34
value "Space" at line 34
key   "t" at line 37
value "Str" at line 37
key   "c" at line 38
value "MAKE" at line 38
key   "t" at line 41
value "Space" at line 41
key   "t" at line 44
value "Str" at lin

In [None]:
       
        # Test textualize functionality
        print("Starting textualization...")
        await dom.textualize()
        
        js_semantics_file = md_file.parent / (str(md_file.stem) + "_semantics.json")
        js_semantics_file.write_text(dom.ast_json, encoding="utf-8")
        print(f"Semantics saved to: {js_semantics_file}")


In [22]:
if False:
    # md_file = Path(os.getcwd()).parent / r'res/md.json.semantics/06 产品手册/软件类/手册/V4.0版本/SX322001《新松机器人通用操作手册》（B-3)/SX322001《新松机器人通用操作手册》（B-3).md'
 
    # md_file = Path(os.getcwd()).parent / r"res/md.mid.hrsl.test/06 产品手册/机械类/SR122001Installation and Maintenance Manual of Siasun SR12 Series Industrial Robot-A.0_20250304110301/SR122001Installation and Maintenance Manual of Siasun SR12 Series Industrial Robot-A.0_20250304110301.md" 
    # md_file = Path("/v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/DSS_MD_CoC_25_HEL_014 Issue 1/DSS_MD_CoC_25_HEL_014 Issue 1.md") 
    # md_file = Path('/d/devel/rag/ribosome/res/md.hrsl/01 设计标准/SX047001新松机器人产品识别设计标准A-1/SX047001新松机器人产品识别设计标准A-1.md') 
    
    # md_file = Path(os.getcwd()).parent / 'res/siasun_md_sample_hrsl/SR02400401/SR02400401《 SIASUN SR210A-210-2.65 Specifications-CE》A-0.md'
    # md_file = Path(os.getcwd()).parent / 'res/siasun_md_sample_hrsl/SX322002/SX322002.md'
    # md_file = Path(os.getcwd()).parent / 'res/siasun_md_sample_hrsl/SR024011/SR024011.md'
    # md_file = Path(os.getcwd()).parent / 'res/siasun_md_sample_hrsl/SN024002/SN024002.md'
    # md_file = Path(os.getcwd()).parent / 'res/siasun_md_sample/SN024002/SN024002.md'
    print(f"Using file: {md_file}")
    print(f"File exists: {md_file.exists()}")
    
    if md_file.exists():
        dom = DOM(md_file, client=chat_client)
        dom.setup()
        print(f"Raw JSON length: {len(dom.raw_json) if dom.raw_json else 0}")
        print(f"AST JSON length: {len(dom.ast_json) if dom.ast_json else 0}")
        
        js_sections_file = md_file.parent / (str(md_file.stem) + "_ast.json")
        print(f"Saving to: {js_sections_file}")
        # Write the JSON representation of the AST to the file
        js_sections_file.write_text(dom.ast_json, encoding="utf-8")
        
        # Test textualize functionality
        print("Starting textualization...")
        await dom.textualize()
        
        js_semantics_file = md_file.parent / (str(md_file.stem) + "_semantics.json")
        js_semantics_file.write_text(dom.ast_json, encoding="utf-8")
        print(f"Semantics saved to: {js_semantics_file}")
    else:
        print("File does not exist!")

In [23]:
def document_reorg(root_folder: Path | str) -> None:
    """
    iterates through a root folder recursively and analyzes the semantics of each Markdown document.
    generate a json file containing the semantical summary of each document 
    output in the same folder as the original markdown file.capitalize
    """
    root = Path(root_folder) if isinstance(root_folder, str) else root_folder
    for file in root.rglob("*.md"):
        dom = DOM(file)
        dom.setup()  # Load the Markdown content and convert it to JSON AST
        ast_json_file = file.parent / (str(file.stem) + "_ast.json")
        if not ast_json_file.exists() and dom.ast_json:
            ast_json_file.write_text(dom.ast_json, encoding="utf-8")
            print(f"Processing file: {file}")
        elif ast_json_file.exists():
            # print(f"File already processed: {file}")
            pass
        else:
            print(f"AST JSON missing: {file}")

# document_reorg(Path("/v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed"))

# document_reorg(Path("../res/md.mid.hrsl.test"))
# document_reorg(Path("/v/data/documents-semantics/.md"))
# document_reorg(Path("../res/SN024002"))
# document_reorg(Path("../res/test_batch_async"))
# document_reorg(Path("/v/data/documents-semantics/.md.hrsl"))
# document_reorg(Path("../res/siasun_md_sample_hrsl"))

In [24]:
ResponseStatus.CANCELLED

<ResponseStatus.CANCELLED: 5>

In [25]:
#| export
async def analyze_one_document_async(md_file: Path, semaphore: asyncio.Semaphore) -> DOM:
    """
    Asynchronously analyzes the semantics of a Markdown document.
    Returns a DOM object containing the semantical summary of the document.
    """
    try:
        async with semaphore:  # Limit concurrent access to the semaphore
            print(f"Analyzing document: {md_file}")
            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always")
                dom = DOM(md_file, client=chat_client)
                dom.setup()  # Load the Markdown content and convert it to JSON AST
                if w:
                    print(f"Warnings encountered while setting up DOM for {md_file}: {w}")
            if dom.semantics_json:
                dom.analysis_status.status = ResponseStatus.COMPLETED
                dom.analysis_status.exception = f"Document already analyzed: {md_file.stem}"
                return dom

            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always")
                await dom.textualize()  # Summarize the document
                if w:
                    print(f"Warnings encountered while textualizing DOM for {md_file}: {w}")
            if dom.file_path:
                ast_json_file = md_file.parent / (str(md_file.stem) + "_semantics.json")
                ast_json_file.write_text(dom.ast_json, encoding="utf-8")  # type: ignore
                dom.analysis_status.status = ResponseStatus.COMPLETED  # type: ignore
                dom.analysis_status.exception = f"Finished analyzing document: {md_file.stem}"
    except asyncio.CancelledError:
        dom.analysis_status.status = ResponseStatus.CANCELLED  # type: ignore
        dom.analysis_status.exception = f"Analysis cancelled for document: {md_file}"  # type: ignore
    except Exception as e:
        dom.analysis_status.status = ResponseStatus.ERROR  # type: ignore
        dom.analysis_status.exception = f"Error occurred while analyzing document: {md_file}, Error: {e}"  # type: ignore

    return dom  # type: ignore

In [26]:
def check_one_document(md_file: Path) -> DOM | None:
    """
    Asynchronously analyzes the semantics of a Markdown document.
    Returns a DOM object containing the semantical summary of the document.
    """
    print(f"Checking document: {md_file}")
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        dom = DOM(md_file, client=chat_client)
        dom.setup()  # Load the Markdown content and convert it to JSON AST
        if w:
            print(f"Warnings encountered while setting up DOM for {md_file}: {w}")
    
    if dom.semantics_json:
        print(f"Document already analyzed: {md_file.stem}")
        return None
    else:
        return dom


In [None]:
import tqdm

async def document_semantics_analysis(root_folder: Path) -> None:
    """
    iterates through a root folder recursively and analyzes the semantics of each Markdown document.
    generate a json file containing the semantical summary of each document 
    output in the same folder as the original markdown file.capitalize
    """
    
    semaphore = asyncio.Semaphore(4)  # Limit the number of concurrent tasks
    # Iterate through all Markdown files in the root folder recursively
    # and create a task for each file to process it asynchronously
    # files = list(root_folder.rglob("*.md"))
    # to_do = [analyze_one_document_async(file, semaphore) for file in files]

    doms_to_analyze = [check_one_document(file) for file in root_folder.rglob("*.md")]
    to_do = [analyze_one_document_async(dom.file_path, semaphore) for dom in doms_to_analyze if dom is not None]  # type: ignore
    to_do_iter = asyncio.as_completed(to_do)  # Create an iterator for the tasks
    to_do_iter = tqdm.tqdm(to_do_iter, total=len(to_do), desc="Processing files", unit="file")
    for coro in to_do_iter:
        # Wait for each task to complete and get the result
        try:
            dom = await coro  # Await the completion of the task
        except Exception as e:
            # print(f"Error processing file: {dom.file_path}")
            print(f"Error processing file: {e}")
            continue
        print(f"Processed file: {dom.file_path} with title: {dom.title}, "
              f"status: {dom.analysis_status.status}, exception: {dom.analysis_status.exception}")

# Run the document semantics analysis
# await document_semantics_analysis(Path("../res/md.json.semantics/02 产品推介资料"))
# await document_semantics_analysis(Path("../res/md.json.semantics"))
await document_semantics_analysis(Path("/v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed"))
await document_semantics_analysis(Path("/v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processible"))




# await document_semantics_analysis(Path("../res/SX023001《SIASUN Industrial Robotics》-A.1"))
# await document_semantics_analysis(Path("/v/data/documents-semantics/.md.mid.hrsl.test"))

# await document_semantics_analysis(Path("../res/test_batch_async"))
# await document_semantics_analysis(Path("../res/siasun_md_sample_hrsl"))
# await document_semantics_analysis(Path("../res/md.hrsl"))
# await document_semantics_analysis(Path("/v/data/documents-semantics/.md.hrsl"))
# await document_semantics_analysis(Path("/v/data/documents-semantics/.md.hrsl"))
# await document_semantics_analysis(Path("../res/SN024002"))
# await document_semantics_analysis(Path("/v/data/documents-semantics/.md.hrsl/05 技术规格"))
# await document_semantics_analysis(Path("/v/data/documents-semantics/.md.hrsl/06 产品手册"))
# await document_semantics_analysis(Path("/v/data/documents-semantics/.md.hrsl/06 产品样册"))
# await document_semantics_analysis(Path("/v/data/documents-semantics/.md.hrsl/08 检测报告与认证证书"))

Checking document: /v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322001_Siasun Robot General Operation Manual_-V1.3/SX322001_Siasun Robot General Operation Manual_-V1.3.md
Checking document: /v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322003《新松机器人弧焊应用操作手册》(B-2)/SX322003《新松机器人弧焊应用操作手册》(B-2).md
Checking document: /v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322001《新松机器人通用操作手册》（B-2)/SX322001《新松机器人通用操作手册》（B-2).md


Processing files:   0%|          | 0/3 [00:00<?, ?file/s]

Analyzing document: /v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322001《新松机器人通用操作手册》（B-2)/SX322001《新松机器人通用操作手册》（B-2).md
/v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322001《新松机器人通用操作手册》（B-2)/SX322001《新松机器人通用操作手册》（B-2).md Summarize table: 0
/v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322001《新松机器人通用操作手册》（B-2)/SX322001《新松机器人通用操作手册》（B-2).md Summarize table: 1
Analyzing document: /v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322001_Siasun Robot General Operation Manual_-V1.3/SX322001_Siasun Robot General Operation Manual_-V1.3.md
Analyzing document: /v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322003《新松机器人弧焊应用操作手册》(B-2)/SX322003《新松机器人弧焊应用操作手册》(B-2).md
/v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322003《新松机器人弧焊应用操作手册》(B-2)/SX322003《新松机器人弧焊应用操作手册》(B-2).md Summarize table: 0
/v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322003《新松机器人弧焊应用操作手册》(B-2)/SX322003《新松机器人弧焊应用操作手册》(B-2).md Summarize tabl

Processing files:  33%|███▎      | 1/3 [3:40:30<7:21:00, 13230.24s/file]

Processed file: /v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322003《新松机器人弧焊应用操作手册》(B-2)/SX322003《新松机器人弧焊应用操作手册》(B-2).md with title: SX322003《新松机器人弧焊应用操作手册》(B-2), status: ResponseStatus.ERROR, exception: Error occurred while analyzing document: /v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322003《新松机器人弧焊应用操作手册》(B-2)/SX322003《新松机器人弧焊应用操作手册》(B-2).md, Error: sequence item 0: expected str instance, list found
Summarize image: /v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322001《新松机器人通用操作手册》（B-2)/img/img_166.png
/v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322001《新松机器人通用操作手册》（B-2)/SX322001《新松机器人通用操作手册》（B-2).md Summarize table: 157
/v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322001_Siasun Robot General Operation Manual_-V1.3/SX322001_Siasun Robot General Operation Manual_-V1.3.md Summarize table: 48
Summarize image: /v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322001《新松机器人通用操作手册》（B-2)/img/img_167.png
/v/d

Processing files:  33%|███▎      | 1/3 [3:52:45<7:45:31, 13965.59s/file]


CancelledError: 

/v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322001_Siasun Robot General Operation Manual_-V1.3/SX322001_Siasun Robot General Operation Manual_-V1.3.md Summarize section: 155 section depth 4


/v/data/新型机器人智能问答系统数据源-md/.md.mid.hrsl.test.not.processed/SX322001《新松机器人通用操作手册》（B-2)/SX322001《新松机器人通用操作手册》（B-2).md Summarize section: 120 section depth 3


## asyncio interface of processing many markdown files


In [None]:
async def process_one(file: Path) -> None:
    """
    Processes a single Markdown file and generates its semantics analysis.
    """
    print(f"Processing file started: {file}")
    dom = DOM(file, client=chat_client)
    dom.setup()
    ast_json_file = file.parent / (str(file.stem) + "_ast.json")
    semantics_json_file = file.parent / (str(file.stem) + "_semantics.json")
    ast_json_file.write_text(dom.ast_json, encoding="utf-8")
    await dom.textualize()
    semantics_json_file.write_text(dom.ast_json, encoding="utf-8")
    print(f"Semantics analysis completed for {file}. Results saved to {semantics_json_file}")

In [None]:
async def supervisor(root_folder: Path) -> int:
    """
    Supervises the processing of Markdown files in a root folder.
    """

    tasks = []
    for file in root_folder.rglob("*.md"):
        print(f"Processing file started: {file}")
        tasks.append(process_one(file))
    
    res = await asyncio.gather(*tasks)

    return len(res)

In [None]:
def process_many(root_folder: Path) -> None:
    """
    Processes all Markdown files in a root folder recursively and generates their semantics analysis.
    """
    
    return asyncio.run(supervisor(root_folder))

process_many(Path("../res/test_batch"))

## Section Class: Recursive Document Structure

In [None]:
# | export
class Section(BaseModel):
    summary: Optional[str] = None
    paragraphs: List[str] = Field(default_factory=list)
    figures: List[Figure] = Field(default_factory=list)
    tables: List[Table] = Field(default_factory=list)
    subsections: List["Section"] = Field(default_factory=list)

    def __init__(
        self,
        summary: Optional[str] = None,
        paragraphs: Optional[List[str]] = None,
        figures: Optional[List[Figure]] = None,
        tables: Optional[List[Table]] = None,
        subsections: Optional[List[dict]] = None,
    ):
        # Recursively initialize subsections if provided as dicts
        if subsections is not None:
            subs = [Section(**s) if isinstance(s, dict) else s for s in subsections]
        else:
            subs = []
        super().__init__(
            summary=summary,
            paragraphs=paragraphs or [],
            figures=figures or [],
            tables=tables or [],
            subsections=subs,
        )

    @classmethod
    def init(cls, md: Markdown):
        # Placeholder for initialization from Markdown
        return cls()

    @classmethod
    def update_forward_refs(cls, **localns):
        ...
        #BaseModel.model_rebuild()


# Support for recursive Section references
Section.model_rebuild()