In [52]:
import base64
import re
from pathlib import Path
from urllib.parse import unquote

import lark_oapi as lark
from bs4 import BeautifulSoup, Tag
from bs4.element import Comment, NavigableString, PageElement
from lark_oapi.api.docx.v1 import (Block, Equation, GetDocumentRequest, GetDocumentResponse, ListDocumentBlockRequest,
                                   ListDocumentBlockResponse, Text, TextElement, TextElementStyle, TextRun)
from lark_oapi.api.drive.v1 import DownloadMediaRequest, DownloadMediaResponse

from secret import APP_ID, APP_SECRET

In [53]:
DOCUMENT_ID = "JYQswCHz7ilaTykFqvOcWFc1nrg"

In [54]:
# 创建client
client: lark.Client = (lark.Client.builder()
                       .app_id(APP_ID)
                       .app_secret(APP_SECRET)
                       .log_level(lark.LogLevel.INFO)
                       .build())

In [55]:
request: GetDocumentRequest = GetDocumentRequest.builder().document_id(DOCUMENT_ID).build()
document_response: GetDocumentResponse = client.docx.v1.document.get(request)

In [None]:
def get_docx_blocks(document_id: str, document_revision_id: int = -1) -> list[Block]:
    items: list[Block] = []
    page_token: str = ""
    while True:
        request: ListDocumentBlockRequest = (ListDocumentBlockRequest.builder()
                                             .document_id(document_id)
                                             .page_size(500)
                                             .page_token(page_token)
                                             .document_revision_id(document_revision_id)
                                             .build())
        response: ListDocumentBlockResponse = client.docx.v1.document_block.list(request)
        assert response.success()
        items.extend(response.data.items)
        if not response.data.has_more:
            break
        else:
            page_token = response.data.page_token
    return items


def download_image(file_token: str) -> bytes:
    request: DownloadMediaRequest = (DownloadMediaRequest.builder()
                                     .file_token(file_token)
                                     .build())
    response: DownloadMediaResponse = client.drive.v1.media.download(request)
    return response.file.read()


def get_image_with_cache(file_token: str) -> bytes:
    path = Path("images") / f"{file_token}.png"
    if path.is_file():
        return path.read_bytes()
    else:
        path.parent.mkdir(parents=True, exist_ok=True)
        image_data = download_image(file_token)
        path.write_bytes(image_data)
        return image_data


def image_to_base64(image: bytes) -> str:
    return base64.b64encode(image).decode("utf-8")


In [None]:
blocks = get_docx_blocks(DOCUMENT_ID)

In [None]:
with open("svgs/doc.svg", "r", encoding="utf-8") as fp:
    doc_svg_content = fp.read()
with open("svgs/info.svg", "r", encoding="utf-8") as fp:
    info_svg_content = fp.read()


def new_tag(name: str, attrs=None, contents: list | None = None) -> Tag:
    tag = Tag(name=name, attrs=attrs)
    if contents is not None:
        tag.extend(contents)
    return tag


def get_lang(char: str) -> str:
    """判断字符的语言"""
    if re.match(r"[0-9a-zA-Z,.:/?%]", char):
        return 'en'
    else:
        return 'zh'


def get_lang_content(string: str) -> list[PageElement]:
    spans = []
    current_lang = None
    current_chars = []

    # 处理剩余字符
    for char in string:
        # 判断当前字符的语言
        lang = get_lang(char)

        if lang == current_lang:
            # 如果语言相同，则添加到当前字符列表
            current_chars.append(char)
        else:
            # 如果语言不同，则创建新的 span 标签并添加到列表
            if current_chars:
                spans.append(new_tag(name="span", attrs={"lang": current_lang}, contents=[NavigableString(''.join(current_chars))]))
            # 更新当前语言和字符列表
            current_lang = lang
            current_chars = [char]
    # 添加最后一个字符列表
    if current_chars:
        spans.append(new_tag(name="span", attrs={"lang": current_lang}, contents=[NavigableString(''.join(current_chars))]))

    return spans


def get_string_content(string: str) -> list[PageElement]:
    """解析字符串中的换行符"""
    parts = string.split("\n")
    contents: list[PageElement] = []
    for i, part in enumerate(parts):
        if part:
            # contents.append(NavigableString(part))
            contents.extend(get_lang_content(part))
        if i < len(parts) - 1:
            contents.append(Tag(name="br", can_be_empty_element=True))
    return contents


class Blocks:
    def __init__(self, blocks: list[Block]):
        self.blocks = blocks
        self.block_dict = {block.block_id: block for block in blocks}
        self.global_image_counter = 0
        self.global_spreadsheet_counter = 0

    def construct_text_elements(self, text: Text) -> list:
        contents: list[PageElement] = []
        text_color: int | None = None  # 记录当前文本颜色，使公式的颜色与之前的文本颜色一致

        for element in text.elements:
            # 文本
            if element.text_run is not None:
                text_element_style: TextElementStyle = element.text_run.text_element_style
                element_contents = get_string_content(element.text_run.content)
                if text_element_style.text_color is not None:
                    text_color = text_element_style.text_color  # 更新当前文本颜色
                    element_contents = [new_tag(
                        name="span",
                        attrs={"class": [f"text-color-{text_element_style.text_color}"]},
                        contents=element_contents,
                    )]
                else:
                    text_color = None  # 重置为 None
                if text_element_style.bold:
                    element_contents = [new_tag(name="strong", contents=element_contents)]
                if text_element_style.italic:
                    element_contents = [new_tag(name="em", contents=element_contents)]
                if text_element_style.underline:
                    element_contents = [new_tag(name="u", contents=element_contents)]
                if text_element_style.strikethrough:
                    element_contents = [new_tag(name="s", contents=element_contents)]
                if text_element_style.inline_code:
                    element_contents = [new_tag(name="code", contents=element_contents)]
                    element_contents = [new_tag(name="pre", contents=element_contents)]
                if text_element_style.link is not None:
                    element_contents = [new_tag(
                        name="a",
                        attrs={"href": unquote(text_element_style.link.url), "target": "_blank"},
                        contents=element_contents,
                    )]
                if text_element_style.background_color is not None:
                    element_contents = [new_tag(
                        name="span",
                        attrs={"class": [f"bg-color-{text_element_style.background_color}"]},
                        contents=element_contents,
                    )]
                contents.extend(element_contents)

            # 公式
            if element.equation is not None:
                wrapped_string = rf"\({element.equation.content.strip()}\)"
                equation_tag = new_tag(
                    name="span",
                    attrs={"class": ["equation"] if text_color is None else ["equation", f"text-color-{text_color}"]},
                    contents=[NavigableString(wrapped_string)]
                )
                contents.append(equation_tag)

            # 引用文档
            if element.mention_doc is not None:
                svg = new_tag("span", attrs={"class": "doc-logo"}, contents=[BeautifulSoup(doc_svg_content, "html.parser").svg])
                tag = new_tag(
                    name="a",
                    attrs={"href": element.mention_doc.url, "target": "_blank"},
                    contents=[svg, *get_string_content(element.mention_doc.title)]
                )
                contents.append(tag)

        return contents

    def construct_tag_from_block(self, block_id: str) -> Tag:
        block: Block = self.block_dict[block_id]
        outer: Tag | None = None
        tag: Tag

        match block.block_type:
            case 1:  # page
                tag = new_tag(name="article")
            case 2:  # text
                tag = new_tag(name="p", contents=self.construct_text_elements(block.text))
            case 3:  # heading1
                tag = new_tag(name="h1", attrs={"class": ["auto-numbering"]}, contents=self.construct_text_elements(block.heading1))
            case 4:  # heading2
                tag = new_tag(name="h2", attrs={"class": ["auto-numbering"]}, contents=self.construct_text_elements(block.heading2))
            case 5:  # heading3
                tag = new_tag(name="h3", attrs={"class": ["auto-numbering"]}, contents=self.construct_text_elements(block.heading3))
            case 6:  # heading4
                tag = new_tag(name="h4", attrs={"class": ["auto-numbering"]}, contents=self.construct_text_elements(block.heading4))
            case 7:  # heading5
                tag = new_tag(name="h5", attrs={"class": ["auto-numbering"]}, contents=self.construct_text_elements(block.heading5))
            case 8:  # heading6
                tag = new_tag(name="h6", attrs={"class": ["auto-numbering"]}, contents=self.construct_text_elements(block.heading6))
            case 12:  # bullet
                tag = new_tag(name="li", contents=self.construct_text_elements(block.bullet))
            case 13:  # ordered
                tag = new_tag(name="li", contents=self.construct_text_elements(block.ordered))
            case 14:  # code
                tag = new_tag(name="pre", contents=[new_tag(name="code", contents=self.construct_text_elements(block.code))])
            case 15:  # quote
                tag = new_tag(name="blockquote")
            case 19:  # callout
                info_tag = new_tag(name="div", attrs={"class": "callout-icon"}, contents=[BeautifulSoup(info_svg_content, "html.parser").svg])
                tag = new_tag(name="div", attrs={"class": "callout-inner"})
                outer = new_tag(name="div", attrs={"class": "callout-outer"}, contents=[info_tag, tag])
            case 21:  # diagram
                tag = new_tag(name="div", attrs={"class": "diagram"})
            case 22:  # divider
                tag = new_tag(name="hr")
            case 24:  # grid
                tag = new_tag(name="div", attrs={"class": ["grid-container"]})
            case 25:  # grid_column
                tag = new_tag(name="div", attrs={"class": ["grid-column"], "style": f"flex: {block.grid_column.width_ratio}%;"})
            case 27:  # image
                image_token: str = block.image.token
                image_data: bytes = get_image_with_cache(image_token)
                image_caption = block.image.caption.content if block.image.caption is not None else ""
                self.global_image_counter += 1
                image_caption = f"图 {self.global_image_counter + 1}.　{image_caption}"
                img_tag = new_tag(
                    name="img",
                    attrs={
                        "src": f"images/{image_token}.png",
                        "alt": image_caption,
                        # "style": f"max-width: min(100%, {block.image.width}px);",
                    },
                )
                if image_caption:
                    contents = [img_tag, new_tag(name="figcaption", contents=get_string_content(image_caption))]
                else:
                    contents = [img_tag]
                tag = new_tag(
                    name="figure",
                    attrs={"class": "image"},
                    contents=contents,
                )
            case 30:  # sheet
                SpreadsheetToken_SheetID = block.sheet.token
                self.global_spreadsheet_counter += 1
                image_caption = f"表 {self.global_spreadsheet_counter + 1}.　{SpreadsheetToken_SheetID}"
                img_tag = new_tag(
                    name="img",
                    attrs={
                        "src": f"sheets/{SpreadsheetToken_SheetID}.png",
                        "alt": image_caption,
                        # "style": f"max-width: min(100%, {block.image.width}px);",
                    },
                )
                contents = [img_tag, new_tag(name="figcaption", contents=get_string_content(image_caption))]
                tag = new_tag(
                    name="figure",
                    attrs={"class": "sheet"},
                    contents=contents,
                )
            case 31:  # table
                row_size: int = block.table.property.row_size
                column_size: int = block.table.property.column_size
                tag = new_tag(name="table", contents=[
                    new_tag(name="thead", contents=[new_tag(name="tr")]),
                    new_tag(name="tbody", contents=[new_tag(name="tr") for _ in range(row_size)])
                ])
                if block.children is not None:
                    for i, child_block_id in enumerate(block.children):
                        child_tag: Tag = self.construct_tag_from_block(child_block_id)
                        row = i // column_size
                        tag.find_all("tr")[row].append(child_tag)
                return tag
            case 32:  # table_cell
                tag = new_tag(name="td")
            case 34:  # quote_container
                tag = new_tag(name="blockquote")
            case 43:  # board
                tag = new_tag(name="div", attrs={"class": "board"})
            case _:
                tag = new_tag(name="div")

        if block.children is not None:
            ul_tag: Tag | None = None
            ol_tag: Tag | None = None

            for child_block_id in block.children:
                child_block: Block = self.block_dict[child_block_id]

                if child_block.block_type == 12:  # bullet
                    if ul_tag is None:
                        ul_tag = new_tag(name="ul")
                        tag.append(ul_tag)
                else:
                    ul_tag = None

                if child_block.block_type == 13:  # ordered
                    if ol_tag is None:
                        ol_tag = new_tag(name="ol")
                        tag.append(ol_tag)
                else:
                    ol_tag = None

                child_tag: Tag = self.construct_tag_from_block(child_block_id)

                if ul_tag is not None:
                    ul_tag.append(child_tag)
                elif ol_tag is not None:
                    ol_tag.append(child_tag)
                else:
                    tag.append(child_tag)

        return outer or tag


def construct_html_from_blocks(blocks: list[Block]) -> BeautifulSoup:
    with open("template.html", "r", encoding="utf-8") as fp:
        soup = BeautifulSoup(fp, "html.parser")

    title = blocks[0].page.elements[0].text_run.content
    article = Blocks(blocks).construct_tag_from_block(blocks[0].block_id)
    soup.html.head.title.string = title
    soup.html.body.append(article)

    return soup


soup = construct_html_from_blocks(blocks)
with open("saved_soup.html", "w", encoding="utf-8") as f:
    f.write(str(soup))