In [13]:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from markitdown import MarkItDown
import rapidocr


In [7]:
def convert_with_rapidocr(pdf_path):
    # 1. 設定 OCR 選項，這裡指定使用 RapidOCR
    ocr_options = RapidOcrOptions()

    # 2. 設定 PDF 處理流程，開啟 OCR 並套用選項
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.ocr_options = ocr_options

    # 3. 初始化轉換器，將設定綁定到 PDF 格式上
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    # 4. 執行轉換
    result = converter.convert(pdf_path)

    # 5. 輸出結果 (轉換成 Markdown)
    return result.document.export_to_markdown()

# 執行測試
markdown_output = convert_with_rapidocr("CW/06/sample_table.pdf")
print(markdown_output)




In [8]:
with open("converted_result.md", "w", encoding="utf-8") as f:
    f.write(markdown_output)
print("轉換完成！檔案已儲存為 converted_result.md")

轉換完成！檔案已儲存為 converted_result.md


In [None]:
def convert_with_olm_vlm(pdf_path):
    # 1. 設定 VLM OCR 選項
    # 這裡可以指定模型路徑，例如 Qwen2-VL 的本地路徑或 HuggingFace ID
    vlm_options = VlmOcrOptions(
        model=VlmOcrModel.QWEN2_VL_2B_INSTRUCT  # 使用 Qwen2-VL 作為後端
    )

    # 2. 配置 Pipeline
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.ocr_options = vlm_options  # 替換掉原本的 RapidOcrOptions

    # 3. 初始化轉換器
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    # 4. 執行轉換
    result = converter.convert(pdf_path)
    return result.document.export_to_markdown()