In [2]:
# |default_exp parsing.ms_office.markitdown

In [7]:
# | hide
from nbdev.showdoc import *
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

## Install dependencies

In [12]:
# | export
import sys
import os
import time
from hashlib import sha256
from pathlib import Path
from pprint import pprint, pformat

from openai import OpenAI
from markitdown import MarkItDown
import subprocess
from pathlib import Path
import pandas as pd
import re
import base64
import os
import shutil
import uuid # For more unique filenames, or use a counter

In [3]:
#| export
from dotenv import load_dotenv
load_dotenv()
OPENROUTER_API_KEY= os.getenv('OPENROUTER_API_KEY')
print(OPENROUTER_API_KEY)
OPENROUTER_API_URL = os.getenv('OPENROUTER_API_URL')
print(OPENROUTER_API_URL)
OPENROUTER_MODEL = os.getenv('OPENROUTER_MODEL')
# OPENROUTER_MODEL = 'OpenAI:o3'
print(OPENROUTER_MODEL)

True

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
https://openrouter.ai/api/v1
Google:gemini-2.5-pro-exp-03-25


In [28]:
#| export
# Optionally, provide your LLM client and model for OCR fallback and default table detection
llm_client = OpenAI(
  base_url=f"{OPENROUTER_API_URL}",
  api_key=f"{OPENROUTER_API_KEY}",
)
llm_model = f"{OPENROUTER_MODEL}"

In [49]:
def convert_office_to_md(root_folder: Path | str, bOverwrite: bool = False):
    """
    Recursively convert all .pptx, .ppt, .docx, and .doc files under
    root_folder (and subfolders) to PDF files in the same folder as the
    original. For .xls and .xlsx files, convert to markdown (.md) files
    in the same folder. Requires LibreOffice (soffice) installed for Office
    document conversion. Requires pandas for Excel to markdown.
    """

    office_exts = {'.pptx', '.ppt', '.docx', '.doc'}
    excel_exts = {'.xls', '.xlsx'}
    root = Path(root_folder)
    md_root = root / '.md'
    for file in root.rglob('*'):
        if file.suffix.lower() not in office_exts and file.suffix.lower() not in excel_exts:
            continue
        md_path = file.with_suffix('.md')
        md_path = md_root / md_path.relative_to(root)
        if not bOverwrite:
            if md_path.exists():
                continue  # Skip if md already exists
        if not md_path.parent.exists():
            md_path.parent.mkdir(parents=True, exist_ok=False)
        if file.suffix.lower() in office_exts:
            try:
                # Use soffice for conversion
                subprocess.run([
                    'markitdown', str(file),
                    '-o', md_path, #str(file.parent / md_path), 
                    '--keep-data-uris',
                ], check=True)
                print(f"Converted: {file} -> {md_path}")
            except subprocess.CalledProcessError as e:
                print(f"Failed to convert {file}: {e}")
        elif file.suffix.lower() in excel_exts:
            try:
                excel = pd.read_excel(file, sheet_name=None)
                with open(md_path, 'w', encoding='utf-8') as f:
                    for sheet, df in excel.items():
                        f.write(f'# Sheet: {sheet}\n\n')
                        f.write(df.to_markdown(index=False))
                        f.write('\n\n')
                print(f"Converted: {file} -> {md_path}")
            except Exception as e:
                print(f"Failed to convert {file} to markdown: {e}")
        elif file.suffix.lower() == '.pdf':
            pass 
        else:
            pass


In [50]:
convert_office_to_md('/v/data/test',bOverwrite=False)

Converted: /v/data/test/对智能体进行评估和优化.docx -> /v/data/test/.md/对智能体进行评估和优化.md


In [None]:
#| export

def extract_base64_images(markdown_file_path, image_output_folder="."):
    """
    Extracts base64 embedded images from a Markdown file, saves them to a folder,
    and replaces the base64 strings with relative paths to the new image files.

    Args:
        markdown_file_path (str): Path to the input Markdown file.
        image_output_folder (str): Name of the folder to save extracted images.
                                   This folder will be created relative to the
                                   Markdown file's directory if it doesn't exist.
    """
    if not os.path.exists(markdown_file_path):
        print(f"Error: Markdown file not found at {markdown_file_path}")
        return
    # markdown_file_stem = markdown_file_path.stem
    markdown_dir = os.path.dirname(os.path.abspath(markdown_file_path))
    full_image_output_path = os.path.join(markdown_dir, image_output_folder)

    if not os.path.exists(full_image_output_path):
        os.makedirs(full_image_output_path)
        # print(f"Created image output folder: {full_image_output_path}")

    with open(markdown_file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Regex to find base64 encoded images in Markdown
    # Pattern: ![alt text](data:image/png;base64,BASE64_STRING)
    # Groups:
    # 1: Alt text
    # 2: Image format (e.g., png, jpeg)
    # 3: Base64 data string
    # We also capture the full match (group 0) to replace it
    regex_img_quote = r"!\[(.*?)\]\(data:image/(.+?);base64,([A-Za-z0-9+/=\s]+)\)"
    regex_illegal_file_name = r'[^a-zA-Z0-9_\-\.]+'  # Legal characters for filenames

    new_content = content
    images_extracted_count = 0

    # We need to iterate carefully as string replacements change string length
    # Finding all matches first and then replacing is safer, but can be tricky
    # if matches overlap (not typical for this pattern).
    # A simpler approach for non-overlapping, distinct matches is to iterate
    # and replace. For more complex scenarios, one might work on a list of lines
    # or use re.sub with a function.

    # Using re.finditer to get match objects for more control
    for i, match in enumerate(re.finditer(regex_img_quote, content)):
        full_match_str = match.group(0)
        alt_text = match.group(1)
        # Normalize alt text to a legal filename
        alt_text = re.sub(regex_illegal_file_name, '_', alt_text)  # Replace illegal characters with '_'
        alt_text = alt_text.strip()  # Remove leading/trailing whitespace
        alt_text = alt_text[:50] if len(alt_text) > 50 else alt_text  # Limit length to 50 characters
        alt_text = 'img' if not alt_text else alt_text # If alt text is empty, use a default name

        image_format = match.group(2).lower() # e.g., png, jpeg
        image_format = re.sub(r'x-([a-zA-Z])', r'\1', image_format) # Normalize format (e.g., x-wmf/x-emf to wmf/emf)
        base64_data = match.group(3)

        # Clean up base64 data (remove potential whitespace)
        base64_data_cleaned = "".join(base64_data.split())
        # Fix missing padding
        missing_padding = len(base64_data_cleaned) % 4
        if missing_padding != 0:
            base64_data_cleaned += '=' * (4 - missing_padding)
        try:
            image_data = base64.b64decode(base64_data_cleaned)
        except base64.binascii.Error as e:
            print(f"Warning: Could not decode base64 string for an image (alt: {alt_text}). Error: {e}")
            continue # Skip this image

        # Generate a unique filename
        # Using a counter is simple, could use uuid for more robustness
        image_filename = f"{alt_text}_{images_extracted_count}.{image_format}"
        image_filepath = os.path.join(full_image_output_path, image_filename)

        # Save the image
        with open(image_filepath, 'wb') as img_file:
            img_file.write(image_data)
        # print(f"Extracted and saved: {image_filepath}")
        if image_format == 'wmf': # in case of wmf, we need to convert it to svg with soffice
            svg_file = Path(image_filepath).with_suffix('.svg')
            subprocess.run([
                'soffice', '--headless', '--convert-to', 'svg',
                str(image_filepath),
                '--outdir', str(full_image_output_path),
            ], check=True)
            if svg_file.exists():
                Path(image_filepath).unlink()  # Remove the original WMF file
                image_filename = svg_file.name  # Update filename to the new SVG file

        if image_format == 'emf': # in case of wmf, we need to convert it to svg with soffice
            svg_file = Path(image_filepath).with_suffix('.png')
            subprocess.run([
                'soffice', '--headless', '--convert-to', 'png',
                str(image_filepath),
                '--outdir', str(full_image_output_path),
            ], check=True)
            if svg_file.exists():
                Path(image_filepath).unlink()  # Remove the original WMF file
                image_filename = svg_file.name  # Update filename to the new SVG file
        # Create the new Markdown image link (relative path)
        # The path in Markdown should be relative to the Markdown file itself
        relative_image_path = os.path.join(image_output_folder, image_filename)
        # Ensure forward slashes for Markdown paths, even on Windows
        relative_image_path_markdown = relative_image_path.replace(os.sep, '/')
        new_image_md_link = f"![{alt_text}]({relative_image_path_markdown})"

        # Replace the original base64 string with the new link in the `new_content`
        # Only replace the first occurrence of this specific full_match_str in case of duplicates
        # (though each match from finditer is unique in its position)
        new_content = new_content.replace(full_match_str, new_image_md_link, 1)
        images_extracted_count += 1

    if images_extracted_count > 0:
        # Save the modified Markdown content
        # You might want to save to a new file, e.g., original_filename_modified.md
        # For this example, I'll overwrite the original. Be careful!
        # Consider backing up your original file first.
        output_markdown_file_path = markdown_file_path # Overwrite
        # output_markdown_file_path = os.path.splitext(markdown_file_path)[0] + "_modified.md" # New file

        with open(output_markdown_file_path, 'w', encoding='utf-8') as f:
            f.write(new_content)
        print(f"Modified Markdown saved to: {output_markdown_file_path}, processed {images_extracted_count} image(s).")
    else:
        print("No base64 embedded images found in the Markdown file.")


In [30]:

# --- How to use it ---
if __name__ == "__main__":
    # Create a dummy Markdown file for testing
    dummy_md_content = """
# My Document

This is some text.

Here is an image: A red dot

Some more text.

And another one: A blue square

This one is tricky with potential newlines in base64:
![With Newlines](data:image/gif;base64,R0lGODlhAQABAIAAAP///wAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw==
)

End of document.
    """
    test_md_file = "test_document.md"
    with open(test_md_file, "w", encoding="utf-8") as f:
        f.write(dummy_md_content)
    print(f"Created dummy Markdown file: {test_md_file}")

    # Specify the path to your Markdown file
    markdown_file = test_md_file  # Or "your_actual_file.md"
    # Specify the folder (relative to the MD file) where images will be saved
    image_folder = "md_images"

    extract_base64_images(markdown_file, image_folder)

    # --- Optional: Clean up dummy files and folder after testing ---
    print("\nCleaning up dummy files...")
    if os.path.exists(os.path.join(os.path.dirname(test_md_file), image_folder)):
        for img_file in os.listdir(os.path.join(os.path.dirname(test_md_file), image_folder)):
            os.remove(os.path.join(os.path.dirname(test_md_file), image_folder, img_file))
        os.rmdir(os.path.join(os.path.dirname(test_md_file), image_folder))
    if os.path.exists(test_md_file):
        os.remove(test_md_file)
    print("Cleanup complete.")

878

Created dummy Markdown file: test_document.md
Extracted and saved: /d/devel/rag/ribosome/nbs/md_images/image_0.png
Extracted and saved: /d/devel/rag/ribosome/nbs/md_images/image_1.jpeg
Extracted and saved: /d/devel/rag/ribosome/nbs/md_images/image_2.gif

Processed 3 image(s).
Modified Markdown saved to: test_document.md

Cleaning up dummy files...
Cleanup complete.


In [30]:
#| export
def extract_base64_from_md(root_folder):
    """
    Recursively convert all .md files under
    root_folder (and subfolders) and extract all base64 images in them into
    a separate folder and replace the base64 image references in the markdown
    files with the path to the extracted image.
    """


    root = Path(root_folder)
    tmp = Path(root_folder).parent / 'tmp'
    shutil.move(root, tmp) # Copy the whole folder to res
    os.makedirs(root, exist_ok=False)  # make sure the root folder exists and is empty
    image_folder = "img"
    for file in tmp.rglob('*'):
        if file.suffix.lower() == '.md':
            try:
                # Create a folder with the file name and move the md file into it
                md_folder = file.parent / (file.stem)
                res_folder = root / md_folder.relative_to(tmp)
                os.makedirs(res_folder, exist_ok=False)
                shutil.copy(file, res_folder)
                new_md_file = res_folder / file.name
                # Use replace original md file with base64 extracted in separate image folder
                extract_base64_images(new_md_file,image_folder)
                print(f"Converted: {res_folder}")
                # file.unlink()  # Remove the original md file after extraction
            except subprocess.CalledProcessError as e:
                print(f"Failed to convert {file}: {e}")
    
    shutil.rmtree(tmp)  # Remove the temporary folder after extraction

In [32]:
extract_base64_from_md('/v/data/新型机器人智能问答系统数据源-mid/.md')

Modified Markdown saved to: /v/data/新型机器人智能问答系统数据源-mid/.md/01 设计标准/新松机器人产品识别设计标准（推荐） 250219/新松机器人产品识别设计标准（推荐） 250219.md, processed 324 image(s).
Converted: /v/data/新型机器人智能问答系统数据源-mid/.md/01 设计标准/新松机器人产品识别设计标准（推荐） 250219
Modified Markdown saved to: /v/data/新型机器人智能问答系统数据源-mid/.md/02 产品推介资料/SX023010新松打磨解决方案介绍V1.0/SX023010新松打磨解决方案介绍V1.0.md, processed 133 image(s).
Converted: /v/data/新型机器人智能问答系统数据源-mid/.md/02 产品推介资料/SX023010新松打磨解决方案介绍V1.0
Modified Markdown saved to: /v/data/新型机器人智能问答系统数据源-mid/.md/02 产品推介资料/SX023010新松打磨解决方案介绍V1.0/SX023010新松打磨解决方案介绍V1.0.md, processed 133 image(s).
Converted: /v/data/新型机器人智能问答系统数据源-mid/.md/02 产品推介资料/SX023010新松打磨解决方案介绍V1.0
Modified Markdown saved to: /v/data/新型机器人智能问答系统数据源-mid/.md/02 产品推介资料/SR023001新松电动车车架焊接解决方案介绍V1.0/SR023001新松电动车车架焊接解决方案介绍V1.0.md, processed 98 image(s).
Converted: /v/data/新型机器人智能问答系统数据源-mid/.md/02 产品推介资料/SR023001新松电动车车架焊接解决方案介绍V1.0
Modified Markdown saved to: /v/data/新型机器人智能问答系统数据源-mid/.md/02 产品推介资料/SR023001新松电动车车架焊接解决方案介绍V1.0/SR023001新松电动车车架

In [45]:
#| export
def copy_md_files(src_md_root: Path, dst_md_root: Path, bOverwrite: bool = True):
    """
    Recursively copy all .md files under
    src_md_folder (higher quality of original office converted md by MID) (and subfolders) 
    to dst_md_folder (low quality of original pdf converted md by gemini 2.5 pro exp).
    """

    # Create the destination folder if it does not exist
    if not dst_md_root.exists():
        dst_md_root.mkdir(parents=True, exist_ok=False)
    for file in src_md_root.rglob('*'):
        if file.suffix.lower() == '.md':
            try:
                # Create a folder with the file name and move the md file into it
                src_md_folder = file.parent
                dst_md_folder = dst_md_root / src_md_folder.relative_to(src_md_root) 
                if dst_md_folder.exists():
                    if bOverwrite:
                        shutil.rmtree(dst_md_folder)
                        print(f"Remove: {dst_md_folder}")
                    else:
                        print(f"Skipped: {dst_md_folder}")
                        continue
                shutil.copytree(src_md_folder, dst_md_folder)
                print(f"Copied: {src_md_folder} -> {dst_md_folder}")
            except subprocess.CalledProcessError as e:
                print(f"Failed to convert {file}: {e}")

In [46]:
copy_md_files(Path('/v/data/新型机器人智能问答系统数据源-mid/.md'), Path('/v/data/新型机器人智能问答系统数据源-md/.md'))

Remove: /v/data/新型机器人智能问答系统数据源-md/.md/05 技术规格/规格参数/SR024009《新松SR25A-35-1.80规格参数》A-2
Copied: /v/data/新型机器人智能问答系统数据源-mid/.md/05 技术规格/规格参数/SR024009《新松SR25A-35-1.80规格参数》A-2 -> /v/data/新型机器人智能问答系统数据源-md/.md/05 技术规格/规格参数/SR024009《新松SR25A-35-1.80规格参数》A-2
Remove: /v/data/新型机器人智能问答系统数据源-md/.md/05 技术规格/规格参数/SR024001《新松SR12A-12-1.46规格参数》A-1
Copied: /v/data/新型机器人智能问答系统数据源-mid/.md/05 技术规格/规格参数/SR024001《新松SR12A-12-1.46规格参数》A-1 -> /v/data/新型机器人智能问答系统数据源-md/.md/05 技术规格/规格参数/SR024001《新松SR12A-12-1.46规格参数》A-1
Remove: /v/data/新型机器人智能问答系统数据源-md/.md/05 技术规格/规格参数/SN024003《新松SN4A-4-0.58规格参数》A-1
Copied: /v/data/新型机器人智能问答系统数据源-mid/.md/05 技术规格/规格参数/SN024003《新松SN4A-4-0.58规格参数》A-1 -> /v/data/新型机器人智能问答系统数据源-md/.md/05 技术规格/规格参数/SN024003《新松SN4A-4-0.58规格参数》A-1
Remove: /v/data/新型机器人智能问答系统数据源-md/.md/05 技术规格/规格参数/SR024012《新松SR210-120-3.05规格参数》A-2
Copied: /v/data/新型机器人智能问答系统数据源-mid/.md/05 技术规格/规格参数/SR024012《新松SR210-120-3.05规格参数》A-2 -> /v/data/新型机器人智能问答系统数据源-md/.md/05 技术规格/规格参数/SR024012《新松SR210-120-3.05规格参数》A-2
Remove: /v/

In [None]:
#| export
# mid = MarkItDown(
#     enable_plugins=True,     # Mandatory
#     llm_client=llm_client,  # Mandatory
#     llm_model=llm_model,    # Mandatory
#     show_progress=True,  # Optional, defaults to False
#     ocr_service=None,  # Optional, defaults to "openrouter"k
#     table_detection_service=None,  # Optional, defaults to "openrouter":w
# )
mid = MarkItDown(
    llm_client=llm_client,  # Mandatory
    llm_model=llm_model,    # Mandatory
    show_progress=True,  # Optional, defaults to False
)

In [None]:
def convert_all_to_markdown_with_llm(root_folder: str | Path, mid: MarkItDown):
    """
    Recursively convert all .pdf files under root_folder (and subfolders)
    to markdown (.md) files in the same folder using marker-pdf's 'marker' command
    with 4 workers.
    """
    import subprocess
    from pathlib import Path
    # import os

    root = Path(root_folder)
    for file in root.rglob('*'):
        md_path = file.with_suffix('.md')
        if md_path.exists():
            continue  # Skip if markdown already exists
        try:
            result = mid.convert(
                file,
                show_progress=True,  # Show progress bar
                force_ocr=True,
                pages=None,  # Convert all pages
            )
            with open(md_path, 'w', encoding='utf-8') as f:
                f.write(result.markdown)
            print(f"Converted: {file} -> {md_path}")
        except subprocess.CalledProcessError as e:
            print(f"Failed to convert {file}: {e}")

In [None]:
convert_all_to_markdown_with_llm('/v/data/新型机器人智能问答系统数据源-merged-MID',mid)

In [None]:
import pdftotext

In [13]:
def convert_pdf_to_markdown_pdf2text(root_folder: str | Path):
    """
    Recursively convert all .pdf files under root_folder (and subfolders)
    to markdown (.md) files in the same folder using marker-pdf's 'marker' command
    with 4 workers.
    """
    import subprocess
    from pathlib import Path
    import pdftotext
    # import os

    root = Path(root_folder)
    for file in root.rglob('*.pdf'):
        md_path = file.with_suffix('.md')
        if md_path.exists():
            continue  # Skip if markdown already exists
        with open(file, "rb") as f:
            pdf = pdftotext.PDF(f)
        
        # Join all text from the PDF pages
        text = "\n\n".join(pdf)
        with open(md_path, 'w', encoding='utf-8') as f:
            f.write(text)
        print(f"Converted: {file} -> {md_path}")

In [14]:
convert_pdf_to_markdown_pdf2text('/v/data/新型机器人智能问答系统数据源-merged-MID')

ImportError: /d/.pyenv/versions/miniconda3-latest/envs/cell/bin/../lib/libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by /lib/x86_64-linux-gnu/libpoppler-cpp.so.0)

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()

<div>
<link rel="stylesheet" href="https://gradio.s3-us-west-2.amazonaws.com/2.6.5/static/bundle.css">
<div id="target"></div>
<script src="https://gradio.s3-us-west-2.amazonaws.com/2.6.5/static/bundle.js"></script>
<script>
launchGradioFromSpaces("abidlabs/question-answering", "#target")
</script>
</div>