In [2]:
!pip install mistralai
!pip install langchain




[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import os
import json
import base64
import shutil
from pathlib import Path
from mistralai import Mistral, DocumentURLChunk
from mistralai.models import OCRResponse

In [4]:
# The only requirement for this script is to have a Mistral API Key.
# You can get a free API Key at: https://console.mistral.ai/api-keys

from dotenv import load_dotenv

load_dotenv()
api_key = "9ABfnEldWDRL5oisq0gY53yvXujV94hX"
print(f"Loaded API Key: {api_key[:4]}...")
client = Mistral(api_key=api_key)

Loaded API Key: 9ABf...


In [5]:
# Path configuration
INPUT_DIR = Path("./content/pdf_content/")   # Folder where th9ABfnEldWDRL5oisq0gY53yvXujV94hXe user places the PDFs to be processed
DONE_DIR = Path("./content/pdf_content/markdown_outputs")            # Folder where processed PDFs will be moved
OUTPUT_ROOT_DIR = Path("./content/pdf_content/ocr_output")    # Root folder for conversion results

# Ensure directories exist
INPUT_DIR.mkdir(exist_ok=True)
DONE_DIR.mkdir(exist_ok=True)
OUTPUT_ROOT_DIR.mkdir(exist_ok=True)

In [6]:
def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    """
    This converts base64 encoded images directly in the markdown...
    And replaces them with links to external images, so the markdown is more readable and organized.
    """
    for img_name, base64_str in images_dict.items():
        markdown_str = markdown_str.replace(f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})")
    return markdown_str

def get_combined_markdown(ocr_response: OCRResponse) -> str:
    """
    Part of the response from the Mistral API, which is an OCRResponse object...
    And returns a single string with the combined markdown of all the pages of the PDF.
    """
    markdowns: list[str] = []
    for page in ocr_response.pages:
        image_data = {}
        for img in page.images:
            image_data[img.id] = img.image_base64
        markdowns.append(replace_images_in_markdown(page.markdown, image_data))

    return "\n\n".join(markdowns)

In [7]:
import os
import re
import json
import base64
import shutil
from pathlib import Path

# Assuming these globals are defined elsewhere in your project:
# OUTPUT_ROOT_DIR, DONE_DIR, client, DocumentURLChunk

output_file_relative_path = None
output_images_relative_path = None

def process_pdf(pdf_path: Path):
    global output_file_relative_path, output_images_relative_path
    # Process all PDFs in INPUT_DIR
    # - Important to be careful with the number of PDFs, as the Mistral API has a usage limit
    #   and it could cause errors by exceeding the limit.

    # PDF base name
    pdf_base = pdf_path.stem
    print(f"Processing {pdf_path.name} ...")
    
    # Output folders
    output_dir = OUTPUT_ROOT_DIR / pdf_base
    output_file_relative_path = output_dir
    # Overwrite the output directory if it already exists
    if output_dir.exists():
        shutil.rmtree(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    images_dir = output_dir / "images"
    output_images_relative_path = images_dir
    images_dir.mkdir(parents=True, exist_ok=True)
    
    # PDF -> OCR
    with open(pdf_path, "rb") as f:
        pdf_bytes = f.read()
        
    uploaded_file = client.files.upload(
        file={
            "file_name": pdf_path.name,
            "content": pdf_bytes,
        },
        purpose="ocr"
    )
    
    signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
    
    ocr_response = client.ocr.process(
        document=DocumentURLChunk(document_url=signed_url.url),
        model="mistral-ocr-latest",
        include_image_base64=True
    )
    
    # Save OCR in JSON 
    # (in case something fails it could be reused, but it is not used in the rest of the code)
    ocr_json_path = output_dir / "ocr_response.json"
    with open(ocr_json_path, "w", encoding="utf-8") as json_file:
        json.dump(ocr_response.dict(), json_file, indent=4, ensure_ascii=False)
    print(f"OCR response saved in {ocr_json_path}")
    
    # OCR -> Markdown prepared for Obsidian
    # - That is, from base64 encoded images, it converts them to links to 
    #   external images and generates the images as such, in a subfolder.
    
    global_counter = 1
    updated_markdown_pages = []
    
    for page in ocr_response.pages:
        updated_markdown = page.markdown
        for image_obj in page.images:
            
            # base64 to image
            base64_str = image_obj.image_base64
            if base64_str.startswith("data:"):
                base64_str = base64_str.split(",", 1)[1]
            image_bytes = base64.b64decode(base64_str)
            
            # image extension handling
            ext = Path(image_obj.id).suffix if Path(image_obj.id).suffix else ".png"
            new_image_name = f"{pdf_base}_img_{global_counter}{ext}"
            global_counter += 1
            
            # save image in subfolder
            image_output_path = images_dir / new_image_name
            with open(image_output_path, "wb") as f:
                f.write(image_bytes)
            
            # Update markdown with wikilink: ![[new_image_name]]
            updated_markdown = updated_markdown.replace(
                f"![{image_obj.id}]({image_obj.id})",
                f"![[{new_image_name}]]"
            )
        updated_markdown_pages.append(updated_markdown)
    
    final_markdown = "\n\n".join(updated_markdown_pages)
    output_markdown_path = output_dir / "output.md"
    output_file_relative_path = output_markdown_path
    with open(output_markdown_path, "w", encoding="utf-8") as md_file:
        md_file.write(final_markdown)
    print(f"Markdown generated in {output_markdown_path}")


In [8]:
pdf_files = list(INPUT_DIR.glob("*.pdf"))
if not pdf_files:
    print("No PDFs to process.")
    exit()
    
for pdf_file in pdf_files:
    try:
        process_pdf(pdf_file)
        shutil.move(str(pdf_file), DONE_DIR / pdf_file.name)
        print(f"{pdf_file.name} moved to {DONE_DIR}")
    except Exception as e:
        print(f"Error processing {pdf_file.name}: {e}")

Processing BFS_notespdf.pdf ...
OCR response saved in content\pdf_content\ocr_output\BFS_notespdf\ocr_response.json
Markdown generated in content\pdf_content\ocr_output\BFS_notespdf\output.md
BFS_notespdf.pdf moved to content\pdf_content\markdown_outputs


C:\Users\Anandaraman\AppData\Local\Temp\ipykernel_5700\2116534252.py:60: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  json.dump(ocr_response.dict(), json_file, indent=4, ensure_ascii=False)


In [9]:
output_images_relative_path, output_file_relative_path

(WindowsPath('content/pdf_content/ocr_output/BFS_notespdf/images'),
 WindowsPath('content/pdf_content/ocr_output/BFS_notespdf/output.md'))

In [10]:
# Process all PDFs in INPUT_DIR
# - Important to be careful with the number of PDFs, as the Mistral API has a usage limit
#   and it could cause errors by exceeding the limit.

pdf_files = list(INPUT_DIR.glob("*.pdf"))
if not pdf_files:
    print("No PDFs to process.")
    exit()
    
for pdf_file in pdf_files:
    try:
        process_pdf(pdf_file)
        shutil.move(str(pdf_file), DONE_DIR / pdf_file.name)
        print(f"{pdf_file.name} moved to {DONE_DIR}")
    except Exception as e:
        print(f"Error processing {pdf_file.name}: {e}")


No PDFs to process.


In [None]:
import os
import re
import moondream as md
from PIL import Image

# Initialize the Moondream model with your API key
model = md.vl(api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJrZXlfaWQiOiJlODYyMDEzZC00NmVkLTRiNDYtOGMxZi0xYzYwMTUzY2M0YjkiLCJpYXQiOjE3Mzc1MjYyMjd9.0agZ8vgxwgrUJ7YMrIoBqGPs_4hsuh2zhqkwckxYkIM")

def generate_description_for_image(image_path, figure_caption=""):
    """
    Load an image from the provided path, encode it using the Moondream API,
    and query for a description that is based on the provided figure caption.
    """
    image = Image.open(image_path)
    encoded_image = model.encode_image(image)
    query_text = (
        f"Describe the key technical findings in this figure/visualization "
        f"captioned: {figure_caption} using natural language. Illustrate and mention trends, "
        f"patterns, and numerical values that can be observed. Provide a scientific/academic styled short, "
        f"single paragraph summary that is highly insightful in context of the document."
    )
    response = model.query(encoded_image, query_text)
    description = response.get("answer", "No description available.")
    return description

def extract_captions_from_markdown(markdown_path):
    """
    Parse the markdown file to build a mapping from image filename to its figure caption.
    It looks for placeholders like ![[filename]] and if the next line starts with "Figure",
    uses that as the caption.
    """
    captions = {}
    with open(markdown_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    i = 0
    while i < len(lines):
        line = lines[i]
        placeholder_match = re.search(r'!\[\[(.*?)\]\]', line)
        if placeholder_match:
            image_filename = placeholder_match.group(1)
            caption = ""
            # Check if the following line is a figure caption
            if i + 1 < len(lines) and lines[i + 1].strip().startswith("Figure"):
                caption = lines[i + 1].strip()
            captions[image_filename] = caption
        i += 1
    return captions

def generate_image_descriptions(images_folder, captions_mapping):
    """
    For each image file (as referenced by the markdown file), load the image from the
    given images folder and generate a description using the Moondream API.
    """
    descriptions = {}
    for image_filename, caption in captions_mapping.items():
        image_path = os.path.join(images_folder, image_filename)
        if os.path.exists(image_path):
            print(f"Processing image: {image_filename}")
            description = generate_description_for_image(image_path, caption)
        else:
            description = "Image file not found."
        descriptions[image_filename] = description
    return descriptions

def update_markdown_file(markdown_path, image_descriptions):
    """
    Update the markdown file by replacing the image placeholder (and the adjacent figure
    caption) with a markdown block that includes the original caption and the generated
    image description.
    """
    with open(markdown_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    updated_lines = []
    i = 0
    while i < len(lines):
        line = lines[i]
        placeholder_match = re.search(r'!\[\[(.*?)\]\]', line)
        if placeholder_match:
            image_filename = placeholder_match.group(1)
            caption = ""
            # If the next line is a figure caption, capture it and skip it in the output.
            if i + 1 < len(lines) and lines[i + 1].strip().startswith("Figure"):
                caption = lines[i + 1].strip()
                i += 1  # Skip the caption line since we'll include it in our replacement.
            description = image_descriptions.get(image_filename, "No description available.")
            replacement = f"{caption}\n\n**Image Description:** {description}\n"
            updated_lines.append(replacement)
        else:
            updated_lines.append(line)
        i += 1

    with open(markdown_path, 'w', encoding='utf-8') as f:
        f.writelines(updated_lines)


    # Define paths for images and markdown file.
global output_file_relative_path, output_images_relative_path
images_folder = str(output_images_relative_path)   # Update this path if your images are elsewhere.
markdown_file = str(output_file_relative_path) # Update this path if your markdown file is elsewhere.
    
    # First, extract the figure captions from the markdown file.
captions_mapping = extract_captions_from_markdown(markdown_file)
    
    # Next, generate image descriptions using the Moondream API.
image_descriptions = generate_image_descriptions(images_folder, captions_mapping)
    
    # Finally, update the markdown file by replacing placeholders with the descriptions.
update_markdown_file(markdown_file, image_descriptions)
    
print("Markdown file updated with image descriptions.")

Processing image: BFS_notespdf_img_1.jpeg
Processing image: BFS_notespdf_img_2.jpeg
Processing image: BFS_notespdf_img_3.jpeg
Processing image: BFS_notespdf_img_4.jpeg
Processing image: BFS_notespdf_img_5.jpeg
Processing image: BFS_notespdf_img_6.jpeg
Processing image: BFS_notespdf_img_7.jpeg
Processing image: BFS_notespdf_img_8.jpeg
Markdown file updated with image descriptions.


: 

In [None]:
# !pip install langchain
# !pip install langchain-community
# !pip install openai

# import os
# import re
# from pathlib import Path
# from langchain.agents import Tool, initialize_agent
# from langchain.llms import OpenAI

# # Set OpenAI API Key
# os.environ["OPENAI_API_KEY"] = "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"  # Replace with your actual key
# output_file_relative_path = "content\pdf_content\ocr_output\BFS_notespdf\output.md"

# # Function to process LaTeX and non-markdown symbols
# def convert_markdown(md_content: str) -> str:
#     """Cleans markdown content by replacing LaTeX expressions and non-markdown symbols."""

#     # Normalize inline math expressions (ensure spaces inside `$...$` math expressions)
#     def fix_inline_math(match):
#         content = match.group(1).strip()
#         return f"$ {content} $"
    
#     md_content = re.sub(r"\$(.+?)\$", fix_inline_math, md_content)
    
#     # Replace common LaTeX symbols with markdown-friendly alternatives
#     latex_replacements = {
#         r"\Longrightarrow": "→",
#         r"\Theta": "Θ",
#         r"\cdot": "·",
#         # Add more replacements if needed
#     }
    
#     for latex_cmd, replacement in latex_replacements.items():
#         md_content = md_content.replace(latex_cmd, replacement)
    
#     return md_content

# def parse_markdown_tool(input_text: str) -> str:
#     """Tool that converts a given markdown text into a cleaned markdown format."""
#     return convert_markdown(input_text)

# # Define the LangChain tool
# markdown_parser_tool = Tool(
#     name="MarkdownParser",
#     func=parse_markdown_tool,
#     description="Scans markdown content for LaTeX or other non-markdown symbols and converts them into proper markdown format suitable for hierarchical chunking."
# )

# # Initialize LangChain LLM agent
# llm = OpenAI(temperature=0)
# agent = initialize_agent(
#     tools=[markdown_parser_tool],
#     llm=llm,
#     agent="zero-shot-react-description",
#     verbose=True
# )

# # Directory where the markdown files are stored
# input_directory = Path(str(output_file_relative_path))  # Change this to your actual directory

# # Process all markdown files in the directory and overwrite them
# for md_file in input_directory.glob("*.md"):
#     with md_file.open("r", encoding="utf-8") as file:
#         content = file.read()
    
#     # Use the agent to process the markdown content
#     converted_content = agent.run(content)
    
#     # Overwrite the same file with the cleaned markdown content
#     with md_file.open("w", encoding="utf-8") as file:
#         file.write(converted_content)
    
#     print(f"Processed and updated: {md_file.name}")





[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting openai
  Downloading openai-1.66.2-py3-none-any.whl (567 kB)
     -------------------------------------- 567.3/567.3 kB 7.1 MB/s eta 0:00:00
Collecting distro<2,>=1.7.0
  Downloading distro-1.9.0-py3-none-any.whl (20 kB)
Collecting jiter<1,>=0.4.0
  Downloading jiter-0.9.0-cp311-cp311-win_amd64.whl (210 kB)
     ------------------------------------- 210.1/210.1 kB 12.5 MB/s eta 0:00:00
Installing collected packages: jiter, distro, openai
Successfully installed distro-1.9.0 jiter-0.9.0 openai-1.66.2



[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
  agent = initialize_agent(


In [1]:
import os
import re
import shutil
from pathlib import Path

def convert_latex_to_markdown(text):
    """
    Convert LaTeX notation to markdown equivalent symbols.
    Skip checking or modifying image descriptions and figure descriptions.
    """
    # Identify image and figure descriptions for selective processing
    parts = []
    last_end = 0
    
    # Pattern to match both image and figure descriptions
    combined_pattern = r'(\*\*Image Description:\*\*.*?(?=\n\n|\Z))|(Figure \d+:.*?(?=\n\n|\Z))'
    
    for match in re.finditer(combined_pattern, text, flags=re.DOTALL):
        # Add the text before the match (to be processed)
        if match.start() > last_end:
            parts.append(('process', text[last_end:match.start()]))
        
        # Add the match itself (to be preserved as-is)
        parts.append(('preserve', match.group(0)))
        last_end = match.end()
    
    # Add any remaining text after the last match
    if last_end < len(text):
        parts.append(('process', text[last_end:]))
    
    # Process each part according to its type
    processed_parts = []
    for part_type, part_text in parts:
        if part_type == 'preserve':
            # Keep this part as-is
            processed_parts.append(part_text)
        else:
            # Process this part
            processed_parts.append(process_latex_part(part_text))
    
    # Combine all processed parts
    return ''.join(processed_parts)

def process_latex_part(text):
    """
    Process a part of the text that should have LaTeX converted to markdown.
    """
    # Dictionary of LaTeX to markdown symbol conversions
    latex_to_markdown = {
        # Operators
        r'\\times': '×',
        r'\\div': '÷',
        r'\\dfrac\{([^}]*)\}\{([^}]*)\}': r'\1/\2',  # Simple fraction replacement
        r'\\frac\{([^}]*)\}\{([^}]*)\}': r'\1/\2',  # Add standard fraction
        r'\\sqrt\{([^}]*)\}': r'√\1',
        
        # Math environments
        r'\\begin\{aligned\}(.*?)\\end\{aligned\}': r'```math\n\1\n```',
        
        # Special characters and formatting
        r'\\textbf\{([^}]*)\}': r'**\1**',  # Bold text
        r'\\textit\{([^}]*)\}': r'*\1*',    # Italic text
        r'\\emph\{([^}]*)\}': r'*\1*',      # Emphasized text
        r'\\underline\{([^}]*)\}': r'_\1_', # Underlined text
        
        # Common math operations
        r'\\sum': '∑',
        r'\\prod': '∏',
        r'\\int': '∫',
        
        # Symbols
        r'\\pi': 'π',
        r'\\approx': '≈',
        r'\\pm': '±',
        r'\\neq': '≠',
        r'\\infty': '∞',
        r'\\in': '∈',
        r'\\notin': '∉',
        r'\\subset': '⊂',
        r'\\subseteq': '⊆',
        r'\\cup': '∪',
        r'\\cap': '∩',
        r'\\implies': '⟹',
        r'\\impliedby': '⟸',
        r'\\to': '→',
        r'\\longrightarrow': '⟶',
        r'\\Rightarrow': '⇒',
        r'\\Longrightarrow': '⟹',
        r'\\propto': '∝',
        r'\\bar': '¯',
        r'\\tilde': '~',
        r'\\breve': '˘',
        r'\\hat': '^',
        r'\\prime': '′',
        r'\\dagger': '†',
        r'\\ast': '∗',
        r'\\star': '⋆',
        r'\\cdots': '⋯',
        r'\\vdots': '⋮',
        r'\\ldots': '...',
        
        # Greek letters
        r'\\alpha': 'α',
        r'\\beta': 'β',
        r'\\gamma': 'γ',
        r'\\Gamma': 'Γ',
        r'\\delta': 'δ',
        r'\\Delta': 'Δ',
        r'\\epsilon': 'ϵ',
        r'\\varepsilon': 'ε',
        r'\\zeta': 'ζ',
        r'\\eta': 'η',
        r'\\theta': 'θ',
        r'\\Theta': 'Θ',
        r'\\vartheta': 'ϑ',
        r'\\iota': 'ι',
        r'\\kappa': 'κ',
        r'\\lambda': 'λ',
        r'\\Lambda': 'Λ',
        r'\\mu': 'μ',
        r'\\nu': 'ν',
        r'\\xi': 'ξ',
        r'\\Xi': 'Ξ',
        r'\\omicron': 'ο',
        r'\\pi': 'π',
        r'\\Pi': 'Π',
        r'\\varpi': 'ϖ',
        r'\\rho': 'ρ',
        r'\\varrho': 'ϱ',
        r'\\sigma': 'σ',
        r'\\Sigma': 'Σ',
        r'\\varsigma': 'ς',
        r'\\tau': 'τ',
        r'\\upsilon': 'υ',
        r'\\Upsilon': 'Υ',
        r'\\phi': 'ϕ',
        r'\\Phi': 'Φ',
        r'\\varphi': 'φ',
        r'\\chi': 'χ',
        r'\\psi': 'ψ',
        r'\\Psi': 'Ψ',
        r'\\omega': 'ω',
        r'\\Omega': 'Ω',
        
        # Comparison operators
        r'\\leq': '≤',
        r'\\geq': '≥',
        r'\\forall': '∀',
        r'\\exists': '∃',
        
        # Space commands
        r'\\quad': ' ',
        r'\\qquad': '  ',
        
        # Algorithm related
        r'\\operatorname\{([^}]*)\}': r'\1',
        
        # Common substitutions
        r'\\left\(': '(',
        r'\\right\)': ')',
        r'\\left\[': '[',
        r'\\right\]': ']',
        r'\\left\{': '{',
        r'\\right\}': '}',
        r'\\{': '{',
        r'\\}': '}',
        r'\\mid': '|',
        r'_\{([^}]*)\}': r'_\1',
        r'\^\{([^}]*)\}': r'^\1',
        
        # Special formatting
        r'\\begin\{itemize\}(.*?)\\end\{itemize\}': lambda match: '\n' + '\n'.join('- ' + item.strip() for item in re.split(r'\\item', match.group(1))[1:]) + '\n',
        r'\\begin\{enumerate\}(.*?)\\end\{enumerate\}': lambda match: '\n' + '\n'.join(f'{i+1}. ' + item.strip() for i, item in enumerate(re.split(r'\\item', match.group(1))[1:])) + '\n',
    }
    
    # Regular expression patterns for inline and display equations
    inline_pattern = r'\$([^\$]+)\$'
    display_pattern = r'\$\$([^\$]+)\$\$'
    
    # Process display equations first
    display_matches = list(re.finditer(display_pattern, text))
    for match in display_matches:
        original_equation = match.group(0)
        equation_content = match.group(1)
        
        # Apply conversions to the equation content
        modified_content = equation_content
        for latex_pattern, markdown_symbol in latex_to_markdown.items():
            try:
                if callable(markdown_symbol):
                    modified_content = re.sub(latex_pattern, markdown_symbol, modified_content, flags=re.DOTALL)
                else:
                    modified_content = re.sub(latex_pattern, markdown_symbol, modified_content)
            except re.error:
                # Skip this pattern if it causes an error
                print(f"Warning: Skipping problematic pattern: {latex_pattern}")
                continue
        
        # Replace the original equation with the converted one
        text = text.replace(original_equation, f"\n```math\n{modified_content}\n```\n")
    
    # Process inline equations
    inline_matches = list(re.finditer(inline_pattern, text))
    for match in inline_matches:
        original_equation = match.group(0)
        equation_content = match.group(1)
        
        # Apply conversions to the equation content
        modified_content = equation_content
        for latex_pattern, markdown_symbol in latex_to_markdown.items():
            try:
                if callable(markdown_symbol):
                    modified_content = re.sub(latex_pattern, markdown_symbol, modified_content, flags=re.DOTALL)
                else:
                    modified_content = re.sub(latex_pattern, markdown_symbol, modified_content)
            except re.error:
                # Skip this pattern if it causes an error
                print(f"Warning: Skipping problematic pattern: {latex_pattern}")
                continue
        
        # Replace the original equation with the converted one
        text = text.replace(original_equation, f"`{modified_content}`")
    
    # Additional post-processing
    # Fix multiple blank lines
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    # Preserve code blocks
    code_blocks = re.findall(r'```.*?```', text, re.DOTALL)
    for i, block in enumerate(code_blocks):
        text = text.replace(block, f"CODE_BLOCK_{i}")
    
    # Clean up remaining LaTeX commands that weren't caught
    text = re.sub(r'\\[a-zA-Z]+(\{[^}]*\})*', '', text)
    
    # Restore code blocks
    for i, block in enumerate(code_blocks):
        text = text.replace(f"CODE_BLOCK_{i}", block)
    
    return text

def process_markdown_file(input_file, output_file):
    """
    Process a markdown file and convert LaTeX notation to markdown equivalents.
    """
    try:
        with open(input_file, 'r', encoding='utf-8', errors='replace') as file:
            content = file.read()
        
        # Convert LaTeX to markdown
        converted_content = convert_latex_to_markdown(content)
        
        # Write converted content to output file
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(converted_content)
        
        print(f"Successfully converted {input_file} to {output_file}")
        return True
    except Exception as e:
        print(f"Error processing {input_file}: {str(e)}")
        return False

def process_directory(input_dir, output_dir):
    """
    Process all markdown files in the input directory and save converted files to the output directory.
    """
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Get all markdown files in the input directory
    input_path = Path(input_dir)
    markdown_files = []
    
    # Search for markdown files recursively
    for extension in ('*.md', '*.markdown'):
        markdown_files.extend(list(input_path.glob(f"**/{extension}")))
    
    if not markdown_files:
        print(f"No markdown files found in {input_dir}")
        return
    
    print(f"Found {len(markdown_files)} markdown files")
    
    # Process each markdown file
    successful_conversions = 0
    for md_file in markdown_files:
        try:
            relative_path = md_file.relative_to(input_path)
            output_file = Path(output_dir) / relative_path
            
            # Create any necessary subdirectories
            output_file.parent.mkdir(parents=True, exist_ok=True)
            
            if process_markdown_file(md_file, output_file):
                successful_conversions += 1
        except Exception as e:
            print(f"Error processing file {md_file}: {str(e)}")
    
    print(f"Successfully converted {successful_conversions} out of {len(markdown_files)} files")

# Main function with hardcoded paths
def main():
    # Hardcoded input and output directories
    input_dir = "./content/pdf_content/ocr_output/BFS_notespdf"
    output_dir = "./cleaned_md_outputs"
    
    print(f"Processing markdown files from {input_dir} to {output_dir}")
    process_directory(input_dir, output_dir)

# Run the main function when the script is executed
if __name__ == "__main__":
    main()

Processing markdown files from ./content/pdf_content/ocr_output/BFS_notespdf to ./cleaned_md_outputs
Found 1 markdown files
Successfully converted content\pdf_content\ocr_output\BFS_notespdf\output.md to cleaned_md_outputs\output.md
Successfully converted 1 out of 1 files
