In [4]:
import os
import subprocess
import concurrent.futures
from PyPDF2 import PdfReader
from markitdown import MarkItDown
import multiprocessing

def split_pdf_efficiently(pdf_path, num_chunks):
    """Splits a PDF into approximately equal-sized chunks using pdftk efficiently.

    Args:
        pdf_path: Path to the PDF file.
        num_chunks: The desired number of output chunks.

    Returns:
        A list of paths to the generated PDF chunks.
    """
    reader = PdfReader(pdf_path)
    num_pages = len(reader.pages)
    pages_per_chunk = num_pages // num_chunks
    chunk_paths = []
    output_dir = os.path.dirname(pdf_path)

    # Build a list of page ranges for pdftk
    page_ranges = []
    start_page = 1
    for i in range(num_chunks):
        end_page = start_page + pages_per_chunk -1
        if i == num_chunks - 1: #last chunk
           end_page = num_pages
        page_ranges.append((start_page, end_page))
        start_page = end_page + 1


    with concurrent.futures.ProcessPoolExecutor() as executor:
        futures = []
        for i, (start, end) in enumerate(page_ranges):
            output_filename = f"chunk_{i}_{start}-{end}.pdf"
            output_filepath = os.path.join(output_dir, output_filename)
            chunk_paths.append(output_filepath)

            future = executor.submit(run_pdftk, pdf_path, start, end, output_filepath)
            futures.append(future)

        #Wait all and check exceptions.
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result() #get exception if exists.
            except Exception as e:
                print(f"Error during PDF splitting: {e}")


    return chunk_paths

def run_pdftk(pdf_path, start_page, end_page, output_path):
    """Runs pdftk to extract a specific page range."""
    command = [
        "pdftk",
        pdf_path,
        "cat",
        f"{start_page}-{end_page}",
        "output",
        output_path
    ]
    subprocess.run(command, check=True, capture_output=True)


def convert_chunk(chunk_path, docintel_endpoint=None):
    """Converts a single PDF chunk to Markdown."""
    try:
        output_md_path = os.path.splitext(chunk_path)[0] + ".md"
        md = MarkItDown(docintel_endpoint=docintel_endpoint) if docintel_endpoint else MarkItDown()
        result = md.convert(chunk_path)
        with open(output_md_path, "w", encoding="utf-8") as f:
            f.write(result.text_content)
        return output_md_path, None
    except Exception as e:
        return None, (chunk_path, e)


def merge_markdown_files(markdown_files, output_file):
    """Merges multiple Markdown files."""
    with open(output_file, "w", encoding="utf-8") as outfile:
        for filename in markdown_files:
            try:
                with open(filename, "r", encoding="utf-8") as infile:
                    outfile.write(infile.read())
                outfile.write("\n\n---\n\n")
            except FileNotFoundError:
                print(f"Warning: {filename} not found, skipping.")
            except Exception as e:
                print(f"Error reading or writing during merge: {e}")


def main():
    pdf_file = "/home/aricept094/mydata/Nelson Textbook of Pediatrics, 21th edition 2020.pdf"
    output_dir = os.path.dirname(pdf_file)
    final_output_file = os.path.join(output_dir, "merged_output.md")
    docintel_endpoint = None
    num_chunks = multiprocessing.cpu_count() * 4  # Adjust - start with 4x your CPU count

    # 1. Split the PDF *efficiently*
    print("Splitting PDF...")
    chunk_paths = split_pdf_efficiently(pdf_file, num_chunks)
    print(f"PDF split into {len(chunk_paths)} chunks.")

    # 2. Convert to Markdown (using multiprocessing.Pool)
    print("Converting to Markdown...")
    markdown_files = []
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        results = pool.starmap(convert_chunk, [(path, docintel_endpoint) for path in chunk_paths])
        for md_path, error in results:
            if md_path:
                markdown_files.append(md_path)
            elif error:
                chunk_path, e = error
                print(f"Error converting {chunk_path}: {e}")

    # 3. Merge
    print("Merging Markdown files...")
    merge_markdown_files(markdown_files, final_output_file)
    print(f"Merged Markdown file created at: {final_output_file}")

    # 4. Clean up
    print("Cleaning up temporary files...")
    for chunk_path in chunk_paths:
        try:
            os.remove(chunk_path)
            md_chunk_path = os.path.splitext(chunk_path)[0] + ".md"
            os.remove(md_chunk_path)
        except OSError as e:
            print(f"Error deleting file {chunk_path}: {e}")
    print("Cleanup complete.")

if __name__ == "__main__":
    main()

Splitting PDF...
PDF split into 64 chunks.
Converting to Markdown...
Merging Markdown files...
Merged Markdown file created at: /home/aricept094/mydata/merged_output.md
Cleaning up temporary files...
Cleanup complete.


In [2]:
import os
import multiprocessing
import time
import math


def split_markdown_file_balanced(filepath, target_size_mb=10):
    """
    Splits a markdown file into two files, aiming for more balanced sizes.

    Args:
        filepath: The path to the markdown file.
        target_size_mb: The approximate target size for each part (in MB).
                         This is a *target*; the split won't be perfectly even.
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        print(f"Error: File not found: {filepath}")
        return
    except Exception as e:
        print(f"Error reading file: {filepath} - {e}")
        return

    total_size_bytes = len(content.encode('utf-8'))  # Get size in bytes
    target_size_bytes = target_size_mb * 1024 * 1024  # Convert MB to bytes

    # Find a good split point near the middle, but on a line break.
    split_index = total_size_bytes // 2
    best_split_index = -1

    # Adjust to find a line break closest to the target size
    if split_index < target_size_bytes :
        # Search forward for a line break
        for i in range(split_index, len(content)):
             if content[i] == '\n':
                best_split_index = i
                break
    else:
            #seach backward
        for i in range(split_index,0,-1):
             if content[i] == '\n':
                best_split_index = i
                break

    if best_split_index == -1:
        print("Error: Could not find a suitable split point (no line breaks).")
        return


    part1 = content[:best_split_index]
    part2 = content[best_split_index:]

    basename = os.path.splitext(filepath)[0]
    ext = os.path.splitext(filepath)[1]
    output_file1 = f"{basename}_part1{ext}"
    output_file2 = f"{basename}_part2{ext}"

    try:
        with open(output_file1, 'w', encoding='utf-8') as f:
            f.write(part1)
        with open(output_file2, 'w', encoding='utf-8') as f:
            f.write(part2)
    except Exception as e:
        print(f"Error writing output files: {e}")
        return

    print(f"Successfully split '{filepath}' into '{output_file1}' and '{output_file2}'")
    print(f"Part 1 size: {os.path.getsize(output_file1) / (1024 * 1024):.2f} MB")
    print(f"Part 2 size: {os.path.getsize(output_file2) / (1024 * 1024):.2f} MB")



def main():
    filepath = "/home/aricept094/mydata/merged_output.md"
    target_size_mb = 10  # Adjust as needed

    start_time = time.time()

    # Use multiprocessing to avoid blocking the main thread
    process = multiprocessing.Process(target=split_markdown_file_balanced, args=(filepath, target_size_mb))
    process.start()
    process.join()  # Wait for the process to complete

    end_time = time.time()
    print(f"Total execution time: {end_time - start_time:.4f} seconds")


if __name__ == "__main__":
    main()

Successfully split '/home/aricept094/mydata/merged_output.md' into '/home/aricept094/mydata/merged_output_part1.md' and '/home/aricept094/mydata/merged_output_part2.md'
Part 1 size: 13.25 MB
Part 2 size: 13.17 MB
Total execution time: 0.1683 seconds
