# Find and replace the given text in the PDF file

- Used PyMuPDF because it supports element level coordinates

In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.9 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.9 PyMuPDFb-1.24.9


In [None]:
import fitz  # PyMuPDF

def find_text_in_pdf(pdf_path, search_text):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    results = []

    # Iterate through the pages
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text_instances = page.search_for(search_text)

        # If the text is found, store the result with page number and rectangle coordinates
        for instance in text_instances:
            results.append({
                "page": page_num + 1,
                "coordinates": instance
            })

    pdf_document.close()

    return results

# Example usage
pdf_path = "/content/pdf_files/in/research_paper.pdf"
search_text = "research paper"
locations = find_text_in_pdf(pdf_path, search_text)

if locations:
    print(f"Text '{search_text}' found at:")
    for location in locations:
        print(f"Page {location['page']}, Coordinates: {location['coordinates']}")
else:
    print(f"Text '{search_text}' not found in the PDF.")


Text 'research paper' found at:
Page 1, Coordinates: Rect(85.10399627685547, 70.81623077392578, 286.26788330078125, 98.8174819946289)


In [None]:
import fitz  # PyMuPDF

def int_to_rgb(color_int):
    # Convert an integer color to RGB format (tuple of floats in range 0-1)
    r = (color_int >> 16) & 255
    g = (color_int >> 8) & 255
    b = color_int & 255
    return (r / 255.0, g / 255.0, b / 255.0)

def replace_text_with_x_in_pdf(input_pdf_path, output_pdf_path, search_text):
    # Open the PDF file
    pdf_document = fitz.open(input_pdf_path)

    is_modified = False

    # Iterate through the pages
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text_instances = page.search_for(search_text)

        if not text_instances:
            print(f"Text '{search_text}' not found on page {page_num + 1}. Skipping...")
            continue

        # If the text is found, replace each character with 'X', keeping original styling
        for instance in text_instances:
            # Extract text within the bounding box
            words = page.get_text("words", clip=instance)
            for word in words:
                word_rect = fitz.Rect(word[:4])
                original_text = word[4]
                replacement_text = 'X' * len(original_text)

                # Get original font size and color by using the first character
                spans = page.get_text("dict", clip=word_rect)["blocks"][0]["lines"][0]["spans"]
                font_size = spans[0]["size"]
                color_int = spans[0]["color"]

                # Convert the integer color value to RGB format
                rgb_color = int_to_rgb(color_int)

                # Draw a white rectangle to cover the original text
                page.draw_rect(word_rect, color=(1, 1, 1), fill=(1, 1, 1), fill_opacity=1)

                # Ensure the replacement text is placed correctly
                page.insert_text(word_rect.tl, replacement_text, fontsize=font_size, color=rgb_color)

            is_modified = True

    if not is_modified:
        print(f"No occurrences of '{search_text}' found in the PDF.")
        return

    # Save the modified PDF to a new file
    print(f"Replaced '{search_text}' with 'X' and saved to '{output_pdf_path}'.")
    pdf_document.save(output_pdf_path)
    pdf_document.close()

# Example usage
input_pdf_path = "/content/pdf_files/in/research_paper.pdf"
output_pdf_path = "/content/pdf_files/out/research_paper_v4.pdf"
search_text = "research paper"

replace_text_with_x_in_pdf(input_pdf_path, output_pdf_path, search_text)


Text 'research paper' not found on page 2. Skipping...
Text 'research paper' not found on page 3. Skipping...
Text 'research paper' not found on page 4. Skipping...
Text 'research paper' not found on page 5. Skipping...
Text 'research paper' not found on page 6. Skipping...
Text 'research paper' not found on page 7. Skipping...
Text 'research paper' not found on page 8. Skipping...
Text 'research paper' not found on page 9. Skipping...
Text 'research paper' not found on page 10. Skipping...
Text 'research paper' not found on page 11. Skipping...
Replaced 'research paper' with 'X' and saved to '/content/pdf_files/out/research_paper_v4.pdf'.
