In [22]:
import fitz  # PyMuPDF

def enumerate_words(pdf_path):
    doc = fitz.open(pdf_path)
    words_with_ids = []
    word_id = 1

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            for line in block["lines"]:
                for span in line["spans"]:
                    for word in span["text"].split():
                        words_with_ids.append({
                            "id": word_id,
                            "text": word,
                            "bbox": fitz.Rect(span["bbox"]),
                            "font": span["font"],  # font used in the span
                            "size": span["size"],  # font size used in the span
                            "color": span["color"],  # font color used in the span as rgb tuple
                            "page_num": page_num
                        })
                        word_id += 1
    return words_with_ids

# Example: print the details of the first 10 words
for word in words_with_ids[:10]:
    print(f"ID: {word['id']}, Text: {word['text']}, Font: {word['font']}, Size: {word['size']}, Color: {word['color']}")


ID: 1, Text: Eren, Font: Arial-BoldMT, Size: 30.0, Color: 1521244
ID: 2, Text: Kalinsazlioglu, Font: Arial-BoldMT, Size: 30.0, Color: 1521244
ID: 3, Text: eren@enpoi.com, Font: Calibri, Size: 12.0, Color: 1521244
ID: 4, Text: |, Font: Calibri, Size: 12.0, Color: 1521244
ID: 5, Text: +49, Font: Calibri, Size: 12.0, Color: 1521244
ID: 6, Text: 17659532268, Font: Calibri, Size: 12.0, Color: 1521244
ID: 7, Text: |, Font: Calibri, Size: 12.0, Color: 1521244
ID: 8, Text: Niedenstein,, Font: Calibri, Size: 12.0, Color: 1521244
ID: 9, Text: Germany, Font: Calibri, Size: 12.0, Color: 1521244
ID: 10, Text: |, Font: Calibri, Size: 12.0, Color: 1521244


In [25]:
import fitz  # PyMuPDF

def replace_text_with_same_style(pdf_path, word_id_to_replace, new_text, output_pdf_path):
    doc = fitz.open(pdf_path)
    words_with_details = enumerate_words(pdf_path)

    for word_info in words_with_details:
        if word_info["id"] == word_id_to_replace:
            page = doc.load_page(word_info["page_num"])
            rect = word_info["bbox"]
            page.add_redact_annot(rect)
            page.apply_redactions()

            # Font handling
            fontname = word_info["font"]
            # Check if the font can be found in the PDF's font resources
            try:
                font = fitz.Font(fontname)
            except RuntimeError:
                # If the font is not found, fall back to a standard font
                fontname = "Helvetica"

            # Color handling
            # Ensure color is in the format of a tuple with three components
            color = word_info["color"]
            if isinstance(color, int):
                # If color is an integer, convert to grayscale tuple
                color = (color / 255,) * 3
            else:
                # Normalize color components to range [0, 1]
                color = tuple(c / 255 if c > 1 else c for c in color)

            # Insert new text with the same style
            page.insert_text(rect.tl, new_text, fontname=fontname, fontsize=word_info["size"], color=color)

    doc.save(output_pdf_path)

# Use the function to replace text
replace_text_with_same_style("Eren CV Muster.docx.pdf", 1, "New Text", "updated.pdf")


ValueError: need 1, 3 or 4 color components in range 0 to 1

In [35]:
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import DecodedStreamObject, EncodedStreamObject
import fitz  # PyMuPDF

def enumerate_words(pdf_path):
    doc = fitz.open(pdf_path)
    words_with_ids = []
    word_id = 1

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        words = page.get_text("words")

        for word in words:
            if word[4].strip():  # Skip spaces
                words_with_ids.append({
                    "id": word_id,
                    "text": word[4],
                    "bbox": word[:4],  # The bounding box of the word
                    "page_num": page_num
                })
                word_id += 1

    return words_with_ids

def replace_text(content, replacements=dict()):
    lines = content.splitlines()
    result = ""
    in_text = False

    for line in lines:
        if line == "BT":
            in_text = True
        elif line == "ET":
            in_text = False
        elif in_text:
            cmd = line[-2:]
            if cmd.lower() == 'tj':
                for k, v in replacements.items():
                    if k in line:
                        line = line.replace(k, v)
                        break
                result += line + "\n"
            else:
                result += line + "\n"
            continue
        result += line + "\n"

    return result

def process_data(object, replacements):
    data = object.get_data()
    decoded_data = data.decode('utf-8')
    replaced_data = replace_text(decoded_data, replacements)
    encoded_data = replaced_data.encode('utf-8')

    if isinstance(object, DecodedStreamObject):
        object.set_data(encoded_data)
    else:  # EncodedStreamObject
        object._data = encoded_data  # Directly set the _data attribute

def replace_text_in_pdf(input_path, replacements, output_path):
    pdf = PdfReader(input_path)
    writer = PdfWriter()

    for page_number in range(len(pdf.pages)):
        page = pdf.pages[page_number]
        contents = page.get_contents()

        if isinstance(contents, (DecodedStreamObject, EncodedStreamObject)):
            process_data(contents, replacements)
        elif isinstance(contents, list):
            for obj in contents:
                if isinstance(obj, (DecodedStreamObject, EncodedStreamObject)):
                    streamObj = obj.get_object()
                    process_data(streamObj, replacements)

        writer.add_page(page)

    with open(output_path, 'wb') as out_file:
        writer.write(out_file)

# Example usage
words_with_ids = enumerate_words('Eren CV Muster.docx.pdf')

# Debugging: Print the extracted words with their details
for word in words_with_ids:
    print(f"ID: {word['id']}, Text: '{word['text']}', BBox: {word['bbox']}, Page: {word['page_num']}")

# Replace the text of the word with ID 1
word_to_replace = next((word for word in words_with_ids if word["id"] == 1), None)
if word_to_replace:
    replacements = {word_to_replace["text"]: "New Text"}
    replace_text_in_pdf('Eren CV Muster.docx.pdf', replacements, 'output.pdf')
else:
    print("Word with given ID not found")


ID: 1, Text: 'Eren', BBox: (58.5, 50.4091796875, 125.18519592285156, 83.9248046875), Page: 0
ID: 2, Text: 'Kalinsazlioglu', BBox: (133.529296875, 50.4091796875, 335.2445068359375, 83.9248046875), Page: 0
ID: 3, Text: 'eren@enpoi.com', BBox: (58.5, 90.8232421875, 143.07595825195312, 102.8232421875), Page: 0
ID: 4, Text: '|', BBox: (145.79296875, 90.8232421875, 151.318359375, 102.8232421875), Page: 0
ID: 5, Text: '+49', BBox: (154.03125, 90.8232421875, 172.17327880859375, 102.8232421875), Page: 0
ID: 6, Text: '17659532268', BBox: (174.884765625, 90.8232421875, 241.80679321289062, 102.8232421875), Page: 0
ID: 7, Text: '|', BBox: (244.5, 90.8232421875, 250.025390625, 102.8232421875), Page: 0
ID: 8, Text: 'Niedenstein,', BBox: (252.73828125, 90.8232421875, 314.25640869140625, 102.8232421875), Page: 0
ID: 9, Text: 'Germany', BBox: (316.974609375, 90.8232421875, 361.546875, 102.8232421875), Page: 0
ID: 10, Text: '|', BBox: (364.259765625, 90.8232421875, 369.78515625, 102.8232421875), Page: 0


In [38]:
from PyPDF2 import PdfReader

def replace_text_in_pdf(input_path, old_text, new_text):
    pdf = PdfReader(input_path)
    
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text and old_text in page_text:
            print(page_text.replace(old_text, new_text))  # This prints the modified text

# Example usage
replace_text_in_pdf('Eren CV Muster.docx.pdf', 'Eren', 'new text')


new text
Kalinsazlioglu
eren@enpoi.c om|+4917659532268|Niedens tein,German y|Website|YouTube|Github
A b o u tm e
Ihavepassionandanaptitudeforinnovation,creativeproblem-solving,andanalyticalthinking.Theoreticalandpracticalknowledgeofelectronics,mechanics,andsoftwaredevelopmentprocessesaremystrengths.Istartedmakingpapermodelsmyself.Iwasverysuccessful.ThenItriedtocreatemyowndesignsonthecomputerandmakethemyourself.Istartedtobeinterestedin3Dprintersandparticipatedinvariousactivitiesonthissubject.WhenIcametoGermany,Istartedtoconcentrateonanimation,filmmakingandgamedevelopment.Iamacontinuouslearnerwholovesandstaysintunewiththelatesttechnology.MyfutureplansaretocombineallthisknowledgeIhavelearnedanddevelopprojects.Lookingforwardtomakingnewmachinesandinventions.S k i l l s
●
Personal
skills:ProblemSolving|Generatingnewideas|Innovative|Honesty|Communication|Leadership|RelationshipBuilding|Collaboration|Focused|Motivation
●
Computer
Experience:3DmodelingSolidworks|Sketchup|Blender|UnrealEngine|Un