In [1]:
import fitz  # PyMuPDF for PDF text extraction
import openai
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import os

In [None]:

# Set up OpenAI API key
openai.api_key = "..."

os.environ['OPENAI_API_KEY'] = "..."

In [3]:
# Function to translate text
def translate_text(text, target_language="French"):
    if not text.strip():
        return text  # Skip empty strings

    prompt = f"""
    You are a professional translator. Translate the following text into {target_language}, maintaining formatting and tone.
    
    Text: "{text}"
    Translation:
    """
    from openai import Client
    try:
        # response = openai.ChatCompletion.create(
        #     model="gpt-4",
        #     messages=[
        #         {"role": "system", "content": "You are an expert translator."},
        #         {"role": "user", "content": prompt},
        #     ],
        #     max_tokens=500,
        #     temperature=0.7,
        # )
        # translated_text = response.choices[0].message.content.strip()

        client = Client()
        response=client.chat.completions.create(
            model="gpt-4",  # Use "gpt-4" for better translations; "gpt-3.5-turbo" is also an option.
            messages=[
                {"role": "system", "content": "You are an expert translator."},
                {"role": "user", "content": prompt},
            ],
            max_tokens=500,
            temperature=0.7,
        )
        print("Coversions: ", response.choices[0].message.content)
        translated_text = response.choices[0].message.content.strip()

        return translated_text
    except Exception as e:
        print(f"Error translating text: {e}")
        return text

In [4]:

# Function to extract text from PDF
def extract_text_from_pdf(input_file):
    pdf_document = fitz.open(input_file)
    text_pages = []

    for page_number in range(len(pdf_document)):
        page = pdf_document.load_page(page_number)
        text = page.get_text("text")  # Extract text
        text_pages.append(text)

    pdf_document.close()
    return text_pages


In [5]:

# Function to create a new PDF with translated text
def create_translated_pdf(translated_text_pages, output_file):
    c = canvas.Canvas(output_file, pagesize=letter)
    width, height = letter

    for page_text in translated_text_pages:
        y = height - 50  # Start from the top
        for line in page_text.split("\n"):
            c.drawString(50, y, line)  # Draw each line of text
            y -= 15  # Move to the next line
            if y < 50:  # Start a new page if the space runs out
                c.showPage()
                y = height - 50
        c.showPage()  # Finish the page

    c.save()


In [6]:

# Main function to translate a PDF
def translate_pdf(input_file, output_file, target_language="French"):
    # Step 1: Extract text from the PDF
    print("Extracting text from PDF...")
    text_pages = extract_text_from_pdf(input_file)

    # Step 2: Translate each page's text
    translated_text_pages = []
    for page_text in text_pages:
        print("Translating text...")
        translated_text = translate_text(page_text, target_language)
        translated_text_pages.append(translated_text)

    # Step 3: Create a new PDF with translated text
    print("Creating translated PDF...")
    create_translated_pdf(translated_text_pages, output_file)
    print(f"Translated PDF saved to {output_file}")


In [7]:

# Function to translate text within a PDF while preserving layout
def translate_pdf_in_place(input_file, output_file, target_language="French"):
    # Open the PDF
    pdf_document = fitz.open(input_file)
    print("pdf_document: ", pdf_document, len(pdf_document))

    for page_number in range(len(pdf_document)):
        print("page_number: ", page_number)
        page = pdf_document[page_number]
        print("page: ", page)
        text_blocks = page.get_text("blocks")  # Get text as blocks
        print(f"Page {page_number + 1}: Found {len(text_blocks)} text blocks.")
        # for block in text_blocks:
        #     # Unpack the first 5 elements (ignore the rest using *)
        #     x0, y0, x1, y1, original_text, *_ = block
        #     if original_text.strip():  # Skip empty or whitespace-only blocks
        #         print(f"Original Text: {original_text}")
                
        #         # Translate text
        #         translated_text = translate_text(original_text, target_language)
        #         print(f"Translated Text: {translated_text}")

        #         # Insert the translated text in the same position
        #         page.insert_textbox(
        #             fitz.Rect(x0, y0, x1, y1),  # Use the original block's rectangle
        #             translated_text,
        #             fontsize=12,  # Match font size
        #             align=0,  # Left-align text
        #         )


        for block in text_blocks:
            # Extract block details
            x0, y0, x1, y1, original_text = block[:5]  # Ignore extra elements
            if not original_text.strip():  # Skip empty blocks
                continue

            print(f"Original Text: {original_text}")

            # Translate the text
            translated_text = translate_text(original_text, target_language)
            print(f"Translated Text: {translated_text}")

            # Remove original content by overlaying a white rectangle
            rect = fitz.Rect(x0, y0, x1, y1)
            page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))

            # Insert translated text with font matching
            page.insert_textbox(
                rect,
                translated_text,
                fontsize=12,  # Adjust dynamically if needed
                fontname="helv",  # Use a default font; adjust if necessary
                align=0,  # Left-align text
            )

    # Save the updated PDF
    pdf_document.save(output_file)
    pdf_document.close()
    print(f"Translated PDF saved to {output_file}")


In [8]:

# Example Usage
input_pdf = "sample_files/example.pdf"
output_pdf = "translated_files/translated_example.pdf"
target_language = "French"


translate_pdf_in_place(input_pdf, output_pdf, target_language)


pdf_document:  Document('sample_files/example.pdf') 3
page_number:  0
page:  page 0 of sample_files/example.pdf
Page 1: Found 2 text blocks.
Original Text: Hello world

Coversions:  "Bonjour le monde
"
Translated Text: "Bonjour le monde
"
Original Text: Python is the  best programming language

Coversions:  "Python est le meilleur langage de programmation"
Translated Text: "Python est le meilleur langage de programmation"
page_number:  1
page:  page 1 of sample_files/example.pdf
Page 2: Found 2 text blocks.
Original Text: My name is Anand Vishwakarma

Coversions:  "Mon nom est Anand Vishwakarma"
Translated Text: "Mon nom est Anand Vishwakarma"
Original Text: • The AI developer

Coversions:  "• Le développeur d'IA"
Translated Text: "• Le développeur d'IA"
page_number:  2
page:  page 2 of sample_files/example.pdf
Page 3: Found 1 text blocks.
Original Text: There’s no need to learn foreign languages anymore. 
Thanks to AI, you speak into your phone, and it translates.
So, wherever you are