In [8]:
import fitz
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AutoModelForCausalLM, AutoTokenizer
import os

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    pdf_document = fitz.open(pdf_path)
    for page in pdf_document:
        text += page.get_text()
    return text

# Function to translate text using BERT
def translate_text_with_bert(text, target_language="fr"):
    model_name = "bert-base-multilingual-cased"
    model = BertForSequenceClassification.from_pretrained(model_name)
    tokenizer = BertTokenizer.from_pretrained(model_name)

    # Translate the input text to the target language
    input_ids = tokenizer.encode(text, add_special_tokens=True, max_length=128, truncation=True, padding=True, return_tensors="pt")
    target_language_code = tokenizer.get_vocab().get(target_language, tokenizer.get_vocab()["[UNK]"])
    translation_ids = model.generate(
        input_ids,
        num_beams=4,
        max_length=128,
        early_stopping=True,
        num_return_sequences=1,
        forced_bos_token_id=target_language_code
    )

    # Decode the translated text
    translated_text = tokenizer.decode(translation_ids[0], skip_special_tokens=True)
    return translated_text

# Function to create a new PDF document with translated text
def create_translated_pdf(input_pdf_path, output_pdf_path, target_language="fr"):
    extracted_text = extract_text_from_pdf(input_pdf_path)
    translated_text = translate_text_with_bert(extracted_text, target_language)

    pdf_document = fitz.open()
    pdf_document.insert_page(0, width=600, height=800)
    page = pdf_document[0]
    page.insert_text(translated_text)

    pdf_document.save(output_pdf_path)
    pdf_document.close()

# # Paths for input and output PDF files
# input_pdf_path = "your_input.pdf"
# output_pdf_path = "translated_outputBERT.pdf"

# # Translate and create the new PDF
# create_translated_pdf(input_pdf_path, output_pdf_path)

# print(f"Translation complete. Translated PDF saved at: {output_pdf_path}")


If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


In [9]:
# Input and output file paths
input_pdf_path = "uploads/AFFAIRE C.P. ET M.N. c. FRANCE.pdf"
output_pdf_path = "translated_outputBERT.pdf"

# Translate and create the new PDF
create_translated_pdf(input_pdf_path, output_pdf_path)

print(f"Translation complete. Translated PDF saved at: {output_pdf_path}")


AttributeError: 'BertTokenizer' object has no attribute '_target_languages'