Notebook for translating a PDF document.

In [None]:
from ipynb.fs.defs.a_preprocess_data import store_documents 
from ipynb.fs.defs.a_preprocess_data import get_pdf_documents

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.chat_models import ChatOpenAI
from typing import List
import tiktoken

In [None]:
def translate_text(text):
    """
    Uses OpenAI's API to translate the given text.
    """

    llm = ChatOpenAI() 
    system_message_prompt = SystemMessagePromptTemplate.from_template("""Du bist ein hilfreicher Assistent, der Text von Deutsch nach Englisch übersetzt.""")
    human_message_prompt = HumanMessagePromptTemplate.from_template("""Bitte übersetze mir den folgenden Text: \n \n {text} """)
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    translation = llm(chat_prompt.format_prompt(text=text).to_messages()).content
    return translation

In [None]:
def estimate_openai_price_for_qa_set_gen(text: str, model_name: str):
    """
    Estimates the price for translating based on the text and model used.
    """

    tokenizer = tiktoken.get_encoding("cl100k_base")
    token_length = len(tokenizer.encode(text))

    print("Token Length = ", token_length)

    if model_name == "gpt-3.5-turbo":
        input_price = 0.0010 / 1000
        output_price = 0.0020 / 1000
    elif model_name == "gpt-4":
        input_price = 0.03 / 1000
        output_price = 0.06 / 1000
    elif model_name == "gpt-4-32k":
        input_price = 0.06 / 1000
        output_price = 0.12 / 1000
    else:
        input_price = 0
        output_price = 0        

    price_translation_input = token_length * input_price
    price_translation_output = token_length * output_price

    print("Price in $ for translation = ", price_translation_input + price_translation_output)

In [None]:
def split_text(text: str) -> List[str]:
    """
    Splits the given text in reasonable context sizes based on tokens.
    """

    tokenizer = tiktoken.get_encoding("cl100k_base")
    tokens = tokenizer.encode(text)

    parts = []
    current_part = []

    for token in tokens:
        current_part.append(token)

        if len(current_part) >= 2040:
            parts.append(current_part)
            current_part = []

    if current_part:
        parts.append(current_part)

    texts = []
    for part in parts:
        text = tokenizer.decode(part)
        texts.append(text)

    return texts

In [None]:
# Get one PDF document
docs = get_pdf_documents(True)
last_doc = docs[len(docs) - 1]

In [None]:
estimate_openai_price_for_qa_set_gen(last_doc.page_content, "gpt-3.5-turbo")
estimate_openai_price_for_qa_set_gen(last_doc.page_content, "gpt-4")
estimate_openai_price_for_qa_set_gen(last_doc.page_content, "gpt-4-32k")

In [None]:
# Split the text
split_texts = split_text(last_doc.page_content)
len(split_texts)

In [None]:
import json

# Translate the chunks and save them
translations = []
for index, split_text in enumerate(split_texts):
    translated = translate_text(split_text)
    translations.append(translated)

    if (index + 1) % 10 == 0:
        print("Index completed: ", index)

    with open("./../../inputData/PDF/documents/translation_Kersten_2020", 'w', encoding='utf-8') as json_file:
        json.dump(translations, json_file, ensure_ascii=False, indent=4)

In [None]:
# Concatenate the chunks and store the translated document
result_string = '\n'.join(translations)
last_doc.page_content = result_string
docs[len(docs) - 1]

store_documents(docs, "./../../inputData/PDF/documents/all_documents")
last_doc = [last_doc]
store_documents(last_doc, "./../../inputData/PDF/documents/new_documents")