
## Trying the No Language Left Behind Model

In [1]:
!pip install transformers
!pip install torch
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.26.5-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

#This is an example to test whether a simple sentence works
model_name = "facebook/nllb-200-3.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


german_sentence = "Es war einmal ein König, der hatte drei Söhne."


target_lang = "eng_Latn"  # English in Latin script
forced_bos_token_id = tokenizer.convert_tokens_to_ids(target_lang)

inputs = tokenizer(german_sentence, return_tensors="pt", padding=True, truncation=True)
outputs = model.generate(**inputs, forced_bos_token_id=forced_bos_token_id, max_length=50)
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Translation:", translation)


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/6.93G [00:00<?, ?B/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/8.55G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Translation: Once upon a time there was a king who had three sons.


In [None]:
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

input_dir = "/work/FrederiekeNicolaWullf#7811/Exam/german_tales/" 
output_dir = "/work/FrederiekeNicolaWullf#7811/Exam/nllb_translated/"

#Load model
model_name = "facebook/nllb-200-3.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


source_lang = "deu_Latn" #remember the different language codes
target_lang = "eng_Latn"


forced_bos_token_id = tokenizer.convert_tokens_to_ids(target_lang)

#Sentence basis
def translate_text(text, tokenizer, model, forced_bos_token_id):
    sentences = text.split(". ")
    translations = []
    
    for sentence in sentences:
        
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        
        
        outputs = model.generate(**inputs, forced_bos_token_id=forced_bos_token_id, max_length=150)
        translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        
        translations.append(translated_sentence)
    
    return ". ".join(translations)

os.makedirs(output_dir, exist_ok=True)
#splits per sentence and the recombines them in txt file afterwards.

for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        input_file = os.path.join(input_dir, filename)
        output_file = os.path.join(output_dir, f"{filename}_en")
        
        
        with open(input_file, "r", encoding="utf-8") as file:
            text = file.read()

        translated_text = translate_text(text, tokenizer, model, forced_bos_token_id)

        with open(output_file, "w", encoding="utf-8") as file:
            file.write(translated_text)

        
        print(f"Translation completed: {output_file}")


        # Paths


# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Function to translate a single file
def translate_file(file_path):
    # Read German text
    with open(file_path, "r", encoding="utf-8") as file:
        german_text = file.read()

    # Translation logic (choose a method: sentence, paragraph, or full text)
    # Example: Full-text translation
    prompt = f"Translate the following German text to English, but keep the style and tone as close to the historical original as possible :\n\n{german_text}"
    full_translation = pipeline(prompt, max_length=5000, num_return_sequences=1)[0]["generated_text"]

    # Save translated text with the same filename but in the output folder
    base_name = os.path.basename(file_path)  # Get the file name (e.g., 001_fairytale.txt)
    translated_name = os.path.splitext(base_name)[0] + "_en.txt"  # Add suffix for English
    output_path = os.path.join(output_folder, translated_name)

    with open(output_path, "w", encoding="utf-8") as file:
        file.write(full_translation)

    print(f"Translated and saved: {output_path}")

# Example: Translate a single file
translate_file("/work/FrederiekeNicolaWullf#7811/Exam/german_tales/22_Das_Räthsel.txt")

### Full Translation of one file

In [6]:
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


input_file_path = "/work/Exam/german_tales/140_Das_Hausgesinde.txt"
output_file_path = "/work/Exam/nllb_renamed/140_domestic_servants.txt"


model_name = "facebook/nllb-200-3.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

source_lang = "deu_Latn"  # German in Latin script
target_lang = "eng_Latn"  # English in Latin script

forced_bos_token_id = tokenizer.convert_tokens_to_ids(target_lang)

def translate_text(text, tokenizer, model, forced_bos_token_id):
    # Split the text into sentences
    sentences = text.split(". ")
    translations = []
    
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        
        outputs = model.generate(**inputs, forced_bos_token_id=forced_bos_token_id, max_length=150)
        translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        translations.append(translated_sentence)
    
    return ". ".join(translations)

with open(input_file_path, "r", encoding="utf-8") as file:
    text = file.read()

full_translation = translate_text(text, tokenizer, model, forced_bos_token_id)

with open(output_file_path, "w", encoding="utf-8") as file:
    file.write(full_translation)

print(f"Translated and saved: {output_file_path}")



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Translated and saved: /work/FrederiekeNicolaWullf#7811/Exam/nllb_renamed/140_domestic_servants.txt


### Full translation loop for all
-Skips already translated files (had to be repeated multiple times due to ucloud crash)

In [2]:
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

input_dir = "/work/Exam/german_tales/" 
output_dir = "/work/Exam/nllb_translated/"

model_name = "facebook/nllb-200-3.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

source_lang = "deu_Latn"
target_lang = "eng_Latn"

forced_bos_token_id = tokenizer.convert_tokens_to_ids(target_lang)

# Translate sentence by sentence
def translate_text(text, tokenizer, model, forced_bos_token_id):
    sentences = text.split(". ")
    translations = []
    
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        outputs = model.generate(**inputs, forced_bos_token_id=forced_bos_token_id, max_length=150)
        translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translations.append(translated_sentence)
    
    return ". ".join(translations)  # Recombine sentences into a single text

os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        input_file = os.path.join(input_dir, filename)
        output_file = os.path.join(output_dir, f"{filename}_en")

        if os.path.exists(output_file):
            print(f"Skipping already translated file: {output_file}")
            continue

        with open(input_file, "r", encoding="utf-8") as file:
            text = file.read()

        translated_text = translate_text(text, tokenizer, model, forced_bos_token_id)

        with open(output_file, "w", encoding="utf-8") as file:
            file.write(translated_text)

        print(f"Translation completed: {output_file}")


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/6.93G [00:00<?, ?B/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/8.55G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Skipping already translated file: /work/FrederiekeNicolaWullf#7811/Exam/nllb_translated/177_Die_Boten_des_Todes.txt_en
Skipping already translated file: /work/FrederiekeNicolaWullf#7811/Exam/nllb_translated/144_Das_Eselein.txt_en
Skipping already translated file: /work/FrederiekeNicolaWullf#7811/Exam/nllb_translated/135_Die_weiße_und_die_schwarze_Bra.txt_en
Skipping already translated file: /work/FrederiekeNicolaWullf#7811/Exam/nllb_translated/164_Der_faule_Heinz.txt_en
Skipping already translated file: /work/FrederiekeNicolaWullf#7811/Exam/nllb_translated/31_Das_Mädchen_ohne_Hände.txt_en
Skipping already translated file: /work/FrederiekeNicolaWullf#7811/Exam/nllb_translated/155_Die_Brautschau.txt_en
Skipping already translated file: /work/FrederiekeNicolaWullf#7811/Exam/nllb_translated/196_Oll_Rinkrank.txt_en
Skipping already translated file: /work/FrederiekeNicolaWullf#7811/Exam/nllb_translated/50_Dornröschen.txt_en
Skipping already translated file: /work/FrederiekeNicolaWullf#7811/E