In [1]:
from transformers import MarianMTModel, MarianTokenizer

In [2]:
target_model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
target_tokenizer = MarianTokenizer.from_pretrained(target_model_name)
target_model = MarianMTModel.from_pretrained(target_model_name)

en_model_name = 'Helsinki-NLP/opus-mt-ROMANCE-en'
en_tokenizer = MarianTokenizer.from_pretrained(en_model_name)
en_model = MarianMTModel.from_pretrained(en_model_name)

In [32]:
first_model_name = 'Helsinki-NLP/opus-mt-en-de'
first_model_tkn = MarianTokenizer.from_pretrained(first_model_name)
first_model = MarianMTModel.from_pretrained(first_model_name)

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/750k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/284M [00:00<?, ?B/s]

In [33]:
# Get the name of the second model
second_model_name = 'Helsinki-NLP/opus-mt-de-en'
# Get the tokenizer
second_model_tkn = MarianTokenizer.from_pretrained(second_model_name)
# Load the pretrained model based on the name
second_model = MarianMTModel.from_pretrained(second_model_name)

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/750k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/284M [00:00<?, ?B/s]

In [28]:
def format_batch_texts(language_code, batch_texts):
    formated_bach = [">>{}<< {}".format(language_code, text) for text in batch_texts]
    return formated_bach

In [34]:
original_texts = ['circular body.', 'large round eyes.']
formated_batch = format_batch_texts("fr", original_texts)

def perform_translation(batch_texts, model, tokenizer, language="fr"):
    formated_batch_texts = format_batch_texts(language, batch_texts)
    translated = model.generate(**tokenizer(formated_batch_texts, return_tensors="pt", padding=True))
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    return translated_texts

translated_texts = perform_translation(original_texts, first_model, first_model_tkn)
back_translated_texts = perform_translation(translated_texts, second_model, second_model_tkn)

print(back_translated_texts)

['Round body.', 'Big round eyes.']


In [14]:
def translate(texts, model, tokenizer, language="fr"):
    template = lambda text: f"{text}" if language == "en" else f">>{language}<< {text}"
    src_texts = [template(text) for text in texts]

    with tokenizer.as_target_tokenizer():
        encoded = tokenizer(src_texts, return_tensors='pt', truncation=False, padding=True)
    
    translated = model.generate(**encoded)
    translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return translated_texts

def back_translate(texts, source_lang="en", target_lang="fr"):
    fr_texts = translate(texts, first_model, first_model_tkn, language=target_lang)
    back_translated_texts = translate(fr_texts, second_model, second_model_tkn, language=source_lang)
    
    return back_translated_texts

In [17]:
en_texts = ['The angel has triangular body.', 'It has big, round eyes.']

aug_texts = back_translate(en_texts, source_lang="en", target_lang="fr")
print(aug_texts)

['The body of the Angel Hasang.', "It's too gros, it's around them."]
