In [1]:
import warnings   
warnings.filterwarnings('ignore')   

In [2]:
from transformers import MarianMTModel, MarianTokenizer

# Back Translation 

In this process we take a text written in English and translate it to for example: French and then translate it back to English.

As it can be seen in the picture below, it gives a bit of diversity using this process

![](https://amitness.com/images/back-translation-marianmt.png)


# French Model

In [3]:
fr_model_name = 'Helsinki-NLP/opus-mt-en-fr'
fr_tokenizer = MarianTokenizer.from_pretrained(fr_model_name)
fr_model = MarianMTModel.from_pretrained(fr_model_name)

# English Model

In [4]:
en_model_name = 'Helsinki-NLP/opus-mt-fr-en'
en_tokenizer = MarianTokenizer.from_pretrained(en_model_name)
en_model = MarianMTModel.from_pretrained(en_model_name)

In [23]:
def translate(texts, model, tokenizer, language="fr"):
    # Prepare the text data into appropriate format for the model
    template = lambda text: f"{text}" if language == "en" else f">>{language}<< {text}"
    #print(template)
    src_texts = [template(text) for text in texts]
    #print(src_texts)

    
    # Generate translation using model
    translated = model.generate(**tokenizer(src_texts, return_tensors="pt", padding=True))

    # Convert the generated tokens indices back into text
    translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
    
    return translated_texts


In [24]:
def back_translate(texts, source_lang="en", target_lang="fr"):
    # Translate from source to target language (fr)
    fr_texts = translate(texts, fr_model, fr_tokenizer, 
                         language=target_lang)

    # Translate from target language back to source language (en)
    back_translated_texts = translate(fr_texts, en_model, en_tokenizer, 
                                      language=source_lang)
    
    return fr_texts, back_translated_texts


In [25]:
en_texts = ['This is so cool', 'I hated the food', 'They were very helpful']
aug_texts = back_translate(en_texts, source_lang="en", target_lang="fr")

print(aug_texts)


(["C'est trop cool.", 'Je détestais la nourriture.', 'Ils ont été très utiles.'], ["That's so cool.", 'I hated the food.', 'They were very helpful.'])
