In [18]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = "t5-small"
model_t5 = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer_t5 = T5Tokenizer.from_pretrained(model_name)

def infer_t5(input_text, command="translate to french"):
    prompt = f"{command}: {input_text}"
    input_ids = tokenizer_t5.encode(prompt, return_tensors="pt")
    output_ids = model_t5.generate(input_ids, max_new_tokens=1000)
    output_text = tokenizer_t5.decode(output_ids[0], skip_special_tokens=True)
    return output_text


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-roa"
tokenizer_marian = MarianTokenizer.from_pretrained(model_name)
model_marian = MarianMTModel.from_pretrained(model_name)

def infer_marian(input_text, command="translate to french"):
    src_text = [f">>fra<<{input_text}"]
    output_ids = model_marian.generate(**tokenizer_marian(src_text, return_tensors="pt", padding=True))
    output_text = tokenizer_marian.decode(output_ids[0], skip_special_tokens=True)
    return output_text


In [7]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model_name = 'facebook/mbart-large-50-many-to-many-mmt'
model_mbart = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer_mbart = MBart50TokenizerFast.from_pretrained(model_name)

def infer_mbart(input_text, command="translate to french"):
    tokenizer_mbart.src_lang = "en_XX"
    encoded_hi = tokenizer_mbart(input_text, return_tensors="pt")
    generated_tokens = model_mbart.generate(
        **encoded_hi,
        forced_bos_token_id=tokenizer_mbart.lang_code_to_id["fr_XX"]
    )
    return tokenizer_mbart.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [3]:
import requests 
import pandas as pd

dataset = [] 
for index in range(0, 1000, 100):
    url = f"https://datasets-server.huggingface.co/rows?dataset=giga_fren&config=en-fr&split=train&offset={index}&length=100"
    response = requests.get(url)
    data = response.json()
    for row in data["rows"]:
        x = row["row"]["translation"]["en"]
        y = row["row"]["translation"]["fr"]
        dataset.append((x, y))
    print(f"Downloaded {index + 100} rows")
df = pd.DataFrame(dataset, columns=["input_text", "target_text"])
df.head()
df.to_csv("giga_fren.csv", index=False)

Downloaded 100 rows
Downloaded 200 rows
Downloaded 300 rows
Downloaded 400 rows
Downloaded 500 rows
Downloaded 600 rows
Downloaded 700 rows
Downloaded 800 rows
Downloaded 900 rows
Downloaded 1000 rows


In [None]:
df = pd.read_csv("giga_fren.csv")

for index, row in df.iterrows():
    df.loc[index, "t5"] = infer_t5(row["input_text"])
    if index % 100 == 0:
        print(f"Translated {index} rows for t5")      

df.head()

Translated 0 rows for t5
Translated 100 rows for t5


In [36]:
for index, row in df.iterrows():
    df.loc[index, "marian"] = infer_marian(row["input_text"])
    if index % 100 == 0:
        print(f"Translated {index} rows for marian")

df.head()

KeyboardInterrupt: 

In [None]:
for index, row in df.iterrows():
    df.loc[index, "mbart"] = infer_mbart(row["input_text"])
    if index % 100 == 0:
        print(f"Translated {index} rows for mbart")
    
df.head()