In [1]:
!pip install -q transformers sentencepiece

from transformers import MarianTokenizer, MarianMTModel


In [2]:
# Choose the right model: "Helsinki-NLP/opus-mt-en-hi" (English → Hindi)
model_name = 'Helsinki-NLP/opus-mt-en-hi'

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [3]:
def translate(text):
    # Tokenize input text
    batch = tokenizer([text], return_tensors="pt", padding=True, truncation=True)

    # Perform translation
    generated_ids = model.generate(**batch, max_length=50, num_beams=4)

    # Decode output
    translation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return translation

# Example
text = "How are you?"
print("Translated:", translate(text))


Translated: आप कैसे हैं?


In [4]:
from google.colab import files
uploaded = files.upload()


Saving output.csv to output.csv


In [5]:
import pandas as pd

df = pd.read_csv("output.csv")
print(df.head())


                                                 er  \
0                              Let's try something.   
1                            I have to go to sleep.   
2  Today is June 18th and it is Muiriel's birthday!   
3  Today is June 18th and it is Muiriel's birthday!   
4                                Muiriel is 20 now.   

                                                  fr  
0                            Tentons quelque chose !  
1                              Je dois aller dormir.  
2  Aujourd'hui nous sommes le 18 juin et c'est l'...  
3  Aujourd'hui c'est le 18 juin, et c'est l'anniv...  
4                       Muiriel a 20 ans maintenant.  


In [6]:
!pip install -q transformers datasets sentencepiece


In [7]:
from datasets import Dataset
from transformers import MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Convert the pandas DataFrame to a Hugging Face dataset
dataset = Dataset.from_pandas(df)

def preprocess(example):
    model_inputs = tokenizer(example['er'], max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(text_target=example['fr'], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [8]:
from transformers import MarianMTModel, TrainingArguments, Trainer

model = MarianMTModel.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./mt_finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=100,
    save_total_limit=1,
    report_to="none" # Disable wandb logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()

Step,Training Loss
100,1.9625
200,1.4274
300,1.3266
400,1.2429
500,1.1949
600,1.1686
700,1.1123
800,1.084
900,1.0329
1000,1.033




TrainOutput(global_step=3750, training_loss=0.8984074462890626, metrics={'train_runtime': 526.6075, 'train_samples_per_second': 56.968, 'train_steps_per_second': 7.121, 'total_flos': 508475473920000.0, 'train_loss': 0.8984074462890626, 'epoch': 3.0})

In [9]:
model.save_pretrained("./mt_finetuned")
tokenizer.save_pretrained("./mt_finetuned")


('./mt_finetuned/tokenizer_config.json',
 './mt_finetuned/special_tokens_map.json',
 './mt_finetuned/vocab.json',
 './mt_finetuned/source.spm',
 './mt_finetuned/target.spm',
 './mt_finetuned/added_tokens.json')

In [10]:
from transformers import MarianMTModel, MarianTokenizer

model = MarianMTModel.from_pretrained("./mt_finetuned")
tokenizer = MarianTokenizer.from_pretrained("./mt_finetuned")




In [11]:
def translate(text):
    # Tokenize input text
    inputs = tokenizer([text], return_tensors="pt", truncation=True, padding=True)

    # Generate translation
    translated_ids = model.generate(**inputs, max_length=64, num_beams=4, early_stopping=True)

    # Decode tokens to string
    output = tokenizer.batch_decode(translated_ids, skip_special_tokens=True)[0]
    return output


In [12]:
print(translate("What is your name?"))


Quelle est ton ?


In [16]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import MarianMTModel, MarianTokenizer

# Load the fine-tuned model and tokenizer
model_finetuned = MarianMTModel.from_pretrained("./mt_finetuned")
tokenizer_finetuned = MarianTokenizer.from_pretrained("./mt_finetuned")

def translate_finetuned(text):
    # Tokenize input text
    inputs = tokenizer_finetuned([text], return_tensors="pt", truncation=True, padding=True)

    # Generate translation
    translated_ids = model_finetuned.generate(**inputs, max_length=64, num_beams=4, early_stopping=True)

    # Decode tokens to string
    output = tokenizer_finetuned.batch_decode(translated_ids, skip_special_tokens=True)[0]
    return output

# Define a smoothing function
chencherry = SmoothingFunction()

for i in range(10):
    src = df.iloc[i]['er']
    ref = df.iloc[i]['fr']
    pred = translate_finetuned(src)
    print(f"SOURCE: {src}\nTARGET: {ref}\nPREDICTED: {pred}\nBLEU: {sentence_bleu([ref.split()], pred.split(), smoothing_function=chencherry.method1)}\n")



SOURCE: Let's try something.
TARGET: Tentons quelque chose !
PREDICTED: Donnez-moi quelque chose.
BLEU: 0.0814136751754278

SOURCE: I have to go to sleep.
TARGET: Je dois aller dormir.
PREDICTED: Je dois aller à boire.
BLEU: 0.26591479484724945

SOURCE: Today is June 18th and it is Muiriel's birthday!
TARGET: Aujourd'hui nous sommes le 18 juin et c'est l'anniversaire de Muiriel !
PREDICTED: On année est la 18 commence et c'est la maintenant.
BLEU: 0.04044024548098852

SOURCE: Today is June 18th and it is Muiriel's birthday!
TARGET: Aujourd'hui c'est le 18 juin, et c'est l'anniversaire de Muiriel.
PREDICTED: On année est la 18 commence et c'est la maintenant.
BLEU: 0.04939382737115371

SOURCE: Muiriel is 20 now.
TARGET: Muiriel a 20 ans maintenant.
PREDICTED: Muiriel est 20 maintenant.
BLEU: 0.0823481567964712

SOURCE: The password is "Muiriel".
TARGET: Le mot de passe est « Muiriel ».
PREDICTED: Le se passe est à la moir.
BLEU: 0.0670144447096575

SOURCE: I will be back soon.
TARGET: J