In [None]:
from transformers import MarianMTModel, MarianTokenizer, pipeline
from datasets import load_dataset

In [None]:
dataset = load_dataset("opus_books", "en-fr", split="train[:5000]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

en-fr/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127085 [00:00<?, ? examples/s]

In [None]:
dataset = dataset.train_test_split(test_size=0.1)


In [None]:
print("Training samples",len(dataset["train"]))
print("Testing samples",len(dataset["test"]))

Training samples 4500
Testing samples 500


In [None]:
print(f"English : {dataset["train"][0]['translation']['en']} ")
print(f"French : {dataset["train"][0]['translation']['fr']} ")


English : I've lost too much time already and they must be anxious at home.' 
French : Jâ€™ai dÃ©jÃ  perdu beaucoup de temps et lâ€™on doit sâ€™inquiÃ©ter, chez moi. 


In [None]:
model_name = "Helsinki-NLP/opus-mt-en-fr"

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)



tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
translator = pipeline("translation",model=model , tokenizer=tokenizer)


Device set to use cuda:0


In [None]:
test_sentences = [
    "Hello, how are you?",
    "Machine learning is fascinating.",
    "I love reading books."
]

for sentence in test_sentences:
  result = translator(sentence,max_length=128)
  print(f"English : {sentence}")
  print(f"French : {result[0]['translation_text']}\\n")

English : Hello, how are you?
French : Bonjour, comment allez-vous ?\n
English : Machine learning is fascinating.
French : L'apprentissage automatique est fascinant.\n
English : I love reading books.
French : J'adore lire des livres.\n


In [None]:
def preprocess_function(examples):
  english = [ex["en"] for ex in examples['translation']]
  french = [ex['fr'] for ex in examples["translation"]]

  model_inputs = tokenizer(
      english,
      max_length=128,
      truncation=True,
      padding="max_length"
  )

  labels = tokenizer(
      text_target=french,
      max_length=128,
      truncation=True,
      padding="max_length"
  )

  model_inputs["labels"] = labels['input_ids']
  return model_inputs


In [None]:
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model)


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    predict_with_generate=True,
    save_total_limit=3,
    fp16=True,
    logging_steps=100,
)

In [None]:
import evaluate
import numpy as np

# Load BLEU score calculator
metric = evaluate.load("sacrebleu")

# Function to calculate translation quality
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Handle tuple output
    if isinstance(preds, tuple):
        preds = preds[0]

    # Convert numbers back to text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace padding with actual tokens
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Calculate BLEU score
    result = metric.compute(
        predictions=decoded_preds,
        references=[[label] for label in decoded_labels]
    )

    return {"bleu": result["score"]}

print("âœ… Evaluation metric ready!")


âœ… Evaluation metric ready!


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training!
print("ðŸš€ Starting training... (This will take 10-15 minutes on CPU)")
trainer.train()

print("âœ… Training complete!")

ðŸš€ Starting training... (This will take 10-15 minutes on CPU)


Epoch,Training Loss,Validation Loss,Bleu
1,0.4079,0.427068,23.40495


âœ… Training complete!


In [None]:
print("\nðŸŽ‰Testing fine-tuned model \n")

for sentence in test_sentences:
  result = translator(sentence,max_length=128)
  print(f"English: {sentence}")
  print(f"French: {result[0]['translation_text']}")


ðŸŽ‰Testing fine-tuned model 

English: Hello, how are you?
French: Comment allez-vous ?
English: Machine learning is fascinating.
French: Lapprentissage de la machine est fascinant.
English: I love reading books.
French: Jaime lire des livres.


In [None]:
def translate_text(text):
  result = translator(text,max_length=128)
  return result[0]['translation_text']

In [None]:
my_sentences = [
    "I am learning machine translation.",
    "This project is very interesting.",
    "Artificial intelligence is the future."
]

for text in my_sentences:
  print(f"English: {text}")
  print(f"French: {translate_text(text)}")

English: I am learning machine translation.
French: Japprends la traduction automatique.
English: This project is very interesting.
French: Ce projet est trÃ¨s intÃ©ressant.
English: Artificial intelligence is the future.
French: Lintelligence artificielle est lavenir.


In [None]:
# Evaluate final performance
print("ðŸ“Š Evaluating model performance...")
results = trainer.evaluate()

print(f"\nâœ… Final BLEU Score: {results['eval_bleu']:.2f}")
print("\nBLEU Score Guide:")
print("0-10: Very poor")
print("10-20: Poor")
print("20-30: Acceptable")
print("30-40: Good")
print("40-50: Very good")
print("50+: Excellent")


ðŸ“Š Evaluating model performance...



âœ… Final BLEU Score: 23.40

BLEU Score Guide:
0-10: Very poor
10-20: Poor
20-30: Acceptable
30-40: Good
40-50: Very good
50+: Excellent


In [None]:
# Save your fine-tuned model
save_directory = "./my_finetuned_en_fr_translator"

print("ðŸ’¾ Saving model...")
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
print(f"âœ… Model saved to {save_directory}")





ðŸ’¾ Saving model...


NameError: name 'model' is not defined