<a href="https://colab.research.google.com/github/Farbod-gsm99/NLP-Tasks-and-Tokenizers/blob/main/Translation_Fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets sacrebleu

In [None]:
from datasets import load_dataset

raw_dataset = load_dataset("kde4", lang1="en", lang2="fr")
raw_dataset

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

In [None]:
num_samples = 10000

raw_dataset['train'] = raw_dataset["train"].shuffle(seed=42).select(range(num_samples))
raw_dataset

In [None]:
split_dataset = raw_dataset["train"].train_test_split(train_size=0.9, seed=20)
split_dataset['validation'] = split_dataset.pop('test')
split_dataset['train'][0]

In [None]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True)
    return model_inputs

In [None]:
tokenized_dataset = split_dataset.map(preprocess_function, batched=True, remove_columns=split_dataset["train"].column_names)
tokenized_dataset

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

batch = data_collator([tokenized_dataset["train"][i] for i in range(1, 3)])
batch.keys()

In [None]:
batch["labels"]

In [None]:
!pip install evaluate

In [None]:
import evaluate

metric = evaluate.load("sacrebleu")

In [None]:
import numpy as np

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != 100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [None]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-fr",
    hub_model_id="marian-finetuned-kde4-en-to-fr",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,

)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.evaluate(max_length=max_length)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(max_length=max_length)

In [None]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "TheFuriousGunner/marian-finetuned-kde4-en-to-fr"
translator = pipeline("translation", model=model_checkpoint)
translator("This email was sent by me.")

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
translator = pipeline("translation", model=model_checkpoint)
translator("This email was sent by me.")