<a href="https://colab.research.google.com/github/CBaffelli/CAS-NLP_Machine-translation/blob/main/03_CAS_NLP_Final_project_multilingual_model_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets evaluate sacrebleu accelerate -U bert_score rouge_score peft sacremoses torch pynvml

# **Fine-tuning**

This script is used for the fine-tuning of a machine translation model from EN into 5 languages (ES, FR, IT, PT, RO). Supports T5 models (T5 and flanT5) as well as OPUS models.

In [None]:
#@title Imports and varia
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset, concatenate_datasets, DatasetDict
import evaluate
import numpy as np
import plotly.express as px
import matplotlib as mp
import matplotlib.pyplot as plt
from peft import get_peft_model, LoraConfig, TaskType
import torch

In [None]:
#@title Mount GDrive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title Load data
#Load the datasets
italian = pd.read_csv('/content/drive/MyDrive/CAS NLP/Final_project/Dataset/data_for_training/italian.csv', dtype=str)
french = pd.read_csv('/content/drive/MyDrive/CAS NLP/Final_project/Dataset/data_for_training/french.csv', dtype=str)
spanish = pd.read_csv('/content/drive/MyDrive/CAS NLP/Final_project/Dataset/data_for_training/spanish.csv', dtype=str)
romanian = pd.read_csv('/content/drive/MyDrive/CAS NLP/Final_project/Dataset/data_for_training/romanian.csv', dtype=str)
portuguese = pd.read_csv('/content/drive/MyDrive/CAS NLP/Final_project/Dataset/data_for_training/portuguese.csv', dtype=str)

#Create a mapping to iterate in the dataframes
languages = {
    'Italian': italian,
    'French': french,
    'Spanish' : spanish,
    'Romanian' : romanian,
    'Portuguese' : portuguese
}

In [None]:
#@title Load tokenizer and model
#models: google-t5/t5-small, google-t5/t5-base, google-t5/t5-large, google/flan-t5-small, google/flan-t5-base, google/flan-t5-large, Helsinki-NLP/opus-mt-en-roa
#Get the tokenizer and the model
checkpoint = "Helsinki-NLP/opus-mt-en-roa"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
#@title Pre-process dataset and append model-specific prefix
#Creates a mapping for the prefixes
prefix_mapping_T5 = {
    'Italian' : 'translate English to Italian: ',
    'French' : 'translate English to French: ',
    'Spanish' : 'translate English to Spanish: ',
    'Romanian' : 'translate English to Romanian: ',
    'Portuguese' : 'translate English to Portuguese: '
}

prefix_mapping_OPUS =  {
    'Italian' : '>>ita<< ',
    'French' : '>>fra<< ',
    'Spanish' : '>>spa<< ',
    'Romanian' : '>>ron<< ',
    'Portuguese' : '>>por<< '
}

#Function to transform the dataset and append the correct prefix
##Returns the dataset in the Hugginface dataset format
def transform_dataset_append_prefix(name, df):
  prefix = ''
  dataset = []
  if 'opus' in checkpoint:
    prefix = prefix_mapping_OPUS[name]
  elif 't5' in checkpoint:
    prefix = prefix_mapping_T5[name]
  for index, row in df.iterrows():
    translation = {'en': prefix + row['sourceExpression'], 'target': row['targetExpression']}
    data = {'translation': translation}
    dataset.append(data)
  return Dataset.from_pandas(pd.DataFrame(data=dataset))

In [None]:
#@title Apply tokenizer
#Function to preprocess and tokenize the data
##We need to preprocess and prepare the data for the fine-tuning
max_length = 128
source_lang = "en"
def preprocess_function(examples):
  inputs = [example[source_lang] for example in examples["translation"]]
  targets = [example['target'] for example in examples["translation"]]
  model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True)
  return model_inputs


In [None]:
#Transform the dataset in the Huggingface dataset format, apply the tokenization
datasets = {}
for language_name, language_df in languages.items():
  initial_dataset = transform_dataset_append_prefix(language_name, language_df)
  datasets[language_name] = initial_dataset.map(preprocess_function, batched=True)

In [None]:
#Once we have prepared all the datasets, we can combine them together in a single dataset
combined_dataset = concatenate_datasets(list(datasets.values()))

In [None]:
#@title Split the data into train and test
#Then we split the data into train and test set
#Split 20% for testing
train_test_split = combined_dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
#Create a DatasetDict to hold the splits
final_dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

#We create a data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
#@title Import what is needed for the evaluation
#We import the score for the evaluation
sacrebleu_score = evaluate.load("sacrebleu")

In [None]:
#@title Functions needed to compute metrics
#Function to post-process text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

#Function to compute the metrics
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = sacrebleu_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    result = {"sacrebleu":  round(result["score"], 4)}
    return result

In [None]:
#@title Freeze the embedding layers
for name, param in model.named_parameters():
    if 'shared' in name:  # 'shared' is commonly used for the embedding parameter
        param.requires_grad = False

In [None]:
#@title Load the PEFT configuration
#Load the model with the PEFT config
peft_config = ''
if 't5' in checkpoint:
  peft_config = LoraConfig(
      task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
  )
else:
    peft_config = LoraConfig(
      task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,
      target_modules = ['k_proj', 'v_proj', 'q_proj', 'out_proj']
  )
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
#@title Define hyperparameters, training arguments, and the sequence2sequence trainer
#Hyperparameters and misc
learning_rate = 1e-03
batch_size = 32
epochs = 4

#We define the training arguments and the trainer
training_args = Seq2SeqTrainingArguments(
    output_dir='',
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    gradient_accumulation_steps=4,
    dataloader_num_workers=4,
    fp16=True,
    push_to_hub=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset["train"],
    eval_dataset=final_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
torch.cuda.empty_cache()

In [None]:
#@title Start the fine-tuning process
trainer.train()

In [None]:
#@title Save the model
trainer.save_model('')