# Baseline

In [2]:
from datasets import load_dataset

opus_dataset = load_dataset("Helsinki-NLP/opus-100", "en-fr")
print(opus_dataset)

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})


Now we load the pre-trained tokenizer for the NLLB model and apply it to the English-French pair:

In [3]:
max_tok_length = 16

from transformers import AutoTokenizer

checkpoint = "facebook/nllb-200-distilled-600M"
# from flores200_codes import flores_codes
src_code = "eng_Latn"
tgt_code = "fra_Latn"
tokenizer = AutoTokenizer.from_pretrained(
    checkpoint, 
    padding=True, 
    pad_to_multiple_of=8, 
    src_lang=src_code, 
    tgt_lang=tgt_code, 
    truncation=True, 
    max_length=max_tok_length,
    )

In [4]:
source_lang = "en"
target_lang = "fr"

def preprocess_function_opus(batch):
    source_texts = [t[source_lang] for t in batch["translation"]]
    target_texts = [t[target_lang] for t in batch["translation"]]
    
    model_inputs = tokenizer(
        source_texts,
        text_target=target_texts,
        truncation=True
    )
    return model_inputs


The way the Datasets library applies this processing is by adding new fields to the datasets, one for each key in the dictionary returned by the tokenize function, that is, *input_ids*, *attention_mask* and *labels*. We can check what the preprocess_function is doing with a small sample

In [5]:
sample = opus_dataset["train"].select(range(2))
print(sample)
model_input = preprocess_function_opus(sample)
print(model_input)

Dataset({
    features: ['translation'],
    num_rows: 2
})
{'input_ids': [[256047, 1617, 6471, 10643, 248, 69427, 248144, 72757, 81, 2], [256047, 9680, 44575, 16023, 28665, 9174, 48993, 281, 349, 130, 8654, 248074, 4039, 45893, 6629, 216274, 108, 349, 10984, 31194, 51358, 452, 349, 20511, 16545, 248075, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[256057, 1617, 6471, 10643, 248, 69427, 248144, 57167, 81, 2], [256057, 1181, 50447, 82940, 5481, 153, 238676, 96, 99762, 28428, 25712, 138025, 34783, 79, 127, 216274, 1166, 51358, 56198, 60, 79, 55, 248116, 58146, 84655, 248075, 2]]}


In [6]:
for sample in model_input['input_ids']:
    print(tokenizer.convert_ids_to_tokens(sample))

['eng_Latn', '‚ñÅThe', '‚ñÅtime', '‚ñÅnow', '‚ñÅis', '‚ñÅ05', ':', '08', '‚ñÅ.', '</s>']
['eng_Latn', '‚ñÅThis', '‚ñÅRegulation', '‚ñÅshall', '‚ñÅenter', '‚ñÅinto', '‚ñÅforce', '‚ñÅon', '‚ñÅthe', '‚ñÅse', 'vent', 'h', '‚ñÅday', '‚ñÅfollowing', '‚ñÅits', '‚ñÅpublication', '‚ñÅin', '‚ñÅthe', '‚ñÅOf', 'ficial', '‚ñÅJournal', '‚ñÅof', '‚ñÅthe', '‚ñÅEuropean', '‚ñÅUnion', '.', '</s>']


We can recover the source text by applying [batch_decode](https://huggingface.co/docs/transformers/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.batch_decode) of the tokenizer 

In [7]:
tokenizer.batch_decode(model_input['input_ids'])

['eng_Latn The time now is 05:08 .</s>',
 'eng_Latn This Regulation shall enter into force on the seventh day following its publication in the Official Journal of the European Union.</s>']

Now, we can apply the preprocess_function to the raw datasets (training, validation and test):

In [8]:
tokenized_datasets = opus_dataset.map(preprocess_function_opus, batched=True, num_proc=8)

We are going to filter the tokenized datasets by maximum number of tokens in source and target language:

In [9]:
tokenized_datasets = tokenized_datasets.filter(lambda x: len(x["input_ids"]) <= max_tok_length and len(x["labels"]) <= max_tok_length , desc=f"Discarding source and target sentences with more than {max_tok_length} tokens", num_proc=8)

We can take a quick look at the length histogram in the source language:

In [11]:
print(tokenized_datasets)
dic = {}
for sample in tokenized_datasets['train']:
    sample_length = len(sample['input_ids'])
    if sample_length not in dic:
        dic[sample_length] = 1
    else:
        dic[sample_length] += 1 

for i in range(1,max_tok_length+1):
    if i in dic:
        print(f"{i:>2} {dic[i]:>3}")

DatasetDict({
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 685
    })
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 456517
    })
    validation: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 681
    })
})
 3 3689
 4 22266
 5 32999
 6 41868
 7 48152
 8 52195
 9 52798
10 47928
11 42356
12 35651
13 28066
14 22165
15 15418
16 10966


Checking a sample after filtering by maximum number of tokens:

In [12]:
for sample in tokenized_datasets['train'].select(range(5)):
    print(sample['input_ids'])
    print(sample['attention_mask'])
    print(sample['labels'])

[256047, 1617, 6471, 10643, 248, 69427, 248144, 72757, 81, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[256057, 1617, 6471, 10643, 248, 69427, 248144, 57167, 81, 2]
[256047, 94124, 248079, 6158, 248116, 248066, 1482, 248130, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1]
[256057, 2353, 248116, 354, 248105, 271, 340, 29, 248116, 354, 340, 11475, 385, 2]
[256047, 3140, 15699, 117, 4062, 37124, 1259, 42430, 30, 6382, 248075, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[256057, 4523, 43495, 248079, 303, 7, 248116, 52221, 247500, 5716, 1023, 340, 303, 23557, 248075, 2]
[256047, 26521, 1259, 13144, 21347, 248130, 2]
[1, 1, 1, 1, 1, 1, 1]
[256057, 2218, 15999, 835, 2]
[256047, 26779, 617, 2]
[1, 1, 1, 1]
[256057, 26779, 617, 2]


bitsandbytes is a quantization library with a Transformers integration. With this integration, you can quantize a model to 8 or 4-bits and enable many other options by configuring the BitsAndBytesConfig class. For example, you can:

<ul>
<li>set load_in_4bit=True to quantize the model to 4-bits when you load it</li>
<li>set bnb_4bit_quant_type="nf4" to use a special 4-bit data type for weights initialized from a normal distribution</li>
<li>set bnb_4bit_use_double_quant=True to use a nested quantization scheme to quantize the already quantized weights</li>
<li>set bnb_4bit_compute_dtype=torch.bfloat16 to use bfloat16 for faster computation</li>
</ul>


In [13]:
import torch
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

Pass the quantization_config to the from_pretrained method.

In [None]:
from transformers import AutoModelForSeq2SeqLM
from peft import prepare_model_for_kbit_training

model_nllb_baseline = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint,
    quantization_config=quantization_config
)

model_nllb_finetuned = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint,
    quantization_config=quantization_config
)

model_nllb_finetuned = prepare_model_for_kbit_training(model_nllb_finetuned)

## Configuration

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
)

In [None]:
model_nllb_finetuned = get_peft_model(model_nllb_finetuned, config)
model_nllb_finetuned.print_trainable_parameters()

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(
    tokenizer = tokenizer,
    model = model_nllb_finetuned,
    pad_to_multiple_of=8
)

## Evaluation

In [15]:
from evaluate import load

bleu_metric = load("sacrebleu")
comet_metric = load("comet")

  from pkg_resources import DistributionNotFound, get_distribution


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.6.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
Encoder model frozen.
/home/alumno.upv.es/gdipal1/envs/ta-project/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:197: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


We need to define a function compute_metrics to compute BLEU scores at each epoch. The example below performs a basic post-processing to decode the predictions into texts:

In [None]:
import numpy as np
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    
    return preds, labels

def compute_metrics(eval_preds):
    inputs, preds, labels = eval_preds

    # Convert to lists if coming from a datasets.Column
    if not isinstance(inputs, list):
        inputs = list(inputs)
    if not isinstance(labels, list):
        labels = list(labels)
        
        
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace negative ids in inputs and labels as we can't decode them.
    inputs = [
        [tokenizer.pad_token_id if j < 0 else j for j in input]
        for input in inputs
    ]
    decoded_inputs = tokenizer.batch_decode(inputs, skip_special_tokens=True)

    labels = [
        [tokenizer.pad_token_id if j < 0 else j for j in label]
        for label in labels
    ]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    bleu_result = bleu_metric.compute(predictions=decoded_preds, references=[s for s in decoded_labels])
    comet_result = comet_metric.compute(sources=decoded_inputs, predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": bleu_result["score"]}
    result["comet"] = comet_result["mean_score"] * 100

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Training

In [None]:
from transformers import Seq2SeqTrainingArguments

batch_size = 32
model_name = checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-en-to-fr",
    eval_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=2,
    predict_with_generate=True,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model_nllb_finetuned,
    args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()

## Inference

Let us first load the default inference parameters of NLLB.

In [17]:
from transformers import GenerationConfig

generation_config = GenerationConfig.from_pretrained(
    checkpoint,
)

print(generation_config)

GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "eos_token_id": 2,
  "max_length": 200,
  "pad_token_id": 1
}



We prepare the test set in batches to be translated:

In [None]:
test_batch_size = 32
batch_tokenized_test = tokenized_datasets['test'].batch(test_batch_size)

Dataset({
    features: ['translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 22
})


Processing in batches to add padding and convert to tensors, then perform inference with num_beams = 1 and do_sample = False, that is, greedy search.

In [None]:
def evaluate_model(model):
    number_of_batches = len(batch_tokenized_test["translation"])
    input_sequences = []
    preds_sequences = []
    labels_sequences = []
    for i in range(number_of_batches):
        batch_tokenized_test_src = list(batch_tokenized_test["translation"][i][j][source_lang] for j in range(len(batch_tokenized_test["translation"][i])))
        batch_tokenized_test_tgt = list(batch_tokenized_test["translation"][i][j][target_lang] for j in range(len(batch_tokenized_test["translation"][i])))
        inputs = tokenizer(
            batch_tokenized_test_src, 
            max_length=max_tok_length, 
            truncation=True, 
            return_tensors="pt", 
            padding=True,
            )
        labels = tokenizer(
            batch_tokenized_test_tgt, 
            max_length=max_tok_length, 
            truncation=True, 
            return_tensors="pt", 
            padding=True,
        )
        with torch.no_grad():    
            output_batch = model.generate(
                generation_config=generation_config, 
                input_ids=inputs["input_ids"].cuda(), 
                attention_mask=inputs["attention_mask"].cuda(), 
                forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code), 
                max_length = max_tok_length, 
                num_beams=1, 
                do_sample=False,
                )
        input_sequences.extend(inputs["input_ids"].cpu())
        preds_sequences.extend(output_batch.cpu())
        labels_sequences.extend(labels["input_ids"].cpu())
    return input_sequences, preds_sequences, labels_sequences

## Inference Results

In [None]:
input_sequences, preds_sequences, labels_sequences = evaluate_model(model_nllb_baseline)

print("NLLB Baseline Results:")
nllb_baseline_result = compute_metrics((input_sequences, preds_sequences, labels_sequences))
print(f'BLEU score: {nllb_baseline_result["bleu"]}')
print(f'COMET score: {nllb_baseline_result["comet"]}')

/home/alumno.upv.es/gdipal1/envs/ta-project/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/alumno.upv.es/gdipal1/envs/ta-project/lib/pyth ...
üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
You are using a CUDA device ('NVIDIA L40S') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision

BLEU score: 29.9651
COMET score: 78.1702


In [None]:
input_sequences, preds_sequences, labels_sequences = evaluate_model(model_nllb_finetuned)
print("NLLB Finetuned Results:")
nllb_finetuned_result = compute_metrics((input_sequences, preds_sequences, labels_sequences))
print(f'BLEU score: {nllb_finetuned_result["bleu"]}')
print(f'COMET score: {nllb_finetuned_result["comet"]}')