# English to Spanish Fine Tuning

Starting from the Hugging Face Translation Tutorial (https://huggingface.co/learn/nlp-course/en/chapter7/4#preparing-the-data)

In [49]:
from tqdm import tqdm
import numpy as np

# hugging face translation tutorial
from datasets import load_dataset
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
import evaluate
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
# custom training loop
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
from tqdm.auto import tqdm

# torch
import torch
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

device

'cpu'

## Notes

### 2025-03-21
* May be interested in using Subword Segmentation on rare/novel words?
* May be interested in using a different dataset than `helsinski-NLP/kde4`
  * Weird errors like english words attached to end of spanish translations
  * Small sentences, maybe this was a textbook?

**Tip**: Restart the kernel every time you have to download something...

Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [2]:
#pip install tf-keras --user

In [3]:
#pip install transformers

In [None]:
#!pip install transformers[torch] OR !pip install 'accelerate>=0.26.0

In [4]:
#!pip install datasets

In [5]:
#!pip install evaluate

In [6]:
#!pip install sentencepiece

In [7]:
#!pip install sacrebleu

## Load Dataset(s)

In [13]:
# use Helinski-NLP/kde4 dataset (https://huggingface.co/datasets/Helsinki-NLP/kde4)
# this dataset has a good amount of oddness/errors, we should probably consider something else
raw_datasets = load_dataset("kde4", lang1="en", lang2="es", trust_remote_code=True)

In [14]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 218655
    })
})

In [15]:
# split again for validation set
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test")

In [16]:
split_datasets["train"][84]["translation"]

{'en': 'Left Arrow', 'es': 'Tecla de dirección izquierda.'}

In [17]:
split_datasets["train"][22]["translation"] # interesting...

{'en': 'Edit and paint images', 'es': 'Editar y pintar imágenesName'}

## Process Data

In [18]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-es"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

In [19]:
split_datasets["train"][66]["translation"]

{'en': 'Removes the selected render mode.',
 'es': 'Elimina el modo de procesado seleccionado.'}

In [20]:
en_sentence = split_datasets["train"][66]["translation"]["en"]
es_sentence = split_datasets["train"][66]["translation"]["es"]

inputs = tokenizer(en_sentence, text_target=es_sentence)
inputs

{'input_ids': [23292, 9, 5, 4836, 17058, 9888, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1], 'labels': [47448, 14, 1160, 4, 38117, 15569, 3, 0]}

In [21]:
wrong_targets = tokenizer(es_sentence) # english tokenizer preprocessing spanish sentence, does not do well (this is demo stuff, just to make sure the tokenizer is applied correctly)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(inputs["labels"]))

['▁Eli', 'mina', '▁el', '▁mod', 'o', '▁de', '▁pro', 'ces', 'ado', '▁s', 'ele', 'c', 'cion', 'ado', '.', '</s>']
['▁Elimina', '▁el', '▁modo', '▁de', '▁procesado', '▁seleccionado', '.', '</s>']


In [22]:
max_length = 128 # depends on how long text is... text does not seeem to be very long?

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["es"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True)
    return model_inputs

⚠️ We don’t pay attention to the attention mask of the targets, as the model won’t expect it. Instead, the labels corresponding to a padding token should be set to -100 so they are ignored in the loss computation. This will be done by our data collator later on since we are applying dynamic padding, but if you use padding here, you should adapt the preprocessing function to set all labels that correspond to the padding token to -100.

In [23]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

# Model

In [24]:
# demo
model_checkpoint = "Helsinki-NLP/opus-mt-en-es"
translator = pipeline("translation", model=model_checkpoint)
translator("Have you considered horse?")




Device set to use cpu


[{'translation_text': '¿Has considerado el caballo?'}]

In [25]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

### Data Collation
Deals with padding for dynamic batching. Need to pad labels to maximum length.

I do not know what this means beyond that information.

In [26]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [27]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [28]:
batch["labels"]

tensor([[ 3244,   755,     0,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100],
        [  664,  1532, 12100,    19, 17220,  6533,  2834,  1257, 15569,     3,
             0]])

In [29]:
batch["decoder_input_ids"]

tensor([[65000,  3244,   755,     0, 65000, 65000, 65000, 65000, 65000, 65000,
         65000],
        [65000,   664,  1532, 12100,    19, 17220,  6533,  2834,  1257, 15569,
             3]])

In [30]:
# labels for first and second elements in dataset
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[3244, 755, 0]
[664, 1532, 12100, 19, 17220, 6533, 2834, 1257, 15569, 3, 0]


### Metrics

In [31]:
metric = evaluate.load("sacrebleu")

In [32]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits :eyes:
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them (these are padding labels)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

### Fine Tuning - Custom Training Loop

In [41]:
tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [42]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [45]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [51]:
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [52]:
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [53]:
# push to huggingface hubm (needs login)
# from huggingface_hub import Repository, get_full_repo_name

# model_name = "marian-finetuned-kde4-en-to-es-accelerate"
# repo_name = get_full_repo_name(model_name)
# output_dir = "marian-finetuned-kde4-en-to-es-accelerate"
# repo = Repository(output_dir, clone_from=repo_name)

In [54]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

In [55]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # Save and upload
    # accelerator.wait_for_everyone()
    # unwrapped_model = accelerator.unwrap_model(model)
    # unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    # if accelerator.is_main_process:
    #     tokenizer.save_pretrained(output_dir)
    #     repo.push_to_hub(
    #         commit_message=f"Training in progress epoch {epoch}", blocking=False
    #     )

  0%|          | 0/73797 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Fine Tuning - Pre Defined Loop

Log into hugging face here:
```py
from huggingface_hub import notebook_login
notebook_login()
```
OR
```sh
huggingface-cli login
```

In [34]:
args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-es",
    eval_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True, # speed up training on modern GPUs
    push_to_hub=False, # I don't know how to do this
)

In [37]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [39]:
# score original model as baseline to ensure fine tuning actually improves
trainer.evaluate(max_length=max_length) # takes time

KeyboardInterrupt: 

In [None]:
trainer.train() # takes time

In [None]:
trainer.evaluate(max_length=max_length) # takes time

### Demo with Fine Tuned Model

In [None]:
# Replace this with your own checkpoint
model_checkpoint = "huggingface-course/marian-finetuned-kde4-en-to-es"
translator = pipeline("translation", model=model_checkpoint)
translator("Default to expanded threads")

In [None]:
translator(
    "Unable to import %1 using the OFX importer plugin. This file is not the correct format."
)

## Remnants of BERT

#### Torch Datasets

* takes in inputs and outputs/labels
* interfaces with tokenizer
* handles batching

In [9]:
class MultiLabelDataset(torch.utils.data.Dataset):

    def __init__(self, text, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.targets = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            # 'targets': torch.tensor(self.targets[index], dtype=torch.long) # was float
            'targets': self.targets[index].clone().detach()
        }

#### BERT Class

* first layer is pretrained BERT model
* add whatever layers after

In [10]:
class ELECTRAClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        # super(BERTClass, self).__init__()
        super(ELECTRAClass, self).__init__()
        # self.l1 = BertModel.from_pretrained("bert-base-uncased")
        self.electra = ElectraForSequenceClassification.from_pretrained("google/electra-small-discriminator", num_labels=NUM_OUT)

#         self.pre_classifier = torch.nn.Linear(768, 256)
        # self.classifier = torch.nn.Linear(768, NUM_OUT)
#         self.dropout = torch.nn.Dropout(0.5)
        # self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
#         output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
#         hidden_state = output_1[0]
#         pooler = hidden_state[:, 0]
# #         pooler = self.pre_classifier(pooler)
# #         pooler = torch.nn.Tanh()(pooler)
# #         pooler = self.dropout(pooler)
#         output = self.classifier(pooler)
#         output = self.softmax(output)
        # return output

        output = self.electra(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        return output.logits

#### Helpful Functions

Loss
* Tasks with only two labels use binary crossentropy
* Tasks with more labels will use categorical crossentropy
* Tasks that don't have labels, but rather have distributions should use KL divergence
* Tasks that don't have distributions should use something like RMSE loss

Train
* Steps through the data batch by batch
* grabs ids, masks, and token_type_ids which are required inputs for BERT
* inputs are passed through the model, compared to targets, computes loss function, backprops

Validation
* Takes a model, passes inputs
* Need to use the targets from here because they are potentially shuffled!

In [11]:
def loss_fn(outputs, targets):
    # return torch.nn.BCELoss()(outputs, targets)
    return torch.nn.CrossEntropyLoss()(outputs, targets) # not likely to work on first go

In [12]:
def train(model, training_loader, optimizer):
    model.train()
    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long) # was float

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss

In [13]:
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(testing_loader):
            targets = data['targets'].to(device, dtype=torch.long)
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            outputs = torch.argmax(outputs, dim=1).cpu().detach()
            # outputs = torch.sigmoid(outputs).cpu().detach()
            fin_outputs.extend(outputs)
            fin_targets.extend(targets)
    return torch.stack(fin_outputs), torch.stack(fin_targets)

#### The Tokenizer

* converts raw string to ids, masks, and token_type_ids

In [14]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = ElectraTokenizerFast.from_pretrained('google/electra-small-discriminator')

# what does the tokenizer do?
print(f"train data: {train_X[5]}")

tokenizer.encode_plus(
            train_X[5],
            None,
            add_special_tokens=True,
            max_length=128,
            # pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

train data: (Objective) In order to increase classification accuracy of tea-category identification (TCI) system, this paper proposed a novel approach. (Method) The proposed methods first extracted 64 color histogram to obtain color information, and 16 wavelet packet entropy to obtain the texture information. With the aim of reducing the 80 features, principal component analysis was harnessed. The reduced features were used as input to generalized eigenvalue proximal support vector machine (GEPSVM). Winner-takes-all (WTA) was used to handle the multiclass problem. Two kernels were tested, linear kernel and Radial basis function (RBF) kernel. Ten repetitions of 10-fold stratified cross validation technique were used to estimate the out-of-sample errors. We named our method as GEPSVM + RBF + WTA and GEPSVM + WTA. (Result) The results showed that PCA reduced the 80 features to merely five with explaining 99.90% of total variance. The recall rate of GEPSVM + RBF + WTA achieved the highest 

{'input_ids': [101, 1006, 7863, 1007, 1999, 2344, 2000, 3623, 5579, 10640, 1997, 5572, 1011, 4696, 8720, 1006, 22975, 2072, 1007, 2291, 1010, 2023, 3259, 3818, 1037, 3117, 3921, 1012, 1006, 4118, 1007, 1996, 3818, 4725, 2034, 15901, 4185, 3609, 2010, 3406, 13113, 2000, 6855, 3609, 2592, 1010, 1998, 2385, 4400, 7485, 14771, 23077, 2000, 6855, 1996, 14902, 2592, 1012, 2007, 1996, 6614, 1997, 8161, 1996, 3770, 2838, 1010, 4054, 6922, 4106, 2001, 17445, 2098, 1012, 1996, 4359, 2838, 2020, 2109, 2004, 7953, 2000, 18960, 1041, 29206, 10175, 5657, 4013, 9048, 9067, 2490, 9207, 3698, 1006, 16216, 4523, 2615, 2213, 1007, 1012, 3453, 1011, 3138, 1011, 2035, 1006, 21925, 1007, 2001, 2109, 2000, 5047, 1996, 4800, 26266, 3291, 1012, 2048, 16293, 2015, 2020, 7718, 1010, 7399, 16293, 1998, 15255, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#### Training setup

* hyperparameters
* setup dataset
* setup parameters
* setup dataloader

In [15]:
MAX_LEN = 128
BATCH_SIZE = 64 #original: 64
EPOCHS = 3
NUM_OUT = len(labels)
LEARNING_RATE = 2e-05

training_data = MultiLabelDataset(train_X, torch.from_numpy(train_y), tokenizer, MAX_LEN)
test_data = MultiLabelDataset(test_X, torch.from_numpy(test_y), tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

training_loader = torch.utils.data.DataLoader(training_data, **train_params)
testing_loader = torch.utils.data.DataLoader(test_data, **test_params)

#### Train, Evaluate

* model.to -> send to GPU, if available (anything computed should be put onto the GPU)
* setup optimizer - could use Stochastic Gradient Descent, but ADAM tends to work better
* for each epoch, train, show the loss, evaluate on the test data

In [16]:
# model = BERTClass(NUM_OUT)
model = ELECTRAClass(NUM_OUT)
model.to(device)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')
    guess, targs = validation(model, testing_loader)
    # guesses = torch.max(guess, dim=1)
    guesses = guess.cpu()
    # targets = torch.max(targs, dim=1)
    targets = targs.cpu()
    # print('arracy on test set {}'.format(accuracy_score(guesses.indices, targets.indices)))
    print("\naccuracy on test set {}".format(accuracy_score(guesses, targets)))

pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

100%|██████████| 719/719 [05:47<00:00,  2.07it/s]


Epoch: 0, Loss:  0.889681339263916


100%|██████████| 16/16 [00:03<00:00,  4.43it/s]



accuracy on test set 0.7269035532994924


100%|██████████| 719/719 [06:22<00:00,  1.88it/s]


Epoch: 1, Loss:  0.6426712870597839


100%|██████████| 16/16 [00:03<00:00,  4.24it/s]



accuracy on test set 0.7857868020304568


100%|██████████| 719/719 [06:22<00:00,  1.88it/s]


Epoch: 2, Loss:  0.5695027709007263


100%|██████████| 16/16 [00:03<00:00,  4.12it/s]


accuracy on test set 0.8060913705583757





In [None]:
# terminate this colab session to release resources
# from google.colab import runtime
# runtime.unassign()