# Fine-Tunning Helsinki-NLP/opus-en-af

In [1]:
!pip install sacremoses evaluate torchinfo

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses, evaluate
Successfully installed evaluate-0.4.2 sacremoses-0.1.1


## Required Modules

In [2]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import evaluate
from torchinfo import summary
import re

2024-07-20 08:25:22.507198: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-20 08:25:22.507301: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-20 08:25:22.631380: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Dataset Path config

In [3]:
config = {
  "TRAIN_RAW": "/kaggle/input/sun-en-af/train",
  "TRAIN_DATA": "/kaggle/working/train",
  "VAL_RAW": "/kaggle/input/sun-en-af/val",
  "VAL_DATA": "/kaggle/working/val",
"TRAIN_SOURCE": [
  "data414_2021_a1.en.txt",
  "data414_2021_a2.en.txt",
  "data414_2020_a1.en.txt",
  "ss414_2018_a1.en.txt",
  "ss414_2018_a2.en.txt",
  "ss414_2018_a3.en.txt",
  "ss414_2019_a1.en.txt",
  "ss414_2019_a2.en.txt",
  "ss414_2019_a3.en.txt",
],
"VAL_SOURCE": [
  "compsys414_2017_a1.en.txt",
  "compsys414_2017_a2.en.txt",
  "compsys414_2017_a3.en.txt"
],
  "TRAIN_TARGET": [
    "data414_2021_a1.af.txt",
    "data414_2021_a2.af.txt",
    "data414_2020_a1.af.txt",
    "ss414_2018_a1.af.txt",
    "ss414_2018_a2.af.txt",
    "ss414_2018_a3.af.txt",
    "ss414_2019_a1.af.txt",
    "ss414_2019_a2.af.txt",
    "ss414_2019_a3.af.txt",
  ],
  "VAL_TARGET": [
    "compsys414_2017_a1.af.txt",
    "compsys414_2017_a2.af.txt",
    "compsys414_2017_a3.af.txt"
  ]
}

## Data Cleaning and custom dataset

In [4]:
def process_line(line):
    pattern = r'\$([^$]+)\$'
    new_text = ""
    last_end = 0
    for match in re.finditer(pattern, line):
        start, end = match.span()
        new_text += line[last_end:start] + " $" + match.group(1).replace(" ", "") + "$ "
        last_end = end

    new_text += line[last_end:]
    text = new_text
    text = re.sub(r"\s{2,}", " ", text)
    text = text.split()
    return text

def preprocess_data(train_raw_dir, train_data_dir, language_files, language_name):
    with open(f"{train_data_dir}/{language_name}.txt", 'w+') as output_file:
        for file_name in language_files:
            with open(f"{train_raw_dir}/{file_name}") as input_file:
                for line in input_file:
                    line = line.strip()
                    if len(line) > 0 and line[0] != "%":
                        processed_line = process_line(line)
                        output_file.write(" ".join(processed_line) + "\n")
    print(f"Done for {language_name}!")


In [5]:
preprocess_data(config["TRAIN_RAW"], config["TRAIN_DATA"], config["TRAIN_SOURCE"], "english")
preprocess_data(config["TRAIN_RAW"], config["TRAIN_DATA"], config["TRAIN_TARGET"], "afrikaans")

preprocess_data(config["VAL_RAW"], config["VAL_DATA"], config["VAL_SOURCE"], "english")
preprocess_data(config["VAL_RAW"], config["VAL_DATA"], config["VAL_TARGET"], "afrikaans")

Done for english!
Done for afrikaans!
Done for english!
Done for afrikaans!


In [6]:
class LangData(Dataset):
    def __init__(self, tokens):
        self.tokens = tokens
        
    def __len__(self):
        return len(self.tokens['input_ids'])

    def __getitem__(self, index):
        input_ids = self.tokens['input_ids'][index]
        labels = self.tokens['labels'][index]
        attention_mask = self.tokens['attention_mask'][index]

        return {
            'input_ids': input_ids.squeeze(),
            'attention_mask': attention_mask.squeeze(),
            'labels': labels.squeeze()
        }
    
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]
    
    # Pad sequences
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=57444)
    attention_masks_padded = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)  
    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_masks_padded,
        'labels': labels_padded
    }

with open(f"{config['TRAIN_DATA']}/english.txt") as text:
    english = text.read().strip().split("\n")
with open(f"{config['TRAIN_DATA']}/afrikaans.txt") as text:
    afrikaans = text.read().strip().split("\n")
    
with open(f"{config['VAL_DATA']}/english.txt") as text:
    english_val = text.read().strip().split("\n")
with open(f"{config['VAL_DATA']}/afrikaans.txt") as text:
    afrikaans_val = text.read().strip().split("\n")

## Utility

In [7]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

def eval_model(model, data_loader, device):
    metric = evaluate.load("bleu")
    for i in range(1, 5):
        for batch in data_loader:
            input_ = batch['input_ids']
            attnm_ = batch['attention_mask']
            labels = batch['labels']
            pred = model.generate(input_.to(device))
            pred, lab = postprocess(pred, labels)
            metric.add_batch(predictions=pred, references=lab)
        print(f"BLEU{-i}".center(80))
        print("-----" * 18)
        blues = metric.compute(max_order=i)
        for key, val in blues.items():
            print(f"{key:<20}: {val}")
        print("*****" * 18)


def train_model(model, train_loader, optimizer, device, epochs):
    train_loss = []
    N = len(train_loader.dataset)
    for epoch in range(epochs):
        pbar = tqdm(train_loader, unit="batch", desc=f"Epoch {epoch + 1}/{epochs}")
        run_loss = 0
        for batch in pbar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            run_loss += loss.item() * input_ids.size(0)
            pbar.set_postfix(loss=f"{run_loss / N:.3f}")
        train_loss.append(run_loss/N)
    return train_loss

## Load the pre-trained model and tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-af", return_tensors="pt")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-af")

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/819k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/297M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [9]:
summary(model, depth=4, col_names=["num_params","trainable",])

Layer (type:depth-idx)                                       Param #                   Trainable
MarianMTModel                                                --                        Partial
├─MarianModel: 1-1                                           --                        Partial
│    └─Embedding: 2-1                                        29,411,840                True
│    └─MarianEncoder: 2-2                                    29,411,840                Partial
│    │    └─Embedding: 3-1                                   (recursive)               True
│    │    └─MarianSinusoidalPositionalEmbedding: 3-2         (262,144)                 False
│    │    └─ModuleList: 3-3                                  --                        True
│    │    │    └─MarianEncoderLayer: 4-1                     3,152,384                 True
│    │    │    └─MarianEncoderLayer: 4-2                     3,152,384                 True
│    │    │    └─MarianEncoderLayer: 4-3                     3,15

In [10]:
LR = 2e-5
device = "cuda"
model = model.to(device)
optimizer = AdamW(model.parameters(), LR)

In [11]:
tokens_train = tokenizer(english, text_target=afrikaans, return_tensors="pt", padding='longest')
tokens_eval = tokenizer(english_val, text_target=afrikaans_val, return_tensors="pt", padding='longest')

train_dataset = LangData(tokens_train)
valid_dataset = LangData(tokens_eval)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

## Train the model

### Evaluate the model before training

In [12]:
print("Training set".center(50))
eval_model(model,train_loader, device)

                   Training set                   


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.6742814842166323
precisions          : [0.7077720207253886]
brevity_penalty     : 0.9526817456355053
length_ratio        : 0.9537667471996486
translation_length  : 8685
reference_length    : 9106
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.5695846153251003
precisions          : [0.7077720207253886, 0.5050419147126716]
brevity_penalty     : 0.9526817456355053
length_ratio        : 0.9537667471996486
translation_length  : 8685
reference_length    : 9106
******************************************************************************************
              

In [13]:
print("Validation set".center(50))
eval_model(model, valid_loader, device)

                  Validation set                  
                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.5724458204334365
precisions          : [0.5724458204334365]
brevity_penalty     : 1.0
length_ratio        : 1.1234782608695653
translation_length  : 3230
reference_length    : 2875
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.4408896339676111
precisions          : [0.5724458204334365, 0.3395669291338583]
brevity_penalty     : 1.0
length_ratio        : 1.1234782608695653
translation_length  : 3230
reference_length    : 2875
************************************************************************************

### Train the model

In [14]:
train_loss = train_model(model, train_loader, optimizer, device, 50)
np.save("train_loss.npy", np.array(train_loss))

Epoch 1/50: 100%|██████████| 8/8 [00:04<00:00,  1.77batch/s, loss=5.187]
Epoch 2/50: 100%|██████████| 8/8 [00:04<00:00,  1.94batch/s, loss=2.591]
Epoch 3/50: 100%|██████████| 8/8 [00:04<00:00,  1.94batch/s, loss=1.374]
Epoch 4/50: 100%|██████████| 8/8 [00:04<00:00,  1.94batch/s, loss=0.745]
Epoch 5/50: 100%|██████████| 8/8 [00:04<00:00,  1.94batch/s, loss=0.530]
Epoch 6/50: 100%|██████████| 8/8 [00:04<00:00,  1.94batch/s, loss=0.447]
Epoch 7/50: 100%|██████████| 8/8 [00:04<00:00,  1.94batch/s, loss=0.400]
Epoch 8/50: 100%|██████████| 8/8 [00:04<00:00,  1.94batch/s, loss=0.367]
Epoch 9/50: 100%|██████████| 8/8 [00:04<00:00,  1.94batch/s, loss=0.342]
Epoch 10/50: 100%|██████████| 8/8 [00:04<00:00,  1.94batch/s, loss=0.320]
Epoch 11/50: 100%|██████████| 8/8 [00:04<00:00,  1.94batch/s, loss=0.301]
Epoch 12/50: 100%|██████████| 8/8 [00:04<00:00,  1.93batch/s, loss=0.284]
Epoch 13/50: 100%|██████████| 8/8 [00:04<00:00,  1.94batch/s, loss=0.268]
Epoch 14/50: 100%|██████████| 8/8 [00:04<00:00,

### Re-evaluate model after training

In [15]:
print("Training set".center(50))
eval_model(model,train_loader, device)

                   Training set                   
                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.9457426249259133
precisions          : [0.9495038588754134]
brevity_penalty     : 0.9960387375845371
length_ratio        : 0.9960465627059082
translation_length  : 9070
reference_length    : 9106
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.923166020630943
precisions          : [0.9495038588754134, 0.9047121634168988]
brevity_penalty     : 0.9960387375845371
length_ratio        : 0.9960465627059082
translation_length  : 9070
reference_length    : 9106
*******************************************************

In [16]:
print("Validation set".center(50))
eval_model(model, valid_loader, device)

                  Validation set                  
                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.6743807086687942
precisions          : [0.6983441324694024]
brevity_penalty     : 0.9656853652999539
length_ratio        : 0.9662608695652174
translation_length  : 2778
reference_length    : 2875
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.5408381583255089
precisions          : [0.6983441324694024, 0.4491525423728814]
brevity_penalty     : 0.9656853652999539
length_ratio        : 0.9662608695652174
translation_length  : 2778
reference_length    : 2875
******************************************************

In [17]:
metric = evaluate.load("bleu")

In [18]:
test = next(iter(valid_loader))
predicted = model.generate(test['input_ids'].to(device))
predictions, labels = postprocess(predicted, test['labels'])
sources = tokenizer.batch_decode(test['input_ids'].to(device), skip_special_tokens=True)

In [19]:
for source, pred, lab in zip(sources, predictions, labels):
    print(f"Source    : {source[:150]}")
    print(f"Prediction: {pred[:150]}")
    print(f"Label     : {lab[0][:150]}")
    bleu = []
    for i in range(1,5):
        bleu.append(metric.compute(predictions=[pred], references=lab, max_order=i)['bleu'])
    print(f"BLEU      : {bleu}")
    print()

Source    : Below is the ordering options within this family.
Prediction: Onderste is die bestellingsopsies in hierdie gesin.
Label     : Die bestelling opsies binne die familie word gesien hieronder.
BLEU      : [0.19470019576785122, 0.0, 0.0, 0.0]

Source    : Under what circumstances would the EFM32ZG110F4 be preferred above the EFM32ZG110F32?
Prediction: Onder watter omstandighede sou die EFM32ZG110F4 bo die EFM32ZG110F32 verkies word?
Label     : Onder watter omstandighede sal die EFM32ZG110F4 verkies word bo die EFM32ZG110F32?
BLEU      : [0.9166666666666666, 0.7071067811865475, 0.464158883361278, 0.0]

Source    : Is a cross-compiler used in the creation of the application for the BB in the practical?
Prediction: Is 'n kruiskommuler gebruik in die skep van die aansoek vir die BB in die praktiese?
Label     : Is 'n kruisvertaler gebruik in die skep van die program vir die BB van die prakties?
BLEU      : [0.7647058823529411, 0.6183469424008423, 0.5032468442237513, 0.4065220433860