<a href="https://colab.research.google.com/github/CaptainPlusPlus/btba_rerproduction/blob/main/btba_reproduction2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reproduction of BTBA Model for Unsupervised Word Alignment
Article can be found here: https://aclanthology.org/2021.acl-long.24.pdf

### Requirements
* Downloand and preprocess the deen, fren, roen texts from https://github.com/lilt/alignment-scripts.
* Upload the `bpe` lowercased preprocessed `train` & `test` folders as well as the sentencepiece `bpe` models.

## Custom Tokenizer Definition

Since the Sentencepiece tokenizer used in the article and alignment scripts it is compared against outputs a binary format model and vocabulary, the sentencepiece tokenizer must be adjusted to fit the HuggingFace models used to reproduce the article's transformer based approaach.

In [1]:
import sentencepiece as spm

class CustomSentencePieceTokenizer:
    def __init__(self, sentencepiece_model_path):
        self.sp = spm.SentencePieceProcessor()
        if not self.sp.Load(sentencepiece_model_path):
            raise FileNotFoundError("Failed to load SentencePiece model from specified path.")
        self.special_tokens = {'<s>': self.sp.piece_to_id('<s>'), '</s>': self.sp.piece_to_id('</s>'), '<unk>': self.sp.piece_to_id('<unk>')}
        self.additional_special_tokens = {'<pad>': self.sp.GetPieceSize(), '<mask>': self.sp.GetPieceSize() + 1}
        self.special_token_ids = {**self.special_tokens, **self.additional_special_tokens}

    def tokenize(self, text):
        return self.sp.encode_as_pieces(text)

    def convert_tokens_to_ids(self, tokens):
        return [self.special_token_ids.get(token, self.sp.piece_to_id(token)) for token in tokens]

    def convert_ids_to_tokens(self, ids):
        id_to_token_map = {id: token for token, id in self.special_token_ids.items()}
        id_to_token_map.update({id: self.sp.id_to_piece(id) for id in range(self.sp.GetPieceSize())})
        return [id_to_token_map.get(id, '<unk>') for id in ids]

    def get_vocab_size(self):
        return self.sp.GetPieceSize() + len(self.additional_special_tokens)

    def get_special_tokens(self):
        return {**self.special_tokens, **self.additional_special_tokens}

    def get_special_token_ids(self):
        return self.special_token_ids


In [2]:
def test_custom_tokenizer():
    tokenizer = CustomSentencePieceTokenizer('/content/drive/MyDrive/bachelor_thesis/data/alignment-scripts/train/bpe.deen.model')
    test_sentence = "das ist ein test."
    print("Testing tokenization of sentence:", test_sentence)
    tokens = tokenizer.tokenize(test_sentence)
    print("Tokens:", tokens)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    print("Token IDs:", token_ids)
    tokens_from_ids = tokenizer.convert_ids_to_tokens(token_ids)
    print("Tokens from IDs:", tokens_from_ids)
    print("Special Tokens:", tokenizer.get_special_tokens())
    print("Special Token IDs:", tokenizer.get_special_token_ids())
    special_tokens_test = ['<pad>', '<mask>', '<s>', '</s>', '<unk>']
    special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens_test)
    print("Special tokens to IDs:", list(zip(special_tokens_test, special_tokens_ids)))
    special_tokens_round_trip = tokenizer.convert_ids_to_tokens(special_tokens_ids)
    print("IDs back to special tokens:", special_tokens_round_trip)

test_custom_tokenizer()

Testing tokenization of sentence: das ist ein test.
Tokens: ['▁das', '▁ist', '▁ein', '▁test', '.']
Token IDs: [94, 158, 69, 4218, 39789]
Tokens from IDs: ['▁das', '▁ist', '▁ein', '▁test', '.']
Special Tokens: {'<s>': 1, '</s>': 2, '<unk>': 0, '<pad>': 40000, '<mask>': 40001}
Special Token IDs: {'<s>': 1, '</s>': 2, '<unk>': 0, '<pad>': 40000, '<mask>': 40001}
Special tokens to IDs: [('<pad>', 40000), ('<mask>', 40001), ('<s>', 1), ('</s>', 2), ('<unk>', 0)]
IDs back to special tokens: ['<pad>', '<mask>', '<s>', '</s>', '<unk>']


In [3]:
tokenizer = CustomSentencePieceTokenizer('/content/drive/MyDrive/bachelor_thesis/data/alignment-scripts/train/bpe.deen.model')

## Load bpe lowercased data and save tokenized data

In [4]:
import torch

def load_data(src_file, tgt_file):
    with open(src_file, 'r', encoding='utf-8') as src_f, open(tgt_file, 'r', encoding='utf-8') as tgt_f:
        src_lines = [line.strip() for line in src_f.readlines()]
        tgt_lines = [line.strip() for line in tgt_f.readlines()]
    assert len(src_lines) == len(tgt_lines), "Source and target files should have the same number of lines."
    return src_lines, tgt_lines

def tokenize_and_save_data(src_lines, tgt_lines, tokenizer, src_path, tgt_path):
    tokenized_src = [torch.tensor(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(line)), dtype=torch.long) for line in src_lines]
    tokenized_tgt = [torch.tensor(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(line)), dtype=torch.long) for line in tgt_lines]
    torch.save(tokenized_src, src_path)
    torch.save(tokenized_tgt, tgt_path)
    print(f"Tokenized data saved to {src_path} and {tgt_path}")


In [5]:
src_file_path = '/content/drive/MyDrive/bachelor_thesis/data/alignment-scripts/train/deen.lc.src.bpe'
tgt_file_path = '/content/drive/MyDrive/bachelor_thesis/data/alignment-scripts/train/deen.lc.tgt.bpe'
model_path = '/content/drive/MyDrive/bachelor_thesis/data/alignment-scripts/train/bpe.deen.model'
tokenized_src_path = '/content/drive/MyDrive/bachelor_thesis/data/alignment-scripts/train/deen_tokenized_src.pt'
tokenized_tgt_path = '/content/drive/MyDrive/bachelor_thesis/data/alignment-scripts/train/deen_tokenized_tgt.pt'

In [None]:
src_lines, tgt_lines = load_data(src_file_path, tgt_file_path)
tokenize_and_save_data(src_lines, tgt_lines, tokenizer, tokenized_src_path, tokenized_tgt_path)

Tokenized data saved to /content/drive/MyDrive/bachelor_thesis/data/alignment-scripts/train/deen_tokenized_src.pt and /content/drive/MyDrive/bachelor_thesis/data/alignment-scripts/train/deen_tokenized_tgt.pt


In [6]:
from transformers import BartConfig, BartModel
import torch.nn as nn

class BTBADecoderLayer(nn.Module):
    def __init__(self, config, is_last_layer=False):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(config.d_model, config.decoder_attention_heads)
        self.multihead_attn = nn.MultiheadAttention(config.d_model, config.decoder_attention_heads)
        self.layer_norm1 = nn.LayerNorm(config.d_model)
        self.layer_norm2 = nn.LayerNorm(config.d_model)
        self.is_last_layer = is_last_layer
        if not is_last_layer:
            self.ffn = nn.Sequential(
                nn.Linear(config.d_model, config.decoder_ffn_dim),
                nn.ReLU(),
                nn.Linear(config.decoder_ffn_dim, config.d_model),
            )
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x, memory, src_mask=None, tgt_mask=None):
        x = self.layer_norm1(x + self.dropout(self.self_attn(x, x, x, key_padding_mask=tgt_mask)[0]))
        x = self.layer_norm2(x + self.dropout(self.multihead_attn(x, memory, memory, key_padding_mask=src_mask)[0]))
        if not self.is_last_layer:
            x = self.ffn(x)
        return x

class BTBAModel(BartModel):
    def __init__(self, config):
        super().__init__(config)
        self.decoder.layers = nn.ModuleList([
            BTBADecoderLayer(config, is_last_layer=(i == config.decoder_layers - 1))
            for i in range(config.decoder_layers)
        ])


## Dynamic Masking for tje Dataset

* Every word in a sentence should be masked only once across the entire training - track the masking state and reset it after each epoch.
* Percentage-based: At least 10% of the words in each sentence must be masked, or one word if the sentence has less than ten words.

In [7]:
from torch.utils.data import Dataset

class DynamicMaskingDataset(Dataset):
    def __init__(self, tokenized_src, tokenized_tgt, tokenizer, mask_probability=0.1):
        self.tokenized_src = tokenized_src
        self.tokenized_tgt = tokenized_tgt
        self.tokenizer = tokenizer
        self.mask_id = tokenizer.get_special_token_ids()['<mask>']
        self.pad_id = tokenizer.get_special_token_ids()['<pad>']
        self.mask_probability = mask_probability
        self.mask_tracker = {i: set() for i in range(len(tokenized_tgt))}

    def __len__(self):
        # Returns the number of items in the dataset
        return len(self.tokenized_tgt)

    def mask_input(self, inputs, idx):
        num_tokens = len(inputs)
        num_to_mask = max(int(num_tokens * self.mask_probability), 1)
        labels = inputs.clone()

        candidate_mask = (inputs != self.pad_id) & (inputs != self.tokenizer.get_special_token_ids()['<s>']) & (inputs != self.tokenizer.get_special_token_ids()['</s>'])
        candidate_indices = np.setdiff1d(np.where(candidate_mask.numpy())[0], list(self.mask_tracker[idx]))

        if len(candidate_indices) == 0:
            self.mask_tracker[idx] = set()
            candidate_indices = np.where(candidate_mask.numpy())[0]

        if len(candidate_indices) < num_to_mask:
            num_to_mask = len(candidate_indices)

        if num_to_mask > 0:
            masked_indices = np.random.choice(candidate_indices, size=num_to_mask, replace=False)
            self.mask_tracker[idx].update(masked_indices)
            inputs[masked_indices] = self.mask_id
        else:
            labels.fill_(-100)

        labels[~candidate_mask] = -100
        return inputs, labels

    def __getitem__(self, idx):
        src = self.tokenized_src[idx]
        tgt = self.tokenized_tgt[idx]
        src, src_labels = self.mask_input(src, idx)
        tgt, tgt_labels = self.mask_input(tgt, idx)

        return {"input_ids": src, "labels": tgt_labels}  # Ensure labels match the format expected by the model



In [8]:
tokenized_src = torch.load(tokenized_src_path)
tokenized_tgt = torch.load(tokenized_tgt_path)

In [9]:
dataset = DynamicMaskingDataset(tokenized_src, tokenized_tgt, tokenizer)

In [10]:
from transformers import BartForConditionalGeneration, BartConfig

# Update the model initialization
config = BartConfig.from_pretrained('facebook/bart-large')
config.decoder_ffn_dim = 3072
model = BartForConditionalGeneration(config)

# Now model.forward should be able to take 'labels' and compute loss if they are provided.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

In [14]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Pad sequences to the longest sequence in the batch
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.get_special_token_ids()['<pad>'])
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {'input_ids': input_ids_padded, 'labels': labels_padded}


In [12]:
from torch.nn.functional import cross_entropy
from transformers import Trainer, TrainingArguments

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels", None)
        outputs = model(**inputs)
        logits = outputs.logits

        if labels is not None:
            logits = logits.view(-1, logits.size(-1))
            labels = labels.view(-1)
            loss = cross_entropy(logits, labels, ignore_index=-100)
        else:
            loss = None
        return (loss, outputs) if return_outputs else loss



In [18]:
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader
import numpy as np  # Ensure numpy is imported
import torch.multiprocessing as mp

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,  # Reduced batch size
    gradient_accumulation_steps=4,  # Accumulate gradients to emulate a larger batch
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,
)




# Now you can create DataLoader and do other operations
data_loader = DataLoader(
    dataset,
    batch_size=16,  # Adjust if necessary
    shuffle=True,
    num_workers=4,
    pin_memory=True,
    collate_fn=collate_fn
)



trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=collate_fn,
)


RuntimeError: CUDA error: CUDA-capable device(s) is/are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
trainer.train()

In [21]:
import torch

# Clearing the GPU memory
torch.cuda.empty_cache()

# Checking for allocated memory and freeing up
if torch.cuda.is_available():
    torch.cuda.ipc_collect()  # collect garbage


In [None]:
def evaluate_with_talp(model, tokenizer, data_loader, device):
    model.eval()
    model.to(device)
    alignments = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            outputs = model(input_ids)
            generated_tokens = torch.argmax(outputs.logits, dim=-1)
            alignments.extend(process_alignment_data(generated_tokens, tokenizer))
    return alignments

def process_alignment_data(generated_tokens, tokenizer):
    # Convert tokens to alignments and return
    return [tokenizer.convert_ids_to_tokens(g) for g in generated_tokens]


In [None]:
def full_context_based_optimization(model, train_dataloader, optimizer, scheduler, device, num_iterations=50):
    model.train()
    for iteration in range(num_iterations):
        for batch in train_dataloader:
            input_ids, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()


In [None]:
def symmetrize_alignments(ltr_alignments):
    symmetrical_alignments = {}
    for src_idx, tgt_list in ltr_alignments.items():
        for tgt_idx in tgt_list:
            if tgt_idx not in symmetrical_alignments:
                symmetrical_alignments[tgt_idx] = []
            symmetrical_alignments[tgt_idx].append(src_idx)
    # Ensure symmetry
    for src_idx, tgt_list in ltr_alignments.items():
        if src_idx not in symmetrical_alignments:
            symmetrical_alignments[src_idx] = []
        symmetrical_alignments[src_idx].extend([tgt for tgt in tgt_list if src_idx in symmetrical_alignments.get(tgt, [])])
    return symmetrical_alignments


In [None]:
import gc

gc.collect()
torch.cuda.empty_cache()