# Creating and optimizng the BTBA Tranformer model

## Preprocessing data
Using Ro<->En, De<->En and Fr<->En datasets from https://github.com/lilt/alignment-scripts/tree/master to compare with other works.
Preprocessed as described in the repository, and used here under /data

## Masking input

* 10% of the data is masked (at least one word per sentence).
* Every token is masked exactly once. 
* masking sentences is done on the fly in every epoch.
* Following Och and Ney (2003), <bos> token is added at the beginning of source sentence for target words without alignments in source.

In [1]:
# Masking Dependencies
import random
import itertools

In [2]:
def apply_mask(sentence, mask_index, mask_token="<x>"):
    """ Replace word at specified index with mask token. """
    words = sentence.strip().split()
    words[mask_index] = mask_token
    
    return ' '.join(words)

# def generate_masked_sentences(sentence, mask_token="<x>"):
#     """ Generate permutations of sentence with mask token. """
#     words = sentence.strip().split()
#     num_words = len(words)
# 
#     indices = list(range(num_words))
#     random.shuffle(indices)
# 
#     # Distribute masked indices evenly with into at most 10 sets.
#     num_sets = min(num_words, 10)
#     for i in range(num_sets):
#         current_indices = indices[i::num_sets]
#         if not current_indices:
#             continue
#         masked_sentence = words[:]
#         for index in current_indices:
#             masked_sentence[index] = mask_token
#         yield ' '.join(masked_sentence)

## Prepare data for training

In [2]:
from itertools import combinations
import random

In [1]:
def generate_masked_sentences(sentence, mask_token="<mask>", max_outputs=10):
    words = sentence.strip().split()
    num_words = len(words)
    # Assuming the first word is '<bos>' and should not be masked
    num_to_mask = max(1, int((num_words - 1) * 0.1))  # Exclude the first token from the percentage calculation
    index_permutations = list(combinations(range(1, num_words), num_to_mask))  # Start range at 1 to exclude '<bos>'
    random.shuffle(index_permutations)  # Shuffle to ensure random selection

    masked_sentences = []
    for indices_to_mask in index_permutations[:max_outputs]:
        temp_words = words[:]
        for index in indices_to_mask:
            temp_words[index] = mask_token
        masked_sentences.append(' '.join(temp_words))
    return masked_sentences


# Sample usage with preprocessed data
src_train_data = ["<bos> " + line.strip() for line in open("data/alignment-scripts/train/German-English/europarl-v7.de-en.de", 'r', encoding='utf-8')]
tgt_train_data = [line.strip() for line in open("data/alignment-scripts/train/German-English/europarl-v7.de-en.en", 'r', encoding='utf-8')]
masked_tgt_train_data = [generate_masked_sentences(sentence) for sentence in tgt_train_data]

# Since I need different masks for each epoch, consider generating masks on-the-fly during training or generate multiple sets in advance.

# Prepare data
src_test_data = ["<bos> " + line.strip() for line in open("path_to_preprocessed_german_test_file.txt", 'r', encoding='utf-8')]
tgt_test_data = [line.strip() for line in open("path_to_preprocessed_english_test_file.txt", 'r', encoding='utf-8')]


NameError: name 'combinations' is not defined

## Reproduce BTBA model
### Architecture
* Start with an existing model with an architecture as close as possible to the original transformer model (Vaswani et al., 2017)
    * Used implementation existing in torch - https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html
    * Maybe in the future:
        * BART is used in this case for its sequence to sequence nature, and unlike BERT, relies on Decoder as well as pretrained with denoising autoencoder.
* Remove the feed forward sub layer and the normalizations steps associated with it in the final decoder layer to access its final attention value and its output

In [11]:
# Architecture dependencies - requires `torch`, `TensorFlow` >= 2.0 and `transformers`
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

In [None]:
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer

class CustomTransformerModel(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_encoder_layers, num_decoder_layers, dropout=0.1):
        super(CustomTransformerModel, self).__init__()
        self.model_dim = model_dim
        encoder_layer = TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        decoder_layer = TransformerDecoderLayer(d_model=model_dim, nhead=num_heads, dropout=dropout)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
        self.src_tok_emb = nn.Embedding(input_dim, model_dim)
        self.tgt_tok_emb = nn.Embedding(input_dim, model_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 512, model_dim))
        self.output_linear = nn.Linear(model_dim, input_dim)

    def forward(self, src, tgt, src_mask, tgt_mask, src_key_padding_mask, tgt_key_padding_mask):
        src_emb = self.src_tok_emb(src) + self.positional_encoding[:, :src.size(1), :]
        tgt_emb = self.tgt_tok_emb(tgt) + self.positional_encoding[:, :tgt.size(1), :]
        memory = self.transformer_encoder(src_emb, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask)
        return self.output_linear(outs)

    def generate_square_subsequent_mask(self, sz):
        mask = torch.full((sz, sz), float('-inf'))
        mask_cond = torch.arange(mask.size(-1))
        mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
        return mask


In [None]:
from torch.optim import Adam

# Assume 'vocab_size' is determined after BPE and 'input_dim' equals 'vocab_size'
model = CustomTransformerModel(input_dim=vocab_size, model_dim=512, num_heads=8, num_encoder_layers=6, num_decoder_layers=6)
optimizer = Adam(model.parameters(), lr=0.0002)
criterion = nn.CrossEntropyLoss()

# Assuming dataloaders are set up with masked_tgt_train_data and src_train_data
for epoch in range(50):  # Or as specified for each language pair
    model.train()
    total_loss = 0
    for src, tgt in dataloader:
        src_mask = model.generate_square_subsequent_mask(src.size(0))
        tgt_mask = model.generate_square_subsequent_mask(tgt.size(0))
        optimizer.zero_grad()
        output = model(src, tgt, src_mask, tgt_mask, None, None)
        loss = criterion(output.view(-1, vocab_size), tgt.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}: Loss {total_loss / len(dataloader)}")


In [None]:
from transformers import BartForConditionalGeneration, BartConfig

class CustomBartModel(BartForConditionalGeneration):
    def __init__(self, config):
        super().__init__(config)
        # Assuming modifications are needed in attention or other specifics
        for layer in self.model.decoder.layers:
            layer.self_attn.self_attention.is_bidirectional = True
    
    def forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask):
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            output_attentions=True  # Ensure attentions are returned for analysis
        )
        return outputs

# Load pre-configured BART
config = BartConfig.from_pretrained('facebook/bart-large')
config.output_attentions = True  # Ensure we can access attentions for alignment analysis
model = CustomBartModel(config)


In [None]:
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# Tokenize data
train_encodings = tokenizer(src_train_data, tgt_train_data, padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(src_test_data, tgt_test_data, padding=True, truncation=True, max_length=512)


In [None]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=0.0002)
criterion = nn.CrossEntropyLoss()

for epoch in range(50):  # Adjust epochs per language pair requirements
    model.train()
    total_loss = 0
    for batch in dataloader:  # Assuming dataloader is properly set up
        optimizer.zero_grad()
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'],
                        decoder_input_ids=batch['decoder_input_ids'], decoder_attention_mask=batch['decoder_attention_mask'])
        loss = criterion(outputs.logits.view(-1, tokenizer.vocab_size), batch['labels'].view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}: Loss {total_loss / len(dataloader)}")


In [None]:
def symmetrize_alignments(forward_alignments, backward_alignments):
    """
    Symmetrize the alignments using a heuristic like grow-diagonal-final-and.
    
    forward_alignments: Dict[Tuple[int, int], float] - Alignment scores from source to target.
    backward_alignments: Dict[Tuple[int, int], float] - Alignment scores from target to source.
    """
    symmetrized_alignments = set()
    max_threshold = 0.1  # Define a threshold to filter alignments by score

    # Simple union symmetrization for demonstration
    for (i, j), score in forward_alignments.items():
        if score > max_threshold:
            symmetrized_alignments.add((i, j))

    for (j, i), score in backward_alignments.items():
        if score > max_threshold:
            symmetrized_alignments.add((i, j))

    return symmetrized_alignments

# Legend
forward_alignments = { (1, 2): 0.9, (2, 3): 0.85 }
backward_alignments = { (2, 1): 0.88, (3, 2): 0.90 }
symmetrized = symmetrize_alignments(forward_alignments, backward_alignments)


In [None]:
import sentencepiece as spm

# Assume spm_model is the path to the trained SentencePiece model
sp = spm.SentencePieceProcessor(model_file='spm_model.model')

def encode_with_bpe(texts, sp):
    return [sp.encode(text, out_type=str) for text in texts]

# Example usage
encoded_texts = encode_with_bpe(["This is a sample text.", "Here is another one."], sp)


src_train_data_bpe = encode_with_bpe(src_train_data, sp)
tgt_train_data_bpe = encode_with_bpe(tgt_train_data, sp)

# Assume data loaders and training setup are adjusted to use *_train_data_bpe


In [None]:
# Concatenate all text files into one for SentencePiece training
cat path_to_source_train_file.txt path_to_target_train_file.txt > full_text.txt

# Train SentencePiece Model
spm_train --input=full_text.txt --model_prefix=bpe_model --vocab_size=40000 --character_coverage=0.995 --model_type=bpe


In [None]:
import sentencepiece as spm

# Load trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load('bpe_model.model')

def encode_with_bpe(texts, sp):
    return [sp.encode_as_pieces(text) for text in texts]

# Load and encode training data
src_train_data = ["<bos> " + line.strip() for line in open("path_to_preprocessed_german_train_file.txt", 'r', encoding='utf-8')]
tgt_train_data = [line.strip() for line in open("path_to_preprocessed_english_train_file.txt", 'r', encoding='utf-8')]

src_train_encoded = encode_with_bpe(src_train_data, sp)
tgt_train_encoded = encode_with_bpe(tgt_train_data, sp)


In [None]:
# Assuming a custom Transformer model setup as discussed
from torch.optim import Adam
optimizer = Adam(model.parameters(), lr=0.0002)
criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for src, tgt in zip(src_train_encoded, tgt_train_encoded):
        src_tensor = torch.tensor([sp.piece_to_id(token) for token in src]).unsqueeze(1)  # Batch size 1 for simplicity
        tgt_tensor = torch.tensor([sp.piece_to_id(token) for token in tgt]).unsqueeze(1)
        optimizer.zero_grad()
        output = model(src_tensor, tgt_tensor, None, None)  # Adjust masks and padding as necessary
        loss = criterion(output.view(-1, len(sp)), tgt_tensor.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}: Loss {total_loss / len(src_train_encoded)}")


In [None]:
# Example symmetrization function should be applied post-training with actual attention data
symmetrized_alignments = symmetrize_alignments(forward_attention_data, backward_attention_data)
