# Task description
- Translate text from Chinese to English.
- Main goal: Get familiar with transformer.

## install the required package

In [1]:
!pip install torchmetrics



## Import package

In [2]:
import os
import json
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchsummary import summary
import torch.nn.functional as F

## Fix random seed

In [3]:
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_seed(87)

# Data
- Original dataset is [20k-en-zh-translation-pinyin-hsk](https://huggingface.co/datasets/swaption2009/20k-en-zh-translation-pinyin-hsk)
- We select 50000 English-Chinese sentence pairs for translation task

- Args:
  - BATCH_SIZE
  - data_dir: the path to the given translation dataset
- Tokenizer: BertTokenizer
  - encode: convert text to token ID
  - decode: convert token ID back to text
- Add paddings
  - make all the sentences the same length by inserting token ID = PAD_IDX at the back

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd "/content/drive/MyDrive"

In [5]:
data_dir = "./translation_data.json"
BATCH_SIZE = 64

## Show the raw data

In [6]:
translation_raw_data = pd.read_json(data_dir)
translation_raw_data = translation_raw_data
display(translation_raw_data)

Unnamed: 0,english,chinese
0,"Slowly and not without struggle, America began...",美国缓慢地开始倾听，但并非没有艰难曲折。
1,Dithering is a technique that blends your colo...,抖动是关于颜色混合的技术，使你的作品看起来更圆滑，或者只是创作有趣的材质。
2,This paper discusses the petrologic characteri...,本文以珲春早第三纪含煤盆地的地质构违背景为依据，分析了煤系地层的岩石学特征。
3,The second encounter relates to my grandfather...,第二次事件跟我爷爷的宝贝匣子有关。
4,One way to address these challenges would be t...,解决这些挑战的途径包括依照麻瓜在南非的经验设立真相与和解委员会。
...,...,...
49995,You were too obtuse to take the hint.,你太迟钝了， 没有理解这种暗示。
49996,"Therefore, in the event the mortgagee of ship ...",因此，在这种情况下船舶抵押权人放弃了债务人提供的担保就会影响其他担保人的利益，导致抵押权人的...
49997,"Fourth, puncture administrative bloat.",第四，削弱行政膨胀。
49998,Massimo Oddo says he won't be thinking about h...,马西莫。奥多声明他不会在世界杯决赛圈比赛结束之前考虑未来的俱乐部。


## Tokenizer

In [7]:
from transformers import BertTokenizer
tokenizer_en = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer_cn = BertTokenizer.from_pretrained("bert-base-chinese")

In [8]:
english_seqs = translation_raw_data["english"].apply(lambda x: tokenizer_en.encode(x, add_special_tokens=True, padding=False))
chinese_seqs = translation_raw_data["chinese"].apply(lambda x: tokenizer_cn.encode(x, add_special_tokens=True, padding=False))

MAX_TOKENIZE_LENGTH = max(english_seqs.str.len().max(),chinese_seqs.str.len().max()) # longest string
MAX_TOKENIZE_LENGTH = pow(2, math.ceil(math.log(MAX_TOKENIZE_LENGTH)/math.log(2)))   # closest upper to the power of 2

print("max tokenize length:", MAX_TOKENIZE_LENGTH)

max tokenize length: 128


## Add paddings

In [9]:
PAD_IDX = 0
BOS_IDX = chinese_seqs.iloc[0][0]
EOS_IDX = chinese_seqs.iloc[0][-1]

def add_padding(token_list, max_length):
    ### TO-DO: Add padding to make all the sentence the same length
    # Add your logic here to pad the token_list to max_length
    # This is just a placeholder, you need to implement the actual padding logic
    padded_list = token_list + [PAD_IDX] * (max_length - len(token_list))
    return padded_list # Return the padded token list

chinese_seqs = chinese_seqs.apply(lambda x: add_padding(x,MAX_TOKENIZE_LENGTH))
english_seqs = english_seqs.apply(lambda x: add_padding(x,MAX_TOKENIZE_LENGTH))

In [10]:
# check the padding result
print("=====chinese tokenized data=====")
print(chinese_seqs.iloc[0])

print("=====english tokenized data=====")
print(english_seqs.iloc[0])

=====chinese tokenized data=====
[101, 5401, 1744, 5353, 2714, 1765, 2458, 1993, 967, 1420, 8024, 852, 2400, 7478, 3766, 3300, 5680, 7410, 3289, 2835, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
=====english tokenized data=====
[101, 13060, 1105, 1136, 1443, 5637, 117, 1738, 1310, 1106, 5113, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Datalodader
- Split dataset into training dataset(90%) and validation dataset(10%).
- Create dataloader to iterate the data.

In [11]:
data_size = len(translation_raw_data)
train_size = int(0.9*data_size)
test_size = data_size - train_size
print("train_size:",train_size)
print("test_size:",test_size)

en_training_data = []
cn_training_data = []
en_testing_data = []
cn_testing_data = []

for i in range(data_size):
    if (i < train_size):
        en_training_data.append(torch.Tensor(english_seqs.iloc[i]))
        cn_training_data.append(torch.Tensor(chinese_seqs.iloc[i]))
    else:
        en_testing_data.append(torch.Tensor(english_seqs.iloc[i]))
        cn_testing_data.append(torch.Tensor(chinese_seqs.iloc[i]))


class TextTranslationDataset(Dataset):
    def __init__(self, src, dst):
        self.src_list = src
        self.dst_list = dst

    def __len__(self):
        return len(self.src_list)

    def __getitem__(self, idx):
        return self.src_list[idx], self.dst_list[idx]

cn_to_en_train_set = TextTranslationDataset(cn_training_data, en_training_data)
cn_to_en_test_set = TextTranslationDataset(cn_testing_data, en_testing_data)

cn_to_en_train_loader = DataLoader(cn_to_en_train_set, batch_size=BATCH_SIZE, shuffle=False)
cn_to_en_test_loader = DataLoader(cn_to_en_test_set, batch_size=BATCH_SIZE, shuffle=True)

train_size: 45000
test_size: 5000


# Model
- TO-DO: Finish the model by yourself
- Base transformer layers in [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
    - TransformerEncoderLayer:
    - TransformerDecoderLayer:
- Positional encoding and input embedding
- Note that you may need masks when implementing attention mechanism
    - Padding mask: prevent input from attending to padding tokens
    - Causal mask: prevent decoder input from attending to future input

In [12]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, nhead):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.nhead = nhead
        self.d_k = d_model // nhead

        # Linear layers for Q, K, V
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        # Final output linear layer
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, src_padding_mask=None, future_mask=None):
        d_k = Q.size(-1)
        matmul_qk = torch.matmul(Q, K.transpose(-2, -1))
        matmul_qk = matmul_qk / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))


        # Add future mask for preventing access to future tokens in decoder
        if future_mask is not None:
            matmul_qk += future_mask
        # Apply source padding mask if present
        if src_padding_mask is not None:
            matmul_qk = matmul_qk.masked_fill(src_padding_mask.unsqueeze(1).unsqueeze(2) == 1, float('-inf'))

        attn = F.softmax(matmul_qk, dim=-1)  # Attention weights
        output = torch.matmul(attn, V)    # Weighted sum of values based on attention

        # # Print intermediate values for debugging
        # print(f"QK: {matmul_qk}")
        # print(f"Attention Weights: {attn}")
        # print(f"Attention Output: {output}")

        return output, attn

    def forward(self, Q, K, V, key_padding_mask=None, attn_mask=None):
        # Change input shapes to [batch_size, seq_len, d_model]
        Q = Q.permute(1, 0, 2)
        K = K.permute(1, 0, 2)
        V = V.permute(1, 0, 2)

        batch_size = Q.size(0)

        # Apply linear transformations to Q, K, V and split into heads
        Q = self.W_q(Q).view(batch_size, -1, self.nhead, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.nhead, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.nhead, self.d_k).transpose(1, 2)

        # # Print Q, K, V after projection and split into heads
        # print(f"Q (after projection and split): {Q}")
        # print(f"K (after projection and split): {K}")
        # print(f"V (after projection and split): {V}")

        # Compute scaled dot-product attention
        attn_output, _ = self.scaled_dot_product_attention(Q, K, V, key_padding_mask, attn_mask)

        # Concatenate attention heads and apply the final linear layer
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        attn_output = self.W_o(attn_output)

        # # Print final output after concatenating heads and applying output linear layer
        # print(f"Final Output (before permute): {attn_output}")

        # Revert to original shape [seq_len, batch_size, d_model]
        attn_output = attn_output.permute(1, 0, 2)

        # print(f"Final Output (after permute): {attn_output}")

        return attn_output


In [13]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, dim_feedforward, nhead, dropout):
        super(TransformerEncoderLayer, self).__init__()

        # Custom Multi-Head Attention mechanism
        self.self_attn = MultiHeadAttention(d_model=d_model, nhead=nhead)

        # Feedforward network (two linear layers with ReLU activation in between)
        self.feedforward = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),  # Project from d_model to dim_feedforward
            nn.ReLU(),                            # Apply ReLU activation
            nn.Linear(dim_feedforward, d_model)    # Project back to d_model size
        )

        # Layer normalization to stabilize training
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Dropout layers for regularization
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, src_padding_mask=None):
        # Self-attention block
        attn_output = self.self_attn(x, x, x, src_padding_mask)  # Apply multi-head self-attention
        x = self.norm1(x + self.dropout1(attn_output))  # Add & normalize (Residual connection + LayerNorm)

        # Feedforward block
        ff_output = self.feedforward(x)  # Apply feedforward network
        x = self.norm2(x + self.dropout2(ff_output))  # Add & normalize (Residual connection + LayerNorm)

        return x  # Return the processed output

In [14]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, dim_feedforward, nhead, dropout):
        super(TransformerDecoderLayer, self).__init__()

        # MultiHeadAttention for self-attention and cross-attention
        self.self_attn = MultiHeadAttention(d_model=d_model, nhead=nhead)
        self.cross_attn = MultiHeadAttention(d_model=d_model, nhead=nhead)

        # Feedforward network
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

        # Dropout
        self.dropout = nn.Dropout(dropout)

    def residual_norm(self, x, sublayer_output, norm_layer):
        """Helper function for residual connection, dropout, and normalization."""
        return norm_layer(x + self.dropout(sublayer_output))

    def forward(self, x, enc_output, src_padding_mask=None, tgt_padding_mask=None, tgt_future_mask=None):
        # Self-attention with future masking (target sequence)
        tgt_attn_output = self.self_attn(x, x, x, tgt_padding_mask, tgt_future_mask)
        x = self.residual_norm(x, tgt_attn_output, self.norm1)

        # Cross-attention (encoder-decoder attention)
        cross_attn_output = self.cross_attn(x, enc_output, enc_output, key_padding_mask=src_padding_mask)
        x = self.residual_norm(x, cross_attn_output, self.norm2)

        # Feedforward neural network
        ff_output = self.linear2(self.dropout(F.relu(self.linear1(x))))
        x = self.residual_norm(x, ff_output, self.norm3)

        return x

In [15]:
class Transformer(nn.Module):
    def __init__(self, d_model, num_heads, num_encoder_layers, num_decoder_layers, d_ff, dropout):
        super(Transformer, self).__init__()

        # Encoder layers
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, d_ff, num_heads, dropout) for _ in range(num_encoder_layers)
        ])

        # Decoder layers
        self.decoder_layers = nn.ModuleList([
            TransformerDecoderLayer(d_model, d_ff, num_heads, dropout) for _ in range(num_decoder_layers)
        ])

        # Final linear layer to map the Transformer's output to the vocabulary distribution
        self.fc_out = nn.Linear(d_model, d_model)

    def forward(self, src_embeded, tgt_embeded, src_padding_mask, tgt_padding_mask, tgt_future_mask):
        # 1. Pass the source sequence through the encoder
        enc_output = self.encode(src_embeded, src_padding_mask)
        # 2. Pass the target sequence and encoder output through the decoder
        output = self.decode(tgt_embeded, enc_output, src_padding_mask, tgt_padding_mask, tgt_future_mask)
        return output

    def encode(self, src_embeded, src_padding_mask=None):
        """Pass the embedded source sequence through all encoder layers."""
        for layer in self.encoder_layers:
            src_embeded = layer(src_embeded, src_padding_mask)
        return src_embeded

    def decode(self, tgt_embeded, enc_output, src_padding_mask=None, tgt_padding_mask=None, tgt_future_mask=None):
        """Pass the embedded target sequence and encoder output through all decoder layers."""
        for layer in self.decoder_layers:
            tgt_embeded = layer(tgt_embeded, enc_output, src_padding_mask, tgt_padding_mask, tgt_future_mask)
        return tgt_embeded

In [16]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size, dropout, maxlen):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)

        # Calculate positional encodings
        pos_enc = torch.zeros(maxlen, emb_size)
        position = torch.arange(0, maxlen, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_size, 2).float() * -(math.log(10000.0) / emb_size))

        pos_enc[:, 0::2] = torch.sin(position * div_term)  # Use sin for even positions
        pos_enc[:, 1::2] = torch.cos(position * div_term)  # Use cos for odd positions
        pos_enc = pos_enc.unsqueeze(0)  # Add a batch dimension

        self.register_buffer('pos_enc', pos_enc)  # Register positional encodings as a buffer (not trainable)

    def forward(self, token_embedding):
        # Add positional encodings to the token embeddings
        return self.dropout(token_embedding + self.pos_enc[:, :token_embedding.size(1)])

In [17]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)  # Create the embedding layer
        self.emb_size = emb_size

    def forward(self, tokens):
        # Convert token indices to embeddings and scale by the square root of the embedding size
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [18]:
def generate_square_subsequent_mask(sz):
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1).to(DEVICE)

In [19]:
def create_mask(src, tgt, pad_idx=PAD_IDX):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_future_mask = generate_square_subsequent_mask(tgt_seq_len) # Pass device to generate_square_subsequent_mask
    # Create padding mask for source sequence
    src_padding_mask = (src == pad_idx).transpose(0, 1)
    # Create padding mask for target sequence
    tgt_padding_mask = (tgt == pad_idx).transpose(0, 1)
                                                                                        # and ensure the mask is a boolean tensor
    # If tgt_future_mask is provided, it will be used directly
    src_padding_mask = src_padding_mask.to(DEVICE)
    tgt_padding_mask = tgt_padding_mask.to(DEVICE)
    tgt_future_mask = tgt_future_mask.to(DEVICE)
    return tgt_future_mask, src_padding_mask, tgt_padding_mask

In [20]:
# Seq2Seq Network
class Seq2SeqNetwork(nn.Module):
    def __init__(self,
                 num_encoder_layers,
                 num_decoder_layers,
                 emb_size,
                 nhead,
                 src_vocab_size,
                 tgt_vocab_size,
                 dim_feedforward,
                 dropout=0.1):
        super().__init__()
        self.transformer = Transformer(
            d_model=emb_size,
            num_heads=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            d_ff=dim_feedforward,
            dropout=dropout
        )
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        maxlen=5000
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout, maxlen=maxlen)

    def forward(self,
                src,
                trg,
                tgt_future_mask=None,
                src_padding_mask=None,
                tgt_padding_mask=None):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask, tgt_future_mask=tgt_future_mask)
        return self.generator(outs)


    def encode(self, src, src_padding_mask=None):
        return self.transformer.encode(self.positional_encoding(self.src_tok_emb(src)), src_padding_mask=src_padding_mask)

    def decode(self, tgt, memory, src_padding_mask=None, tgt_padding_mask=None, tgt_future_mask=None):
        return self.transformer.decode(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask, tgt_future_mask=tgt_future_mask)

## Note: The parameter size of model should be less than 100M (100,000k) !!!

In [21]:
EMB_SIZE = 128
NHEAD = 8
FFN_HID_DIM = 1024
NUM_ENCODER_LAYERS = 1
NUM_DECODER_LAYERS = 1
SRC_VOCAB_SIZE = tokenizer_cn.vocab_size
TGT_VOCAB_SIZE = tokenizer_en.vocab_size
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

transformer = Seq2SeqNetwork(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)
param_transformer = sum(p.numel() for p in transformer.parameters())
print (f"The parameter size of transformer is {param_transformer/1000} k")
#   The parameter size of model should be less than 100M (100,000k) !!!
#   The parameter size of model should be less than 100M (100,000k) !!!
#   The parameter size of model should be less than 100M (100,000k) !!!

The parameter size of transformer is 10898.884 k


# Training
- You can change the training setting by yourself including
  - Number of epoch
  - Optimizer
  - Learning rate
  - Learning rate scheduler
  - etc...

In [22]:
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Hyperparameters
NUM_EPOCHS = 10
LR = 0.001
BETAS = (0.9, 0.98)
EPSILON = 1e-9

# Loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(transformer.parameters(), lr=LR, betas=BETAS, eps=EPSILON)

# Learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)




## Translation quality metrics: BLEU score

In [23]:
from torchmetrics.text import BLEUScore

def bleu_score_func(predicted, truth, grams=1):
    preds = [predicted]
    truth = [[truth]]
    bleu = BLEUScore(n_gram=grams)
    return bleu(preds, truth)


def BLEU_batch(predict, truth, output_tokenizer):
    batch_size = predict.size(1)
    total_score = 0
    for i in range(batch_size):
        predict_str = output_tokenizer.decode(predict[:, i], skip_special_tokens=True)
        truth_str = output_tokenizer.decode(truth[:, i], skip_special_tokens=True)
        score_gram1 = bleu_score_func(predict_str.lower(), truth_str.lower(), grams=1)
        #score_gram2 = bleu_score_func(predict_str.lower(), truth_str, grams=2)
        #score_gram3 = bleu_score_func(predict_str.lower(), truth_str, grams=3)
        #score_gram4 = bleu_score_func(predict_str.lower(), truth_str, grams=4)
        #total_score = total_score + (score_gram1 + score_gram2 + score_gram3 + score_gram4) / 4.0
        total_score = total_score + score_gram1
    total_score = total_score / batch_size
    return total_score

## Training and Evaluation Functions

In [24]:
def train_epoch(model, optimizer, train_dataloader):
    model.train()  # Set model to training mode
    total_loss = 0

    for src, tgt in train_dataloader:
        # Prepare source and target inputs
        src = src.transpose(0, 1).to(DEVICE).type(torch.long)
        tgt = tgt.transpose(0, 1).to(DEVICE).type(torch.long)

        # Prepare target input (remove last token) and target output (remove first token)
        tgt_input = tgt[:-1, :]
        tgt_out = tgt[1:, :]

        # Create masks
        tgt_future_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input, PAD_IDX)

        # Forward pass
        logits = model(src, tgt_input, src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask, tgt_future_mask=tgt_future_mask)

        # Compute loss
        optimizer.zero_grad()  # Zero the gradients
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1).long())
        
        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

        # Accumulate loss
        total_loss += loss.item()

    # Return average loss per batch
    return total_loss / len(train_dataloader)

In [25]:
def evaluate(model, val_dataloader):
    model.eval()
    losses = 0
    score = 0

    for src, tgt in val_dataloader:
        src = src.transpose(0, 1)
        tgt = tgt.transpose(0, 1)

        src = src.to(DEVICE).type(torch.long)
        tgt = tgt.to(DEVICE).type(torch.long)

        tgt_input = tgt[:-1, :]
        tgt_future_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input, PAD_IDX)

        logits = model(src, tgt_input, src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask, tgt_future_mask=tgt_future_mask)
        
        tgt_out = tgt[1:, :]
        _, tgt_predict = torch.max(logits, dim=-1)
        score_batch = BLEU_batch(tgt_predict, tgt_out, tokenizer_en)

        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1).long())
        losses += loss.item()
        score += score_batch

    # Return the average loss value and average BLEU score
    return (losses / len(list(val_dataloader))), (score / len(list(val_dataloader)))

## Start training
- MODEL_SAVE_PATH: path for storing the best model

In [26]:
MODEL_SAVE_PATH = "./model.ckpt"

In [27]:
from timeit import default_timer as timer
transformer = transformer.to(DEVICE)

best_acc = 0
for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer, cn_to_en_train_loader)
    end_time = timer()
    val_loss, val_acc = evaluate(transformer, cn_to_en_test_loader)

    scheduler.step(val_loss)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, Val Acc: {val_acc:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


    # Save the best model so far.
    if val_acc > best_acc:
        best_acc = val_acc
        best_state_dict = transformer.state_dict()
        torch.save(best_state_dict, MODEL_SAVE_PATH)
        print("(model saved)")

Epoch: 1, Train loss: 6.385, Val loss: 5.799, Val Acc: 0.157, Epoch time = 58.195s
(model saved)
Epoch: 2, Train loss: 5.542, Val loss: 5.481, Val Acc: 0.183, Epoch time = 58.579s
(model saved)
Epoch: 3, Train loss: 5.165, Val loss: 5.355, Val Acc: 0.188, Epoch time = 58.685s
(model saved)
Epoch: 4, Train loss: 4.894, Val loss: 5.313, Val Acc: 0.195, Epoch time = 58.728s
(model saved)
Epoch: 5, Train loss: 4.686, Val loss: 5.289, Val Acc: 0.201, Epoch time = 58.790s
(model saved)
Epoch: 6, Train loss: 4.520, Val loss: 5.301, Val Acc: 0.204, Epoch time = 58.818s
(model saved)
Epoch: 7, Train loss: 4.385, Val loss: 5.321, Val Acc: 0.204, Epoch time = 58.846s
Epoch: 8, Train loss: 4.278, Val loss: 5.352, Val Acc: 0.205, Epoch time = 58.994s
(model saved)
Epoch: 9, Train loss: 4.026, Val loss: 5.347, Val Acc: 0.212, Epoch time = 58.974s
(model saved)
Epoch: 10, Train loss: 3.947, Val loss: 5.372, Val Acc: 0.215, Epoch time = 58.957s
(model saved)


# Inference

In [28]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)  # Moving src tensor to the appropriate device
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_padding_mask=src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_padding_mask = torch.ones(1, ys.size(0)).type(torch.bool).to(DEVICE)  # target padding mask
        tgt_padding_mask = ~tgt_padding_mask
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))).to(DEVICE)  # target causal mask
        out = model.decode(ys, memory, src_padding_mask=src_mask, tgt_padding_mask=tgt_padding_mask, tgt_future_mask=tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

In [29]:
# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str, input_tokenizer, output_tokenizer):
    model.eval()
    sentence = input_tokenizer.encode(src_sentence)
    sentence = torch.tensor(sentence).view(-1, 1)
    num_tokens = sentence.shape[0]

    src_mask = torch.ones(1, num_tokens).type(torch.bool)  # source padding mask
    src_mask = ~src_mask
    tgt_tokens = greedy_decode(model, sentence, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    output_sentence = output_tokenizer.decode(tgt_tokens, skip_special_tokens=True)
    return output_sentence

## Load best model

In [30]:
transformer = Seq2SeqNetwork(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
transformer.to(DEVICE)
transformer.load_state_dict(torch.load("model.ckpt"))

  transformer.load_state_dict(torch.load("model.ckpt"))


<All keys matched successfully>

## Translation testing

In [31]:
sentence = "你好，欢迎来到中国"
ground_truth = 'Hello, Welcome to China'
predicted = translate(transformer, sentence, tokenizer_cn, tokenizer_en)

print(f'{"Input:":15s}: {sentence}')
print(f'{"Prediction":15s}: {predicted}')
print(f'{"Ground truth":15s}: {ground_truth}')
print("Bleu Score (1gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 1).item())
print("Bleu Score (2gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 2).item())
print("Bleu Score (3gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 3).item())
print("Bleu Score (4gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 4).item())

Input:         : 你好，欢迎来到中国
Prediction     : You are going to welcome you.
Ground truth   : Hello, Welcome to China
Bleu Score (1gram):  0.3333333134651184
Bleu Score (2gram):  0.0
Bleu Score (3gram):  0.0
Bleu Score (4gram):  0.0


In [32]:
sentence = "早上好，很高心见到你"
ground_truth = 'Good Morning, nice to meet you'
predicted = translate(transformer, sentence, tokenizer_cn, tokenizer_en)

print(f'{"Input:":15s}: {sentence}')
print(f'{"Prediction":15s}: {predicted}')
print(f'{"Ground truth":15s}: {ground_truth}')
print("Bleu Score (1gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 1).item())
print("Bleu Score (2gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 2).item())
print("Bleu Score (3gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 3).item())
print("Bleu Score (4gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 4).item())

Input:         : 早上好，很高心见到你
Prediction     : You see the early morning, you see.
Ground truth   : Good Morning, nice to meet you
Bleu Score (1gram):  0.2857142984867096
Bleu Score (2gram):  0.0
Bleu Score (3gram):  0.0
Bleu Score (4gram):  0.0


In [33]:
sentence = "祝您有个美好的一天"
ground_truth = 'Have a nice day'
predicted = translate(transformer, sentence, tokenizer_cn, tokenizer_en)

print(f'{"Input:":15s}: {sentence}')
print(f'{"Prediction":15s}: {predicted}')
print(f'{"Ground truth":15s}: {ground_truth}')
print("Bleu Score (1gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 1).item())
print("Bleu Score (2gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 2).item())
print("Bleu Score (3gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 3).item())
print("Bleu Score (4gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 4).item())

Input:         : 祝您有个美好的一天
Prediction     : You have a good day.
Ground truth   : Have a nice day
Bleu Score (1gram):  0.4000000059604645
Bleu Score (2gram):  0.3162277638912201
Bleu Score (3gram):  0.0
Bleu Score (4gram):  0.0
