# # **Deep Learning Lab 3: Machine Translation**
**Author**: BSChen (313510156)

# (1) Task description
- Translate text from Chinese to English.
- Main goal: Get familiar with transformer.

## Import package

In [1]:
import os
import json
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchsummary import summary
from timeit import default_timer as timer

from tqdm import tqdm  # added by myself

from utils import *
from network import *

  from .autonotebook import tqdm as notebook_tqdm


## Fix random seed

In [2]:
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_seed(29)

# (2) Data Processing
- Original dataset is [Tatoeba](https://tatoeba.org/zh-cn/) and [XDailyDialog](https://github.com/liuzeming01/XDailyDialog)
- We select 50000 English-Chinese sentence pairs for translation task

- Args:
  - BATCH_SIZE  (You can modify)
  - data_dir: the path to the given training translation dataset

In [3]:
data_dir = "./translation_train_data.json"
BATCH_SIZE = 8

## Show the raw data

In [4]:
translation_raw_data = pd.read_json(data_dir)
display(translation_raw_data)

Unnamed: 0,English,Chinese
0,I'm Susan Greene.,我是蘇珊格林。
1,You don't have to take an examination.,你不需要考试。
2,I can't leave.,我走不了。
3,A cold beer would hit the spot!,来杯冰啤酒就太棒了!
4,Let's start!,讓我們開始吧。
...,...,...
49995,Just buy a cask of wine. Have you bought ice yet?,买一桶酒就行了。你买冰块了吗?
49996,OK. No problem.,"好的,没问题。"
49997,"I'm not really in the mood for Italian, actual...","实际上,我不太喜欢意大利菜。我想吃点辣的。"
49998,It's OK. It seems we have a lot in common.,还行吧。看来我们有很多共同点。


## Tokenization
- Tokenizer: BertTokenizer
  - encode: convert text to token ID
  - decode: convert token ID back to text

In [5]:
tokenizer_en = tokenizer_english()
tokenizer_cn = tokenizer_chinese()

In [6]:
english_seqs = translation_raw_data["English"].apply(lambda x: tokenizer_en.encode(x, add_special_tokens=True, padding=False))
chinese_seqs = translation_raw_data["Chinese"].apply(lambda x: tokenizer_cn.encode(x, add_special_tokens=True, padding=False))

MAX_TOKENIZE_LENGTH = max(english_seqs.str.len().max(), chinese_seqs.str.len().max()) # longest string
MAX_TOKENIZE_LENGTH = pow(2, math.ceil(math.log(MAX_TOKENIZE_LENGTH) / math.log(2)))  # closest upper to the power of 2

print("Max tokenize length:", MAX_TOKENIZE_LENGTH)

Max tokenize length: 128


## Add paddings
- make all the sentences the same length by inserting token ID = PAD_IDX at the back

In [7]:
# Add padding
def add_padding(token_list: list, max_length: int) -> list:
    if len(token_list) < max_length:
        padding_length = max_length - len(token_list)
        token_list = token_list + [PAD_IDX] * padding_length
    else:
        token_list = token_list[:max_length]  # Trim to MAX_LENGTH if longer
    return token_list

chinese_seqs = chinese_seqs.apply(lambda x: add_padding(x, MAX_TOKENIZE_LENGTH))
english_seqs = english_seqs.apply(lambda x: add_padding(x, MAX_TOKENIZE_LENGTH))

In [8]:
# Check the padding result
print("===== Chinese tokenized data =====")
print(chinese_seqs.iloc[0])

print("===== English tokenized data =====")
print(english_seqs.iloc[0])

===== Chinese tokenized data =====
[101, 2769, 3221, 5979, 4396, 3419, 3360, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
===== English tokenized data =====
[101, 146, 112, 182, 5640, 10983, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Datalodader
- Split dataset into training dataset(90%) and validation dataset(10%). You can modify the traning/validation ratio
- Create dataloader to iterate the data.

In [9]:
data_size  = len(translation_raw_data)
train_size = int(0.95 * data_size)
valid_size = data_size - train_size
print("train size:", train_size)
print("valid size:", valid_size)

en_train_data = []
cn_train_data = []
en_valid_data = []
cn_valid_data = []

for i in range(data_size):
    if (i < train_size):
        en_train_data.append(torch.Tensor(english_seqs.iloc[i]))
        cn_train_data.append(torch.Tensor(chinese_seqs.iloc[i]))
    else:
        en_valid_data.append(torch.Tensor(english_seqs.iloc[i]))
        cn_valid_data.append(torch.Tensor(chinese_seqs.iloc[i]))

class TextTranslationDataset(Dataset): 
    def __init__(self, src, dst, augment_prob=0):
        self.src_list = src
        self.dst_list = dst
        self.augment_prob = augment_prob

    def __len__(self): 
        return len(self.src_list)

    def __getitem__(self, idx):
        if random.random() < self.augment_prob and self.src_list[idx].shape[0] > 7:
            # Random dropout
            drop_num = random.randint(1, self.src_list[idx].shape[0] - 1)
            src = self.src_list[idx].clone()
            src[drop_num] = PAD_IDX
        else:
            src = self.src_list[idx]
        return src, self.dst_list[idx]

cn_to_en_train_set = TextTranslationDataset(cn_train_data, en_train_data, augment_prob=0.3)
cn_to_en_valid_set = TextTranslationDataset(cn_valid_data, en_valid_data)

cn_to_en_train_loader = DataLoader(cn_to_en_train_set, batch_size=BATCH_SIZE, shuffle=False)
cn_to_en_valid_loader = DataLoader(cn_to_en_valid_set, batch_size=BATCH_SIZE, shuffle=True)

train size: 47500
valid size: 2500


# (3) Model
- **TO-DO**: Finish the model in "network.py"
    - You can first write code here for convenience, but note that <span style='color:red'>**TA will test your model using model definition in "network.py"**</span><p>
- Base transformer layers in [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
    - TransformerEncoderLayer:
    - TransformerDecoderLayer:
- Positional encoding and input embedding
- Note that you may need masks when implementing attention mechanism
    - Padding mask: prevent input from attending to padding tokens
    - Causal mask: prevent decoder input from attending to future input

In [10]:
model = load_model()

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

model = model.to(DEVICE)
param_model = sum(p.numel() for p in model.parameters())
print (f"The parameter size of model is {param_model / 1000} k")

The parameter size of model is 24759.876 k


# (4) Training
- You can change the training setting by yourself including
  - Number of epoch
  - Optimizer
  - Learning rate
  - Learning rate scheduler
  - etc...

In [11]:
NUM_EPOCHS = 80
LEARNING_RATE = 1e-2
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, betas=(0.9, 0.98), eps=1e-9)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9, weight_decay=1e-5)
# scheduler = None
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.7, patience=5, min_lr=1e-5
)

## Training and Evaluation Functions

In [12]:
def train_epoch(
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    train_dataloader: DataLoader
    ):
    # Initialization
    model.train()
    losses = 0

    for src, tgt in tqdm(train_dataloader):
        # src, tgt shape: (batch_size, seq_length)
        src, tgt = src.to(DEVICE), tgt.to(DEVICE)

        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        logits = model(src, tgt_input)

        optimizer.zero_grad()
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_output.reshape(-1).long())
        loss.backward()
        optimizer.step()

        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model: torch.nn.Module, val_dataloader: DataLoader):
    model.eval()
    losses = 0
    score = 0

    for src, tgt in tqdm(val_dataloader):
        # src, tgt shape: (batch_size, seq_length)
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        
        logits = model(src, tgt_input)
        _, tgt_predict = torch.max(logits, dim=-1)
        score_batch = BLEU_batch(tgt_predict, tgt_output, tokenizer_en)

        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_output.reshape(-1).long())
        losses += loss.item()
        score += score_batch

    return (losses / len(list(val_dataloader))), (score / len(list(val_dataloader)))

## Start training
- MODEL_SAVE_PATH: path for storing the best model

In [13]:
MODEL_SAVE_PATH = "./model.ckpt"
LOG_FILE_PATH = "log.csv"
SAVE_TOLERANCE = 0.0005

In [14]:
print("Start training...")
print(f"Model training on device: {DEVICE}")

# Initialization
model = model.to(DEVICE)
with open(LOG_FILE_PATH, 'w') as log_file:
    log_file.write("Epoch, Train Loss, Val Loss, Val Acc, Epoch Time\n")

best_acc = 0
for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(
        # Main translation model
        model,
        optimizer,
        cn_to_en_train_loader
    )
    end_time = timer()
    val_loss, val_acc = evaluate(model, cn_to_en_valid_loader)

    # Log the results
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, Val Acc: {val_acc:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
    with open(LOG_FILE_PATH, 'a') as log_file:
        log_file.write(f"{epoch}, {train_loss:.3f}, {val_loss:.3f}, {val_acc:.3f}, {(end_time - start_time):.3f}\n")

    # Save the best model so far.
    if val_acc > best_acc - SAVE_TOLERANCE:
        best_acc = val_acc if val_acc > best_acc else best_acc
        best_state_dict = model.state_dict()
        torch.save(best_state_dict, MODEL_SAVE_PATH)
        print("(model saved)")

    if scheduler:
        scheduler.step(val_loss)

Start training...
Model training on device: cuda


100%|██████████| 5938/5938 [00:41<00:00, 144.07it/s]
100%|██████████| 313/313 [00:02<00:00, 113.35it/s]


Epoch: 1, Train loss: 4.121, Val loss: 3.724, Val Acc: 0.227, Epoch time = 41.370s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 144.06it/s]
100%|██████████| 313/313 [00:02<00:00, 112.25it/s]


Epoch: 2, Train loss: 3.179, Val loss: 3.250, Val Acc: 0.301, Epoch time = 41.363s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 144.06it/s]
100%|██████████| 313/313 [00:02<00:00, 111.76it/s]


Epoch: 3, Train loss: 2.738, Val loss: 3.052, Val Acc: 0.338, Epoch time = 41.485s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.79it/s]
100%|██████████| 313/313 [00:02<00:00, 111.52it/s]


Epoch: 4, Train loss: 2.458, Val loss: 2.884, Val Acc: 0.357, Epoch time = 41.439s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.79it/s]
100%|██████████| 313/313 [00:02<00:00, 111.49it/s]


Epoch: 5, Train loss: 2.253, Val loss: 2.788, Val Acc: 0.378, Epoch time = 41.440s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.77it/s]
100%|██████████| 313/313 [00:02<00:00, 111.28it/s]


Epoch: 6, Train loss: 2.089, Val loss: 2.736, Val Acc: 0.392, Epoch time = 41.443s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.84it/s]
100%|██████████| 313/313 [00:02<00:00, 111.22it/s]


Epoch: 7, Train loss: 1.956, Val loss: 2.686, Val Acc: 0.401, Epoch time = 41.425s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.47it/s]
100%|██████████| 313/313 [00:02<00:00, 110.81it/s]


Epoch: 8, Train loss: 1.840, Val loss: 2.647, Val Acc: 0.405, Epoch time = 41.532s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.82it/s]
100%|██████████| 313/313 [00:02<00:00, 111.16it/s]


Epoch: 9, Train loss: 1.741, Val loss: 2.642, Val Acc: 0.406, Epoch time = 41.432s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.68it/s]
100%|██████████| 313/313 [00:02<00:00, 110.83it/s]


Epoch: 10, Train loss: 1.656, Val loss: 2.627, Val Acc: 0.412, Epoch time = 41.472s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.64it/s]
100%|██████████| 313/313 [00:02<00:00, 110.65it/s]


Epoch: 11, Train loss: 1.580, Val loss: 2.646, Val Acc: 0.418, Epoch time = 41.485s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.62it/s]
100%|██████████| 313/313 [00:02<00:00, 110.22it/s]


Epoch: 12, Train loss: 1.508, Val loss: 2.673, Val Acc: 0.419, Epoch time = 41.488s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.74it/s]
100%|██████████| 313/313 [00:02<00:00, 110.24it/s]


Epoch: 13, Train loss: 1.449, Val loss: 2.685, Val Acc: 0.422, Epoch time = 41.584s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.79it/s]
100%|██████████| 313/313 [00:02<00:00, 110.51it/s]


Epoch: 14, Train loss: 1.392, Val loss: 2.708, Val Acc: 0.422, Epoch time = 41.437s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.75it/s]
100%|██████████| 313/313 [00:02<00:00, 110.73it/s]


Epoch: 15, Train loss: 1.340, Val loss: 2.722, Val Acc: 0.423, Epoch time = 41.452s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.71it/s]
100%|██████████| 313/313 [00:02<00:00, 110.66it/s]


Epoch: 16, Train loss: 1.291, Val loss: 2.743, Val Acc: 0.430, Epoch time = 41.464s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.74it/s]
100%|██████████| 313/313 [00:02<00:00, 110.65it/s]


Epoch: 17, Train loss: 1.120, Val loss: 2.694, Val Acc: 0.431, Epoch time = 41.454s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.74it/s]
100%|██████████| 313/313 [00:02<00:00, 110.66it/s]


Epoch: 18, Train loss: 1.054, Val loss: 2.707, Val Acc: 0.440, Epoch time = 41.454s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.84it/s]
100%|██████████| 313/313 [00:02<00:00, 110.44it/s]


Epoch: 19, Train loss: 1.020, Val loss: 2.732, Val Acc: 0.442, Epoch time = 41.427s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.73it/s]
100%|██████████| 313/313 [00:02<00:00, 110.20it/s]


Epoch: 20, Train loss: 0.989, Val loss: 2.751, Val Acc: 0.440, Epoch time = 41.458s


100%|██████████| 5938/5938 [00:41<00:00, 143.84it/s]
100%|██████████| 313/313 [00:02<00:00, 110.89it/s]


Epoch: 21, Train loss: 0.960, Val loss: 2.775, Val Acc: 0.438, Epoch time = 41.553s


100%|██████████| 5938/5938 [00:41<00:00, 143.89it/s]
100%|██████████| 313/313 [00:02<00:00, 110.52it/s]


Epoch: 22, Train loss: 0.937, Val loss: 2.815, Val Acc: 0.436, Epoch time = 41.410s


100%|██████████| 5938/5938 [00:41<00:00, 144.04it/s]
100%|██████████| 313/313 [00:02<00:00, 110.40it/s]


Epoch: 23, Train loss: 0.823, Val loss: 2.797, Val Acc: 0.445, Epoch time = 41.366s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.48it/s]
100%|██████████| 313/313 [00:02<00:00, 110.50it/s]


Epoch: 24, Train loss: 0.783, Val loss: 2.812, Val Acc: 0.447, Epoch time = 41.529s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.79it/s]
100%|██████████| 313/313 [00:02<00:00, 110.43it/s]


Epoch: 25, Train loss: 0.758, Val loss: 2.840, Val Acc: 0.447, Epoch time = 41.441s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.55it/s]
100%|██████████| 313/313 [00:02<00:00, 110.67it/s]


Epoch: 26, Train loss: 0.737, Val loss: 2.877, Val Acc: 0.443, Epoch time = 41.510s


100%|██████████| 5938/5938 [00:41<00:00, 143.77it/s]
100%|██████████| 313/313 [00:02<00:00, 110.16it/s]


Epoch: 27, Train loss: 0.718, Val loss: 2.913, Val Acc: 0.446, Epoch time = 41.444s


100%|██████████| 5938/5938 [00:41<00:00, 143.81it/s]
100%|██████████| 313/313 [00:02<00:00, 110.03it/s]


Epoch: 28, Train loss: 0.699, Val loss: 2.897, Val Acc: 0.447, Epoch time = 41.435s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.78it/s]
100%|██████████| 313/313 [00:02<00:00, 110.15it/s]


Epoch: 29, Train loss: 0.628, Val loss: 2.899, Val Acc: 0.451, Epoch time = 41.444s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.65it/s]
100%|██████████| 313/313 [00:02<00:00, 110.10it/s]


Epoch: 30, Train loss: 0.601, Val loss: 2.929, Val Acc: 0.452, Epoch time = 41.607s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.88it/s]
100%|██████████| 313/313 [00:02<00:00, 110.30it/s]


Epoch: 31, Train loss: 0.587, Val loss: 2.967, Val Acc: 0.453, Epoch time = 41.414s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.37it/s]
100%|██████████| 313/313 [00:02<00:00, 110.02it/s]


Epoch: 32, Train loss: 0.570, Val loss: 3.004, Val Acc: 0.453, Epoch time = 41.562s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.84it/s]
100%|██████████| 313/313 [00:02<00:00, 110.33it/s]


Epoch: 33, Train loss: 0.556, Val loss: 2.985, Val Acc: 0.448, Epoch time = 41.427s


100%|██████████| 5938/5938 [00:41<00:00, 143.62it/s]
100%|██████████| 313/313 [00:02<00:00, 110.62it/s]


Epoch: 34, Train loss: 0.546, Val loss: 3.022, Val Acc: 0.453, Epoch time = 41.485s


100%|██████████| 5938/5938 [00:41<00:00, 143.70it/s]
100%|██████████| 313/313 [00:02<00:00, 110.26it/s]


Epoch: 35, Train loss: 0.498, Val loss: 3.025, Val Acc: 0.457, Epoch time = 41.464s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.85it/s]
100%|██████████| 313/313 [00:02<00:00, 110.34it/s]


Epoch: 36, Train loss: 0.481, Val loss: 3.039, Val Acc: 0.455, Epoch time = 41.424s


100%|██████████| 5938/5938 [00:41<00:00, 143.88it/s]
100%|██████████| 313/313 [00:02<00:00, 109.93it/s]


Epoch: 37, Train loss: 0.470, Val loss: 3.069, Val Acc: 0.455, Epoch time = 41.413s


100%|██████████| 5938/5938 [00:41<00:00, 143.94it/s]
100%|██████████| 313/313 [00:02<00:00, 110.23it/s]


Epoch: 38, Train loss: 0.458, Val loss: 3.100, Val Acc: 0.453, Epoch time = 41.394s


100%|██████████| 5938/5938 [00:41<00:00, 143.70it/s]
100%|██████████| 313/313 [00:02<00:00, 110.25it/s]


Epoch: 39, Train loss: 0.450, Val loss: 3.078, Val Acc: 0.456, Epoch time = 41.595s


100%|██████████| 5938/5938 [00:41<00:00, 143.88it/s]
100%|██████████| 313/313 [00:02<00:00, 110.65it/s]


Epoch: 40, Train loss: 0.443, Val loss: 3.109, Val Acc: 0.452, Epoch time = 41.413s


100%|██████████| 5938/5938 [00:41<00:00, 143.86it/s]
100%|██████████| 313/313 [00:02<00:00, 110.36it/s]


Epoch: 41, Train loss: 0.411, Val loss: 3.101, Val Acc: 0.457, Epoch time = 41.418s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.52it/s]
100%|██████████| 313/313 [00:02<00:00, 110.32it/s]


Epoch: 42, Train loss: 0.399, Val loss: 3.143, Val Acc: 0.453, Epoch time = 41.517s


100%|██████████| 5938/5938 [00:41<00:00, 144.01it/s]
100%|██████████| 313/313 [00:02<00:00, 110.07it/s]


Epoch: 43, Train loss: 0.392, Val loss: 3.167, Val Acc: 0.457, Epoch time = 41.374s


100%|██████████| 5938/5938 [00:41<00:00, 143.69it/s]
100%|██████████| 313/313 [00:02<00:00, 110.15it/s]


Epoch: 44, Train loss: 0.386, Val loss: 3.162, Val Acc: 0.456, Epoch time = 41.466s


100%|██████████| 5938/5938 [00:41<00:00, 143.84it/s]
100%|██████████| 313/313 [00:02<00:00, 109.77it/s]


Epoch: 45, Train loss: 0.378, Val loss: 3.168, Val Acc: 0.457, Epoch time = 41.422s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.75it/s]
100%|██████████| 313/313 [00:02<00:00, 109.57it/s]


Epoch: 46, Train loss: 0.374, Val loss: 3.180, Val Acc: 0.455, Epoch time = 41.450s


100%|██████████| 5938/5938 [00:41<00:00, 144.05it/s]
100%|██████████| 313/313 [00:02<00:00, 110.29it/s]


Epoch: 47, Train loss: 0.353, Val loss: 3.197, Val Acc: 0.459, Epoch time = 41.366s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.90it/s]
100%|██████████| 313/313 [00:02<00:00, 110.84it/s]


Epoch: 48, Train loss: 0.347, Val loss: 3.225, Val Acc: 0.457, Epoch time = 41.538s


100%|██████████| 5938/5938 [00:41<00:00, 143.91it/s]
100%|██████████| 313/313 [00:02<00:00, 110.55it/s]


Epoch: 49, Train loss: 0.338, Val loss: 3.224, Val Acc: 0.455, Epoch time = 41.404s


100%|██████████| 5938/5938 [00:41<00:00, 143.91it/s]
100%|██████████| 313/313 [00:02<00:00, 110.56it/s]


Epoch: 50, Train loss: 0.335, Val loss: 3.249, Val Acc: 0.458, Epoch time = 41.402s


100%|██████████| 5938/5938 [00:41<00:00, 143.71it/s]
100%|██████████| 313/313 [00:02<00:00, 110.62it/s]


Epoch: 51, Train loss: 0.331, Val loss: 3.240, Val Acc: 0.458, Epoch time = 41.459s


100%|██████████| 5938/5938 [00:41<00:00, 143.81it/s]
100%|██████████| 313/313 [00:02<00:00, 110.12it/s]


Epoch: 52, Train loss: 0.325, Val loss: 3.233, Val Acc: 0.456, Epoch time = 41.433s


100%|██████████| 5938/5938 [00:41<00:00, 143.91it/s]
100%|██████████| 313/313 [00:02<00:00, 110.00it/s]


Epoch: 53, Train loss: 0.314, Val loss: 3.257, Val Acc: 0.457, Epoch time = 41.402s


100%|██████████| 5938/5938 [00:41<00:00, 143.86it/s]
100%|██████████| 313/313 [00:02<00:00, 109.95it/s]


Epoch: 54, Train loss: 0.307, Val loss: 3.272, Val Acc: 0.457, Epoch time = 41.418s


100%|██████████| 5938/5938 [00:41<00:00, 144.04it/s]
100%|██████████| 313/313 [00:02<00:00, 109.86it/s]


Epoch: 55, Train loss: 0.304, Val loss: 3.279, Val Acc: 0.458, Epoch time = 41.365s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.70it/s]
100%|██████████| 313/313 [00:02<00:00, 109.69it/s]


Epoch: 56, Train loss: 0.300, Val loss: 3.292, Val Acc: 0.457, Epoch time = 41.464s


100%|██████████| 5938/5938 [00:41<00:00, 143.79it/s]
100%|██████████| 313/313 [00:02<00:00, 109.71it/s]


Epoch: 57, Train loss: 0.297, Val loss: 3.313, Val Acc: 0.457, Epoch time = 41.437s


100%|██████████| 5938/5938 [00:41<00:00, 143.67it/s]
100%|██████████| 313/313 [00:02<00:00, 110.02it/s]


Epoch: 58, Train loss: 0.293, Val loss: 3.303, Val Acc: 0.457, Epoch time = 41.601s


100%|██████████| 5938/5938 [00:41<00:00, 143.75it/s]
100%|██████████| 313/313 [00:02<00:00, 109.87it/s]


Epoch: 59, Train loss: 0.287, Val loss: 3.291, Val Acc: 0.458, Epoch time = 41.450s


100%|██████████| 5938/5938 [00:41<00:00, 144.06it/s]
100%|██████████| 313/313 [00:02<00:00, 110.43it/s]


Epoch: 60, Train loss: 0.284, Val loss: 3.320, Val Acc: 0.459, Epoch time = 41.361s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.89it/s]
100%|██████████| 313/313 [00:02<00:00, 110.24it/s]


Epoch: 61, Train loss: 0.280, Val loss: 3.305, Val Acc: 0.459, Epoch time = 41.410s


100%|██████████| 5938/5938 [00:41<00:00, 143.84it/s]
100%|██████████| 313/313 [00:02<00:00, 110.23it/s]


Epoch: 62, Train loss: 0.277, Val loss: 3.299, Val Acc: 0.458, Epoch time = 41.427s


100%|██████████| 5938/5938 [00:41<00:00, 143.63it/s]
100%|██████████| 313/313 [00:02<00:00, 110.27it/s]


Epoch: 63, Train loss: 0.275, Val loss: 3.333, Val Acc: 0.460, Epoch time = 41.484s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.77it/s]
100%|██████████| 313/313 [00:02<00:00, 110.14it/s]


Epoch: 64, Train loss: 0.273, Val loss: 3.324, Val Acc: 0.458, Epoch time = 41.446s


100%|██████████| 5938/5938 [00:41<00:00, 143.81it/s]
100%|██████████| 313/313 [00:02<00:00, 110.00it/s]


Epoch: 65, Train loss: 0.269, Val loss: 3.324, Val Acc: 0.460, Epoch time = 41.435s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.62it/s]
100%|██████████| 313/313 [00:02<00:00, 105.39it/s]


Epoch: 66, Train loss: 0.266, Val loss: 3.330, Val Acc: 0.460, Epoch time = 41.488s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.73it/s]
100%|██████████| 313/313 [00:02<00:00, 110.56it/s]


Epoch: 67, Train loss: 0.264, Val loss: 3.312, Val Acc: 0.459, Epoch time = 41.456s


100%|██████████| 5938/5938 [00:41<00:00, 143.86it/s]
100%|██████████| 313/313 [00:02<00:00, 110.11it/s]


Epoch: 68, Train loss: 0.260, Val loss: 3.365, Val Acc: 0.458, Epoch time = 41.417s


100%|██████████| 5938/5938 [00:41<00:00, 143.78it/s]
100%|██████████| 313/313 [00:02<00:00, 110.16it/s]


Epoch: 69, Train loss: 0.259, Val loss: 3.355, Val Acc: 0.460, Epoch time = 41.441s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.89it/s]
100%|██████████| 313/313 [00:02<00:00, 110.31it/s]


Epoch: 70, Train loss: 0.259, Val loss: 3.345, Val Acc: 0.459, Epoch time = 41.409s


100%|██████████| 5938/5938 [00:41<00:00, 143.65it/s]
100%|██████████| 313/313 [00:02<00:00, 110.18it/s]


Epoch: 71, Train loss: 0.257, Val loss: 3.362, Val Acc: 0.460, Epoch time = 41.477s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.70it/s]
100%|██████████| 313/313 [00:02<00:00, 110.36it/s]


Epoch: 72, Train loss: 0.254, Val loss: 3.349, Val Acc: 0.458, Epoch time = 41.465s


100%|██████████| 5938/5938 [00:44<00:00, 134.35it/s]
100%|██████████| 313/313 [00:02<00:00, 107.87it/s]


Epoch: 73, Train loss: 0.252, Val loss: 3.355, Val Acc: 0.459, Epoch time = 44.351s


100%|██████████| 5938/5938 [00:41<00:00, 143.77it/s]
100%|██████████| 313/313 [00:02<00:00, 109.25it/s]


Epoch: 74, Train loss: 0.252, Val loss: 3.358, Val Acc: 0.458, Epoch time = 41.447s


100%|██████████| 5938/5938 [00:41<00:00, 143.49it/s]
100%|██████████| 313/313 [00:02<00:00, 110.42it/s]


Epoch: 75, Train loss: 0.249, Val loss: 3.357, Val Acc: 0.458, Epoch time = 41.658s


100%|██████████| 5938/5938 [00:41<00:00, 143.61it/s]
100%|██████████| 313/313 [00:02<00:00, 110.29it/s]


Epoch: 76, Train loss: 0.248, Val loss: 3.384, Val Acc: 0.458, Epoch time = 41.490s


100%|██████████| 5938/5938 [00:41<00:00, 143.67it/s]
100%|██████████| 313/313 [00:02<00:00, 110.33it/s]


Epoch: 77, Train loss: 0.248, Val loss: 3.383, Val Acc: 0.460, Epoch time = 41.475s
(model saved)


100%|██████████| 5938/5938 [00:41<00:00, 143.83it/s]
100%|██████████| 313/313 [00:02<00:00, 110.39it/s]


Epoch: 78, Train loss: 0.245, Val loss: 3.381, Val Acc: 0.459, Epoch time = 41.427s


100%|██████████| 5938/5938 [00:41<00:00, 143.18it/s]
100%|██████████| 313/313 [00:02<00:00, 110.13it/s]


Epoch: 79, Train loss: 0.244, Val loss: 3.384, Val Acc: 0.460, Epoch time = 41.614s


100%|██████████| 5938/5938 [00:41<00:00, 143.57it/s]
100%|██████████| 313/313 [00:02<00:00, 110.02it/s]

Epoch: 80, Train loss: 0.243, Val loss: 3.380, Val Acc: 0.459, Epoch time = 41.502s





# (5) Inference

In [1]:
from utils import *
from network import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer_en = tokenizer_english()
tokenizer_cn = tokenizer_chinese()

## Load best model

In [3]:
model = load_model(MODEL_PATH="model.ckpt")
model = model.to(DEVICE)

## Translation testing
 - **TO-DO**: Finish the "translate" function in "network.py"
   - You can first write code here for convenience, but note that <span style='color:red'>**TA will test your model using "translate" function in "network.py"**</span>

In [4]:
sentence = "你好，欢迎来到中国。"
ground_truth = 'Hello, welcome to China.'
predicted = translate(model, sentence, tokenizer_cn, tokenizer_en)

print(f'{"Input":15s}: {sentence}')
print(f'{"Prediction":15s}: {predicted}')
print(f'{"Ground truth":15s}: {ground_truth}')
print("Bleu Score (1-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 1).item())
print("Bleu Score (2-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 2).item())
print("Bleu Score (3-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 3).item())
print("Bleu Score (4-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 4).item())

Input          : 你好，欢迎来到中国。
Prediction     : You are a good reputation for China.
Ground truth   : Hello, welcome to China.
Bleu Score (1-gram):  0.1428571492433548
Bleu Score (2-gram):  0.0
Bleu Score (3-gram):  0.0
Bleu Score (4-gram):  0.0


In [5]:
sentence = "她知道您的電話號碼嗎?"
ground_truth = 'Does she know your telephone number?'
predicted = translate(model, sentence, tokenizer_cn, tokenizer_en)

print(f'{"Input":15s}: {sentence}')
print(f'{"Prediction":15s}: {predicted}')
print(f'{"Ground truth":15s}: {ground_truth}')
print("Bleu Score (1-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 1).item())
print("Bleu Score (2-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 2).item())
print("Bleu Score (3-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 3).item())
print("Bleu Score (4-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 4).item())

Input          : 她知道您的電話號碼嗎?
Prediction     : Does she know your telephone number?
Ground truth   : Does she know your telephone number?
Bleu Score (1-gram):  1.0
Bleu Score (2-gram):  1.0
Bleu Score (3-gram):  1.0
Bleu Score (4-gram):  1.0


In [None]:
sentence = "你现在在哪里工作?"
ground_truth = 'Where do you work now?'
predicted = translate(model, sentence, tokenizer_cn, tokenizer_en)

print(f'{"Input":15s}: {sentence}')
print(f'{"Prediction":15s}: {predicted}')
print(f'{"Ground truth":15s}: {ground_truth}')
print("Bleu Score (1-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 1).item())
print("Bleu Score (2-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 2).item())
print("Bleu Score (3-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 3).item())
print("Bleu Score (4-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 4).item())

Input:         : 你现在在哪里工作?
Prediction     : Where do you work right now?
Ground truth   : Where do you work now?
Bleu Score (1-gram):  0.8333333134651184
Bleu Score (2-gram):  0.7071067690849304
Bleu Score (3-gram):  0.6299605369567871
Bleu Score (4-gram):  0.5372849702835083


In [12]:
sentence = "歡迎來到台灣。"
ground_truth = "Welcome to Taiwan."
predicted = translate(model, sentence, tokenizer_cn, tokenizer_en)

print(f'{"Input":15s}: {sentence}')
print(f'{"Prediction":15s}: {predicted}')
print(f'{"Ground truth":15s}: {ground_truth}')
print("Bleu Score (1-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 1).item())
print("Bleu Score (2-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 2).item())
print("Bleu Score (3-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 3).item())
print("Bleu Score (4-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 4).item())

Input          : 歡迎來到台灣。
Prediction     : Welcome to Taiwan.
Ground truth   : Welcome to Taiwan.
Bleu Score (1-gram):  1.0
Bleu Score (2-gram):  1.0
Bleu Score (3-gram):  1.0
Bleu Score (4-gram):  0.0


In [13]:
sentence = "你好，歡迎來到台灣。"
ground_truth = "Hello, welcome to Taiwan."
predicted = translate(model, sentence, tokenizer_cn, tokenizer_en)

print(f'{"Input":15s}: {sentence}')
print(f'{"Prediction":15s}: {predicted}')
print(f'{"Ground truth":15s}: {ground_truth}')
print("Bleu Score (1-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 1).item())
print("Bleu Score (2-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 2).item())
print("Bleu Score (3-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 3).item())
print("Bleu Score (4-gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 4).item())

Input          : 你好，歡迎來到台灣。
Prediction     : You are competent studying in Taiwan.
Ground truth   : Hello, welcome to Taiwan.
Bleu Score (1-gram):  0.1666666567325592
Bleu Score (2-gram):  0.0
Bleu Score (3-gram):  0.0
Bleu Score (4-gram):  0.0
