Drive Mount
----------------------------------------

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%matplotlib inline

Sentencepiece
----------------------------






In [4]:
# Sentencepiece
! pip install sentencepiece
! apt-get install cmake build-essential pkg-config libgoogle-perftools-dev
! git clone https://github.com/google/sentencepiece.git 
! cd sentencepiece
! mkdir build
! cd build
! cmake ..
! make -j $(nproc)
! make install
! sudo ldconfig -v

%cd /content/drive/MyDrive/translation-model

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 20.5 MB/s eta 0:00:01[K     |▌                               | 20 kB 26.6 MB/s eta 0:00:01[K     |▉                               | 30 kB 31.0 MB/s eta 0:00:01[K     |█                               | 40 kB 23.6 MB/s eta 0:00:01[K     |█▍                              | 51 kB 20.1 MB/s eta 0:00:01[K     |█▋                              | 61 kB 14.8 MB/s eta 0:00:01[K     |██                              | 71 kB 15.1 MB/s eta 0:00:01[K     |██▏                             | 81 kB 16.6 MB/s eta 0:00:01[K     |██▍                             | 92 kB 16.8 MB/s eta 0:00:01[K     |██▊                             | 102 kB 16.0 MB/s eta 0:00:01[K     |███                             | 112 kB 16.0 MB/s eta 0:00:01[K     |███▎                            | 122 kB 16.0 MB/s eta 0:00:01[K     |██

In [5]:
from torchtext.data.functional import sentencepiece_tokenizer, load_sp_model
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k
from typing import Iterable, List


# 만든 sentencepiece tokenizer가 제대로 작동하는 지 확인
tokenizer = sentencepiece_tokenizer(load_sp_model('chungcheong_vocab.model'))
token = ['뭔가 쫌 달라보일 수도 있지만','긍까 롤과 비슷한 에이오에스 장르의']
list(tokenizer(token))

[['▁뭔가', '▁쫌', '▁달라', '보', '일', '▁수도', '▁있지만'],
 ['▁긍까', '▁롤', '과', '▁비슷한', '▁에이', '오', '에스', '▁장르의']]

In [6]:
import sentencepiece as spm
from torchtext.data.functional import sentencepiece_tokenizer, load_sp_model, sentencepiece_numericalizer
import os

# Sentencepiece를 만들기 위한 repository
dir_path = "/content/drive/MyDrive/translation-model"
jeju_corpus = "/content/drive/MyDrive/translation-model/data/jeju/Jeju_dataset.txt"
jeolla_corpus = "/content/drive/MyDrive/translation-model/data/jeolla/Jeolla_dataset.txt"
gs_corpus = "/content/drive/MyDrive/translation-model/data/gyeongsang/Gyeongsang_dataset.txt"
ch_corpus = "/content/drive/MyDrive/translation-model/data/chungcheong/Chungcheong_dataset.txt"
gw_corpus = "/content/drive/MyDrive/translation-model/data/gangwon/Gangwon_dataset.txt"
korean_corpus = '/content/drive/MyDrive/translation-model/data/jeju/Korean_dataset.txt'
corpus_path = [jeju_corpus, gs_corpus, jeolla_corpus, ch_corpus, gw_corpus]

# Sentencepiece tokenizer model name
jeju_prefix = "jeju_vocab"
jeolla_prefix = "jeolla_vocab"
gs_prefix = "gyeongsang_vocab"
ch_prefix = "chungcheong_vocab"
gw_prefix = "gangwon_vocab"
korean_prefix = "korean_vocab"

# 어떤 언어를 사용할 지 선택해주는 변수 이후 모든 코드에 영향을 끼치며 이 부분을 바꾸어주면 다른 방언의 모델을 불러올 수 있다.
# 0 = 제주, 1 = 경상, 2 = 전라, 3 = 충청, 4 = 강원
lan_num = 0

LANGUAGES = ['Jeju', 'Gyeongsang', 'Jeolla', 'Chungcheong', 'Gangwon']
PREFIXS = ['jeju', 'gyeongsang', 'jeolla', 'chungcheong', 'gangwon']
SRC_LANGUAGE = LANGUAGES[lan_num]
TGT_LANGUAGE = 'Ko'
vocab_size = 32000

# 언어별로 sentencepiece를 불러오고 없다면 생성
if not os.path.isfile(f"{dir_path}/{PREFIXS[lan_num]}_vocab.model"):
  spm.SentencePieceTrainer.train(
      f"--input={corpus_path[lan_num]} --model_prefix={PREFIXS[lan_num]}_vocab --vocab_size={vocab_size}" + " --character_coverage=0.9995" +
      " --model_type=unigram" +
      " --max_sentence_length=128" + # 문장 최대 길이
      " --pad_id=1" + # pad (0)
      " --unk_id=0" + # unknown (1)
      " --bos_id=2" + # begin of sequence (2)
      " --eos_id=3") # end of sequence (3)
if not os.path.isfile(f"{dir_path}/korean_vocab.model"): 
  spm.SentencePieceTrainer.train(
    f"--input={korean_corpus} --model_prefix={korean_prefix} --vocab_size={vocab_size}" + " --character_coverage=0.9995" +
    " --model_type=unigram" +
    " --max_sentence_length=128" + # 문장 최대 길이
    " --pad_id=1" + # pad (0)
    " --unk_id=0" + # unknown (1)
    " --bos_id=2" + # begin of sequence (2)
    " --eos_id=3") # end of sequence (3) 

# Place-holders
token_transform = {}
vocab_transform = {}

token_transform[SRC_LANGUAGE] = sentencepiece_tokenizer(load_sp_model(f"{PREFIXS[lan_num]}_vocab.model"))
token_transform[TGT_LANGUAGE] = sentencepiece_tokenizer(load_sp_model("korean_vocab.model"))

vocab_transform[SRC_LANGUAGE] = sentencepiece_numericalizer(load_sp_model(f"{PREFIXS[lan_num]}_vocab.model"))
vocab_transform[TGT_LANGUAGE] = sentencepiece_numericalizer(load_sp_model("korean_vocab.model"))

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

Seq2Seq Network using Transformer
---------------------------------




In [7]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network 
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, 
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [8]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

# Parameters For Model


In [9]:
torch.manual_seed(5)

SRC_VOCAB_SIZE = 32000
TGT_VOCAB_SIZE = 32000
EMB_SIZE = 128
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, 
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# 옵티마이저
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)

# 스케쥴러
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda = lambda epoch: 0.95**epoch, last_epoch=-1, verbose=False)

Collation
---------




In [10]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]), 
                      torch.tensor(token_ids), 
                      torch.tensor([EOS_IDX])))
    
def make_flat_list(t):
  return [int(item) for sublist in t for item in sublist]

# src and tgt language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(vocab_transform[ln], 
                                               make_flat_list, #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tesors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE]([src_sample.rstrip("\n")]))
        tgt_batch.append(text_transform[TGT_LANGUAGE]([tgt_sample.rstrip("\n")]))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

CSV 파일로부터 torch Dataset 형태로 불러와줌





In [11]:
from torch.utils.data import Dataset
import pandas as pd


class Datasetfromcsv(Dataset):
  def __init__(self, csv_file):
    self.text_data = pd.read_csv(csv_file,encoding='utf8',names=['Original','Expected']).astype(str)

  def __len__(self):
    return len(self.text_data)

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx.to_list()

    src = self.text_data.iloc[idx, 0]
    tgt = self.text_data.iloc[idx, 1]
    return src, tgt

In [12]:
# Clear GPU memory
import gc
gc.collect()
torch.cuda.empty_cache()

In [18]:
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from torchtext.data.metrics import bleu_score
from itertools import chain

decoding = spm.SentencePieceProcessor()
decoding.load('/content/drive/MyDrive/translation-model/korean_vocab.model')

def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    dataset = Datasetfromcsv(f'data/{PREFIXS[lan_num]}/trainer.csv')
    train_dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    
    for src, tgt in tqdm(train_dataloader):
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(train_dataloader)

def save_model(model, dir, mode="last"):
    torch.save(model.state_dict(),  f'{dir}/{SRC_LANGUAGE}_transformer_model_{mode}.ckpt')

def load_model(model, dir, mode="last"):
  if os.path.exists(f'{dir}/{SRC_LANGUAGE}_transformer_model_{mode}.ckpt'):
      model.load_state_dict(torch.load(f'{dir}/{SRC_LANGUAGE}_transformer_model_{mode}.ckpt'))

def load_language_model(model, dir, lan, mode='best'):
  if os.path.exists(f'{dir}/{lan}_transformer_model_{mode}.ckpt'):
    model.load_state_dict(torch.load(f'{dir}/{lan}_transformer_model_{mode}.ckpt'))

def translate_for_eval(model: torch.nn.Module, src_sentence: str):
    src = text_transform[SRC_LANGUAGE]([str(src_sentence)]).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    list_id = []
    for i in tgt_tokens.cpu().numpy():
      list_id.append(int(i.item()))
    if type(list_id[0]) != int:
      list_id = list(chain.from_iterable(list_id))

    return str(decoding.DecodeIds(list_id).replace("<bos>", "").replace("<eos>", ""))

def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Datasetfromcsv(f'data/{PREFIXS[lan_num]}/val.csv')
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    val_df = pd.read_csv(f'data/{PREFIXS[lan_num]}/val.csv',encoding='utf8',names=['Original','Expected','is_same']).astype(str)
    bleu_df = pd.read_csv(f'data/{PREFIXS[lan_num]}/bleu_score.csv', encoding='utf8', names=['Original','Expected','is_same']).astype(str)
    inputs = list(bleu_df.loc[:,'Original'])
    outputs = []
    expected = list(map(lambda x: [x.split()], bleu_df.loc[:, 'Expected']))

    for src, tgt in tqdm(val_dataloader):
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
        
        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    # Eval 과정 중 랜덤하게 추출한 문장들에 대한 BLEU score 측정
    for line in inputs:
        temp = []
        line_tr = translate_for_eval(model, line)
        line_tr = line_tr.split(" ")
        outputs.append(line_tr)

    return ((losses / len(val_dataloader)), bleu_score(outputs, expected))

Now we have all the ingredients to train our model. Let's do it!




In [19]:
from timeit import default_timer as timer
import numpy as np
import os.path

NUM_EPOCHS = 15
result_dir = "/content/drive/MyDrive/translation-model/model"

best_loss = np.inf

if not os.path.isfile(f'{result_dir}/{SRC_LANGUAGE}_transformer_model_best.ckpt'):
  print(f"Start Training {SRC_LANGUAGE} model")
  # for epoch in tqdm(range(1, NUM_EPOCHS+1)):
  #     start_time = timer()
  #     train_loss = train_epoch(transformer, optimizer)
  #     val_loss, bleu = evaluate(transformer)
  #     end_time = timer()
  #     if best_loss > val_loss:
  #       save_model(transformer, result_dir, mode='best')
  #     print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, Bleu_score: {bleu*100: .3f},"f"Epoch time = {(end_time - start_time):.3f}s"))
  #     scheduler.step()


# function to generate output sequence using greedy algorithm 
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    with torch.no_grad():
      src = text_transform[SRC_LANGUAGE]([src_sentence]).view(-1, 1)
      num_tokens = src.shape[0]
      src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
      tgt_tokens = greedy_decode(
          model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
      list_id = []
      for i in tgt_tokens.cpu().numpy():
        list_id.append(i.item())

    return decoding.DecodeIds(list_id).replace("<bos>", "").replace("<eos>", "")

In [20]:
# 예시 확인
load_model(transformer, result_dir, mode='best')

print(translate(transformer, "게난예"), "/ 그니까요")
print(translate(transformer, "말행 뭐하니"), "/ 말해서 뭐하니")
print(translate(transformer, "어떵 제주도 여자들은 친정에 일이 많으녠 겅했덴."), "/ 어떻게 제주도 여자들은 친정에 일이 많냐고 그랬대")
print(translate(transformer, "아방 나한티 같이 못자켄 너무 콧소리 심해부난."), "/ 아빠 나한테 같이 못자겠다고 해 너무 콧소리 심해서.")
print(translate(transformer, "선생님 눈 조심해야 돼켜"), "/ 진짜 하얗게 돼.")

그러니까요 / 그니까요
말해서 뭐하니 / 말해서 뭐하니
어떻게 제주도 여자들은 친정에 일이 많냐고 그렇게했다고. / 어떻게 제주도 여자들은 친정에 일이 많냐고 그랬대
아빠 나한테 같이 못자겠다고겠다고 너무  ⁇ 소리 심해버리니까. / 아빠 나한테 같이 못자겠다고 해 너무 콧소리 심해서.
선생님 눈 조심해야 돼 / 진짜 하얗게 돼.


# Inference

Inference를 위하여 주어진 language에 대하여 모델을 불러오고 bleu score를 측정할 수 있는 함수들을 정의


In [21]:
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE]([src_sentence]).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    list_id = []
    for i in tgt_tokens.cpu().numpy():
      list_id.append(i.item())

    return decoding.DecodeIds(list_id).replace("<bos>", "").replace("<eos>", "")
def get_tokenized(lan):
  vocab_tokenizer = sentencepiece_numericalizer(load_sp_model(f"{PREFIXS[lan_num]}_vocab.model"))
  return sequential_transforms(vocab_tokenizer, make_flat_list, tensor_transform)


def translate_lan(model: torch.nn.Module, src_sentence: str, lan):
    model.eval()
    src = get_tokenized(lan)([src_sentence]).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    list_id = []
    for i in tgt_tokens.cpu().numpy():
      list_id.append(int(i.item()))
    if type(list_id[0]) != int:
      list_id = list(chain.from_iterable(list_id))

    return decoding.DecodeIds(list_id).replace("<bos>", "").replace("<eos>", "")

def get_corpus_bleu(dir, model, lan):
  bleu_df = pd.read_csv(dir, encoding='utf8', names=['Original','Expected','is_same']).astype(str)
  load_language_model(transformer, model,lan, mode='best')
  print(lan)

  inputs = list(bleu_df.loc[:,'Original'])
  outputs = []
  expected = list(map(lambda x: [x.split()], bleu_df.loc[:, 'Expected']))

  for line in tqdm(inputs):
    temp = []
    line_tr = translate_lan(transformer, line, lan)
    line_tr = line_tr.split(" ")
    outputs.append(line_tr)
  # print(outputs, expected)

  print(bleu_score(outputs, expected))

# Random하게 추출한 2000개의 문장에 대한 BLEU score 측정
dir = "/content/drive/MyDrive/translation-model/data"
for i in PREFIXS:
  print(i)
  get_corpus_bleu(f"{dir}/{i}/bleu_score.csv",result_dir,i)

jeju
jeju


  0%|          | 0/2000 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

References
----------

1. Attention is all you need paper.
   https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
2. The annotated transformer. https://nlp.seas.harvard.edu/2018/04/03/attention.html#positional-encoding

