<a href="https://colab.research.google.com/github/DojunPark/Machine_Translation/blob/master/10_TPU_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Pytorch/XLA

In [1]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version 20200325

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  5116  100  5116    0     0  31386      0 --:--:-- --:--:-- --:--:-- 31386
Updating... This may take around 2 minutes.
Updating TPU runtime to pytorch-dev20200325 ...
Uninstalling torch-1.5.0a0+d6149a7:
  Successfully uninstalled torch-1.5.0a0+d6149a7
Uninstalling torchvision-0.6.0a0+3c254fb:
  Successfully uninstalled torchvision-0.6.0a0+3c254fb
Copying gs://tpu-pytorch/wheels/torch-nightly+20200325-cp36-cp36m-linux_x86_64.whl...
- [1 files][ 83.4 MiB/ 83.4 MiB]                                                
Operation completed over 1 objects/83.4 MiB.                                     
Copying gs://tpu-pytorch/wheels/torch_xla-nightly+20200325-cp36-cp36m-linux_x86_64.whl...
\ [1 files][114.5 MiB/114.5 MiB]                                      

# Preprocessing the training data

In [3]:
!pip install konlpy
!sudo apt-get install curl git
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Reading package lists... Done
Building dependency tree       
Reading state information... Done
curl is already the newest version (7.58.0-2ubuntu3.10).
git is already the newest version (1:2.17.1-1ubuntu0.7).
0 upgraded, 0 newly installed, 0 to remove and 51 not upgraded.
mecab-ko is already installed
mecab-ko-dic is already installed
mecab-python is already installed
Done.


In [4]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [1]:
!pip uninstall torchtext -y 

Uninstalling torchtext-0.7.0:
  Successfully uninstalled torchtext-0.7.0


In [2]:
!pip install torchtext

Collecting torchtext
  Using cached https://files.pythonhosted.org/packages/b9/f9/224b3893ab11d83d47fde357a7dcc75f00ba219f34f3d15e06fe4cb62e05/torchtext-0.7.0-cp36-cp36m-manylinux1_x86_64.whl
Installing collected packages: torchtext
Successfully installed torchtext-0.7.0


In [1]:
from konlpy.tag import Mecab
import spacy

mecab = Mecab()
spacy_en = spacy.load('en')

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

print('tokenization test with sample texts')
print('tokenizing for Korean with mecab: ', mecab.morphs('안녕하세요 저는 건국대학교에 재학 중인 박도준입니다.'))
print('tokenizing for English: ', tokenize_en('Hello, I am Dojun Park, a student at Konkuk University.'))

tokenization test with sample texts
tokenizing for Korean with mecab:  ['안녕', '하', '세요', '저', '는', '건국대', '학교', '에', '재학', '중', '인', '박도준', '입니다', '.']
tokenizing for English:  ['Hello', ',', 'I', 'am', 'Dojun', 'Park', ',', 'a', 'student', 'at', 'Konkuk', 'University', '.']


# prepare dataset using torchtext

In [3]:
from torchtext.data import Field, TabularDataset, BucketIterator

korean = Field(tokenize=mecab.morphs, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=tokenize_en, lower=True, init_token='<sos>', eos_token='<eos>')

fields = {'kor': ('src', korean), 'eng':('trg', english)}

train_data, valid_data, test_data = TabularDataset.splits(
                                                    path = '/content/drive/My Drive/Colab Notebooks',
                                                    train = 'train.csv',
                                                    validation = 'valid.csv',
                                                    test = 'test.csv',
                                                    format = 'csv',
                                                    fields = fields)



In [4]:
korean.build_vocab(train_data, min_freq=2)
english.build_vocab(train_data, min_freq=2)

In [12]:
print(train_data[0].__dict__)

{'src': ['해양수산부', '가', '양식', '넙치', '에서', '검출', '된', '수은', '이', '어디', '에서', '왔', '는지', '원인', '을', '규명', '중', '이', '다', '.'], 'trg': ['the', 'ministry', 'of', 'maritime', 'affairs', 'and', 'fisheries', 'is', 'trying', 'to', 'determine', 'the', 'origin', 'of', 'the', 'mercury', 'found', 'in', 'farmed', 'flounder', '.']}


In [13]:
print(len(train_data))
print(len(valid_data))
print(len(test_data))

120006
40002
40003


# bleu function for test

In [5]:
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys


def translate_sentence(model, sentence, korean, english, device, max_length=50):
    
    if type(sentence) == str:
        tokens = mecab.morphs(sentence)
    else:
        tokens = sentence

    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, korean.init_token)
    tokens.append(korean.eos_token)

    # Go through each korean token and convert to an index
    text_to_indices = [korean.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    outputs = [english.vocab.stoi["<sos>"]]
    for i in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        if best_guess == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
    # remove start token
    return translated_sentence[1:]


def bleu(data, model, korean, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, korean, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)

# Modeling

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter


class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_length,
        device):

        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)
        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout)
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        # src shape: (src_len, N)
        src_mask = src.transpose(0, 1) == self.src_pad_idx
        # (N, src_len)
        return src_mask

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_positions = (
            torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length).unsqueeze(1).expand(trg_seq_length, N)
            .to(self.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        )

        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        )

        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device)

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask = src_padding_mask,
            tgt_mask = trg_mask
        )
        out = self.fc_out(out)

        return out

In [None]:
import torch_xla
import torch_xla.core.xla_model as xm

print(-1)
# Setup the training phase
#load_model = False
print(0)
#save_model = True
print(0.5)
# Training hyperparameters
num_epochs = 5
learning_rate = 1e-4
batch_size = 32
device = xm.xla_device()
print(1)

# Model hyperparameters
src_vocab_size = len(korean.vocab)
print(2)
trg_vocab_size = len(english.vocab)
print(3)
embedding_size = 512
num_heads = 8
num_encoder_layers = 6  # in the paper 6
num_decoder_layers = 6
dropout = 0.10
max_len = 100
forward_expansion = 4
src_pad_idx = english.vocab.stoi['<pad>']
print(4)

# Tensorboard for nice plots
writer = SummaryWriter('runs/loss_plot')
print(5)
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = batch_size,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device
)
print(6)
model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device).to(device)
print(7)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
print(8)
pad_idx = english.vocab.stoi['<pad>']
print(9)
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)
print('완료')

# Model training

In [None]:
import time

start = time.time()
sentence = '이 지역의 많은 공장들이 다른 곳에 외주를 주고 있다.'  # df['kor'][117]

for epoch in range(num_epochs):

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        # forward prop
        output = model(inp_data, target[:-1])
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        optimizer.zero_grad()

        loss = criterion(output, target)
        loss.backward()
        xm.optimizer_step(optimizer, barrier=True)  # TPU 사용시 추가 코드

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()
        
        writer.add_scalar('Training loss', loss, global_step=step)
        step += 1


    model.eval()
    
    translated_sentence = translate_sentence(model, sentence, korean, english, device, max_length = 100)
    translated_sentence = ' '.join(translated_sentence)
    translated_sentence = translated_sentence.replace(' ,', ',')
    translated_sentence = translated_sentence.replace(' .', '.')
    translated_sentence = translated_sentence.replace(' <eos>', '')

    print(f'[Epoch] {epoch+1} / {num_epochs}')
    print(f'[Loss] {loss:.4f}')
    print(f'[Exsample] {sentence} >>> {translated_sentence}')
    print('[Training time] {:.2f} min.'.format((time.time() - start) / 60))
    print(f'[BLEU score] {bleu(test_data, model, korean, english, device):.4f}')

# Conclusion
- TPU 실행을 위한 toch_xla 라이브러리를 설치하여 device를 TPU로 지정하는 데 성공함
- 하지만 Modeling 이후부터 반복적으로 코드가 실행되지 않고 런타임이 재실행되는 문제가 반복됨
- 무료 colab에서 제공하는 RAM이 제한적인 것이 문제일 수 있다고 판단하여 colab pro에서 동일한 코드를 다시 테스트해보아야 함


---

- **TPU 설정 참고 사이트**
> https://beomi.github.io/2020/02/24/Pytorch-with-TPU-on-Colab/




