In [31]:
!nvidia-smi

Wed Jul 14 13:07:24 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    63W / 149W |    763MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [32]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [33]:
!mkdir -p data
!cp -r /gdrive/MyDrive/tutorial_nlp/chap2/data/* ./data

In [80]:
from collections import defaultdict
import random
import time

from nltk import bleu_score
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim


random_state= 42
np.random.seed(random_state)
torch.manual_seed(1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Dataset
- [x] Vocab class
  - [x] for source language
    - [x] 入力言語の語彙数: 2698
  - [x] for destination language
    - [x] 出力言語の語彙数: 3051
- [x] split dataset for training and validation

In [62]:
!head -5 ./data/train.en
!head -5 ./data/train.ja

i can 't tell who will arrive first .
many animals have been destroyed by men .
i 'm in the tennis club .
emi looks happy .
please bear this fact in mind .
誰 が 一番 に 着 く か 私 に は 分か り ま せ ん 。
多く の 動物 が 人間 に よ っ て 滅ぼ さ れ た 。
私 は テニス 部員 で す 。
エミ は 幸せ そう に 見え ま す 。
この 事実 を 心 に 留め て お い て 下さ い 。


In [63]:
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'
BOS_TOKEN = '<S>'
EOS_TOKEN = '</S>'
PAD = 0
UNK = 1
BOS = 2
EOS = 3

word2id = {
    PAD_TOKEN: PAD,
    UNK_TOKEN: UNK,
    BOS_TOKEN: BOS,
    EOS_TOKEN: EOS,
}

In [64]:
def load_data(filepath):
    sentences = []
    with open(filepath, 'r', encoding='utf8') as f:
        for line in f:
            sentence = line.strip('\n').strip().split()
            sentences.append(sentence)
    return sentences


def sentence_to_ids(vocab, sentence):
    _ids = [vocab.word2id.get(word, UNK) for word in sentence]
    _ids += [EOS]
    return _ids


def pad_seq(seq, max_len):
    padded = seq + [PAD for _ in range(max_len - len(seq))]
    return padded

In [65]:
class Vocab(object):
    def __init__(self, word2id={}):
        self.word2id = dict(word2id)
        self.id2word = {id: word for word, id in self.word2id.items()}

    def build_vocab(self, sentences, min_count=1):
        word_counter = defaultdict(int)
        for sentence in sentences:
            for word in sentence:
                word_counter[word] = word_counter.get(word, 0) + 1

        for word, count in sorted(word_counter.items(), key=lambda x: x[1], reverse=True):
            if count >= min_count:
                _id = len(self.word2id)
                self.word2id.setdefault(word, _id)
                self.id2word[_id] = word

In [66]:
# 動作確認
train_X = load_data('./data/train.en')
train_Y = load_data('./data/train.ja')
train_X = train_X[:len(train_X) // 2]
train_Y = train_Y[:len(train_Y) // 2]
train_X, valid_X, train_Y, valid_Y = train_test_split(train_X, train_Y, test_size=0.2, random_state=random_state)

vocab_X = Vocab(word2id)
vocab_X.build_vocab(train_X, min_count=2)
vocab_Y = Vocab(word2id)
vocab_Y.build_vocab(train_Y, min_count=2)

vocab_size_X = len(vocab_X.id2word)
vocab_size_Y = len(vocab_Y.id2word)
print('入力言語の語彙数：', vocab_size_X)
print('出力言語の語彙数：', vocab_size_Y)

入力言語の語彙数： 2698
出力言語の語彙数： 3051


In [67]:
train_X = [sentence_to_ids(vocab_X, sentence) for sentence in train_X]
valid_X = [sentence_to_ids(vocab_X, sentence) for sentence in valid_X]
train_Y = [sentence_to_ids(vocab_Y, sentence) for sentence in train_Y]
valid_Y = [sentence_to_ids(vocab_Y, sentence) for sentence in valid_Y]

In [68]:
# 動作確認
"""
train_X[0]
>>> [18, 86, 9, 52, 342, 32, 22, 4, 2]
EOS = 3 にしているので、
>>> [18, 86, 9, 52, 342, 32, 22, 4, 3]
"""
train_X[0]

[18, 86, 9, 52, 342, 32, 22, 4, 3]

### DataLoader

In [69]:
class DataLoader(object):
    def __init__(self, X, Y, batch_size, shuffle=False):
        self.data = list(zip(X, Y))
        self.batch_size = batch_size
        self.shuffle = shuffle

        self.start_index = 0
        self.reset()
        
    def __iter__(self):
        return self

    def reset(self):
        # サンプルの順番をシャッフルする
        if self.shuffle:
            np.random.shuffle(self.data)
        # ポインタの位置を初期化する
        self.start_index = 0

    def __next__(self):
        # ポインタが最後まで到達したら初期化する
        if self.start_index >= len(self.data):
            self.reset()
            raise StopIteration

        # バッチを取得
        '''
        X, Y = zip(*self.data)
        batch_X = X[self.start_index:self.start_index + self.batch_size]
        batch_Y = Y[self.start_index:self.start_index + self.batch_size]
        '''
        batch_X, batch_Y = zip(*self.data[self.start_index:self.start_index+self.batch_size])
        
        # 入力系列seqs_Xの文章の長さ順（降順）に系列ペアをソートする
        batch_X = np.asarray(batch_X)
        batch_Y = np.asarray(batch_Y)
        lengths_X = np.asarray([len(sen) for sen in batch_X])
        lengths_Y = np.asarray([len(sen) for sen in batch_Y])
        indices = np.argsort(lengths_X)[::-1]
        batch_X = batch_X[indices]
        batch_Y = batch_Y[indices]
        lengths_X = lengths_X[indices]

        # 短い系列の末尾をパディングする
        max_len_X = max(lengths_X)
        max_len_Y = max(lengths_Y)
        batch_X = [pad_seq(sen, max_len_X) for sen in batch_X]
        batch_Y = [pad_seq(sen, max_len_Y) for sen in batch_Y]

        # tensorに変換し、転置する
        batch_X = torch.tensor(batch_X, dtype=torch.long, device=device).transpose(1, 0)  # (length, batch_size)
        batch_Y = torch.tensor(batch_Y, dtype=torch.long, device=device).transpose(1, 0)

        # ポインタを更新する
        self.start_index += self.batch_size

        return batch_X, batch_Y, lengths_X

### Model


In [70]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        """
        :param input_size: int, 入力言語の語彙数
        :param hidden_size: int, 隠れ層のユニット数
        """
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size, padding_idx=PAD)
        self.rnn = nn.GRU(hidden_size, hidden_size)

    def forward(self, seqs, input_lengths, hidden=None):
        """
        :param seqs: tensor, 入力のバッチ, size=(max_length, batch_size)
        :param input_lengths: 入力のバッチの各サンプルの文長
        :param hidden: tensor, 隠れ状態の初期値, Noneの場合は0で初期化される
        :return output: tensor, Encoderの出力, size=(max_length, batch_size, hidden_size)
        :return hidden: tensor, Encoderの隠れ状態, size=(1, batch_size, hidden_size)
        """
        x = self.embedding(seqs)  # (length, batch_size, embedding_size)
        x = pack_padded_sequence(x, input_lengths)
        x, hidden = self.rnn(x, hidden)
        output, _ = pad_packed_sequence(x)
        return output, hidden


class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout=0.):
        """
        :param hidden_size: int, 隠れ層のユニット数
        :param output_size: int, 出力言語の語彙数
        :param dropout: float, ドロップアウト率
        """
        super(Decoder, self).__init__()

        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout = dropout  # NOTE: not used

        self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=PAD)
        self.rnn = nn.GRU(hidden_size, hidden_size)
        self.linear = nn.Linear(hidden_size, output_size)


    def forward(self, seqs, hidden=None):
        """
        :param seqs: tensor, 入力のバッチ, size=(1, batch_size)
        :param hidden: tensor, 隠れ状態の初期値, Noneの場合は0で初期化される
        :return output: tensor, Decoderの出力, size=(1, batch_size, output_size)
        :return hidden: tensor, Decoderの隠れ状態, size=(1, batch_size, hidden_size)
        """
        x = self.embedding(seqs)  # (1, batch_size, embedding_size)
        x, hidden = self.rnn(x, hidden)
        output = self.linear(x)
        return output, hidden


class EncoderDecoder(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        """
        :param input_size: int, 入力言語の語彙数
        :param output_size: int, 出力言語の語彙数
        :param hidden_size: int, 隠れ層のユニット数
        """
        super(EncoderDecoder, self).__init__()

        self.encoder = Encoder(input_size, hidden_size)
        self.decoder = Decoder(hidden_size, output_size)

    def forward(self, batch_X, lengths_X, max_length, batch_Y=None, use_teacher_forcing=False):
        """
        :param batch_X: tensor, 入力系列のバッチ, size=(max_length, batch_size)
        :param lengths_X: list, 入力系列のバッチ内の各サンプルの文長
        :param max_length: int, Decoderの最大文長
        :param batch_Y: tensor, Decoderで用いるターゲット系列
        :param use_teacher_forcing: Decoderでターゲット系列を入力とするフラグ
        :return decoder_outputs: tensor, Decoderの出力, 
            size=(max_length, batch_size, self.decoder.output_size)
        """
        # encoder
        encoder_output, encoder_hidden = self.encoder(batch_X, lengths_X)

        # decoder
        decoder_hidden = encoder_hidden
        bs = batch_X.size(1)
        decoder_input = torch.tensor([BOS] * bs, dtype=torch.long, device=device).unsqueeze(0)  # (length=1, batch_size)

        # scheduled sampling
        decoder_outputs = torch.zeros((max_length, bs, self.decoder.output_size), device=device)
        for ts in range(max_length):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            decoder_outputs[ts] = decoder_output
            # TODO: if decoder_output == EOS, then break and stop the for loop?
            if use_teacher_forcing and batch_Y[ts] is not None:
                decoder_input = batch_Y[ts].unsqueeze(0)  # (length=1, batch_size)
            else:
                # decoder_output: (length, batch_size, output_size)
                decoder_input = decoder_output.max(-1)[1]
        return decoder_outputs

### Training/Validation
- [ ] Epoch 10 Train/Loss ... BLEU ... Valid/Loss ~42 BLEU ~13 程度

In [77]:
class AverageMeter(object):
    """Computes and stores the average and current value
    from https://github.com/pytorch/examples/blob/master/imagenet/main.py
    """
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


def calc_bleu(refs, hyps):
    """
    BLEUスコアを計算する関数
    :param refs: list, 参照訳。単語のリストのリスト (例： [['I', 'have', 'a', 'pen'], ...])
    :param hyps: list, モデルの生成した訳。単語のリストのリスト (例： [['I', 'have', 'a', 'pen'], ...])
    :return: float, BLEUスコア(0~100)
    """
    refs = [[ref[:ref.index(EOS)]] for ref in refs]
    hyps = [hyp[:hyp.index(EOS)] if EOS in hyp else hyp for hyp in hyps]
    return 100 * bleu_score.corpus_bleu(refs, hyps)


def train(dataloader, model, optimizer, criterion, teacher_forcing_rate=0.):
    model.train()
    losses = AverageMeter('Loss', ':.4e')
    outputs, targets = [], []
    pbar = tqdm(total=len(dataloader.data))
    for batch_X, batch_Y, lengths_X in dataloader:
        max_length, bs = batch_Y.size()
        use_teacher_forcing = (random.random() < teacher_forcing_rate)
        output = model(batch_X, lengths_X, max_length, batch_Y, use_teacher_forcing=use_teacher_forcing)
        loss = criterion(output.contiguous(), batch_Y.contiguous())
        losses.update(loss.item() / bs, bs)

        outputs.extend(output.max(-1)[1].transpose(0, 1).contiguous().data.cpu().numpy().tolist())
        targets.extend(batch_Y.transpose(0, 1).contiguous().data.cpu().numpy().tolist())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pbar.update(bs)
    pbar.close()

    return {'loss': losses.avg, 'output': outputs, 'target': targets}


def validate(dataloader, model, criterion):
    model.eval()
    losses = AverageMeter('Loss', ':.4e')
    outputs, targets = [], []
    pbar = tqdm(total=len(dataloader.data))
    with torch.no_grad():
        for batch_X, batch_Y, lengths_X in dataloader:
            max_length, bs = batch_Y.size()
            output = model(batch_X, lengths_X, max_length, batch_Y, use_teacher_forcing=False)
            loss = criterion(output.contiguous(), batch_Y.contiguous())
            losses.update(loss.item() / bs, bs)

            outputs.extend(output.max(-1)[1].transpose(0, 1).contiguous().data.cpu().numpy().tolist())
            targets.extend(batch_Y.transpose(0, 1).contiguous().data.cpu().numpy().tolist())
            
            pbar.update(bs)
        pbar.close()

    return {'loss': losses.avg, 'output': outputs, 'target': targets}

In [81]:
# hyper parameters
num_epochs = 10
batch_size = 64
lr = 1e-3
teacher_forcing_rate = 0.2
ckpt_path = 'model.pth'

model_args = {
    'input_size': vocab_size_X,
    'output_size': vocab_size_Y,
    'hidden_size': 256,
}

train_dataloader = DataLoader(train_X, train_Y, batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_X, valid_Y, batch_size, shuffle=False)
model = EncoderDecoder(**model_args).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

mce = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD)
def masked_cross_entropy(logits, target):
    return mce(logits.view(-1, logits.size(-1)), target.view(-1))
    # return mce(logits.reshape(-1, logits.size(-1)), target.reshape(-1))


best = {
    'valid/BLEU': 0.
}
for epoch_i in range(1, num_epochs + 1):
    start_at = time.time()
    training = train(train_dataloader, model, optimizer, masked_cross_entropy, teacher_forcing_rate)
    validation = validate(valid_dataloader, model, masked_cross_entropy)
    train_bleu = calc_bleu(training['target'], training['output'])
    valid_bleu = calc_bleu(validation['target'], validation['output'])

    if valid_bleu > best['vaild/BLEU']:
        best['valid/BLEU'] = valid_bleu
        torch.save(model.state_dict(), ckpt_path)

    print('Epoch {} Time {} sec [Train] Loss {:5.4f} BLEU {:2.2f} [Valid] Loss {:5.4f} BLEU {:2.2f}'.format(
        epoch_i, time.time() - start_at,
        training['loss'], train_bleu, validation['loss'], valid_bleu,
    ))

HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))

  return array(a, dtype, copy=False, order=order)





HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


Epoch 1 Time 16.28382635116577 sec [Train] Loss 54.4734 BLEU 1.98 [Valid] Loss 51.0776 BLEU 4.02


HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


Epoch 2 Time 16.078113794326782 sec [Train] Loss 48.0542 BLEU 5.05 [Valid] Loss 48.2615 BLEU 4.69


HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


Epoch 3 Time 16.404333353042603 sec [Train] Loss 44.4778 BLEU 7.12 [Valid] Loss 45.8560 BLEU 6.08


HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


Epoch 4 Time 16.4253249168396 sec [Train] Loss 41.4506 BLEU 9.20 [Valid] Loss 44.7580 BLEU 9.52


HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


Epoch 5 Time 20.435551166534424 sec [Train] Loss 38.7643 BLEU 11.58 [Valid] Loss 43.2720 BLEU 9.41


HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


Epoch 6 Time 19.382447719573975 sec [Train] Loss 36.4187 BLEU 13.85 [Valid] Loss 43.0271 BLEU 10.48


HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


Epoch 7 Time 17.57172703742981 sec [Train] Loss 34.7431 BLEU 15.24 [Valid] Loss 42.3056 BLEU 10.37


HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


Epoch 8 Time 18.692575931549072 sec [Train] Loss 32.5237 BLEU 18.06 [Valid] Loss 42.4891 BLEU 11.07


HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


Epoch 9 Time 16.384654760360718 sec [Train] Loss 31.1557 BLEU 19.87 [Valid] Loss 42.7862 BLEU 12.84


HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


Epoch 10 Time 16.497340202331543 sec [Train] Loss 29.6930 BLEU 21.83 [Valid] Loss 43.0904 BLEU 14.02
