In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
!mkdir -p data
!cp -r /gdrive/MyDrive/tutorial_nlp/chap2/data/* ./data

In [None]:
from collections import defaultdict
import time

import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


random_state= 42
torch.manual_seed(1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Dataset
- [x] Vocab class
  - [x] for source language
    - [x] 入力言語の語彙数: 2698
  - [x] for destination language
    - [x] 出力言語の語彙数: 3051
- [x] split dataset for training and validation

In [None]:
!head -5 ./data/train.en
!head -5 ./data/train.ja

In [None]:
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'
BOS_TOKEN = '<S>'
EOS_TOKEN = '</S>'
PAD = 0
UNK = 1
BOS = 2
EOS = 3

word2id = {
    PAD_TOKEN: PAD,
    UNK_TOKEN: UNK,
    BOS_TOKEN: BOS,
    EOS_TOKEN: EOS,
}

In [None]:
def load_data(filepath):
    sentences = []
    with open(filepath, 'r', encoding='utf8') as f:
        for line in f:
            sentence = line.strip('\n').strip().split()
            sentences.append(sentence)
    return sentences


def sentence_to_ids(vocab, sentence):
    _ids = [vocab.word2id.get(word, UNK) for word in sentence]
    _ids += [EOS]
    return _ids


def pad_seq(seq, max_len):
    padded = seq + [PAD for _ in range(max_len - len(seq))]
    return padded

In [None]:
class Vocab(object):
    def __init__(self, word2id={}):
        self.word2id = dict(word2id)
        self.id2word = {id: word for word, id in self.word2id.items()}

    def build_vocab(self, sentences, min_count=1):
        word_counter = defaultdict(int)
        for sentence in sentences:
            for word in sentence:
                word_counter[word] = word_counter.get(word, 0) + 1

        for word, count in sorted(word_counter.items(), key=lambda x: x[1], reverse=True):
            if count >= min_count:
                _id = len(self.word2id)
                self.word2id.setdefault(word, _id)
                self.id2word[_id] = word

In [None]:
# 動作確認
train_X = load_data('./data/train.en')
train_Y = load_data('./data/train.ja')
train_X = train_X[:len(train_X) // 2]
train_Y = train_Y[:len(train_Y) // 2]
train_X, valid_X, train_Y, valid_Y = train_test_split(train_X, train_Y, test_size=0.2, random_state=random_state)

vocab_X = Vocab(word2id)
vocab_X.build_vocab(train_X, min_count=2)
vocab_Y = Vocab(word2id)
vocab_Y.build_vocab(train_Y, min_count=2)

vocab_size_X = len(vocab_X.id2word)
vocab_size_Y = len(vocab_Y.id2word)
print('入力言語の語彙数：', vocab_size_X)
print('出力言語の語彙数：', vocab_size_Y)

In [None]:
train_X = [sentence_to_ids(vocab_X, sentence) for sentence in train_X]
valid_X = [sentence_to_ids(vocab_X, sentence) for sentence in valid_X]
train_Y = [sentence_to_ids(vocab_Y, sentence) for sentence in train_Y]
valid_Y = [sentence_to_ids(vocab_Y, sentence) for sentence in valid_Y]

In [None]:
# 動作確認
"""
train_X[0]
>>> [18, 86, 9, 52, 342, 32, 22, 4, 2]
EOS = 3 にしているので、
>>> [18, 86, 9, 52, 342, 32, 22, 4, 3]
"""
train_X[0]

### DataLoader

In [None]:
class DataLoader(object):
    def __init__(self):
        """
        :param X: list, 入力言語の文章（単語IDのリスト）のリスト
        :param Y: list, 出力言語の文章（単語IDのリスト）のリスト
        :param batch_size: int, バッチサイズ
        :param shuffle: bool, サンプルの順番をシャッフルするか否か
        """

    def __iter__(self):
        return self

    def reset(self):
        # サンプルの順番をシャッフルする
        # ポインタの位置を初期化する
        pass

    def __next__(self):
        # ポインタが最後まで到達したら初期化する

        # バッチを取得

        # 入力系列seqs_Xの文章の長さ順（降順）に系列ペアをソートする

        # 短い系列の末尾をパディングする

        # tensorに変換し、転置する

        # ポインタを更新する

        return batch_X, batch_Y, lengths_X

### Model


In [None]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()


class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()


class EncoderDecoder(nn.Module):
    def __init__(self):
        super(EncoderDecoder, self).__init__()

### Training

In [None]:
# hyper parameters
num_epochs = 10
batch_size = 64
lr = 1e-3
teacher_forcing_rate = 0.2
ckpt_path = 'model.pth'

model_args = {
    'input_size': vocab_size_X,
    'output_size': vocab_size_Y,
    'hidden_size': 256,
}

train_dataloader = None
valid_dataloader = None
model = EncoderDecoder(**model_args).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

### Validation
- [ ] Epoch 10 Train/Loss ... BLEU ... Valid/Loss ~42 BLEU ~13 程度