# 3 word2vec

## 7.3 seq2seqの実装

seq2seqは2つのRNNを組み合わせたニューラルネットワークである。
その2つのRNNをEncoderクラスとDecoderクラスとしてそれぞれ実装する。
そして、その2つのクラスを組み合わせて、Seq2seqクラスを実装する。

### 7.3.1Encoderクラス

まずEncoderクラスを実装する（図7-15を見るのが分かりやすい）。

In [1]:
import sys
sys.path.append('..')
from common.time_layers import *
from common.base_model import BaseModel

In [2]:
class Encoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False) # 状態を維持しない

        self.params = self.embed.params + self.lstm.params
        self.grads = self.embed.grads + self.lstm.grads
        self.hs = None

    def forward(self, xs):
        xs = self.embed.forward(xs)
        hs = self.lstm.forward(xs)
        self.hs = hs
        return hs[:, -1, :]

    def backward(self, dh):
        dhs = np.zeros_like(self.hs)
        dhs[:, -1, :] = dh

        dout = self.lstm.backward(dhs)
        dout = self.embed.backward(dout)
        return dout


注１：今回は0から9の数字と「+」と「 」（空白文字）と「 _ 」の計１３文字を扱うため、vocab_sizeは13になる。

注２：5章と6章の言語モデルでは「長い時系列データ」がひとつだけ存在する問題として扱った。
そこではstateful=Trueとして隠れ状態を維持した。
今回は「短い時系列データ」が複数存在する問題なので、問題ごとにLSTMの隠れ状態をリセットする。

### 7.3.2 Decoderクラス

次にDecoderを実装しよう。
学習時のレイヤ構成は図7-17に書いてある。
今回の問題は足し算なので、決定的に答えを生成したい。
そのため生成時にはargmaxノードを使う（図7-18）。

このように学習時と生成時でレイヤ構成が異なるのでDecoderクラスはTime Softmax with Lossレイヤの前までを担当することにする（図7-19）。

In [3]:
class Decoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) # Encoderのhを維持する。
        self.affine = TimeAffine(affine_W, affine_b)

        self.params, self.grads = [], []
        for layer in (self.embed, self.lstm, self.affine):
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, xs, h):
        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        out = self.lstm.forward(out)
        score = self.affine.forward(out)
        return score

    def backward(self, dscore):
        dout = self.affine.backward(dscore)
        dout = self.lstm.backward(dout)
        dout = self.embed.backward(dout)
        dh = self.lstm.dh
        return dh

    def generate(self, h, start_id, sample_size):
        sampled = []
        sample_id = start_id
        self.lstm.set_state(h)

        for _ in range(sample_size):
            x = np.array(sample_id).reshape((1, 1))
            out = self.embed.forward(x)
            out = self.lstm.forward(out)
            score = self.affine.forward(out)

            sample_id = np.argmax(score.flatten())
            sampled.append(int(sample_id))

        return sampled


### 7.3.3 Seq2seqクラス

Seq2seqは繋ぎ合わせるだけ。

In [4]:
class Seq2seq(BaseModel):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = Decoder(V, D, H)
        self.softmax = TimeSoftmaxWithLoss()

        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads

    def forward(self, xs, ts):
        decoder_xs, decoder_ts = ts[:, :-1], ts[:, 1:]

        h = self.encoder.forward(xs)
        score = self.decoder.forward(decoder_xs, h)
        loss = self.softmax.forward(score, decoder_ts)
        return loss

    def backward(self, dout=1):
        dout = self.softmax.backward(dout)
        dh = self.decoder.backward(dout)
        dout = self.encoder.backward(dh)
        return dout

    def generate(self, xs, start_id, sample_size):
        h = self.encoder.forward(xs)
        sampled = self.decoder.generate(h, start_id, sample_size)
        return sampled


### 7.3.4 seq2seqの評価



In [5]:
import sys
sys.path.append('..')
import numpy as np
import matplotlib.pyplot as plt
from dataset import sequence
from common.optimizer import Adam
from common.trainer import Trainer
from common.util import eval_seq2seq
#from seq2seq import Seq2seq
#from peeky_seq2seq import PeekySeq2seq


# データセットの読み込み
(x_train, t_train), (x_test, t_test) = sequence.load_data('addition.txt')
char_to_id, id_to_char = sequence.get_vocab()

# Reverse input? =================================================
is_reverse = False  # True
if is_reverse:
    x_train, x_test = x_train[:, ::-1], x_test[:, ::-1]
# ================================================================

# ハイパーパラメータの設定
vocab_size = len(char_to_id)
wordvec_size = 16
hideen_size = 128
batch_size = 128
max_epoch = 25
max_grad = 5.0

# Normal or Peeky? ==============================================
model = Seq2seq(vocab_size, wordvec_size, hideen_size)
# model = PeekySeq2seq(vocab_size, wordvec_size, hideen_size)
# ================================================================
optimizer = Adam()
trainer = Trainer(model, optimizer)

acc_list = []
for epoch in range(max_epoch):
    trainer.fit(x_train, t_train, max_epoch=1,
                batch_size=batch_size, max_grad=max_grad)

    correct_num = 0
    for i in range(len(x_test)):
        question, correct = x_test[[i]], t_test[[i]]
        verbose = i < 10
        correct_num += eval_seq2seq(model, question, correct,
                                    id_to_char, verbose, is_reverse)

    acc = float(correct_num) / len(x_test)
    acc_list.append(acc)
    print('val acc %.3f%%' % (acc * 100))

| epoch 1 |  iter 1 / 351 | time 0[s] | loss 2.56
| epoch 1 |  iter 21 / 351 | time 3[s] | loss 2.53
| epoch 1 |  iter 41 / 351 | time 5[s] | loss 2.17
| epoch 1 |  iter 61 / 351 | time 8[s] | loss 1.96
| epoch 1 |  iter 81 / 351 | time 11[s] | loss 1.92
| epoch 1 |  iter 101 / 351 | time 14[s] | loss 1.87
| epoch 1 |  iter 121 / 351 | time 17[s] | loss 1.85
| epoch 1 |  iter 141 / 351 | time 20[s] | loss 1.83
| epoch 1 |  iter 161 / 351 | time 23[s] | loss 1.79
| epoch 1 |  iter 181 / 351 | time 26[s] | loss 1.77
| epoch 1 |  iter 201 / 351 | time 29[s] | loss 1.77
| epoch 1 |  iter 221 / 351 | time 32[s] | loss 1.76
| epoch 1 |  iter 241 / 351 | time 34[s] | loss 1.76
| epoch 1 |  iter 261 / 351 | time 37[s] | loss 1.76
| epoch 1 |  iter 281 / 351 | time 40[s] | loss 1.75
| epoch 1 |  iter 301 / 351 | time 43[s] | loss 1.74
| epoch 1 |  iter 321 / 351 | time 46[s] | loss 1.75
| epoch 1 |  iter 341 / 351 | time 49[s] | loss 1.74
Q 77+85  
T 162 
X 100 
---
Q 975+164
T 1139
X 1000
---


## 7.4 seq2seqの改良

### 7.4.1 入力データの反転（Reverse）

入力データを反転させると学習の進みが良くなる。
理論的なことは分かってないが（わかってないんかーい）、直感的には勾配の伝播がスムーズになるからだと考えられる。

In [6]:
import sys
sys.path.append('..')
import numpy as np
import matplotlib.pyplot as plt
from dataset import sequence
from common.optimizer import Adam
from common.trainer import Trainer
from common.util import eval_seq2seq
#from seq2seq import Seq2seq
#from peeky_seq2seq import PeekySeq2seq


# データセットの読み込み
(x_train, t_train), (x_test, t_test) = sequence.load_data('addition.txt')
char_to_id, id_to_char = sequence.get_vocab()

# Reverse input? =================================================
is_reverse = True # ここを変えた
if is_reverse:
    x_train, x_test = x_train[:, ::-1], x_test[:, ::-1]
# ================================================================

# ハイパーパラメータの設定
vocab_size = len(char_to_id)
wordvec_size = 16
hideen_size = 128
batch_size = 128
max_epoch = 25
max_grad = 5.0

# Normal or Peeky? ==============================================
model = Seq2seq(vocab_size, wordvec_size, hideen_size)
# model = PeekySeq2seq(vocab_size, wordvec_size, hideen_size)
# ================================================================
optimizer = Adam()
trainer = Trainer(model, optimizer)

acc_list = []
for epoch in range(max_epoch):
    trainer.fit(x_train, t_train, max_epoch=1,
                batch_size=batch_size, max_grad=max_grad)

    correct_num = 0
    for i in range(len(x_test)):
        question, correct = x_test[[i]], t_test[[i]]
        verbose = i < 10
        correct_num += eval_seq2seq(model, question, correct,
                                    id_to_char, verbose, is_reverse)

    acc = float(correct_num) / len(x_test)
    acc_list.append(acc)
    print('val acc %.3f%%' % (acc * 100))

| epoch 1 |  iter 1 / 351 | time 0[s] | loss 2.56
| epoch 1 |  iter 21 / 351 | time 2[s] | loss 2.52
| epoch 1 |  iter 41 / 351 | time 5[s] | loss 2.17
| epoch 1 |  iter 61 / 351 | time 8[s] | loss 1.96
| epoch 1 |  iter 81 / 351 | time 11[s] | loss 1.91
| epoch 1 |  iter 101 / 351 | time 14[s] | loss 1.87
| epoch 1 |  iter 121 / 351 | time 16[s] | loss 1.86
| epoch 1 |  iter 141 / 351 | time 19[s] | loss 1.84
| epoch 1 |  iter 161 / 351 | time 22[s] | loss 1.80
| epoch 1 |  iter 181 / 351 | time 25[s] | loss 1.78
| epoch 1 |  iter 201 / 351 | time 28[s] | loss 1.77
| epoch 1 |  iter 221 / 351 | time 31[s] | loss 1.77
| epoch 1 |  iter 241 / 351 | time 33[s] | loss 1.76
| epoch 1 |  iter 261 / 351 | time 36[s] | loss 1.75
| epoch 1 |  iter 281 / 351 | time 39[s] | loss 1.74
| epoch 1 |  iter 301 / 351 | time 42[s] | loss 1.74
| epoch 1 |  iter 321 / 351 | time 45[s] | loss 1.74
| epoch 1 |  iter 341 / 351 | time 48[s] | loss 1.73
Q 77+85  
T 162 
X 100 
---
Q 975+164
T 1139
X 1000
---


### 7.4.2 覗き見（Peeky）

Encoderは入力文（問題文）を固定長のベクトルhに変換する。
そのhがDecoderにとって唯一の情報源である。
しかし、現状のseq2seqは最初の時刻のLSTMレイヤのみがベクトルhを利用している（図7-25）。
この重要な情報であるhをもっと活用する改善が図7-26。

In [7]:
class PeekyDecoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(H + D, 4 * H) / np.sqrt(H + D)).astype('f') # DがH+Dになった
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(H + H, V) / np.sqrt(H + H)).astype('f') # HがH+Hになった
        affine_b = np.zeros(V).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.affine = TimeAffine(affine_W, affine_b)

        self.params, self.grads = [], []
        for layer in (self.embed, self.lstm, self.affine):
            self.params += layer.params
            self.grads += layer.grads
        self.cache = None

    def forward(self, xs, h):
        N, T = xs.shape
        N, H = h.shape

        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        hs = np.repeat(h, T, axis=0).reshape(N, T, H) # この辺が追加
        out = np.concatenate((hs, out), axis=2)

        out = self.lstm.forward(out)
        out = np.concatenate((hs, out), axis=2)

        score = self.affine.forward(out)
        self.cache = H
        return score

    def backward(self, dscore):
        H = self.cache

        dout = self.affine.backward(dscore)
        dout, dhs0 = dout[:, :, H:], dout[:, :, :H]
        dout = self.lstm.backward(dout)
        dembed, dhs1 = dout[:, :, H:], dout[:, :, :H]
        self.embed.backward(dembed)

        dhs = dhs0 + dhs1
        dh = self.lstm.dh + np.sum(dhs, axis=1)
        return dh

    def generate(self, h, start_id, sample_size):
        sampled = []
        char_id = start_id
        self.lstm.set_state(h)

        H = h.shape[1]
        peeky_h = h.reshape(1, 1, H)
        for _ in range(sample_size):
            x = np.array([char_id]).reshape((1, 1))
            out = self.embed.forward(x)

            out = np.concatenate((peeky_h, out), axis=2)
            out = self.lstm.forward(out)
            out = np.concatenate((peeky_h, out), axis=2)
            score = self.affine.forward(out)

            char_id = np.argmax(score.flatten())
            sampled.append(char_id)

        return sampled


class PeekySeq2seq(Seq2seq):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = PeekyDecoder(V, D, H)
        self.softmax = TimeSoftmaxWithLoss()

        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads


In [8]:
import sys
sys.path.append('..')
import numpy as np
import matplotlib.pyplot as plt
from dataset import sequence
from common.optimizer import Adam
from common.trainer import Trainer
from common.util import eval_seq2seq
#from seq2seq import Seq2seq
#from peeky_seq2seq import PeekySeq2seq


# データセットの読み込み
(x_train, t_train), (x_test, t_test) = sequence.load_data('addition.txt')
char_to_id, id_to_char = sequence.get_vocab()

# Reverse input? =================================================
is_reverse = True
if is_reverse:
    x_train, x_test = x_train[:, ::-1], x_test[:, ::-1]
# ================================================================

# ハイパーパラメータの設定
vocab_size = len(char_to_id)
wordvec_size = 16
hideen_size = 128
batch_size = 128
max_epoch = 25
max_grad = 5.0

# Normal or Peeky? ==============================================
# model = Seq2seq(vocab_size, wordvec_size, hideen_size)
model = PeekySeq2seq(vocab_size, wordvec_size, hideen_size) # ここを変えた
# ================================================================
optimizer = Adam()
trainer = Trainer(model, optimizer)

acc_list = []
for epoch in range(max_epoch):
    trainer.fit(x_train, t_train, max_epoch=1,
                batch_size=batch_size, max_grad=max_grad)

    correct_num = 0
    for i in range(len(x_test)):
        question, correct = x_test[[i]], t_test[[i]]
        verbose = i < 10
        correct_num += eval_seq2seq(model, question, correct,
                                    id_to_char, verbose, is_reverse)

    acc = float(correct_num) / len(x_test)
    acc_list.append(acc)
    print('val acc %.3f%%' % (acc * 100))

| epoch 1 |  iter 1 / 351 | time 0[s] | loss 2.57
| epoch 1 |  iter 21 / 351 | time 3[s] | loss 2.48
| epoch 1 |  iter 41 / 351 | time 6[s] | loss 2.20
| epoch 1 |  iter 61 / 351 | time 9[s] | loss 1.99
| epoch 1 |  iter 81 / 351 | time 12[s] | loss 1.89
| epoch 1 |  iter 101 / 351 | time 15[s] | loss 1.82
| epoch 1 |  iter 121 / 351 | time 18[s] | loss 1.82
| epoch 1 |  iter 141 / 351 | time 21[s] | loss 1.80
| epoch 1 |  iter 161 / 351 | time 24[s] | loss 1.79
| epoch 1 |  iter 181 / 351 | time 27[s] | loss 1.78
| epoch 1 |  iter 201 / 351 | time 30[s] | loss 1.77
| epoch 1 |  iter 221 / 351 | time 33[s] | loss 1.76
| epoch 1 |  iter 241 / 351 | time 37[s] | loss 1.76
| epoch 1 |  iter 261 / 351 | time 40[s] | loss 1.75
| epoch 1 |  iter 281 / 351 | time 43[s] | loss 1.74
| epoch 1 |  iter 301 / 351 | time 46[s] | loss 1.74
| epoch 1 |  iter 321 / 351 | time 49[s] | loss 1.73
| epoch 1 |  iter 341 / 351 | time 52[s] | loss 1.73
Q 77+85  
T 162 
X 100 
---
Q 975+164
T 1139
X 1013
---


ReverseとPeekyが共に効果的に働いていることが分かる。
ここで行った改良は小さな改良に過ぎず、次章ではseq2seqに大きな改良を加える。
それはAttentitonと呼ばれるテクニックで、seq2seqを劇的に進化させることができる！（土田さんの解説に期待が高まる）

Peekyを用いることで、重みパラメータが増え、計算量が増加していることには注意が必要である。
また、seq2seqの精度はハイパーパラメータの調整で大きく変わる。