In [1]:
from dataset import sequence
import numpy as np
import time
import matplotlib.pyplot as plt

In [2]:
class AttentionSeq2seqlm:
    
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        
        rn = np.random.randn
        enc_embed = (rn(V, D) / 100).astype('f')
        enc_lstmWx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        enc_lstmWh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        enc_lstmb = np.zeros(4 * H).astype('f')
        
        dec_embed = (rn(V, D) / 100).astype('f')
        dec_lstmWx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        dec_lstmWh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        dec_lstmb = np.zeros(4 * H).astype('f')
        dec_affineW = (rn(2 * H, V) / np.sqrt(H)).astype('f')
        dec_affineb = np.zeros(V).astype('f')
        
        self.params = [enc_embed, enc_lstmWx, enc_lstmWh, enc_lstmb, \
            dec_embed, dec_lstmWx, dec_lstmWh, dec_lstmb, dec_affineW, dec_affineb]
        
        self.grads = []
        self.enc_lstm = [] 
        self.dec_lstm = []
        self.attention = []
        
        self.time_idx = 0
        self.vocab_size = vocab_size
        self.wordvec_size = wordvec_size
        self.hidden_size = hidden_size
        
        self.beta1 = 0.9
        self.beta2 = 0.999
        self.iter = 0
        self.m = None
        self.v = None

        
    def forward(self, enc_xs, ts):
        enc_embed, enc_lstmWx, enc_lstmWh, enc_lstmb, \
            dec_embed, dec_lstmWx, dec_lstmWh, dec_lstmb, dec_affineW, dec_affineb = self.params
        
        self.enc_lstm = []
        self.dec_lstm = []
        
        dec_xs = ts[:, :-1]
        dec_ts = ts[:, 1:]
        
        batch_size, enc_time_size = enc_xs.shape
        dec_time_size = dec_xs.shape[1]
        
        hidden_size = self.hidden_size
               
        ####################### ENCODER #######################
        
        h_prev = np.zeros((batch_size, hidden_size), dtype='f')
        c_prev = np.zeros((batch_size, hidden_size), dtype='f')
        enc_hs = np.zeros((batch_size, enc_time_size, hidden_size), dtype='f')
        for t in range(enc_time_size):
            # embed
            emb_out = enc_embed[enc_xs[:, t]]

            # lstm
            A = np.matmul(emb_out, enc_lstmWx) + np.matmul(h_prev, enc_lstmWh) + enc_lstmb
            f = A[:, :hidden_size]
            g = A[:, hidden_size: 2*hidden_size]
            i = A[:, 2*hidden_size: 3*hidden_size]
            o = A[:, 3*hidden_size:]

            f = self.sigmoid(f)
            g = np.tanh(g)
            i = self.sigmoid(i)
            o = self.sigmoid(o)

            c_next = f * c_prev + g * i
            h_next = o * np.tanh(c_next)
            
            enc_hs[:, t, :] = h_next
            
            self.enc_lstm.append((emb_out, h_prev, c_prev, f, g, i, o, c_next, h_next))
            
            c_prev = c_next
            h_prev = h_next
            
        ####################### DECODER #######################
        
        c_prev = np.zeros((batch_size, hidden_size), dtype='f')
        dec_hs = np.zeros((batch_size, dec_time_size, hidden_size), dtype='f')
        for t in range(dec_time_size):
            # embed
            emb_out = dec_embed[dec_xs[:, t]]
            
            # lstm
            A = np.matmul(emb_out, dec_lstmWx) + np.matmul(h_prev, dec_lstmWh) + dec_lstmb
            f = A[:, :hidden_size]
            g = A[:, hidden_size: 2*hidden_size]
            i = A[:, 2*hidden_size: 3*hidden_size]
            o = A[:, 3*hidden_size:]

            f = self.sigmoid(f)
            g = np.tanh(g)
            i = self.sigmoid(i)
            o = self.sigmoid(o)

            c_next = f * c_prev + g * i
            h_next = o * np.tanh(c_next)
            
            dec_hs[:, t, :] = h_next
            
            self.dec_lstm.append((emb_out, h_prev, c_prev, f, g, i, o, c_next, h_next))
            c_prev = c_next
            h_prev = h_next
            
        # attention
        self.attention = []
        
        c = np.zeros_like(dec_hs)
        for t in range(dec_time_size):
            hr = dec_hs[:, t, :].reshape(batch_size, 1, hidden_size).repeat(enc_time_size, axis=1)
            t1 = enc_hs * hr
            s = np.sum(t1, axis=2)
            a = self.softmax(s)
            ar = a.reshape(batch_size, enc_time_size, 1).repeat(hidden_size, axis=2)
            t2 = enc_hs * ar
            c[:, t, :] = np.sum(t2, axis=1)
            self.attention.append([hr, ar, a])
            
        # affine
        concat_attention = np.concatenate((c, dec_hs), axis=2)
        dec_affine_out = np.matmul(concat_attention, dec_affineW) + dec_affineb
        
        # softmax
        y = self.softmax(dec_affine_out.reshape(batch_size, dec_time_size, -1))
        loss = self.getLoss(y, dec_ts)
        
        self.xs = enc_xs, enc_hs, dec_xs, dec_hs, concat_attention, dec_affine_out, y
        self.ts = dec_ts
        return loss
        
    def backward(self):
        enc_embed, enc_lstmWx, enc_lstmWh, enc_lstmb, \
            dec_embed, dec_lstmWx, dec_lstmWh, dec_lstmb, dec_affineW, dec_affineb = self.params
        enc_xs, enc_hs, dec_xs, dec_hs, concat_attention, dec_affine_out, y = self.xs
        dec_ts = self.ts
        
        vocab_size = self.vocab_size        
        wordvec_size = self.wordvec_size
        batch_size, enc_time_size = enc_xs.shape
        dec_time_size = dec_xs.shape[1]
        
        ####################### DECODER #######################
        
        # softmax
        y = y.reshape(batch_size * dec_time_size, -1)
        y[np.arange(batch_size * dec_time_size), dec_ts.reshape(batch_size * dec_time_size)] -= 1
        soft_dout = y
        
        # affine
        concat_attention = concat_attention.reshape(batch_size * dec_time_size, -1)
        affine_dout = np.matmul(soft_dout, dec_affineW.T).reshape(batch_size, dec_time_size, -1)
        affinedW = np.matmul(concat_attention.T, soft_dout) 
        affinedb = np.sum(soft_dout, axis=0)
        
        dc, affine_dout = affine_dout[:, :, :hidden_size], affine_dout[:, :, hidden_size:]
                
        # attention
        enc_dhs = np.zeros_like(enc_hs)
        dec_dhs = np.zeros_like(dec_hs)
        for t in range(dec_time_size):
            hr, ar, a = self.attention[t]
            
            # t2
            dt2 = dc[:, t, :].reshape(batch_size, 1, hidden_size).repeat(enc_time_size, axis=1)
            
            # ar
            dar = dt2 * enc_hs
            enc_dhs0 = dt2 * ar
            
            # a 
            da = np.sum(dar, axis=2)
            
            # s
            ds = a * da
            sum_ds = np.sum(ds, axis=1, keepdims=True)
            ds -= a * sum_ds
            
            # t1
            dt1 = ds.reshape(batch_size, enc_time_size, -1).repeat(hidden_size, axis=2)
            
            # hr
            dhr = dt1 * enc_hs
            enc_dhs1 = dt1 * hr
            
            # h
            dh = np.sum(dhr, axis=1)
            
            enc_dhs += (enc_dhs0 + enc_dhs1)
            dec_dhs[:, t, :] = dh
            
        dec_dhs += affine_dout

        # lstm
        dec_lstmdWx = np.zeros_like(dec_lstmWx)
        dec_lstmdWh = np.zeros_like(dec_lstmWh)
        dec_lstmdb = np.zeros_like(dec_lstmb)
        
        dec_lstm_douts = np.empty((batch_size, dec_time_size, wordvec_size), dtype='f')
        dh, dc = 0, 0
        for t in reversed(range(dec_time_size)):            
            emb_out, h_prev, c_prev, f, g, i, o, c_next, _ = self.dec_lstm[t]
            dh_next = dec_dhs[:, t, :] + dh
            dc_next = dc

            tanh_c_next = np.tanh(c_next)
            
            ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2)
            
            dc_prev = ds * f
            
            di = ds * g
            df = ds * c_prev
            do = dh_next * tanh_c_next
            dg = ds * i
            
            di *= i * (1 - i)
            df *= f * (1 - f)
            do *= o * (1 - o)
            dg *= (1 - g ** 2)
            
            dA = np.hstack((df, dg, di, do))
            
            dWh = np.matmul(h_prev.T, dA)
            dWx = np.matmul(emb_out.T, dA)
            db = np.sum(dA, axis=0)
            
            dec_lstm_douts[:, t, :] = np.matmul(dA, dec_lstmWx.T)
            dh_prev = np.matmul(dA, dec_lstmWh.T)
        
            dec_lstmdWx += dWx
            dec_lstmdWh += dWh
            dec_lstmdb += db
            dh = dh_prev
            dc = dc_prev
            
        # embed
        dec_embed_dout = np.zeros_like(dec_embed)
        for t in range(dec_time_size):
            np.add.at(dec_embed_dout, dec_xs[:, t], dec_lstm_douts[:, t, :])
            
        ####################### ENCODER #######################
        
        # lstm
        enc_lstmdWx = np.zeros_like(enc_lstmWx)
        enc_lstmdWh = np.zeros_like(enc_lstmWh)
        enc_lstmdb = np.zeros_like(enc_lstmb)
        
        enc_lstm_douts = np.empty((batch_size, enc_time_size, wordvec_size), dtype='f')
        dc = 0
        for t in reversed(range(enc_time_size)):            
            emb_out, h_prev, c_prev, f, g, i, o, c_next, _ = self.enc_lstm[t]
            dh_next = enc_dhs[:, t, :] + dh
            dc_next = dc

            tanh_c_next = np.tanh(c_next)
            
            ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2)
            
            dc_prev = ds * f
            
            di = ds * g
            df = ds * c_prev
            do = dh_next * tanh_c_next
            dg = ds * i
            
            di *= i * (1 - i)
            df *= f * (1 - f)
            do *= o * (1 - o)
            dg *= (1 - g ** 2)
            
            dA = np.hstack((df, dg, di, do))
            
            dWh = np.matmul(h_prev.T, dA)
            dWx = np.matmul(emb_out.T, dA)
            db = np.sum(dA, axis=0)
            
            enc_lstm_douts[:, t, :] = np.matmul(dA, enc_lstmWx.T)
            dh_prev = np.matmul(dA, enc_lstmWh.T)
        
            enc_lstmdWx += dWx
            enc_lstmdWh += dWh
            enc_lstmdb += db
            dh = dh_prev
            dc = dc_prev        
        
        # embed
        enc_embed_dout = np.zeros_like(enc_embed)
        for t in range(enc_time_size):
            np.add.at(enc_embed_dout, enc_xs[:, t], enc_lstm_douts[:, t, :])
            
        self.grads = enc_embed_dout, enc_lstmdWx, enc_lstmdWh, enc_lstmdb, \
            dec_embed_dout, dec_lstmdWx, dec_lstmdWh, dec_lstmdb, affinedW, affinedb
        
    def softmax(self, y):
        y = y - np.max(y)
        y = np.exp(y)
        y = y / y.sum(axis=-1, keepdims=True)
        return y
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def getLoss(self, y, t):
        N, T, V = y.shape

        y = y.reshape(N * T, V)
        t = t.reshape(N * T)

        ls = np.log(y[np.arange(N * T), t])
        return -np.sum(ls) / (N * T)
                
    def updateAdam(self, lr):
        params = self.params
        grads = self.grads
        
        if self.m is None:
            self.m, self.v = [], []
            for param in params:
                self.m.append(np.zeros_like(param))
                self.v.append(np.zeros_like(param))
                
        self.iter += 1
        lr_t = lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)
        
        for i in range(len(params)):
            self.m[i] += (1 - self.beta1) * (grads[i] - self.m[i])
            self.v[i] += (1 - self.beta2) * (grads[i]**2 - self.v[i])
            
            params[i] -= lr_t * self.m[i] / (np.sqrt(self.v[i]) + 1e-7)
        self.params = params
        
    def clip_grads(self, max_norm):
        grads = self.grads
        total_norm = 0
        
        for grad in grads:
            total_norm += np.sum(grad**2)
            
        total_norm = np.sqrt(total_norm)
        
        rate = max_norm / (total_norm + 1e-6)
        
        if rate  < 1:
            for grad in grads:
                grad *= rate
        self.grads = grads
            
    def generate(self, xs, start_id, sample_size):
        enc_embed, enc_lstmWx, enc_lstmWh, enc_lstmb, \
            dec_embed, dec_lstmWx, dec_lstmWh, dec_lstmb, dec_affineW, dec_affineb = self.params
        
        
        enc_time_size = xs.shape[1]
        hidden_size = self.hidden_size
        
        ####################### ENCODER #######################
        
        h_prev = np.zeros((hidden_size), dtype='f')
        c_prev = np.zeros((hidden_size), dtype='f')
        enc_hs = np.zeros((enc_time_size, hidden_size), dtype='f')
        for t in range(enc_time_size):
            # embed
            emb_out = enc_embed[xs[:, t]]

            # lstm
            A = np.matmul(emb_out, enc_lstmWx) + np.matmul(h_prev, enc_lstmWh) + enc_lstmb
            f = A[:, :hidden_size]
            g = A[:, hidden_size: 2*hidden_size]
            i = A[:, 2*hidden_size: 3*hidden_size]
            o = A[:, 3*hidden_size:]

            f = self.sigmoid(f)
            g = np.tanh(g)
            i = self.sigmoid(i)
            o = self.sigmoid(o)

            c_next = f * c_prev + g * i
            h_next = o * np.tanh(c_next)
            
            enc_hs[t, :] = h_next
            
            c_prev = c_next
            h_prev = h_next
                        
        sampled = []
        sample_id = start_id

        ####################### DECODER #######################

        c_prev = np.zeros((hidden_size), dtype='f')
        for _ in range(sample_size):
            
            # embed
            emb_out = dec_embed[sample_id]

            # lstm
            A = np.matmul(emb_out, dec_lstmWx) + np.matmul(h_prev, dec_lstmWh) + dec_lstmb
            f = A[:, :hidden_size]
            g = A[:, hidden_size: 2*hidden_size]
            i = A[:, 2*hidden_size: 3*hidden_size]
            o = A[:, 3*hidden_size:]

            f = self.sigmoid(f)
            g = np.tanh(g)
            i = self.sigmoid(i)
            o = self.sigmoid(o)

            c_next = f * c_prev + g * i
            h_next = o * np.tanh(c_next)

            c_prev = c_next
            h_prev = h_next
            
            # attention
            hr = h_next.reshape(1, 1, hidden_size).repeat(enc_time_size, axis=1)
            t1 = enc_hs * hr
            s = np.sum(t1, axis=2)
            a = self.softmax(s)
            ar = a.reshape(1, enc_time_size, 1).repeat(hidden_size, axis=2)
            t2 = enc_hs * ar
            c = np.sum(t2, axis=1)
            
            # affine
            concat_attention = np.concatenate((c, h_next), axis=1)
            dec_affine_out = np.matmul(concat_attention, dec_affineW) + dec_affineb
            
            # softmax
            y = self.softmax(dec_affine_out)
            sample_id = np.argmax(y.flatten())
            sampled.append(int(sample_id))
        
        return sampled
            
    def eval_seq2seq(self, question, correct, id_to_char,
                     verbos=False, is_reverse=False):
        correct = correct.flatten()
        # 머릿글자
        start_id = correct[0]
        correct = correct[1:]
        guess = self.generate(question, start_id, len(correct))

        # 문자열로 변환
        question = ''.join([id_to_char[int(c)] for c in question.flatten()])
        correct = ''.join([id_to_char[int(c)] for c in correct])
        guess = ''.join([id_to_char[int(c)] for c in guess])

        if verbos:
            if is_reverse:
                question = question[::-1]

            print('Q', question)
            print('T', correct)

            if correct == guess:
                mark = 'O'
                print(mark + ' ' + guess)
            else:
                mark = 'X'
                print(mark + ' ' + guess)
            print('---')

        return 1 if guess == correct else 0

In [3]:
(x_train, t_train), (x_test, t_test) = sequence.load_data('date.txt')
char_to_id, id_to_char = sequence.get_vocab()

In [4]:
vocab_size = len(char_to_id)
wordvec_size = 16
hidden_size = 256
batch_size = 128
max_epoch = 10
max_grad = 5.0
data_size = len(x_train)
learning_rate = 0.001
data_size = len(x_train)
max_iters = data_size // batch_size

In [5]:
model = AttentionSeq2seqlm(vocab_size, wordvec_size, hidden_size) 

In [None]:
acc_list = []
loss_list = []
start_time = time.time()
total_loss, loss_count = 0, 0

for epoch in range(max_epoch):
    idx = np.random.permutation(np.arange(data_size))
    x_train = x_train[idx]
    t_train = t_train[idx]
    
    for iters in range(max_iters):
        batch_x = x_train[(iters * batch_size) : (iters + 1) * batch_size]
        batch_t = t_train[(iters * batch_size) : (iters + 1) * batch_size]
        
        loss = model.forward(batch_x, batch_t)
        model.backward()
        model.clip_grads(max_grad)
        model.updateAdam(learning_rate)
        
        total_loss += loss
        loss_count += 1
        
        if iters % 20 == 0:
            avg_loss = total_loss / loss_count
            elapsed_time = time.time() - start_time
            print('| epoch %d |  iters %d / %d | time %d[s] | loss %.2f'
                % (epoch + 1, iters + 1, max_iters, elapsed_time, avg_loss))
            loss_list.append(avg_loss)
            total_loss, loss_count = 0, 0
            
    correct_num = 0
    for i in range(len(x_test)):
        question, correct = x_test[[i]], t_test[[i]]
        verbose = i < 10
        correct_num += model.eval_seq2seq(question, correct, id_to_char, verbose)
    acc = float(correct_num) / len(x_test)
    acc_list.append(acc)
    print('accuracy %.3f%%' % (acc * 100))

| epoch 1 |  iters 1 / 351 | time 1[s] | loss 4.08
| epoch 1 |  iters 21 / 351 | time 36[s] | loss 2.99
| epoch 1 |  iters 41 / 351 | time 68[s] | loss 1.80
| epoch 1 |  iters 61 / 351 | time 98[s] | loss 1.51
| epoch 1 |  iters 81 / 351 | time 130[s] | loss 1.22
| epoch 1 |  iters 101 / 351 | time 161[s] | loss 1.11
| epoch 1 |  iters 121 / 351 | time 185[s] | loss 1.05
| epoch 1 |  iters 141 / 351 | time 209[s] | loss 1.03
| epoch 1 |  iters 161 / 351 | time 233[s] | loss 1.01
| epoch 1 |  iters 181 / 351 | time 266[s] | loss 0.99
| epoch 1 |  iters 201 / 351 | time 294[s] | loss 0.97
| epoch 1 |  iters 221 / 351 | time 320[s] | loss 0.96
| epoch 1 |  iters 241 / 351 | time 345[s] | loss 0.94
| epoch 1 |  iters 261 / 351 | time 370[s] | loss 0.94
| epoch 1 |  iters 281 / 351 | time 394[s] | loss 0.91
| epoch 1 |  iters 301 / 351 | time 418[s] | loss 0.90
| epoch 1 |  iters 321 / 351 | time 442[s] | loss 0.88
| epoch 1 |  iters 341 / 351 | time 466[s] | loss 0.86
Q 10/15/94           

| epoch 6 |  iters 21 / 351 | time 2428[s] | loss 0.00
| epoch 6 |  iters 41 / 351 | time 2445[s] | loss 0.01
| epoch 6 |  iters 61 / 351 | time 2462[s] | loss 0.00
| epoch 6 |  iters 81 / 351 | time 2479[s] | loss 0.00
| epoch 6 |  iters 101 / 351 | time 2495[s] | loss 0.01
| epoch 6 |  iters 121 / 351 | time 2511[s] | loss 0.01
| epoch 6 |  iters 141 / 351 | time 2527[s] | loss 0.00
| epoch 6 |  iters 161 / 351 | time 2543[s] | loss 0.01
| epoch 6 |  iters 181 / 351 | time 2558[s] | loss 0.00
| epoch 6 |  iters 201 / 351 | time 2574[s] | loss 0.00
| epoch 6 |  iters 221 / 351 | time 2590[s] | loss 0.00
| epoch 6 |  iters 241 / 351 | time 2608[s] | loss 0.00
| epoch 6 |  iters 261 / 351 | time 2625[s] | loss 0.00
| epoch 6 |  iters 281 / 351 | time 2641[s] | loss 0.00
| epoch 6 |  iters 301 / 351 | time 2657[s] | loss 0.01
| epoch 6 |  iters 321 / 351 | time 2672[s] | loss 0.00
| epoch 6 |  iters 341 / 351 | time 2689[s] | loss 0.00
Q 10/15/94                     
T 1994-10-15
O 1994-

In [None]:
acc = np.arange(len(acc_list))
plt.plot(acc, acc_list, label='train')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.show()