In [1]:
from __future__ import print_function, division

import numpy as np
import tensorflow as tf
from keras.datasets import imdb
from tensorflow.contrib.rnn import GRUCell
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
from tqdm import tqdm

from attention import attention
from utils import get_vocabulary_size, fit_in_vocabulary, zero_pad, batch_generator

Using TensorFlow backend.


In [2]:
NUM_WORDS = 10000
INDEX_FROM = 3
SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100
HIDDEN_SIZE = 150
ATTENTION_SIZE = 50
KEEP_PROB = 0.8
BATCH_SIZE = 256
NUM_EPOCHS = 3  # Model easily overfits without pre-trained words embeddings, that's why train for a few epochs
DELTA = 0.5
NUM_CLASSES = 1
MODEL_PATH = './model'

In [3]:
# Load the data set
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM)


In [4]:
print('X_trn:', X_train[:2,], '\n\n number of counts:', X_train.shape)
print('\n y_trn:', y_train[:2,], '\n\n number of counts:', y_train.shape)

X_trn: [list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32])
 list([1, 194, 1153, 194, 8255, 78, 228, 5, 6,

---

- Sequences pre-processing: word ids들중에 가장 높은 ids를 찾아 vocab size를 찾는다.

In [5]:
def get_vocabulary_size(X):
    return max([max(x) for x in X]) + 1  # plus the 0th word
vocabulary_size = get_vocabulary_size(X_train)
vocabulary_size

10000

- X_test가 사전에 정의해 논 vocab ids안에 포함되는지 확인

In [6]:
def fit_in_vocabulary(X, voc_size):
    return [[w for w in x if w < voc_size] for x in X]

X_test = fit_in_vocabulary(X_test, vocabulary_size)

assert X_test == X_test

- 사전에 정의해논 seq_length에 맞도록 zero_padding을 설정해 준다 on training and testset
     - [0]* (m_seq - len(x))

In [7]:
def zero_pad(X, seq_len):
    return np.array([x[:seq_len ] + [0] * max(seq_len - len(x), 0) for x in X])

X_train = zero_pad(X_train, SEQUENCE_LENGTH)
X_test = zero_pad(X_test, SEQUENCE_LENGTH)

In [8]:
zero_pad([[1,2,3],[3,4,2]],5)

array([[1, 2, 3, 0, 0],
       [3, 4, 2, 0, 0]])

In [9]:
# Different placeholders
with tf.variable_scope('Inputs'):
    batch_size_ph = tf.placeholder(tf.int32, [], name='batch_size_ph')
    batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='batch_ph')
    target_ph = tf.placeholder(tf.float32, [None], name='target_ph')
    seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph')
    keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph')
    lr_ph = tf.placeholder(tf.float32, name='lr')

In [15]:
batch_size_ph

<tf.Tensor 'Inputs/batch_size_ph:0' shape=() dtype=int32>

In [10]:
# Embedding layer
with tf.variable_scope('Embedding_layer'):
    embeddings_var = tf.get_variable(
            "w_embed",
            shape= [vocabulary_size, EMBEDDING_DIM], 
            initializer=tf.initializers.truncated_normal(stddev=0.1)
            )
    
    tf.summary.histogram('embeddings_var', embeddings_var)
    
    batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)

Instructions for updating:
Colocations handled automatically by placer.


In [11]:
embeddings_var

<tf.Variable 'Embedding_layer/w_embed:0' shape=(10000, 100) dtype=float32_ref>

In [12]:
# cardinal integer (n, 250) -> (n, 250, 100)
batch_embedded

<tf.Tensor 'Embedding_layer/embedding_lookup/Identity:0' shape=(?, 250, 100) dtype=float32>

In [13]:
with tf.variable_scope('fw'):
    fw_cell = tf.nn.rnn_cell.GRUCell(num_units=HIDDEN_SIZE)
    fw_init_state = fw_cell.zero_state(batch_size_ph, tf.float32)
with tf.variable_scope('bw'):
    bw_cell = tf.nn.rnn_cell.GRUCell(num_units=HIDDEN_SIZE)
    bw_init_state = bw_cell.zero_state(batch_size_ph, tf.float32)


print(fw_init_state)
print(bw_init_state)

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Tensor("fw/GRUCellZeroState/zeros:0", shape=(?, 150), dtype=float32)
Tensor("bw/GRUCellZeroState/zeros:0", shape=(?, 150), dtype=float32)


In [14]:
states_series, current_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell,
                                                               cell_bw=bw_cell,
                                                               inputs= batch_embedded,
                                                               initial_state_fw = fw_init_state,
                                                               initial_state_bw = bw_init_state)

Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


In [15]:
states_series

(<tf.Tensor 'bidirectional_rnn/fw/fw/transpose_1:0' shape=(256, 250, 150) dtype=float32>,
 <tf.Tensor 'ReverseV2:0' shape=(256, 250, 150) dtype=float32>)

In [16]:
current_state

(<tf.Tensor 'bidirectional_rnn/fw/fw/while/Exit_3:0' shape=(256, 150) dtype=float32>,
 <tf.Tensor 'bidirectional_rnn/bw/bw/while/Exit_3:0' shape=(256, 150) dtype=float32>)

In [17]:
tf.summary.histogram('RNN_outputs', states_series)

<tf.Tensor 'RNN_outputs:0' shape=() dtype=string>

---

- attention layer

In [18]:
time_major = False
attention_size = ATTENTION_SIZE

In [19]:
isinstance(states_series, tuple)

True

In [20]:
# concat in bi-direction
inputs = tf.concat(states_series, 2)
inputs

<tf.Tensor 'concat:0' shape=(256, 250, 300) dtype=float32>

In [21]:
# in general, time_major = False if (B,T,D) 
if time_major:
    # (T,B,D) => (B,T,D)
    inputs = tf.transpose(inputs, [1, 0, 2])

In [22]:
hidden_size = inputs.shape[2].value  # D value - hidden size of the RNN layer with concatenation if bi-RNN
hidden_size

300

In [23]:
# 
w_omega = tf.get_variable("w_omega", 
                          shape = [hidden_size, attention_size],
                          initializer = tf.initializers.truncated_normal(stddev=0.1))
b_omega = tf.get_variable("b_omega", 
                          shape = [attention_size],
                          initializer = tf.initializers.truncated_normal(stddev=0.1))
u_omega = tf.get_variable("u_omega", 
                          shape = [attention_size],
                          initializer = tf.initializers.truncated_normal(stddev=0.1))

In [24]:
with tf.variable_scope('v'):
    # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
    #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
    v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)
v

<tf.Tensor 'v/Tanh:0' shape=(256, 250, 50) dtype=float32>

- word-level context vector(u_omega)를 사용하여, attetion score를 구함

In [25]:
u_omega

<tf.Variable 'u_omega:0' shape=(50,) dtype=float32_ref>

In [26]:
vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
vu

<tf.Tensor 'vu:0' shape=(256, 250) dtype=float32>

In [27]:
alphas = tf.nn.softmax(vu, name='alphas')         # (B,T) shape
alphas

<tf.Tensor 'alphas:0' shape=(256, 250) dtype=float32>

In [28]:
tf.expand_dims(alphas, -1)

<tf.Tensor 'ExpandDims:0' shape=(256, 250, 1) dtype=float32>

In [29]:
inputs

<tf.Tensor 'concat:0' shape=(256, 250, 300) dtype=float32>

In [30]:
output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
output

<tf.Tensor 'Sum:0' shape=(256, 300) dtype=float32>

In [31]:
attention_output = output

In [32]:



def attention(inputs, attention_size, time_major=False, return_alphas=False):
    '''
    input: (B,S,D)
    output: attentive (B,D)
    '''
    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.transpose(inputs, [1, 0, 2])

    hidden_size = inputs.shape[2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    w_omega = tf.get_variable("w_omega", 
                              shape = [hidden_size, attention_size],
                              initializer = tf.initializers.truncated_normal(stddev=0.1))
    b_omega = tf.get_variable("b_omega", 
                              shape = [attention_size],
                              initializer = tf.initializers.zeros())
    u_omega = tf.get_variable("u_omega", 
                              shape = [attention_size],
                              initializer = tf.initializers.truncated_normal(stddev=0.1))

    with tf.name_scope('v'):
        # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
        #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
        v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)

    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
    alphas = tf.nn.softmax(vu, name='alphas')         # (B,T) shape

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)

    if not return_alphas:
        return output
    else:
        return output, alphas


In [33]:
drop = tf.nn.dropout(attention_output, rate = 1-keep_prob_ph)
drop

<tf.Tensor 'dropout/mul:0' shape=(256, 300) dtype=float32>

In [34]:
HIDDEN_SIZE

150

In [35]:
# Fully connected layer
with tf.variable_scope('Fully_connected_layer'):
    W = tf.get_variable("W", 
                        shape = [HIDDEN_SIZE * 2, NUM_CLASSES], # Hidden size is multiplied by 2 for Bi-RNN
                        initializer = tf.initializers.truncated_normal(stddev=0.1))
    b = tf.get_variable("b", 
                        shape = [NUM_CLASSES], 
                        initializer = tf.initializers.zeros())
    y_hat = tf.nn.xw_plus_b(drop, W, b)
    
y_hat

<tf.Tensor 'Fully_connected_layer/xw_plus_b:0' shape=(256, 1) dtype=float32>

In [36]:
y_hat = tf.squeeze(y_hat)
tf.summary.histogram('W', W)
y_hat

<tf.Tensor 'Squeeze:0' shape=(256,) dtype=float32>

In [37]:
with tf.variable_scope('Metrics'):
    # Cross-entropy loss and optimizer initialization
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=target_ph))
    tf.summary.scalar('loss', loss)
    optimizer = tf.train.AdamOptimizer(learning_rate=lr_ph).minimize(loss)

    # Accuracy metric
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), target_ph), tf.float32))
    tf.summary.scalar('accuracy', accuracy)

Instructions for updating:
Use tf.cast instead.


In [38]:
merged = tf.summary.merge_all()

In [39]:
X_train[:5]

array([[   1,   14,   22, ...,    0,    0,    0],
       [   1,  194, 1153, ...,    0,    0,    0],
       [   1,   14,   47, ...,    0,    0,    0],
       [   1,    4,    2, ...,   94,  318, 1382],
       [   1,  249, 1323, ...,    0,    0,    0]])

In [50]:

def batch_generator(X, y, batch_size):
    """Primitive batch generator 
    """
    size = X.shape[0]
    X_copy = X.copy()
    y_copy = y.copy()
    indices = np.arange(size)
    np.random.shuffle(indices)
    X_copy = X_copy[indices]
    y_copy = y_copy[indices]
    i = 0
    while True:
        if i + batch_size <= size:
            yield X_copy[i:i + batch_size], y_copy[i:i + batch_size]
            i += batch_size
        else:
            i = 0
            indices = np.arange(size)
            np.random.shuffle(indices)
            X_copy = X_copy[indices]
            y_copy = y_copy[indices]
            continue


# Batch generators
train_batch_generator = batch_generator(X_train, y_train, BATCH_SIZE)
test_batch_generator = batch_generator(X_test, y_test, BATCH_SIZE)

In [51]:
train_writer = tf.summary.FileWriter('./logdir/train', accuracy.graph)
test_writer = tf.summary.FileWriter('./logdir/test', accuracy.graph)

session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))

saver = tf.train.Saver()

In [54]:
x_batch

array([[   1,   48,   13, ...,    0,    0,    0],
       [   1,   14,    9, ...,    0,    0,    0],
       [   1,   14,   22, ...,    0,    0,    0],
       ...,
       [   1,    2,  852, ...,    0,    0,    0],
       [   1,   14,    9, ...,    0,    0,    0],
       [   1,  449, 3214, ...,    0,    0,    0]])

In [64]:
x_batch

array([[   1,   48,   13, ...,    0,    0,    0],
       [   1,   14,    9, ...,    0,    0,    0],
       [   1,   14,   22, ...,    0,    0,    0],
       ...,
       [   1,    2,  852, ...,    0,    0,    0],
       [   1,   14,    9, ...,    0,    0,    0],
       [   1,  449, 3214, ...,    0,    0,    0]])

In [67]:
np.sum(x_batch !=0, axis=1)

array([139, 126,  93, 114, 250, 250, 250, 250, 250,  49, 162, 115, 192,
        89,  95, 206, 250, 250, 135, 132,  90, 216, 250, 186, 250, 243,
       136, 250, 116, 120, 250, 250, 123, 159, 250,  78, 250, 166, 137,
       180, 250,  89, 104,  86, 201, 181, 250, 250, 113, 127, 128, 108,
       250,  56, 227, 250, 170, 139, 250, 114, 154, 133, 250, 127, 138,
       250, 242, 250,  73,  80, 104, 250, 250, 250, 250, 190, 230, 213,
       146, 152, 207, 250,  76, 250, 129,  47, 176, 187, 103, 170, 126,
       201, 250, 121, 117, 150, 157, 250, 125, 190, 250, 250, 250, 153,
        94, 155, 250,  51, 250,  63, 250, 191, 250, 167, 250, 250, 245,
       250, 215, 250, 154, 250, 250,  51, 250, 250, 201, 135, 138, 133,
       250, 129,  92, 250, 152, 250, 163, 119, 250, 130, 223,  44, 107,
       140, 250,  36, 219, 195, 250, 119, 138, 250, 132, 223, 250, 139,
       123, 163, 119, 127, 189, 250, 209, 250, 197, 124, 250,  47, 158,
       126, 139, 250, 186, 120, 250, 201, 210, 112, 250, 125, 25

In [68]:
with tf.Session(config=session_conf) as sess:
        sess.run(tf.global_variables_initializer())
        print("Start learning...")
        for epoch in range(NUM_EPOCHS):
            loss_train = 0
            loss_test = 0
            accuracy_train = 0
            accuracy_test = 0

            print("epoch: {}\t".format(epoch), end="")

            # Training
            num_batches = X_train.shape[0] // BATCH_SIZE
            for b in tqdm(range(num_batches)):
                x_batch, y_batch = next(train_batch_generator)
                seq_len = np.sum(x_batch !=0, axis=1)  # actual lengths of sequences
                loss_tr, acc, _, summary = sess.run([loss, accuracy, optimizer, merged],
                                                    feed_dict={batch_ph: x_batch,
                                                               target_ph: y_batch,
                                                               seq_len_ph: seq_len,
                                                               keep_prob_ph: KEEP_PROB,
                                                               lr_ph: 0.001,
                                                               batch_size_ph:BATCH_SIZE})
                accuracy_train += acc
                loss_train = loss_tr * DELTA + loss_train * (1 - DELTA)
                train_writer.add_summary(summary, b + num_batches * epoch)
            accuracy_train /= num_batches

            # Testing
            num_batches = X_test.shape[0] // BATCH_SIZE
            for b in tqdm(range(num_batches)):
                x_batch, y_batch = next(test_batch_generator)
                seq_len = np.sum(x_batch !=0, axis=1)  # actual lengths of sequences
                loss_test_batch, acc, summary = sess.run([loss, accuracy, merged],
                                                         feed_dict={batch_ph: x_batch,
                                                                    target_ph: y_batch,
                                                                    seq_len_ph: seq_len,
                                                                    keep_prob_ph: 1.0,
                                                                    lr_ph: 0.001,
                                                                    batch_size: BATCH_SIZE})
                accuracy_test += acc
                loss_test += loss_test_batch
                test_writer.add_summary(summary, b + num_batches * epoch)
            accuracy_test /= num_batches
            loss_test /= num_batches

            print("loss: {:.3f}, val_loss: {:.3f}, acc: {:.3f}, val_acc: {:.3f}".format(
                loss_train, loss_test, accuracy_train, accuracy_test
            ))
        train_writer.close()
        test_writer.close()
        saver.save(sess, MODEL_PATH)
        print("Run 'tensorboard --logdir=./logdir' to checkout tensorboard logs.")

  0%|          | 0/97 [00:00<?, ?it/s]

Start learning...
epoch: 0	

100%|██████████| 97/97 [01:12<00:00,  1.36it/s]
100%|██████████| 97/97 [01:10<00:00,  1.38it/s]
  0%|          | 0/97 [00:00<?, ?it/s]

loss: 0.304, val_loss: 0.336, acc: 0.737, val_acc: 0.856
epoch: 1	

100%|██████████| 97/97 [01:10<00:00,  1.35it/s]
100%|██████████| 97/97 [01:10<00:00,  1.37it/s]
  0%|          | 0/97 [00:00<?, ?it/s]

loss: 0.227, val_loss: 0.309, acc: 0.900, val_acc: 0.868
epoch: 2	

100%|██████████| 97/97 [01:10<00:00,  1.33it/s]
100%|██████████| 97/97 [01:10<00:00,  1.40it/s]

loss: 0.203, val_loss: 0.326, acc: 0.928, val_acc: 0.863
Run 'tensorboard --logdir=./logdir' to checkout tensorboard logs.





! tensorboard --logdir=./logdir --host 

---