
# Train RNN (GRU) + Attention on Amason product review/rating data


In [1]:
# !sudo pip install numpy
# !sudo pip install tensorflow
# !sudo pip install keras
# !sudo pip install tqdmn

In [2]:
import numpy as np
import tensorflow as tf
from nltk.corpus import stopwords
from tensorflow.contrib.rnn import GRUCell
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
import gzip
import time
from tqdm import tqdm

from attention import attention
from rnn_common.utils import get_vocabulary_size, fit_in_vocabulary, zero_pad, batch_generator

In [3]:
NUM_WORDS = 40000
INDEX_FROM = 3
# SKIP_TOP = 0 # This will instead be handled by filtering stop words prior to training/testing
SEQUENCE_LENGTH = 40 # The mean sequence length in training data is 32 (after filtering stop words)
EMBEDDING_DIM = 100
HIDDEN_SIZE = 150
ATTENTION_SIZE = 50
KEEP_PROB = 0.8
BATCH_SIZE = 256
NUM_EPOCHS = 3  # Model easily overfits without pre-trained words embeddings, that's why train for a few epochs
DELTA = 0.5
MODEL_PATH = './model'
FILE_PATH = './data/reviews_Toys_and_Games_5.json.gz'
train = True

In [4]:
def num_lines(filepath):
    line_count = 0
    with gzip.open(filepath) as input_file:
        for line in input_file:
            line_count += 1            
    return line_count
            

def get_values(filepath, column, bounds, vocab=None, stopwords=None, add_words=False):
    line_count = num_lines(filepath)
    start_line = bounds[0] * line_count
    end_line = bounds[1] * line_count

    with gzip.open(filepath) as input_file:
        curr_line = 0
        for line in input_file:
            curr_line += 1
            if curr_line >= start_line and curr_line < end_line:
                line_dict = eval(line)
                line_val = line_dict[column]
                
                if vocab is not None:
                    line_ids = []
                    for word in line_val.lower().split(" "):
                        if stopwords is not None:
                            if word in stopwords:
                                continue
                        word_id = vocab.get(word)
                        if word_id is None:
                            if add_words:
                                word_id = len(vocab)+3
                                vocab[word] = word_id
                            else:
                                # In the test dataset, unknown labels will be labeled with id 2 (":UNK")
                                word_id = 2
                        line_ids.append(word_id)
                    line_val = line_ids
                
                    yield len(line_val), line_val
                else:
                    # Scale the rating to (0,1) binary indicator of positive
                    yield int(line_val/5.0)

In [5]:
vocab = {}
stopWords = set(stopwords.words('english'))
print(f"Filtering ({len(stopWords)}) English stop words)")

print("\nReading train input...")
start_time = time.time()

# Note that the actual seq_len is the first of two values in the tokenized review input, the token list is the second
percents_seq_len = np.percentile([review[0] for review in get_values(FILE_PATH, 'reviewText', [0,.8], 
                                                                     vocab, stopWords)],
                                [25,50,75,100])
print(f"Percentiles (25%, 50%, 75%, 100%) seq_len (X_train) = {percents_seq_len}")
# Note that add_words=True, indicating that the vocabulary should be collected from X_train
X_train = [review[1] for review in get_values(FILE_PATH, 'reviewText', [0,.8], vocab, stopWords, add_words=True)]
y_train = [overall for overall in get_values(FILE_PATH, 'overall', [0,.8])]
elapsed = time.time()-start_time
print(f"...finished reading train input, after {round(elapsed)} seconds")
print(f"X_train ({len(X_train)} records): {X_train[:2]}")
print(f"y_train ({len(y_train)} records): {y_train[:2]}")

#print("\nReading dev input...")
#start_time = time.time()
# Note that the actual seq_len is the first of two values in the tokenized review input, the token list is the second
# Note that add_words=False, indicating that the words in X_dev should not contribute to the vocabulary
#X_dev = [review[1] for review in get_values(FILE_PATH, 'reviewText', [.8,.9], vocab, stopWords, add_words=False)]
#y_dev = [overall for overall in get_values(FILE_PATH, 'overall', [.8,.9])]
#elapsed = time.time()-start_time
#print(f"...finished reading dev input, after {round(elapsed)} seconds")
#print(f"X_dev ({len(X_dev)} records): {X_dev[:2]}")
#print(f"y_dev ({len(y_dev)} records): {y_dev[:2]}")

print("\nReading test input...")
start_time = time.time()
# Note that the actual seq_len is the first of two values in the tokenized review input, the token list is the second
# Note that add_words=False, indicating that the words in X_test should not contribute to the vocabulary
X_test = [review[1] for review in get_values(FILE_PATH, 'reviewText', [.8,1.0], vocab, stopWords, add_words=False)]
y_test = [overall for overall in get_values(FILE_PATH, 'overall', [.8,1.0])]
elapsed = time.time()-start_time
print(f"...finished reading test input, after {round(elapsed)} seconds")
print(f"X_test ({len(X_test)} records): {X_test[:2]}")
print(f"y_test ({len(y_test)} records): {y_test[:2]}")

print(f"vocab contains {len(vocab)} words.")

Filtering (179) English stop words)

Reading train input...
Percentiles (25%, 50%, 75%, 100%) seq_len (X_train) = [  18.   32.   62. 3168.]
...finished reading train input, after 23 seconds
X_train (134077 records): [[3, 4, 5, 6, 7, 8, 7, 9], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]
y_train (134077 records): [1, 0]

Reading test input...
...finished reading test input, after 6 seconds
X_test (33519 records): [[10, 88859, 6851, 8442, 103, 44, 139, 88859, 277, 273, 32170, 8032, 30, 372, 218, 161, 6851, 2087, 1030, 6975, 7242, 5662, 2749, 49955, 12500, 474, 6975, 7863, 756, 292, 33, 61, 8875, 45322, 8120, 30, 59116], [42, 636, 40466, 32143, 3299, 6851, 8287, 10, 461, 30, 139, 253, 2883]]
y_test (33519 records): [1, 0]
vocab contains 299733 words.


In [6]:
# Load the data set

# Sequences pre-processing
vocabulary_size = get_vocabulary_size(X_train)
X_test = fit_in_vocabulary(X_test, vocabulary_size)
X_train = zero_pad(X_train, SEQUENCE_LENGTH)
X_test = zero_pad(X_test, SEQUENCE_LENGTH)

In [7]:
# Different placeholders
with tf.name_scope('Inputs'):
    batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='batch_ph')
    target_ph = tf.placeholder(tf.float32, [None], name='target_ph')
    seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph')
    keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph')

In [8]:
# Embedding layer
with tf.name_scope('Embedding_layer'):
    embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True)
    tf.summary.histogram('embeddings_var', embeddings_var)
    batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)

Instructions for updating:
Colocations handled automatically by placer.


In [9]:
# (Bi-)RNN layer(-s)
rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_SIZE), GRUCell(HIDDEN_SIZE),
                        inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32)
tf.summary.histogram('RNN_outputs', rnn_outputs)

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.


<tf.Tensor 'RNN_outputs:0' shape=() dtype=string>

In [10]:
# Attention layer
with tf.name_scope('Attention_layer'):
    attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True)
    tf.summary.histogram('alphas', alphas)

# Dropout
drop = tf.nn.dropout(attention_output, keep_prob_ph)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [11]:
# Fully connected layer
with tf.name_scope('Fully_connected_layer'):
    W = tf.Variable(tf.truncated_normal([HIDDEN_SIZE * 2, 1], stddev=0.1))  # Hidden size is multiplied by 2 for Bi-RNN
    b = tf.Variable(tf.constant(0., shape=[1]))
    y_hat = tf.nn.xw_plus_b(drop, W, b)
    y_hat = tf.squeeze(y_hat)
    tf.summary.histogram('W', W)

with tf.name_scope('Metrics'):
    # Cross-entropy loss and optimizer initialization
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=target_ph))
    tf.summary.scalar('loss', loss)
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)

    # Accuracy metric
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), target_ph), tf.float32))
    tf.summary.scalar('accuracy', accuracy)

In [12]:
merged = tf.summary.merge_all()

# Batch generators
train_batch_generator = batch_generator(X_train, y_train, BATCH_SIZE)
test_batch_generator = batch_generator(X_test, y_test, BATCH_SIZE)

train_writer = tf.summary.FileWriter('./logdir/train', accuracy.graph)
test_writer = tf.summary.FileWriter('./logdir/test', accuracy.graph)

session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))

saver = tf.train.Saver()

In [13]:
if train:
    with tf.Session(config=session_conf) as sess:
        sess.run(tf.global_variables_initializer())
        print("Start learning...")
        for epoch in range(NUM_EPOCHS):
            loss_train = 0
            loss_test = 0
            accuracy_train = 0
            accuracy_test = 0

            print("epoch: {}\t".format(epoch), end="")

            # Training
            num_batches = X_train.shape[0] // BATCH_SIZE
            for b in tqdm(range(num_batches)):
                x_batch, y_batch = next(train_batch_generator)
                seq_len = np.array([list(x).index(0) + 1 for x in x_batch])  # actual lengths of sequences
                loss_tr, acc, _, summary = sess.run([loss, accuracy, optimizer, merged],
                                                    feed_dict={batch_ph: x_batch,
                                                               target_ph: y_batch,
                                                               seq_len_ph: seq_len,
                                                               keep_prob_ph: KEEP_PROB})
                accuracy_train += acc
                loss_train = loss_tr * DELTA + loss_train * (1 - DELTA)
                train_writer.add_summary(summary, b + num_batches * epoch)
            accuracy_train /= num_batches

            # Testing
            num_batches = X_test.shape[0] // BATCH_SIZE
            for b in tqdm(range(num_batches)):
                x_batch, y_batch = next(test_batch_generator)
                seq_len = np.array([list(x).index(0) + 1 for x in x_batch])  # actual lengths of sequences
                loss_test_batch, acc, summary = sess.run([loss, accuracy, merged],
                                                         feed_dict={batch_ph: x_batch,
                                                                    target_ph: y_batch,
                                                                    seq_len_ph: seq_len,
                                                                    keep_prob_ph: 1.0})
                accuracy_test += acc
                loss_test += loss_test_batch
                test_writer.add_summary(summary, b + num_batches * epoch)
            accuracy_test /= num_batches
            loss_test /= num_batches

            print("loss: {:.3f}, val_loss: {:.3f}, acc: {:.3f}, val_acc: {:.3f}".format(
                loss_train, loss_test, accuracy_train, accuracy_test
            ))
        train_writer.close()
        test_writer.close()
        saver.save(sess, MODEL_PATH)
        print("Run 'tensorboard --logdir=./logdir' to checkout tensorboard logs.")

  0%|          | 0/523 [00:00<?, ?it/s]

Start learning...
epoch: 0	

100%|██████████| 523/523 [11:38<00:00,  1.31s/it]
100%|██████████| 130/130 [02:27<00:00,  1.13s/it]
  0%|          | 0/523 [00:00<?, ?it/s]

loss: 0.510, val_loss: 0.555, acc: 0.712, val_acc: 0.715
epoch: 1	

100%|██████████| 523/523 [11:38<00:00,  1.32s/it]
100%|██████████| 130/130 [02:27<00:00,  1.14s/it]
  0%|          | 0/523 [00:00<?, ?it/s]

loss: 0.482, val_loss: 0.545, acc: 0.777, val_acc: 0.729
epoch: 2	

100%|██████████| 523/523 [11:35<00:00,  1.33s/it]
100%|██████████| 130/130 [02:27<00:00,  1.13s/it]


loss: 0.426, val_loss: 0.584, acc: 0.816, val_acc: 0.727
Run 'tensorboard --logdir=./logdir' to checkout tensorboard logs.


In [15]:
from IPython.core.display import display, HTML

vocab_ind = {}
for word,ind in vocab.items():
    vocab_ind[ind]=word
              
saver = tf.train.Saver()

# Calculate alpha coefficients for the first test example
with tf.Session() as sess:
    saver.restore(sess, MODEL_PATH)

    x_batch_test, y_batch_test = X_test[:], y_test[:]
    seq_len_test = np.array([list(x).index(0) + 1 for x in x_batch_test])
    alphas_test = sess.run([alphas], feed_dict={batch_ph: x_batch_test, target_ph: y_batch_test,
                                                seq_len_ph: seq_len_test, keep_prob_ph: 1.0})
alphas_values = alphas_test[0][0]

# Build correct mapping from word to index and inverse
word_index = vocab
word_index = {word: index + INDEX_FROM for word, index in word_index.items()}
word_index[":PAD:"] = 0
word_index[":START:"] = 1
word_index[":UNK:"] = 2
index_word = {value: key for key, value in word_index.items()}
# Represent the sample by words rather than indices
words = list(map(index_word.get, x_batch_test[0]))

# Save visualization as HTML
with open("visualization.html", "w") as html_file:
    for word, alpha in sorted(zip(words, alphas_values / alphas_values.max()), 
                              key=lambda entry: -entry[1]):
        if word == ":START:":
            continue
        elif word == ":PAD:":
            break
        html_file.write('<font style="background: rgba(255, 255, 0, %f)" size=%f>%s (%f)<br></font>\n' % (alpha, 2+alpha*2, word, alpha))

print('\nFollowing are words with comparative attention scores and visualization with color background.')

with open("visualization.html", "r") as html_file:
    html_content = html_file.read()

HTML(filename="./visualization.html")

INFO:tensorflow:Restoring parameters from ./model

Following are words with comparative attention scores and visualization with color background.
