
# Train RNN (GRU) + Attention on Amason product review/rating data


In [1]:
# !sudo pip install numpy
# !sudo pip install tensorflow
# !sudo pip install keras
# !sudo pip install tqdmn
# import nltk
# nltk.download("stopwords")

In [2]:
import os
import numpy as np
import tensorflow as tf
from nltk.corpus import stopwords
from tensorflow.contrib.rnn import GRUCell
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
import gzip
import time
from tqdm import tqdm

from attention import attention
from rnn_common.utils import get_vocabulary_size, fit_in_vocabulary, zero_pad, batch_generator

In [3]:
NUM_WORDS = -1
INDEX_FROM = 3
# SKIP_TOP = 0 # This will instead be handled by filtering stop words prior to training/testing
SEQUENCE_LENGTH = 50 # The mean sequence length in training data is 32 (after filtering stop words)
EMBEDDING_DIM = 100
HIDDEN_SIZE = 200
ATTENTION_SIZE = SEQUENCE_LENGTH * 2
KEEP_PROB = 0.8
BATCH_SIZE = 256
NUM_EPOCHS = 3  # Model easily overfits without pre-trained words embeddings, that's why train for a few epochs
DELTA = 0.5
MODEL_PATH = './model'
FILE_PATH = './data/reviews_Toys_and_Games_5.json.gz'
UNKNOWN_WORDS_FILE_PATH = './data/unknown_words.txt'
PAD_ID = 0
START_ID = 1
UNK_ID = 2

train = True

In [4]:
def num_lines(filepath):
    line_count = 0
    with gzip.open(filepath) as input_file:
        for line in input_file:
            line_count += 1            
    return line_count
            

def get_splits(filepath, split, split_add_words=None):
    if sum(split) != 1.0:
        raise ValueError(f"Error, split items {split} do not add up to 1.")
        
    # Default to adding words only from the train dataset
    if split_add_words is None:
        split_add_words = [False*len(split)]
        split_add_words[0] = True
        
    vocab = {}
    debug = True
    stopWords = set(stopwords.words('english'))
    print(f"Filtering ({len(stopWords)}) English stop words)")
    
    X_datasets = []
    y_datasets = []
    
    # populate empty value lists for each split
    for _ in range(len(split)):
        X_datasets.append([])
        y_datasets.append([])
    
    line_count = num_lines(filepath)
    
    split_arr = []
    for ind in range(len(split)):
        # append list of [0]s, [1]s, etc. each of the size indicated by split
        item_count = int(split[ind]*line_count) + 1
        print(f"Appending {item_count} [{ind}]s for split {split[ind]}...")
        split_arr.extend([int(ind)]*item_count)
        
    split_arr = np.array(split_arr)
    
    # Shuffle the split inds just added
    np.random.shuffle(split_arr)
    
    print(f"Shuffled split_arr: {split_arr}")
    
    ## unknown words file exists, delete it
    if os.path.isfile(UNKNOWN_WORDS_FILE_PATH):
        os.remove(UNKNOWN_WORDS_FILE_PATH)

    with gzip.open(filepath) as input_file, open(UNKNOWN_WORDS_FILE_PATH, 'a') as unknown_file:
        curr_line = 0
        for line in input_file:
            # Look back into the split list shuffled above to determine the appropriate split for this line
            split_ind = split_arr[curr_line]
            
            if curr_line < 10:
                print(f"\nLine {curr_line}:", line)
            else:
                # Only enable debug output for at most the first 10 lines
                debug = False
                
            # Add the review text to the appropriate split, only adding to the vocab as specified in split_add_words.
            # Default behavior is to add words to vocab only for the first split (train)
            word_list = get_values(line, 'reviewText', vocab=vocab, stopwords=stopWords,
                                   add_words=(split_add_words[split_ind]), debug=debug, unknown_file=unknown_file)
            if len(word_list) > 0:
                X_datasets[split_ind].append(word_list)
                y_datasets[split_ind].append(get_values(line, 'overall', debug=debug))
                if debug:
                    print(f"split_ind: {split_ind}, curr_line: {curr_line}")
                    split_rec = len(X_datasets[split_ind])-1
                    print(f"X_datasets[{split_ind}][{split_rec}]: {X_datasets[split_ind][split_rec]}")
                    print(f"y_datasets[{split_ind}][{split_rec}]: {y_datasets[split_ind][split_rec]}")
            
            curr_line += 1
                                        
    print("Processed input file to produce the following randomized splits:")
    for ind in range(len(X_datasets)):
        print(f"[{ind}] : {len(X_datasets[ind])} samples")
        
    return X_datasets, y_datasets, vocab

    
    
def get_values(line, column, vocab=None, stopwords=None, add_words=False, debug=False, unknown_file=None):

    line_dict = eval(line)
    line_val = line_dict[column]
    
    
    if vocab is not None:
        line_ids = []
        for word in line_val.lower().split(" "):
            if (stopwords is not None) and (word in stopwords):
                # skip stop words
                continue
                
            word_id = vocab.get(word)
            if word_id is None:
                if add_words and ((NUM_WORDS<0) or (len(vocab)<(NUM_WORDS-1))):
                    # Create a new word_id
                    word_id = len(vocab)
                    vocab[word] = word_id
                    if debug:
                        print(f"Added word {word} with id {word_id}")
                else:
                    # In the test dataset, unknown words will be skipped, but written to unknown file for review
                    if unknown_file is not None:
                        unknown_file.write(word)
                    continue

            line_ids.append(word_id)
            if debug:
                print(f"IDS: {line_ids}")
                
        # Start a word_id list with the actual seq_len
        seq_len = [len(line_ids)]
        line_val = line_ids
        if debug:
            print(f"seq_len: {seq_len}")
            print(f"line_val: {line_val}")
        
        # For text values, the list of word ids will be returned
        return line_ids
    else:
        # For the y values, an int value (converted from float) will be returned
        return int(line_val)

In [5]:
print("\nReading inputs...")
start_time = time.time()

# Split inputs into X_train, X_test, and X_unk. The first two will add to vocabulary, the third will not.
X_datasets, y_datasets, vocab = get_splits(FILE_PATH, [.8,.1,.1], [True, True, False])
elapsed = time.time()-start_time
print(f"...finished reading input file, after {round(elapsed)} seconds")

print(f"Results:")
print(f"    X_datasets : {len(X_datasets)} elements")

X_train = X_datasets[0]
y_train = y_datasets[0]
print(f"\nX_train ({len(X_train)} records): {X_train[:5]}")
print(f"y_train ({len(y_train)} records): {y_train[:5]}")

percents_review_len = np.percentile([len(review) for review in X_train], [25,50,75,100])
print(f"Percentiles (25%, 50%, 75%, 100%) review_length (X_train) = {percents_review_len}")

X_test = X_datasets[1]
y_test = y_datasets[1]
print(f"\nX_test ({len(X_test)} records): {X_test[:5]}")
print(f"y_test ({len(y_test)} records): {y_test[:5]}")

X_unk = X_datasets[2]
y_unk = y_datasets[2]
print(f"\nX_unk ({len(X_unk)} records): {X_unk[:5]}")
print(f"y_unk ({len(y_unk)} records): {y_unk[:5]}")

print(f"\nvocab contains {len(vocab)} words.")


Reading inputs...
Filtering (179) English stop words)
Appending 134078 [0]s for split 0.8...
Appending 16760 [1]s for split 0.1...
Appending 16760 [2]s for split 0.1...
Shuffled split_arr: [0 1 0 ... 0 1 0]

Line 0: b'{"reviewerID": "A1VXOAVRGKGEAK", "asin": "0439893577", "reviewerName": "Angie", "helpful": [0, 0], "reviewText": "I like the item pricing. My granddaughter wanted to mark on it but I wanted it just for the letters.", "overall": 5.0, "summary": "Magnetic board", "unixReviewTime": 1390953600, "reviewTime": "01 29, 2014"}\n'
Added word like with id 0
IDS: [0]
Added word item with id 1
IDS: [0, 1]
Added word pricing. with id 2
IDS: [0, 1, 2]
Added word granddaughter with id 3
IDS: [0, 1, 2, 3]
Added word wanted with id 4
IDS: [0, 1, 2, 3, 4]
Added word mark with id 5
IDS: [0, 1, 2, 3, 4, 5]
IDS: [0, 1, 2, 3, 4, 5, 4]
Added word letters. with id 6
IDS: [0, 1, 2, 3, 4, 5, 4, 6]
seq_len: [8]
line_val: [0, 1, 2, 3, 4, 5, 4, 6]
split_ind: 0, curr_line: 0
X_datasets[0][0]: [0, 1, 

Processed input file to produce the following randomized splits:
[0] : 134077 samples
[1] : 16760 samples
[2] : 16760 samples
...finished reading input file, after 17 seconds
Results:
    X_datasets : 3 elements

X_train (134077 records): [[0, 1, 2, 3, 4, 5, 4, 6], [21, 22, 23, 24, 25, 26, 27, 28, 23, 29, 30, 31, 32, 33, 34, 30, 31, 35, 36, 37, 23, 38, 39, 32, 40, 35, 41, 42, 23, 43, 44, 45, 46, 0, 47, 48, 23, 49, 50, 51, 52, 53, 40, 54], [39, 27, 55, 56, 57, 58, 59, 60, 61, 62, 63, 39, 27, 64, 65, 66], [67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 8, 78, 79, 80, 8, 81, 10, 74, 82, 83], [84, 40, 85, 86, 87, 88, 74, 89, 39, 90, 40, 91, 92, 93, 94, 95, 74, 96, 97, 98]]
y_train (134077 records): [5, 5, 5, 4, 3]
Percentiles (25%, 50%, 75%, 100%) review_length (X_train) = [  18.   33.   65. 3168.]

X_test (16760 records): [[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [116, 117, 118, 119, 120, 121, 40, 32, 122, 123, 124, 122, 125, 23, 126, 127, 128, 129, 130, 131, 132, 133, 8, 134, 

In [6]:
# Load the data set

# Sequences pre-processing
print(X_train[:2][:5])
print(f"{sum(1 for x in X_train if ((len(x) == 0) or (x is None)))} entries in X_train have length 0")
vocabulary_size = get_vocabulary_size(X_train)
X_test = fit_in_vocabulary(X_test, vocabulary_size)
X_unk = fit_in_vocabulary(X_unk, vocabulary_size)
X_train = zero_pad(X_train, SEQUENCE_LENGTH)
X_test = zero_pad(X_test, SEQUENCE_LENGTH)
X_unk = zero_pad(X_unk, SEQUENCE_LENGTH)

[[0, 1, 2, 3, 4, 5, 4, 6], [21, 22, 23, 24, 25, 26, 27, 28, 23, 29, 30, 31, 32, 33, 34, 30, 31, 35, 36, 37, 23, 38, 39, 32, 40, 35, 41, 42, 23, 43, 44, 45, 46, 0, 47, 48, 23, 49, 50, 51, 52, 53, 40, 54]]
0 entries in X_train have length 0


In [7]:
# Different placeholders
with tf.name_scope('Inputs'):
    batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='batch_ph')
    target_ph = tf.placeholder(tf.float32, [None], name='target_ph')
    seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph')
    keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph')

In [8]:
# Embedding layer
with tf.name_scope('Embedding_layer'):
    embeddings_var = tf.Variable(tf.random_uniform([len(vocab)+1, EMBEDDING_DIM], -1.0, 1.0), trainable=True)
    tf.summary.histogram('embeddings_var', embeddings_var)
    batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)

Instructions for updating:
Colocations handled automatically by placer.


In [9]:
# (Bi-)RNN layer(-s)
rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_SIZE), GRUCell(HIDDEN_SIZE),
                        inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32)
tf.summary.histogram('RNN_outputs', rnn_outputs)

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.


<tf.Tensor 'RNN_outputs:0' shape=() dtype=string>

In [10]:
# Attention layer
with tf.name_scope('Attention_layer'):
    attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True)
    tf.summary.histogram('alphas', alphas)

# Dropout
drop = tf.nn.dropout(attention_output, keep_prob_ph)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [11]:
# Fully connected layer
with tf.name_scope('Fully_connected_layer'):
    W = tf.Variable(tf.truncated_normal([HIDDEN_SIZE * 2, 1], stddev=0.1))  # Hidden size is multiplied by 2 for Bi-RNN
    b = tf.Variable(tf.constant(0., shape=[1]))
    y_hat = tf.nn.xw_plus_b(drop, W, b)
    y_hat = tf.squeeze(y_hat)
    tf.summary.histogram('W', W)

with tf.name_scope('Metrics'):
    # Cross-entropy loss and optimizer initialization
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=target_ph))
    tf.summary.scalar('loss', loss)
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)

    # Accuracy metric
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), target_ph), tf.float32))
    tf.summary.scalar('accuracy', accuracy)

In [12]:
merged = tf.summary.merge_all()

# Batch generators
train_batch_generator = batch_generator(X_train, y_train, BATCH_SIZE)
test_batch_generator = batch_generator(X_test, y_test, BATCH_SIZE)
unk_batch_generator = batch_generator(X_unk, y_unk, BATCH_SIZE)

train_writer = tf.summary.FileWriter('./logdir/train', accuracy.graph)
test_writer = tf.summary.FileWriter('./logdir/test', accuracy.graph)
unk_writer = tf.summary.FileWriter('./logdir/unk', accuracy.graph)

session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True),allow_soft_placement=True, log_device_placement=True)

saver = tf.train.Saver()

In [13]:
if train:
    with tf.device('/device:GPU:0'), tf.Session(config=session_conf) as sess:
        sess.run(tf.global_variables_initializer())
        print("Start learning...")
        for epoch in range(NUM_EPOCHS):
            loss_train = 0
            loss_test = 0
            loss_unk = 0
            accuracy_train = 0
            accuracy_test = 0
            accuracy_unk = 0

            print("\n*********************************************")
            print("epoch: {}\t".format(epoch), end="")

            # Training
            print("\nTraining...")
            num_batches = X_train.shape[0] // BATCH_SIZE
            for b in tqdm(range(num_batches)):
                x_batch, y_batch = next(train_batch_generator)
                seq_len = np.array([list(x).index(0) + 1 for x in x_batch])  # actual lengths of sequences
                loss_tr, acc, _, summary = sess.run([loss, accuracy, optimizer, merged],
                                                    feed_dict={batch_ph: x_batch,
                                                               target_ph: y_batch,
                                                               seq_len_ph: seq_len,
                                                               keep_prob_ph: KEEP_PROB})
                accuracy_train += acc
                loss_train = loss_tr * DELTA + loss_train * (1 - DELTA)
                train_writer.add_summary(summary, b + num_batches * epoch)
            accuracy_train /= num_batches

            # Testing
            print("\nTesting with known words...")
            num_batches = X_test.shape[0] // BATCH_SIZE
            for b in tqdm(range(num_batches)):
                x_batch, y_batch = next(test_batch_generator)
                seq_len = np.array([list(x).index(0) + 1 for x in x_batch])  # actual lengths of sequences
                loss_test_batch, acc, summary = sess.run([loss, accuracy, merged],
                                                         feed_dict={batch_ph: x_batch,
                                                                    target_ph: y_batch,
                                                                    seq_len_ph: seq_len,
                                                                    keep_prob_ph: 1.0})
                accuracy_test += acc
                loss_test += loss_test_batch
                test_writer.add_summary(summary, b + num_batches * epoch)
            accuracy_test /= num_batches
            loss_test /= num_batches

            print("loss: {:.3f}, val_loss: {:.3f}, acc: {:.3f}, val_acc: {:.3f}\n".format(
                loss_train, loss_test, accuracy_train, accuracy_test
            ))
            
            # Testing with unknown words
            print("\nTesting with unknown words...")
            num_batches = X_unk.shape[0] // BATCH_SIZE
            for b in tqdm(range(num_batches)):
                x_batch, y_batch = next(unk_batch_generator)
                seq_len = np.array([list(x).index(0) + 1 for x in x_batch])  # actual lengths of sequences
                loss_unk_batch, acc, summary = sess.run([loss, accuracy, merged],
                                                         feed_dict={batch_ph: x_batch,
                                                                    target_ph: y_batch,
                                                                    seq_len_ph: seq_len,
                                                                    keep_prob_ph: 1.0})
                accuracy_unk += acc
                loss_unk += loss_unk_batch
                unk_writer.add_summary(summary, b + num_batches * epoch)
            accuracy_unk /= num_batches
            loss_unk /= num_batches

            print("loss: {:.3f}, val_loss: {:.3f}, unk_loss: {:.3f}, acc: {:.3f}, val_acc: {:.3f}, unk_acc: {:3f}\n".format(
                loss_train, loss_test, loss_unk, accuracy_train, accuracy_test, accuracy_unk
            ))
        train_writer.close()
        test_writer.close()
        unk_writer.close()
        saver.save(sess, MODEL_PATH)
        print("Run 'tensorboard --logdir=./logdir' to checkout tensorboard logs.")

  0%|          | 0/523 [00:00<?, ?it/s]

Start learning...

*********************************************
epoch: 0	
Training...


  8%|▊         | 42/523 [01:09<13:09,  1.64s/it]ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/burgew/miniconda3/envs/py3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-bbf27653b810>", line 25, in <module>
    keep_prob_ph: KEEP_PROB})
  File "/Users/burgew/miniconda3/envs/py3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 929, in run
    run_metadata_ptr)
  File "/Users/burgew/miniconda3/envs/py3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1152, in _run
    feed_dict_tensor, options, run_metadata)
  File "/Users/burgew/miniconda3/envs/py3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1328, in _do_run
    run_metadata)
  File "/Users/burgew/miniconda3/envs/py3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1334, in _do_call
    return fn(*args)
  File "/Users/burgew/miniconda3/envs/py3.6/lib/python3.6/sit

KeyboardInterrupt: 

In [None]:
from IPython.core.display import display, HTML

vocab_ind = {}
for word,ind in vocab.items():
    vocab_ind[ind]=word
    
print(f"Just created id->word dict of size {len(vocab_ind)} items")
print(f" .   max(ind) = {max(vocab_ind.keys())}")
print(f"Source was word->id dict of size {len(vocab)} items")
print(f" .   max(ind) = {max(vocab.values())}")
              
saver = tf.train.Saver()

# Calculate alpha coefficients for the first test example
with tf.Session() as sess:
    saver.restore(sess, MODEL_PATH)

    x_batch_test, y_batch_test = X_test[:], y_test[:]
    seq_len_test = np.array([list(x).index(0) + 1 for x in x_batch_test])
    alphas_test = sess.run([alphas], feed_dict={batch_ph: x_batch_test, target_ph: y_batch_test,
                                                seq_len_ph: seq_len_test, keep_prob_ph: 1.0})
alphas_values = alphas_test[0][0]

# Build correct mapping from word to index and inverse
word_index = vocab
word_index = {word: index + INDEX_FROM for word, index in word_index.items()}
word_index[":PAD:"] = PAD_ID
word_index[":START:"] = START_ID
word_index[":UNK:"] = UNK_ID
index_word = {value: key for key, value in word_index.items()}
# Represent the sample by words rather than indices
words = list(map(index_word.get, x_batch_test[0]))

# Save visualization as HTML
with open("visualization.html", "w") as html_file:
    for word, alpha in sorted(zip(words, alphas_values / alphas_values.max()), 
                              key=lambda entry: -entry[1]):
        if word == ":START:":
            continue
        elif word == ":PAD:":
            break
        html_file.write('<font style="background: rgba(255, 255, 0, %f)" size=%f>%s (%f)<br></font>\n' % (alpha, 2+alpha*2, word, alpha))

print('\nFollowing are words with comparative attention scores and visualization with color background.')

with open("visualization.html", "r") as html_file:
    html_content = html_file.read()

HTML(filename="./visualization.html")