# UW FSDL Spring 2020 - Many-to-one RNN example

Super simple example of predicting sentiment from text, using the IMDB Reviews dataset.

In [0]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

In [0]:
# Download IMDB reviews dataset
# https://www.tensorflow.org/datasets/catalog/imdb_reviews

dataset = tfds.load('imdb_reviews')
dataset_train, dataset_test = dataset['train'], dataset['test']

for ex in dataset_train.take(4):
  print(ex)

In [0]:
# Let's get just the texts as strings

texts_train = [ex['text'].decode("utf-8") for ex in tfds.as_numpy(dataset_train)]
texts_test = [ex['text'].decode("utf-8") for ex in tfds.as_numpy(dataset_test)]

texts_train[0]

"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."

In [0]:
# To convert texts to tokens, with each token represented by an integer, we need a tokenizer.
# It will strip out punctuation, split up words, and convert to integers.
# Let's limit the vocabulary to 10K most freqently used words in the training dataset.
VOCAB_SIZE = 10000

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(texts_train)
tokenizer.get_config()

In [0]:
# Now we can convert texts to sequences of integers
tokens_train = tokenizer.texts_to_sequences(texts_train)
tokens_test = tokenizer.texts_to_sequences(texts_test)

print(tokens_train[0])

[11, 13, 32, 424, 391, 17, 89, 27, 8, 31, 1365, 3584, 39, 485, 196, 23, 84, 153, 18, 11, 212, 328, 27, 65, 246, 214, 8, 476, 57, 65, 84, 113, 97, 21, 5674, 11, 1321, 642, 766, 11, 17, 6, 32, 399, 8169, 175, 2454, 415, 1, 88, 1230, 136, 68, 145, 51, 1, 7576, 68, 228, 65, 2932, 15, 2903, 1478, 4939, 2, 38, 3899, 116, 1583, 16, 3584, 13, 161, 18, 3, 1230, 916, 7916, 8, 3, 17, 12, 13, 4138, 4, 98, 144, 1213, 10, 241, 682, 12, 47, 23, 99, 37, 11, 7180, 5514, 37, 1365, 49, 400, 10, 97, 1196, 866, 140, 9]


In [0]:
# Pad each sequence to the same length. We'll use 90-th percentile length (this will truncate some sequences).

max_length = int(np.percentile([len(tokens) for tokens in tokens_train], 90))
print(max_length)

tokens_train = tf.keras.preprocessing.sequence.pad_sequences(tokens_train, max_length)
tokens_test = tf.keras.preprocessing.sequence.pad_sequences(tokens_test, max_length)

435


In [0]:
tokens_train.shape

(25000, 435)

In [0]:
labels_train = np.array([ex['label'] for ex in tfds.as_numpy(dataset_train)])
labels_test = np.array([ex['label'] for ex in tfds.as_numpy(dataset_test)])

labels_train[0], labels_train.shape

(0, (25000,))

In [0]:
EMBEDDING_DIM = 64
LSTM_DIM = 128

model = tf.keras.models.Sequential()

# Embedding layer converts sequences of integers (our tokens) to EMBEDDING_DIM-sized real-valued vectors
model.add(tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM))#, input_length=max_length))

# # LSTM processes the embedded vectors in sequence, and outputs an LSTM_DIM-sized vector at the end.
model.add(tf.keras.layers.LSTM(LSTM_DIM))

# # We convert that LSTM_DIM-sized vector to a single value between 0 and 1 with a sigmoid Dense layer
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          640000    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 738,945
Trainable params: 738,945
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(
    x=tokens_train,
    y=labels_train,
    batch_size=128,
    validation_data=(tokens_test, labels_test),
    epochs=6
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x7f336ffe2f60>

A few ideas to try:

- Train for longer!
- Different values for VOCAB_SIZE, EMBEDDING_DIM, or LSTM_DIM
- Stack multiple LSTMs (you will need to pass in `return_sequences=True` -- read the LSTM() docstring for info)
- Make the LSTM(s) bidirectional (look up how to do it)
- Different optimizer or learning rate (look up how to do it)