In [None]:
import tensorflow as tf
from tensorflow.keras.activations import softmax, tanh
import numpy as np
from matplotlib import pyplot as plt
import time

In [None]:
num_words = 20000
(train_sequences, train_labels), (test_sequences, test_labels) = tf.keras.datasets.imdb.load_data(num_words=num_words)
print(train_sequences.shape)
print(test_sequences.shape)
word_to_index = tf.keras.datasets.imdb.get_word_index()
index_to_word = dict((index, word) for (word, index) in word_to_index.items())
sequence_lengths = [len(sequence) for sequence in train_sequences]
max_len = max(sequence_lengths)
max_len = 200
train_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_len)
train_data = tf.data.Dataset.from_tensor_slices((train_sequences_padded, train_labels))
batch_size = 128
epochs = 5
train_data = train_data.shuffle(25000).batch(batch_size).repeat(epochs)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
(25000,)
(25000,)
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [None]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
opt = tf.keras.optimizers.Adam(0.001)
accuracy_fn = tf.keras.metrics.SparseCategoricalAccuracy()

state_size = 1024
U = tf.Variable(tf.random.uniform(shape=[num_words, state_size], minval=-0.1, maxval=0.1, dtype=tf.dtypes.float32))
b = tf.Variable(tf.zeros(shape=[state_size], dtype=tf.dtypes.float32))
W = tf.Variable(tf.random.uniform(shape=[state_size, state_size], minval=-0.1, maxval=0.1, dtype=tf.dtypes.float32))
V = tf.Variable(tf.random.uniform(shape=[state_size, 2], minval=-0.1, maxval=0.1, dtype=tf.dtypes.float32))
c = tf.Variable(tf.zeros(shape=[2], dtype=tf.dtypes.float32))
initial_state = tf.Variable(tf.zeros(shape=[state_size], dtype=tf.dtypes.float32))
trainable_weights = [initial_state, U, b, W, V, c]

In [None]:
def train_loop():
    for step, (sequence_batch, label_batch) in enumerate(train_data):
        train_step(sequence_batch, label_batch, step)

def train_step(sequences, labels, step):
    with tf.GradientTape() as tape:
        output = rnn_loop(sequences)
        loss = loss_fn(labels, output)

    gradients = tape.gradient(loss, trainable_weights)
    opt.apply_gradients(zip(gradients, trainable_weights))

    if step % 195 == 0:
      print("\nNext Epoch")
    if step % 10 == 0:
      accuracy_fn.update_state(labels, output)
      print("Step {} Accuracy: {} Loss: {}".format(step, accuracy_fn.result().numpy(), loss.numpy()))
    

def rnn_loop(sequences):
    old_state = initial_state

    for step in range(max_len):
        x_t = sequences[:, step]
        x_t = tf.one_hot(x_t, depth=num_words)
        new_state = rnn_step(old_state, x_t)

        old_state = new_state

    o_t = output_layer(new_state)

    return o_t

def rnn_step(state, x_t):
    a = b + tf.linalg.matvec(W, state) + tf.matmul(x_t, U)
    h = tanh(a)
    return h

def output_layer(state):
    o = c + tf.matmul(state, V)
    o = softmax(o)
    return o

train_loop()


Next Epoch
Step 0 Accuracy: 0.4921875 Loss: 1.122739553451538
Step 10 Accuracy: 0.484375 Loss: 0.7648497223854065
Step 20 Accuracy: 0.5078125 Loss: 0.6973837614059448
Step 30 Accuracy: 0.509765625 Loss: 0.6842910051345825
Step 40 Accuracy: 0.5140625238418579 Loss: 0.6883311867713928
Step 50 Accuracy: 0.5143229365348816 Loss: 0.6939480900764465
Step 60 Accuracy: 0.5066964030265808 Loss: 0.7066179513931274
Step 70 Accuracy: 0.5 Loss: 0.737827718257904
Step 80 Accuracy: 0.5112847089767456 Loss: 0.680735170841217
Step 90 Accuracy: 0.5093749761581421 Loss: 0.6889183521270752
Step 100 Accuracy: 0.515625 Loss: 0.6857764720916748
Step 110 Accuracy: 0.5162760615348816 Loss: 0.7019472122192383
Step 120 Accuracy: 0.5168269276618958 Loss: 0.6960265636444092
Step 130 Accuracy: 0.5133928656578064 Loss: 0.7120664119720459
Step 140 Accuracy: 0.5151041746139526 Loss: 0.6959869861602783
Step 150 Accuracy: 0.51806640625 Loss: 0.7122974395751953
Step 160 Accuracy: 0.515625 Loss: 0.7051507830619812
Step 1

#Food for Thought
1. Group minibatches by size and pad them to the longest sequence length of the batch is less wasteful.  
2. Removing with a size that contains most reviews could be good, since the important words could be at the end or beginning. Truncating could also be good for the model to learn longer sequences. 
3. (Learned) Embeddings could be used or tf-idf.
4. They both classify the inputs to two classes.
5. It makes sense to learn the initial state, because the model may need something from the state and it can encode, that the one is not important.
6. The output comes from the last time step so pre padding makes more sense to have a correct input at the last time step.
7. With an if else statement and a mask that could be possible.
8. That would be possible, but more expensive. It would be a many to many RNN with an average over the many outputs, but we can just use this many to one architecture.

#Experiments
* lr = 0.001: 67% Accuracy  
* lr = 0.01: 50% Accuracy  
* lr = 0.1: Loss nan  
* Variable initialization [-0.5, 0.5], lr = 0.001: Loss nan
* max_len = 400: Loss nan
