# Assignment 5: Text Classification with RNNs (Part 1)

### Assignment question is available here: https://ovgu-ailab.github.io/idl2023/assignment5.html

In [3]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
import matplotlib.pyplot as plt
from datetime import datetime
from tensorflow.keras.optimizers import Adam

In [4]:
#IMDB Dataset
(train_sequences, train_labels), (test_sequences, test_labels) = tf.keras.datasets.imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [5]:
print(train_sequences.shape)
print(train_labels.shape)

(25000,)
(25000,)


In [6]:
word_to_index = tf.keras.datasets.imdb.get_word_index()
print(len(word_to_index))
index_to_word = dict([(word, index) for (word, index) in word_to_index.items()])
index_to_word

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
88584


{'fawn': 34701,
 'tsukino': 52006,
 'nunnery': 52007,
 'sonja': 16816,
 'vani': 63951,
 'woods': 1408,
 'spiders': 16115,
 'hanging': 2345,
 'woody': 2289,
 'trawling': 52008,
 "hold's": 52009,
 'comically': 11307,
 'localized': 40830,
 'disobeying': 30568,
 "'royale": 52010,
 "harpo's": 40831,
 'canet': 52011,
 'aileen': 19313,
 'acurately': 52012,
 "diplomat's": 52013,
 'rickman': 25242,
 'arranged': 6746,
 'rumbustious': 52014,
 'familiarness': 52015,
 "spider'": 52016,
 'hahahah': 68804,
 "wood'": 52017,
 'transvestism': 40833,
 "hangin'": 34702,
 'bringing': 2338,
 'seamier': 40834,
 'wooded': 34703,
 'bravora': 52018,
 'grueling': 16817,
 'wooden': 1636,
 'wednesday': 16818,
 "'prix": 52019,
 'altagracia': 34704,
 'circuitry': 52020,
 'crotch': 11585,
 'busybody': 57766,
 "tart'n'tangy": 52021,
 'burgade': 14129,
 'thrace': 52023,
 "tom's": 11038,
 'snuggles': 52025,
 'francesco': 29114,
 'complainers': 52027,
 'templarios': 52125,
 '272': 40835,
 '273': 52028,
 'zaniacs': 52130,

In [7]:
#Preprocessing
length = [len(sequence) for sequence in train_sequences]
padded_train_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max(length))
train_data = tf.data.Dataset.from_tensor_slices((padded_train_sequences, train_labels)).shuffle(60000).batch(128)
print(padded_train_sequences.shape)
print(train_data)

(25000, 2494)
<_BatchDataset element_spec=(TensorSpec(shape=(None, 2494), dtype=tf.int32, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>


In [8]:
w_range = 0.1
b_range = 0.001
class model(tf.Module):
  def __init__(self, **args):
    super().__init__(**args)
    self.weight_xh = tf.Variable(tf.random.uniform([512,2000],-w_range,w_range))
    self.weight_hh = tf.Variable(tf.random.uniform([512,512],-w_range,w_range))
    self.weight_ho = tf.Variable(tf.random.uniform([512,2000],-w_range,w_range))
    self.b1 = tf.Variable(tf.random.uniform([512,1],-b_range, b_range))
    self.b2 = tf.Variable(tf.random.uniform([2000,1],-b_range, b_range))

model = model()

In [9]:
def rnn_step(current_input, previous_state):
  a = tf.matmul(model.weight_xh,tf.transpose(current_input)) + tf.matmul(model.weight_hh,previous_state) + model.b1
  new_state = tf.math.tanh(a)
  logits = tf.matmul(tf.transpose(model.weight_ho),new_state) + model.b2
  return new_state,logits

In [10]:
batch_size = 512
state_size = 128
optimizer = tf.keras.optimizers.Adam()
def rnn_loop(sequences, labels_onehot):
  previous_state = tf.zeros([batch_size, state_size],tf.float32)
  for time_step in tf.range(sequences.shape[1]-1):
    current_input = sequences[:, time_step]
    new_state, logits = rnn_step(current_input, previous_state)
    previous_state = new_state
    # loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = tf.transpose(logits),labels=labels_onehot))
    # xent+= loss
  return logits

In [11]:
xent = 0
i = 0
for step, (sequence_batch, label_batch) in enumerate(train_data):
  if step > 200:
    break
  sequences_onehot = tf.one_hot(sequence_batch, depth=2000)
  labels_onehot = tf.one_hot(label_batch, depth=2000)
  # train_step(sequences_onehot, labels_onehot)
  with tf.GradientTape() as tape:
    logits = rnn_loop(sequences_onehot, labels_onehot)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = tf.transpose(logits),labels=labels_onehot))
    xent+= loss
  grads = tape.gradient(xent,model.trainable_variables)
  optimizer.apply_gradients(zip(grads,model.trainable_variables))

  if not i%100:
    preds = tf.argmax(tf.nn.softmax(tf.transpose(logits)),axis=1,output_type=tf.int32)
    # acc = tf.reduce_mean(tf.cast(tf.equal(preds,sequences_onehot[:,step+1]),tf.float32))
    print("Loss: {} ".format(xent/200))



Loss: 0.03369549661874771 
Loss: 0.07899008691310883 
Loss: 0.12465225160121918 
Loss: 0.16086284816265106 
Loss: 0.20347529649734497 
Loss: 0.24196313321590424 
Loss: 0.2873750925064087 
Loss: 0.3207898437976837 
Loss: 0.35393238067626953 
Loss: 0.39820367097854614 
Loss: 0.44264712929725647 
Loss: 0.47942909598350525 
Loss: 0.5091741681098938 
Loss: 0.5561681985855103 
Loss: 0.5899704098701477 
Loss: 0.6390124559402466 
Loss: 0.6910545825958252 
Loss: 0.7158505916595459 
Loss: 0.7675729393959045 
Loss: 0.7988995909690857 
Loss: 0.8374607563018799 
Loss: 0.8672379851341248 
Loss: 0.8942862749099731 
Loss: 0.9305136203765869 
Loss: 0.9747466444969177 
Loss: 1.0118790864944458 
Loss: 1.0406512022018433 
Loss: 1.0940396785736084 
Loss: 1.150284767150879 
Loss: 1.1980818510055542 
Loss: 1.2307627201080322 
Loss: 1.2711261510849 
Loss: 1.3235087394714355 
Loss: 1.3729280233383179 
Loss: 1.3944576978683472 
Loss: 1.4178268909454346 
Loss: 1.4418617486953735 
Loss: 1.4893995523452759 
Loss: 

InvalidArgumentError: ignored

#Q1. Why is this wasteful? Can you think of a smarter padding scheme that is more efficient? Consider the fact that RNNs can work on arbitrary sequence lengths, and that training minibatches are pretty much independent of each other.


The padding is applied to the beginning or the end of the sequence, called pre- or post-sequence padding. However training RNNs on long sequences often face challenges like slow inference, vanishing gradients and difficulty in capturing long term dependencies.

#Q2.Between truncating long sequences and removing them, which option do you think is better? Why?

The simplest approach to long sequences is to simply truncate them, usually at the end but potentially at the beginning. Truncating long sequences allows to force the sequence to manageable length as the costs of losing data. The only drawback is the if the data removed is necessary for the calculation.

#Q3. Can you think of a way to avoid the one-hot vectors completely? Even if you cannot implement it, a conceptual idea is fine.

One conceptual way is to separate a few train classes as unknown classes and group them into a single one-hot encoding. Then all new classes will receive this same encoding.

#Q5. All sequences start with the same special “beginning of sequence” token (coded by index 1). Given this fact, is there a point in learning an initial state? Why (not)?

The default approach to initializing the state of an RNN is to use a zero state. This often works well, particularly for sequence-to-sequence tasks like language modeling where the proportion of outputs that are significantly impacted by the initial state is small.

# Q6. pad_sequences allows for pre or post padding. Try both to see the difference. Which option do you think is better? Recall that we use the final time step output from our model.

Commonly in RNN's, we take the final output or hidden state and use this to make a prediction (or do whatever task we are trying to do).

If we send a bunch of 0's to the RNN before taking the final output (i.e. 'post' padding as you describe), then the hidden state of the network at the final word in the sentence would likely get 'flushed out' to some extent by all the zero inputs that come after this word.

So intuitively, this might be why pre-padding is more popular/effective.

# Q7.Can you think of a way to prevent the RNN from computing new states on padded time steps?

The simplest and most common solution to this problem is to unroll the RNN only over a limited number of time steps during training. This is called truncated backpropagation through time. In TensorFlow you can implement it simply by truncating the input sequences.

#References

https://stats.stackexchange.com/questions/288770/meaning-of-batch-sizes-for-rnns-lstms-and-reasons-for-padding

https://openreview.net/pdf?id=HkwVAXyCW

https://www.techleer.com/articles/151-handling-long-sequence-with-short-term-memory-recurrent-neural-networks/

https://stackoverflow.com/questions/59287602/is-there-a-way-by-which-we-can-train-rnn-without-using-one-hot-encoders

https://r2rt.com/non-zero-initial-states-for-recurrent-neural-networks.html

https://stackoverflow.com/questions/46298793/how-does-choosing-between-pre-and-post-zero-padding-of-sequences-impact-results

https://www.oreilly.com/library/view/neural-networks-and/9781492037354/ch04.html
