# Checking tensorflow version

In [0]:
import tensorflow as tf
print(tf.__version__)

In [0]:
## only run this if the tf version is less than 2.1.0
!pip uninstall tensorflow
!pip install tensorflow==2.1.0

# Introduction to Recurrent Neural Networks with Sentiment Analysis

In [0]:
## imports necessary dependencies and utilities

import tensorflow as tf
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt 
from tensorflow.keras import Sequential, layers, optimizers, losses, metrics
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence

In [0]:
## splits the dataset up into training and test sets
## vocab size refers to the number of unique words to be kept in the reviews 

vocab_size = 5000
max_length = 500
(X_train, y_train), (X_test, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=vocab_size,
                                                      seed=42)

In [0]:
## check the format of the data (complete the code)


In [0]:
## return the list of words of the first review (complete the code)

word2id = imdb.get_word_index()

Hmm what else can we check? We just learnt that checking the length of sequences are important in sequence models. Thereafter, we can decide on a standardised sequence length for each review. Reviews longer than that will be truncated, while reviews shorter than that will be padded. If you are interested in how post or pre padding will affect results, you can refer to this paper: https://arxiv.org/abs/1903.07288#:~:text=Since%20LSTMs%20and%20CNNs%20take,comes%20to%20performance%20and%20accuracies.

In [0]:
## prints length of longest review 

print('Maximum review length: {}'.format(len(max((X_train + X_test))), key=len))

# ## prints length of shortest review 

print('Minimum review length: {}'.format(len(min((X_train + X_test))), key=len))

In [0]:
## max length of sequence is established and the reviews and correspondingly padded

X_train = sequence.pad_sequences(X_train, maxlen=max_length, padding='pre', truncating='post')
X_test = sequence.pad_sequences(X_test, maxlen=max_length, padding='pre', truncating='post')

Now that the dataset has been preprocessed(if you want to learn more about text preprocessing: https://towardsdatascience.com/nlp-text-preprocessing-a-practical-guide-and-template-d80874676e79), we can start instantiating a model and training it. 

In [0]:
## define some variables first

word_vector_size = 128
num_epochs = 5
batch_size = 64

In [0]:
## model instantiation 

model = Sequential()
## the first embedding layer is meant to help convert the word ids to vectors of length vector_size
model.add(layers.Embedding(vocab_size, word_vector_size, input_length=max_length))
## the second layer is a simple RNN layer with the output vector from this layer being (batch_size, num_of units)
model.add(layers.SimpleRNN(128, activation="relu", dropout=0.2, recurrent_dropout=0.2))
## the last layer is a binary classification layer
model.add(layers.Dense(1, activation="sigmoid"))

## model compilation 

model.compile(loss='binary_crossentropy', 
             optimizer=optimizers.Adam(0.001), 
             metrics=['accuracy'])

## run model training 

trained = model.fit(X_train, y_train, epochs=num_epochs, batch_size=batch_size, validation_split=0.3)

In [0]:
## visualise the model training by plotting loss curves

  
acc = trained.history['accuracy']
val_acc = trained.history['val_accuracy']

loss = trained.history['loss']
val_loss = trained.history['val_loss']

plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.ylabel('Accuracy')
plt.ylim([min(plt.ylim()),1])
plt.title('Training and Validation Accuracy')

plt.subplot(2, 1, 2)
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.ylabel('Cross Entropy')
plt.ylim([0,1.0])
plt.title('Training and Validation Loss')
plt.xlabel('epoch')
plt.show()

In [0]:
## evaluate the model on the test set 

scores = model.evaluate(X_test, y_test)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))