# Sentiment Analysis with RNN example

### Step 1. Load data and exploration

In [4]:
from keras.datasets import imdb

# Load the data set into train and test sets
vocabulary_size = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocabulary_size)
print(f'Train size: {len(X_train)}. Test size: {len(X_test)}')

Train size: 25000. Test size: 25000


In [5]:
# Example of a review and a corresponding label
print("--- Review ---")
print(X_train[7])
print("--- Label ---")
print(y_train[7])

--- Review ---
[1, 4, 2, 716, 4, 65, 7, 4, 689, 4367, 2, 2343, 4804, 2, 2, 2, 2, 2315, 2, 2, 2, 2, 4, 2, 628, 2, 37, 9, 150, 4, 2, 4069, 11, 2909, 4, 2, 847, 313, 6, 176, 2, 9, 2, 138, 9, 4434, 19, 4, 96, 183, 26, 4, 192, 15, 27, 2, 799, 2, 2, 588, 84, 11, 4, 3231, 152, 339, 2, 42, 4869, 2, 2, 345, 4804, 2, 142, 43, 218, 208, 54, 29, 853, 659, 46, 4, 882, 183, 80, 115, 30, 4, 172, 174, 10, 10, 1001, 398, 1001, 1055, 526, 34, 3717, 2, 2, 2, 17, 4, 2, 1094, 871, 64, 85, 22, 2030, 1109, 38, 230, 9, 4, 4324, 2, 251, 2, 1034, 195, 301, 14, 16, 31, 7, 4, 2, 8, 783, 2, 33, 4, 2945, 103, 465, 2, 42, 845, 45, 446, 11, 1895, 19, 184, 76, 32, 4, 2, 207, 110, 13, 197, 4, 2, 16, 601, 964, 2152, 595, 13, 258, 4, 1730, 66, 338, 55, 2, 4, 550, 728, 65, 1196, 8, 1839, 61, 1546, 42, 2, 61, 602, 120, 45, 2, 6, 320, 786, 99, 196, 2, 786, 2, 4, 225, 4, 373, 1009, 33, 4, 130, 63, 69, 72, 1104, 46, 1292, 225, 14, 66, 194, 2, 1703, 56, 8, 803, 1004, 6, 2, 155, 11, 4, 2, 3231, 45, 853, 2029, 8, 30, 6, 117, 430

In [14]:
# Define translation dictionaries
word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}

# Maximum sequence length
print(f'The maximum sequence length is: {max(len(x) for x in X_train)}')

The maximum sequence length is: 2494


### Step 2. Padding seqences

In [16]:
from keras.preprocessing import sequence
from tensorflow.keras.utils import pad_sequences

max_words = 500

X_train = pad_sequences(X_train, maxlen = max_words)
X_test = pad_sequences(X_test, maxlen = max_words)

### Step 3. RNN model design

In [21]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

# Basel
model = Sequential([
    Embedding(input_dim=vocabulary_size, output_dim=128, input_length=max_words),
    LSTM(units = 64, return_sequences = True),
    Dropout(0.2),
    LSTM(units = 64),
    Dense(128),
    Dense(1, activation = 'sigmoid')
])

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 128)          640000    
                                                                 
 lstm_2 (LSTM)               (None, 500, 64)           49408     
                                                                 
 dropout_1 (Dropout)         (None, 500, 64)           0         
                                                                 
 lstm_3 (LSTM)               (None, 64)                33024     
                                                                 
 dense_2 (Dense)             (None, 128)               8320      
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 730,881
Trainable params: 730,881
Non-tr