# Sentiment Analysis

In this example we will use IMDb dataset to train an recurrent neural network how to "read" movie reviews and guess whether the author liked the movie or not from them.

In [1]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.datasets import imdb

In [2]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=20000)
# load data with only 20000 of the most popular words

In [3]:
print(x_train[0])  # each word is encoded as a number
print(y_train[0])  # 1 means liked 0 means not liked

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
1


In [4]:
x_train = sequence.pad_sequences(x_train, maxlen=80)
x_test = sequence.pad_sequences(x_test, maxlen=80)
# this means we are only going to look at the first 80 words in each review
# truncating backprop through time

In [5]:
model = Sequential()
model.add(Embedding(20000, 128))
# Embedding layer - this is just a step that converts the input data into dense vectors of fixed
# size that's better suited for a neural network. You generally see this in conjunction with index-based text data like
# we have here. The 20,000 indicates the vocabulary size (remember we only extracted the top 20,000 words) and 128
# is the output dimension.
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         2560000   
_________________________________________________________________
lstm (LSTM)                  (None, None, 128)         131584    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 2,823,297
Trainable params: 2,823,297
Non-trainable params: 0
_________________________________________________________________


In [6]:
model.fit(x_train, y_train,
          batch_size=32,
          epochs=5,
          verbose=2,
          validation_data=(x_test, y_test))

Epoch 1/5
782/782 - 709s - loss: 0.4393 - accuracy: 0.7946 - val_loss: 0.3849 - val_accuracy: 0.8379
Epoch 2/5
782/782 - 741s - loss: 0.2624 - accuracy: 0.8942 - val_loss: 0.4239 - val_accuracy: 0.8250
Epoch 3/5
782/782 - 738s - loss: 0.1715 - accuracy: 0.9368 - val_loss: 0.4346 - val_accuracy: 0.8237
Epoch 4/5
782/782 - 738s - loss: 0.1173 - accuracy: 0.9586 - val_loss: 0.5433 - val_accuracy: 0.8180
Epoch 5/5
782/782 - 752s - loss: 0.0810 - accuracy: 0.9725 - val_loss: 0.5695 - val_accuracy: 0.8009


<tensorflow.python.keras.callbacks.History at 0x1ba102a3648>

In [7]:
score, acc = model.evaluate(x_test, y_test,
                            batch_size=32,
                            verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

782/782 - 26s - loss: 0.5695 - accuracy: 0.8009
Test score: 0.5694915056228638
Test accuracy: 0.8009200096130371


In [9]:
import numpy as np

data = np.reshape(x_train[0], (1, -1))
pred = model.predict(data)

if pred>=0.5:
    print("Prediction: Liked")
else:
    print("Prediction: Not Liked")

if y_train[0]==1:
    print("Real: Liked")
else:
    print("Real: Not Liked")

Prediction: Liked
Real: Liked
