# Sentiment analysis from movie reviews

More info on the dataset is [here](https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification).

So we are going to use an RNN to do sentiment analysis on full-text movie reviews!


In [74]:
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt

Now import our training and testing data. We specify that we only care about the 20,000 most popular words in the dataset in order to keep things somewhat managable. 

In [75]:
START_CHAR = 1
OOV_CHAR = 2
INDEX_WORD_FROM = 3
NUM_WORDS = 20000
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=NUM_WORDS,
                                                                        start_char=START_CHAR,
                                                                        oov_char=OOV_CHAR,
                                                                        index_from=INDEX_WORD_FROM)

Let's get a feel for what this data looks like. Let's look at the first training feature, which should represent a written movie review:

In [76]:
x_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,

If you want, you can reconstruct the original text:

In [77]:
# Retrieve the word index file mapping words to indices
word_index = tf.keras.datasets.imdb.get_word_index()
# Reverse the word index to obtain a dict mapping indices to words
# And add `index_from` to indices to sync with `x_train`
inverted_word_index = dict(
    (i + INDEX_WORD_FROM, word) for (word, i) in word_index.items()
)
# Update `inverted_word_index` to include `start_char` and `oov_char`
inverted_word_index[START_CHAR] = "[START]"
inverted_word_index[OOV_CHAR] = "[OOV]"


def decode_review (seq):
    return " ".join(inverted_word_index[i] for i in seq)
    
# Decode the first sequence in the dataset
display(decode_review(x_train[0])

SyntaxError: incomplete input (1817989105.py, line 17)


So just keep in mind that each number in the training features represent some specific word. 

What do the labels look like?

In [None]:
y_train[0]

They are just 0 or 1, which indicates whether the reviewer said they liked the movie or not.

So to recap, we have a bunch of movie reviews that have been converted into vectors of words represented by integers, and a binary sentiment classification to learn from.

In [None]:
getlengths = np.vectorize(lambda x: len(x))
lengths = getlengths(x_train)

plt.hist(lengths, bins=20)

RNN's can blow up quickly, so again to keep things simple let's **limit the reviews to their first words** (not good for accuracy of course!):

In [78]:
MAX_WORDS = 200
x_train_pad = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=MAX_WORDS)
x_test_pad = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=MAX_WORDS)

Now let's set up our neural network model!

In [83]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(NUM_WORDS, 128))
model.add(tf.keras.layers.LSTM(64))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])




**Warning**:
This may take a very long time to run.

In [84]:
model.fit(x_train_pad, y_train,
          batch_size=16,
          epochs=5,
          verbose=1,
          callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)],
          validation_split=0.2)

Epoch 1/5
[1m   5/1250[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m44s[0m 36ms/step - accuracy: 0.4796 - loss: 0.6926

2024-11-21 13:36:23.306751: E tensorflow/core/util/util.cc:131] oneDNN supports DT_BOOL only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 43ms/step - accuracy: 0.7412 - loss: 0.5140 - val_accuracy: 0.7618 - val_loss: 0.4908
Epoch 2/5
[1m 381/1250[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m39s[0m 45ms/step - accuracy: 0.8126 - loss: 0.4435

KeyboardInterrupt: 

In [81]:
model.summary()

OK, let's evaluate our model's accuracy:

In [63]:
score, acc = model.evaluate(x_test_pad, y_test, verbose=1)
print('Test accuracy:', acc)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 18ms/step - accuracy: 0.8261 - loss: 0.4251
Test accuracy: 0.8296399712562561


In [64]:
from sklearn.metrics import confusion_matrix

#Predict
y_prediction = model.predict(x_test_pad) >= 0.5

#Create confusion matrix and normalizes it over predicted (columns)
cmatrix = confusion_matrix(y_test, y_prediction)
cmatrix

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 21ms/step


array([[10599,  1901],
       [ 2358, 10142]])

Not too bad, considering we limited ourselves to just the first words of each review.


In [69]:
for i in range(5):
    review = decode_review(np.trim_zeros(x_test_pad[i])) # trim is used to remove padding
    prediction = model.predict(x_test_pad[i:i+1]) >= 0.5
    print(f"Prediction: {prediction} (groung truth: {y_test[i]>=0.5})")
    print(review)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Prediction: [[False]] (groung truth: False)
[START] please give this one a miss br br kristy swanson and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite lacklustre so all you madison fans give this a miss
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Prediction: [[ True]] (groung truth: True)
wonderfully written script br br i praise robert altman this is one of his many films that deals with unconventional fascinating subject matter this film is disturbing but it's sincere and it's sure to elicit a strong emotional response from the viewer if you want to see an unusual film some might even say bizarre this is worth the time br br unfortunately it's very difficult to find in video stores you may h

## Bidirectional LSTM

In [72]:
bimodel = tf.keras.models.Sequential()
bimodel.add(tf.keras.layers.Embedding(NUM_WORDS, 128))
bimodel.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
bimodel.add(tf.keras.layers.Dense(1, activation='sigmoid'))

bimodel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

bimodel.fit(x_train_pad, y_train,
          batch_size=16,
          epochs=5,
          verbose=1,
          callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)],
          validation_split=0.2)

bimodel.summary()

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 51ms/step - accuracy: 0.7289 - loss: 0.5215 - val_accuracy: 0.8280 - val_loss: 0.3915
Epoch 2/5
[1m 190/1250[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m52s[0m 50ms/step - accuracy: 0.9010 - loss: 0.2674

KeyboardInterrupt: 

In [82]:
bimodel2 = tf.keras.models.Sequential()
bimodel2.add(tf.keras.layers.Embedding(NUM_WORDS, 128))
bimodel2.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)))
bimodel2.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
bimodel2.add(tf.keras.layers.Dense(1, activation='sigmoid'))

bimodel2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

bimodel2.fit(x_train_pad, y_train,
          batch_size=16,
          epochs=5,
          verbose=1,
          callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)],
          validation_split=0.2)

bimodel2.summary()

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 84ms/step - accuracy: 0.7153 - loss: 0.5298 - val_accuracy: 0.8364 - val_loss: 0.3952
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 87ms/step - accuracy: 0.8954 - loss: 0.2660 - val_accuracy: 0.8606 - val_loss: 0.3447
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 86ms/step - accuracy: 0.9469 - loss: 0.1521 - val_accuracy: 0.8560 - val_loss: 0.3622
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 88ms/step - accuracy: 0.9693 - loss: 0.0877 - val_accuracy: 0.8366 - val_loss: 0.4464
