#Classifying Movie reviews as positive or negative based on IMDB dataset

In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
VOCAB_SIZE = 10000   # most frequent words
MAX_LEN = 200        # length of each review
EMBEDDING_DIM = 32   # Didimmension of word embeddings
BATCH_SIZE = 32
EPOCHS = 5

In [3]:
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=VOCAB_SIZE)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [4]:
train_data = pad_sequences(train_data, maxlen=MAX_LEN)
test_data = pad_sequences(test_data, maxlen=MAX_LEN)

In [5]:
# model
model = tf.keras.Sequential([

    tf.keras.layers.Embedding( input_dim=VOCAB_SIZE,
                              output_dim=EMBEDDING_DIM,
                               input_length=MAX_LEN ), # embedding layer will turn word indices into dense vectors

    tf.keras.layers.GlobalAveragePooling1D(),  # Better than Flatten for sequences
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])



In [6]:
model.compile( optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [7]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)

In [8]:
history = model.fit(
    train_data, train_labels, epochs=10, batch_size=32, validation_data=(test_data, test_labels), callbacks=[early_stop]
)

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.6780 - loss: 0.5779 - val_accuracy: 0.8506 - val_loss: 0.3358
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8888 - loss: 0.2686 - val_accuracy: 0.8438 - val_loss: 0.3471
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9065 - loss: 0.2391 - val_accuracy: 0.8592 - val_loss: 0.3319
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9348 - loss: 0.1781 - val_accuracy: 0.8728 - val_loss: 0.3179
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.9452 - loss: 0.1582 - val_accuracy: 0.8578 - val_loss: 0.3567
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9540 - loss: 0.1357 - val_accuracy: 0.8614 - val_loss: 0.3838


In [9]:
test_loss, test_accuracy = model.evaluate(test_data, test_labels)
print(f" test score {test_accuracy:.4f} (loss: {test_loss:.2f})")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8653 - loss: 0.3763
 test score 0.8614 (loss: 0.38)


In [10]:
# word mapping
word_index = tf.keras.datasets.imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [11]:
def simple_predict(review_text):
    words = review_text.lower().split()     # words to numbers
    numbers = [1]
    for i in words:
        if i in word_index and word_index[i] < 10000:
            numbers.append(word_index[i] + 3)

    padded = tf.keras.preprocessing.sequence.pad_sequences([numbers], maxlen=200)

    score = model.predict(padded)[0][0]
    if score > 0.5:
        return f"positive ({score:.2%} confidence)"
    else:
        return f"neg ({100-score*100:.2%} confidence)"

In [12]:
my_rev = "It was a really nice movie"
print(simple_predict(my_rev))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step
positive (77.03% confidence)
