In [None]:
%tensorflow_version 2.x

# Dataset

We're using the IMDB movie review dataset from Keras. It contains 25,000 reviews which are labeled positive or negative. Each review is encoded by integers that represent how common a word is, in the entire dataset. A word encoded by integer 3 means its the third most common word in the dataset.

In [None]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import tensorflow as tf
import os
import numpy as np

In [None]:
VOCAB_SIZE = 88584  # Number of unique words in the dataset

MAXLEN = 250        #   Max length of a review
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


The reviews are of different lengths and we can't pass different length data into our network, hence they must be made the same length.

In [None]:
train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)

# Creating the Model

The model and its different layers are showcased below. 32 stands for the output dimension of each vector gotten from the embedding layer. Same walue is used in the LSTM because each word is represented by 32 dimensions.

In [None]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(VOCAB_SIZE, 32),
                             tf.keras.layers.LSTM(32),
                             tf.keras.layers.Dense(1, activation = 'sigmoid')])

In [None]:
model.summary()

# Training the Model

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', metrics = ['acc'])

history = model.fit(train_data, train_labels, epochs = 5, validation_split = 0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
results = model.evaluate(test_data, test_labels)
print(results)

[0.48024505376815796, 0.8482400178909302]


# Making Predictions

Since our reviews are encoded, we'll need to convert any review that we write into that form so the network can understand it. To do that, we'll load the encodings from the dataset and use them to encode our own data.

In [None]:
word_index = imdb.get_word_index() # get all the word indices present in the model

def encode_text(text):
  tokens = tf.keras.preprocessing.text.text_to_word_sequence(text)  #Changes every text to a list of words
  tokens = [word_index[word] if word in word_index else 0 for word in tokens]  # Assigns the index of a that word present in the dataset to it, and assigns 0 to the word, if it is not in the dataset, kinda like representing 'I dont know'
  return sequence.pad_sequences([tokens], MAXLEN)[0]

text = "that movie was just amazing, so amazing"
encoded = encode_text(text)
print(encoded)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  12  17  13  4

In [None]:
# a function to decode an encoded text

reverse_word_index = {value: key for (key, value) in word_index.items()} # Changes the keys to the numbers and the values to the words

def decode_integers(integers):
  PAD = 0
  text = ''
  for num in integers:
    if num != PAD:
      text += reverse_word_index[num] + ' '

  return text[:-1]


print(decode_integers(encoded))

that movie was just amazing so amazing


In [None]:
# To make predictions

def predict(text):
  encoded_text = encode_text(text)
  pred = np.zeros((1, 250))  #Creating an array(list) with the given shape, we know why 250 was chosen
  pred[0] = encoded_text     #This is done because predict is used on lists
  result = model.predict(pred)
  print (result[0])
  if result[0] >= 0.5:
    print('It is a positive review\n')
  else:
    print('It is a negative review')


positive_review = 'Spiderman: No way home, was a breath of fresh air, definitely the best movie of the year, marvel has finally found its footing in phase 4, I\'ll definitely be rewatching it'
predict(positive_review)

negative_review = 'Venom was, as MKBHD would say, the bust of the year, thoroughly disappointing, lack of story, presentation or any depth, just reliant on humour and spectacular acting from it\'s lead actor'
predict(negative_review)

[0.9947931]
It is a positive review

[0.11129114]
It is a negative review
