In [4]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import tensorflow as tf
import os
import numpy as np

In [6]:
vocab_size = 88000

maxlen = 250
batch_size = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = vocab_size)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [7]:
train_data[5]

[1,
 778,
 128,
 74,
 12,
 630,
 163,
 15,
 4,
 1766,
 7982,
 1051,
 43222,
 32,
 85,
 156,
 45,
 40,
 148,
 139,
 121,
 664,
 665,
 10,
 10,
 1361,
 173,
 4,
 749,
 86588,
 16,
 3804,
 8,
 4,
 226,
 65,
 12,
 43,
 127,
 24,
 15344,
 10,
 10]

In [8]:
train_data = sequence.pad_sequences(train_data, maxlen)
test_data = sequence.pad_sequences(test_data, maxlen)

In [10]:
#model

model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocab_size, 32),
                             tf.keras.layers.LSTM(32),
                             tf.keras.layers.Dense(1, activation = "sigmoid")
      ])

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          2816000   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 2,824,353
Trainable params: 2,824,353
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(loss = "binary_crossentropy", optimizer = "rmsprop", metrics = ['accuracy'])

In [14]:
history = model.fit(train_data, train_labels, epochs = 10, validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
final_result = model.evaluate(test_data, test_labels)
print(final_result)

[0.5943259596824646, 0.8475199937820435]


In [19]:
#predictions
from tensorflow import keras
word_index = imdb.get_word_index()

def encode_text(txt):
  tokens = keras.preprocessing.text.text_to_word_sequence(txt)
  tokens = [word_index[word] if word in word_index else 0 for word in tokens]
  return sequence.pad_sequences([tokens], maxlen)[0]

In [20]:
txt = "The movie was so bad"
encoded = encode_text(txt)
print(encoded)

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  1 17 13 35 75]


In [26]:
#decoding

reverse_word_index = {value: key for (key, value) in word_index.items()}

def decode_integers(integers):
  pad = 0
  text = ""
  for num in integers:
    if num != pad:
      text += reverse_word_index[num] + " "
  return text[:-1]

In [27]:
 print(decode_integers(encoded))

the movie was so bad


In [31]:
def predict(text):
  encoded_text = encode_text(text)
  pred = np.zeros((1, 250))
  pred[0] = encoded_text
  result = model.predict(pred)
  print(result[0])

In [44]:
pos_rev = "The movie was so great! I really loved it and would watch it again, it was amazingly great"
predict(pos_rev)

[0.8491556]


In [45]:
neg_rev = "The movie was worst. was one of the Worst"
predict(neg_rev)

[0.38163918]
