In [1]:
'''Trains an LSTM model on the IMDB sentiment classification task.

The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.

# Notes

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb


Using TensorFlow backend.


In [3]:
from keras.callbacks import TensorBoard

from time import time

In [4]:
max_features = 20000
# cut texts after this number of words (among top max_features most common words)
maxlen = 170 #80
batch_size = 32


In [5]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')


Loading data...
25000 train sequences
25000 test sequences


In [6]:
x_test.shape

(25000,)

In [7]:
x_test

array([list([1, 591, 202, 14, 31, 6, 717, 10, 10, 18142, 10698, 5, 4, 360, 7, 4, 177, 5760, 394, 354, 4, 123, 9, 1035, 1035, 1035, 10, 10, 13, 92, 124, 89, 488, 7944, 100, 28, 1668, 14, 31, 23, 27, 7479, 29, 220, 468, 8, 124, 14, 286, 170, 8, 157, 46, 5, 27, 239, 16, 179, 15387, 38, 32, 25, 7944, 451, 202, 14, 6, 717]),
       list([1, 14, 22, 3443, 6, 176, 7, 5063, 88, 12, 2679, 23, 1310, 5, 109, 943, 4, 114, 9, 55, 606, 5, 111, 7, 4, 139, 193, 273, 23, 4, 172, 270, 11, 7216, 10626, 4, 8463, 2801, 109, 1603, 21, 4, 22, 3861, 8, 6, 1193, 1330, 10, 10, 4, 105, 987, 35, 841, 16873, 19, 861, 1074, 5, 1987, 17975, 45, 55, 221, 15, 670, 5304, 526, 14, 1069, 4, 405, 5, 2438, 7, 27, 85, 108, 131, 4, 5045, 5304, 3884, 405, 9, 3523, 133, 5, 50, 13, 104, 51, 66, 166, 14, 22, 157, 9, 4, 530, 239, 34, 8463, 2801, 45, 407, 31, 7, 41, 3778, 105, 21, 59, 299, 12, 38, 950, 5, 4521, 15, 45, 629, 488, 2733, 127, 6, 52, 292, 17, 4, 6936, 185, 132, 1988, 5304, 1799, 488, 2693, 47, 6, 392, 173, 4, 2, 4378,

In [8]:
y_test.shape

(25000,)

In [9]:
y_test

array([0, 1, 1, ..., 0, 0, 0])

In [10]:
print('Pad sequences (samples x time)')
x_train2 = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test2 = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train2 shape:', x_train2.shape)
print('x_test2 shape:', x_test2.shape)


Pad sequences (samples x time)
x_train2 shape: (25000, 170)
x_test2 shape: (25000, 170)


In [11]:
x_test

array([list([1, 591, 202, 14, 31, 6, 717, 10, 10, 18142, 10698, 5, 4, 360, 7, 4, 177, 5760, 394, 354, 4, 123, 9, 1035, 1035, 1035, 10, 10, 13, 92, 124, 89, 488, 7944, 100, 28, 1668, 14, 31, 23, 27, 7479, 29, 220, 468, 8, 124, 14, 286, 170, 8, 157, 46, 5, 27, 239, 16, 179, 15387, 38, 32, 25, 7944, 451, 202, 14, 6, 717]),
       list([1, 14, 22, 3443, 6, 176, 7, 5063, 88, 12, 2679, 23, 1310, 5, 109, 943, 4, 114, 9, 55, 606, 5, 111, 7, 4, 139, 193, 273, 23, 4, 172, 270, 11, 7216, 10626, 4, 8463, 2801, 109, 1603, 21, 4, 22, 3861, 8, 6, 1193, 1330, 10, 10, 4, 105, 987, 35, 841, 16873, 19, 861, 1074, 5, 1987, 17975, 45, 55, 221, 15, 670, 5304, 526, 14, 1069, 4, 405, 5, 2438, 7, 27, 85, 108, 131, 4, 5045, 5304, 3884, 405, 9, 3523, 133, 5, 50, 13, 104, 51, 66, 166, 14, 22, 157, 9, 4, 530, 239, 34, 8463, 2801, 45, 407, 31, 7, 41, 3778, 105, 21, 59, 299, 12, 38, 950, 5, 4521, 15, 45, 629, 488, 2733, 127, 6, 52, 292, 17, 4, 6936, 185, 132, 1988, 5304, 1799, 488, 2693, 47, 6, 392, 173, 4, 2, 4378,

In [12]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, name="input_layer"))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid', name="output_layer"))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


Build model...


In [16]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
output_layer (Dense)         (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________


In [15]:
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))

In [20]:
print('Train...')
model.fit(x_train2, y_train,
          batch_size=batch_size,
          epochs=15,
          validation_data=(x_test2, y_test),
          verbose=1,
          callbacks = [tensorboard])

score, acc = model.evaluate(x_test2, y_test,
          batch_size=batch_size)

print('Test score:', score)
print('Test accuracy:', acc)


Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 0.9127236043155194
Test accuracy: 0.84076


In [None]:
!tensorboard --logdir=logs/

In [13]:
# layer names
import tensorflow as tf
import keras as K

sess = K.backend.get_session()
for n in sess.graph.as_graph_def().node:
    if 'input_layer' in n.name:
          print(n.name)
    if 'output_layer' in n.name:
          print(n.name)

input_layer_input
input_layer/random_uniform/shape
input_layer/random_uniform/min
input_layer/random_uniform/max
input_layer/random_uniform/RandomUniform
input_layer/random_uniform/sub
input_layer/random_uniform/mul
input_layer/random_uniform
input_layer/embeddings
input_layer/embeddings/Assign
input_layer/embeddings/read
input_layer/Cast
input_layer/embedding_lookup/axis
input_layer/embedding_lookup
output_layer/random_uniform/shape
output_layer/random_uniform/min
output_layer/random_uniform/max
output_layer/random_uniform/RandomUniform
output_layer/random_uniform/sub
output_layer/random_uniform/mul
output_layer/random_uniform
output_layer/kernel
output_layer/kernel/Assign
output_layer/kernel/read
output_layer/Const
output_layer/bias
output_layer/bias/Assign
output_layer/bias/read
output_layer/MatMul
output_layer/BiasAdd
output_layer/Sigmoid
output_layer_target
output_layer_sample_weights
loss/output_layer_loss/Const
loss/output_layer_loss/sub/x
loss/output_layer_loss/sub
loss/output_

In [14]:
# save the graph model
builder = tf.saved_model.builder.SavedModelBuilder("imdb-lstm-model7")
builder.add_meta_graph_and_variables(sess, ["server"])
builder.save()


INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: b'imdb-lstm-model7/saved_model.pb'


b'imdb-lstm-model7/saved_model.pb'

In [15]:
# save the graph model in hdf5
model.save('imdb-lstm-model7.h5')

In [16]:
# original word index
word_index = imdb.get_word_index()

In [17]:
# word index example
word_index["woods"]

1408

In [18]:
# from word index back to words
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [19]:
# example back to words
decode_review(x_test[0])

"the wonder own as by is sequence i i jars roses to of hollywood br of down shouting getting boring of ever it sadly sadly sadly i i was then does don't close faint after one carry as by are be favourites all family turn in does as three part in another some to be probably with world uncaring her an have faint beginning own as is sequence"

In [20]:
sample_data = [
    "this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert  is an amazing actor and now the same being director  father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for  and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also  to the two little boy's that played the  of norman and paul they were just brilliant children are often left out of the  list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all",
    "To put this in context, I am 34 years old and I have to say that this is the best film I have seen without doubt and I don't expect it will be beaten as far as I am concerned. Obviously times move on, and I acknowledge that due to its violence and one particularly uncomfortable scene this film is not for everyone, but I still remember watching it for the first time, and it blew me away. Anyone who watches it now has to remember that it actually changed the history of cinema. In context- it followed a decade or more of action films that always ended with a chase sequence where the hero saved the day - you could have written those films yourself. Pulp had you gripped and credited the audience with intelligence. There is not a line of wasted dialogue and the movie incorporates a number of complexities that are not immediately obvious. It also resurrected the career of Grease icon John Travolta and highlighted the acting talent of Samuel L Jackson. There are many films now that are edited out of sequence and have multiple plots etc but this is the one they all want to be, or all want to beat, but never will.",
    "Viewers are taken on a ride through three different stories that entertwine together around the world of Marcellus Wallace. Quentin Tarantino proves that he is the master of witty dialogue and a fast plot that doesn't allow the viewer a moment of boredom or rest. From the story of two hit-man on a job, to a fixed boxing match to a date between a hit-man and the wife of a mob boss. There was definitely a lot of care into the writing of the script, as everything no matter the order it is in, fits with the story. Many mysteries have been left such as what is inside of the briefcase and why Marcellus Wallace has a band-aid on the back of his neck, which may be connected. The movie redefined the action genre and reinvigorated the careers of both John Travolta and Bruce Willis. This movie is required viewing for any fan of film.",
    "Was it the money? Did he owe someone a favour? Cage why are you in this terrible movie ? Left Behind is even worse than the Kirk Cameron version ,which also sucked. The book I am sure is better, since many have read it. This movie is a career killer ...it is sad to see Cage left behind by Hollywood. The acting is bad , Cage tries hard to make work out of a script that was seemingly penned by 8 year old's . I mean the dialogue is terrible. Why waste your money making a film that is just bad. Cage needs to really rethink his career . Honestly I felt ashamed for Nick . Wild at Heart , Leaving Las Vegas, and Oscar winner... My God man have you no self respect? Nick you are better than this.",
    "This film is so banal it takes the banality out of The Banal. The premise is absurd. The unravelling of the plot is absurd. The performances are, at best, distracted as is the direction. Even the actors seem unconvinced. I'm still kicking myself for watching this film. Sin and purity are depicted with juvenile simplicity. Religious bias is arrogantly displayed (but I'll leave that to the unfortunate viewer to spot). Complex ideas like forgiveness, atonement and judgement are reduced to idiocy at an atomic level. I apologise if this sounds too much like a rant but for someone who watches well over 15 films a week, I have never subjected myself to anything this offensive (to the senses) in over 25 years.",
]

In [21]:
# Create a tokenizer to preprocess our text descriptions
tokenizer = K.preprocessing.text.Tokenizer(num_words=max_features, char_level=False)
# tokenize.fit_on_texts(sample_data) # only fit on train
tokenizer.word_index = word_index

In [22]:
sample_bow = tokenizer.texts_to_matrix(sample_data)

In [23]:
sample_bow

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.]])

In [24]:
sample_bow.shape

(5, 20000)

In [25]:
# word embeddings of sample data
sample_embed = tokenizer.texts_to_sequences(sample_data)
sample_embed_padded = K.preprocessing.sequence.pad_sequences(sample_embed, maxlen=maxlen, padding="post")

print(sample_embed)
print(sample_embed_padded)
sample_embed_padded.shape

[[11, 19, 13, 40, 527, 970, 1619, 1382, 62, 455, 4465, 63, 3938, 1, 170, 33, 253, 2, 22, 97, 40, 835, 109, 47, 667, 6, 32, 477, 281, 2, 147, 1, 169, 109, 164, 333, 382, 36, 1, 169, 4533, 1108, 14, 543, 35, 10, 444, 1, 189, 47, 13, 3, 144, 2022, 16, 11, 19, 1, 1917, 4610, 466, 1, 19, 68, 84, 9, 13, 40, 527, 35, 73, 12, 10, 1244, 1, 19, 14, 512, 14, 9, 13, 623, 15, 2, 59, 383, 9, 5, 313, 5, 103, 2, 1, 2220, 5241, 13, 477, 63, 3782, 30, 1, 127, 9, 13, 35, 616, 2, 22, 121, 48, 33, 132, 45, 22, 1412, 30, 3, 19, 9, 212, 25, 74, 49, 2, 11, 404, 13, 79, 5, 1, 104, 114, 5949, 12, 253, 1, 4, 3763, 2, 720, 33, 68, 40, 527, 473, 23, 397, 314, 43, 4, 1, 1026, 10, 101, 85, 1, 378, 12, 294, 95, 29, 2068, 53, 23, 138, 3, 191, 7483, 15, 1, 223, 19, 18, 131, 473, 23, 477, 2, 141, 27, 5532, 15, 48, 33, 25, 221, 89, 22, 101, 1, 223, 62, 13, 35, 1331, 85, 9, 13, 280, 2, 13, 4469, 110, 100, 29, 12, 13, 5342, 16, 175, 29], [5, 273, 11, 8, 2005, 10, 241, 12542, 150, 151, 2, 10, 25, 5, 132, 12, 11, 6, 1, 115, 

(5, 170)

In [26]:
# load saved keras model in h5
from keras.models import load_model
loaded_model = load_model('imdb-lstm-model7.h5')

In [27]:
loaded_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
output_layer (Dense)         (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________


In [28]:
predictions = loaded_model.predict(sample_embed_padded)

In [29]:
predictions.shape

(5, 1)

In [30]:
for i in range(len(sample_data)):
    prob = predictions[i]
    print(sample_data[i])
    print('Positive probability: ', prob[0], '\n')

this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert  is an amazing actor and now the same being director  father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for  and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also  to the two little boy's that played the  of norman and paul they were just brilliant children are often left out of the  list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the w