In [1]:
%%capture capt
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb

## Import IMDB dataset

In [2]:
INDEX_FROM = 3
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words, index_from=INDEX_FROM)

In [3]:
word_to_id = imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}

In [4]:
# print first sentence in X_train
sentence = [id_to_word[word] for word in X_train[0]]
print(' '.join(sentence))

<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly <UNK> was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little <UNK> that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big <UNK> for the whole film but these children are amazing and should be <UNK> for what they

## Pre-processing

In [5]:
from tensorflow.keras.preprocessing import sequence

In [6]:
mean_review_length = int(np.mean([len(x) for x in X_train]))
max_review_length = mean_review_length
print(f"Longest sequence is {max_review_length}")

Longest sequence is 238


In [7]:
X_train = sequence.pad_sequences(X_train, padding='post', maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, padding='post', maxlen=max_review_length)

## LSTM model

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout, Input, LSTM, Reshape, Lambda, RepeatVector
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [9]:
nb_words = max([max(x) for x in X_train]) + 1
embedding_vector_length = 256

In [10]:
model = Sequential()
model.add(Embedding(nb_words, embedding_vector_length))

model.add(LSTM(units=128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(units=64))
model.add(Dropout(0.4))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))

model.add(Dense(1, activation='sigmoid'))

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 256)         1280000   
                                                                 
 lstm (LSTM)                 (None, None, 128)         197120    
                                                                 
 dropout (Dropout)           (None, None, 128)         0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0

In [12]:
model.compile(loss='binary_crossentropy', 
              optimizer=tf.keras.optimizers.Adam(learning_rate = 1e-4), 
              metrics=['accuracy'])

In [13]:
filepath = 'model.h5'
checkpoint = ModelCheckpoint(filepath=filepath, 
                             monitor='val_loss',
                             verbose=1, 
                             save_best_only=True,
                             mode='min')

es = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

callbacks = [checkpoint, 
             es
            ]

In [14]:
model.fit(X_train, y_train, epochs=1000, batch_size=128, validation_data=(X_test, y_test), callbacks=callbacks)

Epoch 1/1000
Epoch 1: val_loss improved from inf to 0.69252, saving model to model.h5
Epoch 2/1000
Epoch 2: val_loss improved from 0.69252 to 0.58209, saving model to model.h5
Epoch 3/1000
Epoch 3: val_loss improved from 0.58209 to 0.36773, saving model to model.h5
Epoch 4/1000
Epoch 4: val_loss improved from 0.36773 to 0.33463, saving model to model.h5
Epoch 5/1000
Epoch 5: val_loss did not improve from 0.33463
Epoch 6/1000
Epoch 6: val_loss did not improve from 0.33463
Epoch 7/1000
Epoch 7: val_loss improved from 0.33463 to 0.33038, saving model to model.h5
Epoch 8/1000
Epoch 8: val_loss did not improve from 0.33038
Epoch 9/1000
Epoch 9: val_loss did not improve from 0.33038
Epoch 10/1000
Epoch 10: val_loss did not improve from 0.33038
Epoch 11/1000
Epoch 11: val_loss did not improve from 0.33038
Epoch 12/1000
Epoch 12: val_loss did not improve from 0.33038
Epoch 12: early stopping


<keras.callbacks.History at 0x144a2b5ff40>