In [None]:
# this part is only for using GPU Version

In [1]:
import numpy as np
# we need to fit model with sequence of tokens with specific length
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
# normal LSTM/ GRU and version with Cuda
from keras.layers import Dense, Embedding, GRU, LSTM, CuDNNGRU, CuDNNLSTM, Dropout
from keras.datasets import imdb
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
# num_words: integer or None. Top most frequent words to consider. Any less frequent word will appear as oov_char 
# value in the sequence data
num_words = 20000

In [3]:
# Dataset of 25000 movies reviews from IMDB, labeled by sentiment (positive/negative). Reviews have been 
# preprocessed, and each review is encoded as a sequence of word indexes (integers)
(X_train, Y_train), (X_test, Y_test) = imdb.load_data(num_words=num_words)

In [4]:
print(len(X_train))
print(len(X_test))

25000
25000


In [5]:
print(X_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [6]:
 # represents length of the sequence of tokens | int. Maximum sequence length. Any longer sequence will be truncated.
max_len = 256
# size of the vector that represents each word
batch_size = 128
n_epochs = 10

In [None]:
# The ratio of vocabulary vs embedding length to determine the size of other layers in a neural network doesn't 
# really matter. Word embeddings are always around 100 and 300 in length, longer embedding vectors don't add enough 
# information and smaller ones don't represent the semantics well enough. What matters more is the network architecture, 
# the algorithm(s) and the dataset size.

# A simple way to understand this concept is that a bidirectional LSTM model with 50 neurons (nodes) followed by a fully 
# connected layer of 70 neurons will outperform a simple MLP of 1000 neurons (nodes) connected to a embedding layer 
# simply due to its architecture. Adding dropout will improve performance as well.

# In addition, even if the vocabulary is just 300 words, using pre-trained embeddings will probably yield better results
# than training the embeddings directly on the dataset. The same applies to data size, a dataset with more samples will
# make a better classifier than a dataset with just a couple thousand samples.

# In summary, it is preferable to try many architectures and cross-validate them (and/or ensemble them depending if you 
# have a large enough dataset) with the smallest number of neurons possible and then start building up in size, 
# depending on what computational resources you have and the speed of development you need. Large models slow down 
# development speed whereas small models speed it up. This goes whether your vocabulary is the size of common crawl or 
# just 300. As usual, try feature engineering (sentence length, special characters, etc.) and increase the dataset size
# as doing so often helps in whatever task you're trying to predict.


embedding_size = 10

In [7]:
pad = 'pre' #'post'

In [8]:
# set all sequences to the same size of 256, add zero to empty places
X_train_pad = pad_sequences(X_train, maxlen=max_len, padding=pad, truncating=pad)
X_test_pad = pad_sequences(X_test, maxlen=max_len, padding=pad, truncating=pad)

In [9]:
X_train_pad.shape

(25000, 256)

In [10]:
X_train.shape

(25000,)

In [11]:
model = Sequential()

In [12]:
# add the embedding layer
# input_dim : number of words
# output_dim = size of vector for each word
# input_length : length of each sequence (input review)
# name = name of layer
model.add(Embedding(input_dim=num_words,
                   output_dim=embedding_size,
                   input_length=max_len,
                   name='layer_embedding'))

model.add(Dropout(0.2))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [13]:
# lstm layer with 128 nodes (it doesnt need to be same as batch size)
# automatic activation function is tanh for lstm
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

# model.add(CuDNNLSTM(128, return_sequences=False))
# model.add(Dropout(0.2))

In [14]:
# we have binary classification so we use sigmoid for last layer
model.add(Dense(1, activation='sigmoid', name='Classification'))

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 256, 10)           200000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256, 10)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               71168     
_________________________________________________________________
Classification (Dense)       (None, 1)                 129       
Total params: 271,297
Trainable params: 271,297
Non-trainable params: 0
_________________________________________________________________


In [16]:
# optimizer can also be a hyperparameter
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
# checks to see if the loss is actually decreasing or not
# if there is no improvements it will stop the process after 5 epochs
callback_early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

In [18]:
%%time
# use 0.05 of train data for validation set
model.fit(X_train_pad, Y_train, epochs=n_epochs, batch_size=batch_size, validation_split=0.05, callbacks=[callback_early_stopping])

Instructions for updating:
Use tf.cast instead.
Train on 23750 samples, validate on 1250 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 39min 58s, sys: 2min 29s, total: 42min 28s
Wall time: 14min 8s


<keras.callbacks.History at 0x7f7153c3a8d0>

In [20]:
%%time

eval_ = model.evaluate(X_test_pad, Y_test)

CPU times: user 1min 11s, sys: 2.95 s, total: 1min 14s
Wall time: 27.4 s


In [21]:
print(eval_[0], eval_[1]) # loss / accuracy

0.4016187694072723 0.84424
