In [1]:
from keras.datasets import imdb

Using TensorFlow backend.


In [2]:
vocabulary_size = 5000

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))

Loaded dataset with 25000 training samples, 25000 test samples


In [3]:
print('---review---')
print(X_train[6])
print('---label---')
print(y_train[6])

---review---
[1, 2, 365, 1234, 5, 1156, 354, 11, 14, 2, 2, 7, 1016, 2, 2, 356, 44, 4, 1349, 500, 746, 5, 200, 4, 4132, 11, 2, 2, 1117, 1831, 2, 5, 4831, 26, 6, 2, 4183, 17, 369, 37, 215, 1345, 143, 2, 5, 1838, 8, 1974, 15, 36, 119, 257, 85, 52, 486, 9, 6, 2, 2, 63, 271, 6, 196, 96, 949, 4121, 4, 2, 7, 4, 2212, 2436, 819, 63, 47, 77, 2, 180, 6, 227, 11, 94, 2494, 2, 13, 423, 4, 168, 7, 4, 22, 5, 89, 665, 71, 270, 56, 5, 13, 197, 12, 161, 2, 99, 76, 23, 2, 7, 419, 665, 40, 91, 85, 108, 7, 4, 2084, 5, 4773, 81, 55, 52, 1901]
---label---
1


In [4]:
word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}
print('---review with words---')
print([id2word.get(i, ' ') for i in X_train[6]])
print('---label---')
print(y_train[6])

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
---review with words---
[u'the', u'and', u'full', u'involving', u'to', u'impressive', u'boring', u'this', u'as', u'and', u'and', u'br', u'villain', u'and', u'and', u'need', u'has', u'of', u'costumes', u'b', u'message', u'to', u'may', u'of', u'props', u'this', u'and', u'and', u'concept', u'issue', u'and', u'to', u"god's", u'he', u'is', u'and', u'unfolds', u'movie', u'women', u'like', u"isn't", u'surely', u"i'm", u'and', u'to', u'toward', u'in', u"here's", u'for', u'from', u'did', u'having', u'because', u'very', u'quality', u'it', u'is', u'and', u'and', u'really', u'book', u'is', u'both', u'too', u'worked', u'carl', u'of', u'and', u'br', u'of', u'reviewer', u'closer', u'figure', u'really', u'there', u'will', u'and', u'things', u'is', u'far', u'this', u'make', u'mistakes', u'and', u'was', u"couldn't", u'of', u'few', u'br', u'of', u'you', u'to', u"don't", u'female', u'than', u'place', u'she', u'to', u'was', u

In [5]:
print('Maximum review length: {}'.format(
len(max((X_train + X_test), key=len))))

Maximum review length: 2697


In [6]:
print('Minimum review length: {}'.format(
len(min((X_test + X_test), key=len))))

Minimum review length: 14


In [7]:
from keras.preprocessing import sequence

max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

Design an RNN model for sentiment analysis

We start building our model architecture in the code cell below. We have imported some layers from Keras that you might need but feel free to use any other layers / transformations you like.

Remember that our input is a sequence of words (technically, integer word IDs) of maximum length = max_words, and our output is a binary sentiment label (0 or 1).

In [8]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

In [9]:
embedding_size=32
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [11]:
batch_size = 64
num_epochs = 3

X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]

model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs)

Train on 24936 samples, validate on 64 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f98aef24290>

In [12]:
scores = model.evaluate(X_test, y_test, verbose=0)
print('Test accuracy:', scores[1])

('Test accuracy:', 0.79412)
