# IMDB prediction with RNN

### Libraries

In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM, SpatialDropout1D
from keras.datasets import imdb

Using TensorFlow backend.


### Load and look at data

In [2]:
# Maximum number of words (by frequency of use)
max_features = 5000

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)

In [3]:
pd.DataFrame(X_train).head()

Unnamed: 0,0
0,"[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, ..."
1,"[1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 43..."
2,"[1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 2..."
3,"[1, 4, 2, 2, 33, 2804, 4, 2040, 432, 111, 153,..."
4,"[1, 249, 1323, 7, 61, 113, 10, 10, 13, 1637, 1..."


### Fill or cut reviews

In [4]:
# Maximum length of review in words
maxlen = 90

X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [5]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,89
0,14,407,16,82,2,8,4,107,117,2,...,4472,113,103,32,15,16,2,19,178,32
1,2,2,349,2637,148,605,2,2,15,123,...,52,154,462,33,89,78,285,16,145,95
2,360,7,4,58,316,334,11,4,1716,43,...,106,607,624,35,534,6,227,7,129,113
3,2348,537,23,53,537,21,82,40,2,13,...,26,49,2,15,566,30,579,21,64,2574
4,13,28,126,110,13,473,8,569,61,419,...,19,14,5,2,6,226,251,7,61,113


### Create network model and compile it

In [6]:
model = Sequential()

# Layer for the vector representation of words
model.add(Embedding(max_features, 32)) 
model.add(SpatialDropout1D(0.2))

# Long-term memory layer
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))

# Full bond layer
model.add(Dense(1, activation="sigmoid"))


model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

### Fit the model

In [7]:
model.fit(X_train, y_train, batch_size=64, epochs=5,
          validation_data=(X_test, y_test), verbose=2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
 - 76s - loss: 0.5021 - acc: 0.7502 - val_loss: 0.4040 - val_acc: 0.8189
Epoch 2/5
 - 65s - loss: 0.3695 - acc: 0.8430 - val_loss: 0.3725 - val_acc: 0.8405
Epoch 3/5
 - 64s - loss: 0.3403 - acc: 0.8563 - val_loss: 0.3644 - val_acc: 0.8438
Epoch 4/5
 - 65s - loss: 0.3115 - acc: 0.8726 - val_loss: 0.3679 - val_acc: 0.8428
Epoch 5/5
 - 65s - loss: 0.2954 - acc: 0.8818 - val_loss: 0.3733 - val_acc: 0.8387


<keras.callbacks.History at 0x1de49684f60>

### Check the quality of training on  test data

In [8]:
scores = model.evaluate(X_test, y_test,
                        batch_size=64)



In [9]:
print("Accuracy on test data: {}".format(round(scores[1] * 100, 2)))

Accuracy on test data: 83.87
