In [49]:
from keras.datasets import imdb
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing import sequence
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import nltk
#nltk.download ('punkt')
from nltk import word_tokenize

In [33]:
#initialise the size of the reviews 
size = 50000
#load data, assign and divide into test (25 000) and training sets (25 000) 
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = size) 
print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))

Loaded dataset with 25000 training samples, 25000 test samples


In [34]:
## the words in the review are all integers or IDs that each represents a word 
print('---review--')
print(X_train[6])
## the label is represented as an integer 
print('--label--')
print(y_train[6])

---review--
[1, 6740, 365, 1234, 5, 1156, 354, 11, 14, 5327, 6638, 7, 1016, 10626, 5940, 356, 44, 4, 1349, 500, 746, 5, 200, 4, 4132, 11, 16393, 9363, 1117, 1831, 7485, 5, 4831, 26, 6, 2, 4183, 17, 369, 37, 215, 1345, 143, 32677, 5, 1838, 8, 1974, 15, 36, 119, 257, 85, 52, 486, 9, 6, 26441, 8564, 63, 271, 6, 196, 96, 949, 4121, 4, 2, 7, 4, 2212, 2436, 819, 63, 47, 77, 7175, 180, 6, 227, 11, 94, 2494, 33740, 13, 423, 4, 168, 7, 4, 22, 5, 89, 665, 71, 270, 56, 5, 13, 197, 12, 161, 5390, 99, 76, 23, 2, 7, 419, 665, 40, 91, 85, 108, 7, 4, 2084, 5, 4773, 81, 55, 52, 1901]
--label--
1


In [35]:
## map the review back to original words
#the following code assigns the word dictionary to a variable 
word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}
print('---review with words---')
print([id2word.get(i,' ') for i in X_train[6]])
print('---label---')
print(y_train[6])


---review with words---
['the', 'boiled', 'full', 'involving', 'to', 'impressive', 'boring', 'this', 'as', 'murdering', 'naschy', 'br', 'villain', 'council', 'suggestion', 'need', 'has', 'of', 'costumes', 'b', 'message', 'to', 'may', 'of', 'props', 'this', 'echoed', 'concentrates', 'concept', 'issue', 'skeptical', 'to', "god's", 'he', 'is', 'and', 'unfolds', 'movie', 'women', 'like', "isn't", 'surely', "i'm", 'rocketed', 'to', 'toward', 'in', "here's", 'for', 'from', 'did', 'having', 'because', 'very', 'quality', 'it', 'is', "captain's", 'starship', 'really', 'book', 'is', 'both', 'too', 'worked', 'carl', 'of', 'and', 'br', 'of', 'reviewer', 'closer', 'figure', 'really', 'there', 'will', 'originals', 'things', 'is', 'far', 'this', 'make', 'mistakes', "kevin's", 'was', "couldn't", 'of', 'few', 'br', 'of', 'you', 'to', "don't", 'female', 'than', 'place', 'she', 'to', 'was', 'between', 'that', 'nothing', 'dose', 'movies', 'get', 'are', 'and', 'br', 'yes', 'female', 'just', 'its', 'because

In [37]:
### DATA PROCESSING (All input documents must have the same length)
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)


In [38]:
###Build the Model
embedding_size = 100 
model = Sequential()
model.add(Embedding(size, embedding_size, input_length = max_words))
model.add(LSTM(100))
model.add(Dense(1, activation ='sigmoid'))
print(model.summary())
## the above prints our a summary of our model, a simple RNN model with 1 embedding, 1 LSTM and 1 dense layer.
## a total of 1 653 301 parameters needs to be trained 

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 100)          5000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 5,080,501
Trainable params: 5,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [40]:
##TRAIN THE MODEL 

# compile the model by specifying the loss function and optimizer and accuracy as the evaluation metric 
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit the model, initialize/specify the parameters, the batch size and the number of training epochs 
model.fit(X_train, y_train, validation_data= (X_test, y_test), epochs = 3, batch_size = 50)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x1fc83a2e828>

In [42]:
##TEST THE MODEL 
# the following code checks the accuracy of the above model, which is mainly to estimate the performance of the model on the unseen data 
scores = model.evaluate(X_test, y_test, verbose=0)
# here we could just print the scores, but we multiply the score by 100 to display a full value instead of 0.00
("Accuracy: %.2f%%" % (scores[1]*100))


'Accuracy: 85.78%'

In [43]:
#predict 
word2id = imdb.get_word_index()
test=[]
max_review_length = 500

for word in word_tokenize(""):
     test.append(word2id[word])

test=sequence.pad_sequences([test],maxlen=max_review_length)
model.predict(test)


array([[0.8295099]], dtype=float32)