In [0]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import load_model
from numpy import mean

In [0]:
#initialisation
vocabulary_size = 5000
max_words = 500
embedding_size=32
batch_size = 64
num_epochs = 5
activation='sigmoid'
loss='binary_crossentropy'
optimizer='adam'
metrics=['accuracy']


In [0]:
#load the data
(X_train, y_train),(X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
print("Training samples :",len(X_train))
print("Test samples :",len(X_test))

In [0]:
#convert the id to word in reviews -- only for understanding
word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}
def getWords(indexList):
  print([id2word.get(i, ' ') for i in indexList])
#print('---review index---')
#print(X_train[6])
#print('---review words---')
#print(getWords(X_train[6]))
#print('---label---')
#print(y_train[6])

In [0]:
#print("Before padding ",len(X_train[0]))

X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
#print("After padding ",len(X_train[0]))

In [0]:
#define model
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
#model.add(Dropout(0.5))
model.add(LSTM(units = 100))
model.add(Dropout(0.2))
#model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation=activation))
print(model.summary())

In [0]:
#specify the loss function, optimizer and evaluation metrics 
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

In [0]:
#training set
X_training, y_training = X_train[batch_size:], y_train[batch_size:]
#validation set
X_validation, y_validation = X_train[:batch_size], y_train[:batch_size]
print("Training samples : ",len(X_training))
print("Validation samples : ",len(X_validation))

In [0]:
#Training
histories = model.fit(X_training, y_training, validation_data=(X_validation, y_validation), batch_size=batch_size, epochs=num_epochs)
model.save('sentiment_analysis.h5')

In [0]:
#Evaluate
scores = model.evaluate(X_test, y_test, verbose=0)

In [0]:
print('Training accuracy',mean(histories.history['accuracy'])*100)
print('Test accuracy:', scores[1]*100)

In [0]:
#convert words to index
def get_word_index(review_words):
  words = review_words.split()
  review = []
  for word in words:
    if word not in word2id:
      review.append(2)
    else:
      review.append(word2id[word]+3)
  return review

In [0]:
#movie prediction
def prediction(review):
  prediction = model.predict(review)
  if prediction[0][0] > 0.5:
    print("positive",prediction[0][0])
  else:
    print("negative",prediction[0][0])

In [0]:
review_list = ["This moview is bad", "This movie is good", "Did not understand the movie","This movie is such a waste of time","Movie is one time watch"]
for review in review_list:
  print("Review : ",review)
  review_index = get_word_index(review)
  review_index = sequence.pad_sequences([review_index], maxlen=max_words)
  prediction(review_index)

Without droput 

Training accuracy 91.09454154968262
Test accuracy: 86.76400184631348

dropout 0.5, epochs - 5

Training accuracy 86.3931655883789
Test accuracy: 86.25199794769287

dropout 0.2, epochs - 5

Training accuracy 87.70052194595337
Test accuracy: 87.26400136947632