In [1]:
import keras
from keras.datasets import imdb

In [2]:
vocabulary_size = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Loaded dataset with 25000 training samples, 25000 test samples


In [5]:
X_train.shape

(25000,)

In [6]:
X_test.shape

(25000,)

In [3]:
print('---review---')
print(X_train[6])
print('---label---')
print(y_train[6])

---review---
[1, 6740, 365, 1234, 5, 1156, 354, 11, 14, 5327, 6638, 7, 1016, 2, 5940, 356, 44, 4, 1349, 500, 746, 5, 200, 4, 4132, 11, 2, 9363, 1117, 1831, 7485, 5, 4831, 26, 6, 2, 4183, 17, 369, 37, 215, 1345, 143, 2, 5, 1838, 8, 1974, 15, 36, 119, 257, 85, 52, 486, 9, 6, 2, 8564, 63, 271, 6, 196, 96, 949, 4121, 4, 2, 7, 4, 2212, 2436, 819, 63, 47, 77, 7175, 180, 6, 227, 11, 94, 2494, 2, 13, 423, 4, 168, 7, 4, 22, 5, 89, 665, 71, 270, 56, 5, 13, 197, 12, 161, 5390, 99, 76, 23, 2, 7, 419, 665, 40, 91, 85, 108, 7, 4, 2084, 5, 4773, 81, 55, 52, 1901]
---label---
1


In [7]:
word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}
print('---review with words---')
print([id2word.get(i, ' ') for i in X_train[6]])
print('---label---')
print(y_train[6])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
---review with words---
['the', 'boiled', 'full', 'involving', 'to', 'impressive', 'boring', 'this', 'as', 'murdering', 'naschy', 'br', 'villain', 'and', 'suggestion', 'need', 'has', 'of', 'costumes', 'b', 'message', 'to', 'may', 'of', 'props', 'this', 'and', 'concentrates', 'concept', 'issue', 'skeptical', 'to', "god's", 'he', 'is', 'and', 'unfolds', 'movie', 'women', 'like', "isn't", 'surely', "i'm", 'and', 'to', 'toward', 'in', "here's", 'for', 'from', 'did', 'having', 'because', 'very', 'quality', 'it', 'is', 'and', 'starship', 'really', 'book', 'is', 'both', 'too', 'worked', 'carl', 'of', 'and', 'br', 'of', 'reviewer', 'closer', 'figure', 'really', 'there', 'will', 'originals', 'things', 'is', 'far', 'this', 'make', 'mistakes', 'and', 'was', "couldn't", 'of', 'few', 'br', 'of', 'you', 'to', "don't", 'female', 'than', 'place', 'she', 'to', 'was', 'between', 'that', 'nothing', 'dos

In [8]:
print('Maximum review length: {}'.format(
len(max((X_train + X_test), key=len))))

Maximum review length: 2697


In [9]:
from keras.preprocessing import sequence

In [10]:
# from keras.preprocessing import sequence
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

In [11]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

In [12]:
# from keras import Sequential
# from keras.layers import Embedding, LSTM, Dense, Dropout
#embedding_size=32
embedding_size=50
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(150, dropout=0.2)) #was 100
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 50)           500000    
_________________________________________________________________
lstm (LSTM)                  (None, 150)               120600    
_________________________________________________________________
dense (Dense)                (None, 1)                 151       
Total params: 620,751
Trainable params: 620,751
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])

In [14]:
batch_size = 256
num_epochs = 3
X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]
model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x272636d2730>

In [15]:
scores = model.evaluate(X_test, y_test, verbose=0)
print('Test accuracy:', scores[1])

Test accuracy: 0.8735600113868713


In [16]:
reviewList = ["the movie was not so bad", 
              "the movie was a great waste of my time",
              "the food was so delicious that i felt sinfully wicked"                 
             ]
d = keras.datasets.imdb.get_word_index()

In [18]:
sentiment= {True: "Positive",
           False: "Negative"}
Threshold = 0.5
for r in reviewList:
    words = r.split()
    review = []
    print ("review=", r)
    for word in words:
        
        if word not in d:
            
            review.append(2)
            print (word, "Appended 2")
        else:
            if (d[word]+3) >= vocabulary_size:
                print("got a word outside the vocab_index", word, d[word]+3, "breaking")
                break
        review.append(d[word]+3) 
    review = keras.preprocessing.sequence.pad_sequences([review],
      truncating='pre', padding='pre', maxlen=max_words)
    prediction = model.predict(review)
    print("Prediction Probability = ", prediction[0][0], "Sentiment=", 
          sentiment[prediction[0][0]>Threshold],"\n")


review= the movie was not so bad
Prediction Probability =  0.23646459 Sentiment= Negative 

review= the movie was a great waste of my time
Prediction Probability =  0.3790865 Sentiment= Negative 

review= the food was so delicious that i felt sinfully wicked
sinfully Appended 2


KeyError: 'sinfully'