## Sequence Classification with LSTMs

Tutorial from link: https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/


In [36]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
numpy.random.seed(7)

In [37]:
top_words = 5000
INDEX_FROM=3
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words, index_from=INDEX_FROM)
X_cv, X_test, y_cv, y_test = train_test_split(X_test, y_test, test_size=0.5)

In [38]:
print(X_train.shape)
print(X_cv.shape)
print(X_test.shape)

(25000,)
(12500,)
(12500,)


In [39]:
word_index = imdb.get_word_index()

In [40]:
word_index = {k: (v + INDEX_FROM) for k, v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["giraffe"]

48151

In [41]:
freq_index = {freq: word for word, freq in word_index.items()}
freq_index[48151]

'giraffe'

In [42]:
test_sent = [freq_index[i] for i in X_train[1]]
print(test_sent)

['<START>', 'big', 'hair', 'big', '<UNK>', 'bad', 'music', 'and', 'a', 'giant', 'safety', '<UNK>', 'these', 'are', 'the', 'words', 'to', 'best', 'describe', 'this', 'terrible', 'movie', 'i', 'love', 'cheesy', 'horror', 'movies', 'and', "i've", 'seen', 'hundreds', 'but', 'this', 'had', 'got', 'to', 'be', 'on', 'of', 'the', 'worst', 'ever', 'made', 'the', 'plot', 'is', 'paper', 'thin', 'and', 'ridiculous', 'the', 'acting', 'is', 'an', '<UNK>', 'the', 'script', 'is', 'completely', 'laughable', 'the', 'best', 'is', 'the', 'end', 'showdown', 'with', 'the', 'cop', 'and', 'how', 'he', 'worked', 'out', 'who', 'the', 'killer', 'is', "it's", 'just', 'so', 'damn', 'terribly', 'written', 'the', 'clothes', 'are', '<UNK>', 'and', 'funny', 'in', 'equal', '<UNK>', 'the', 'hair', 'is', 'big', 'lots', 'of', '<UNK>', '<UNK>', 'men', 'wear', 'those', 'cut', '<UNK>', '<UNK>', 'that', 'show', 'off', 'their', '<UNK>', '<UNK>', 'that', 'men', 'actually', 'wore', 'them', 'and', 'the', 'music', 'is', 'just', '<

In [49]:
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
X_cv = sequence.pad_sequences(X_cv, maxlen=max_review_length)

In [50]:
X_train.shape

(25000, 500)

In [51]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


In [52]:
model.fit(X_train, y_train, validation_data=(X_cv, y_cv), epochs=3, batch_size=64)

Train on 25000 samples, validate on 12500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a4f686e80>

In [53]:
scores = model.evaluate(X_test, y_test, verbose=1)



In [54]:
print(model.metrics_names)
print("Accuracy: %.2f%%" % (scores[1]*100))

['loss', 'acc']
Accuracy: 85.97%


In [64]:
test_sent_index = 10
print(' '.join([freq_index[i] for i in X_test[test_sent_index]]))
print("Scored as: ", model.predict(X_test[test_sent_index:test_sent_index + 1]))

<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <START> it stars war <UNK> william <UNK> <UNK> who falls in love with a stunning <UNK> doctor 

In [74]:
test_sent = "the movie was fantastic absolutely wonderful"
test_vec = numpy.array([[word_index[w] for w in test_sent.split()]])
test_vec = sequence.pad_sequences(test_vec)
print("Scored as: ", model.predict(test_vec)[0])

ValueError: Error when checking : expected embedding_5_input to have shape (500,) but got array with shape (6,)