In [1]:
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 23 10:29:02 2022

@author: Luigi Portinale
"""
#The Large Movie Review Dataset (often referred to as the IMDB dataset) 
#contains 25,000 highly-polar movie reviews (good or bad) for training 
#and the same amount again for testing. 
#The problem is to determine whether a given movie review has 
#a positive or negative sentiment.

#Word Embedding
#We will map each word onto a 32 length real valued vector. 
#We will also limit the total number of words that we are interested 
#in modeling to the 5000 most frequent words, and zero out the rest. 
#Finally, the sequence length (number of words) in each review varies, 
#so we will constrain each review to be 500 words, 
#truncating long reviews and pad the shorter reviews with zero values.

import numpy
from keras.api.datasets import imdb
from keras.api.models import Sequential
from keras.api.layers import Dense
from keras.api.layers import LSTM
from keras.api.layers import Embedding
from keras.api.layers import Input
from keras.api.preprocessing import sequence

# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)


# create the model
embedding_vector_length = 32
model = Sequential()
#create the emebedding of the documents
model.add(Input(shape=(max_review_length,)))
model.add(Embedding(top_words, embedding_vector_length))

#add an LSTM with 100 units and drop-out on both the input
#and the recurrent connections
model.add(LSTM(100, recurrent_dropout=0.05, dropout=0.01))

#output layer for binary classification
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

None
Epoch 1/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 73ms/step - accuracy: 0.6560 - loss: 0.5974 - val_accuracy: 0.8373 - val_loss: 0.3737
Epoch 2/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 79ms/step - accuracy: 0.8635 - loss: 0.3428 - val_accuracy: 0.8604 - val_loss: 0.3624
Epoch 3/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 73ms/step - accuracy: 0.8903 - loss: 0.2791 - val_accuracy: 0.8384 - val_loss: 0.3653
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 49ms/step - accuracy: 0.8349 - loss: 0.3732
Accuracy: 83.84%
