In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load the IMDb movie review dataset
vocabulary_size = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocabulary_size)

# Convert word indices back to sentences
word_index = imdb.get_word_index()
index_to_word = {i: word for word, i in word_index.items()}
X_train_sentences = [' '.join([index_to_word.get(word, '') for word in sentence]) for sentence in X_train]
X_test_sentences = [' '.join([index_to_word.get(word, '') for word in sentence]) for sentence in X_test]

# Preprocess the data
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_data(data):
    processed_data = []
    for sentence in data:
        sentence = word_tokenize(sentence)
        sentence = ' '.join([lemmatizer.lemmatize(word.lower()) for word in sentence if word.isalpha()])
        sentence = ' '.join([word for word in sentence.split() if word not in stop_words])
        processed_data.append(sentence)
    return processed_data

X_train = preprocess_data(X_train_sentences)
X_test = preprocess_data(X_test_sentences)

# Tokenize the sentences
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Pad sequences to the same length
max_sequence_length = 250
X_train = pad_sequences(X_train, maxlen=max_sequence_length)
X_test = pad_sequences(X_test, maxlen=max_sequence_length)

# Build the model
embedding_size = 32
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping criteria
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Train the model
batch_size = 64
epochs = 10
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss:", loss)
print("Accuracy:", accuracy)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


[nltk_data] Downloading package wordnet to /root/nltk_data...


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Loss: 0.37683355808258057
Accuracy: 0.8483200073242188
