In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import pickle

In [2]:
# Load the IMDB dataset
num_words = 10000  # Top 10,000 words
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words)

# Pad sequences to ensure uniform input size
maxlen = 200  # Maximum length of input sequences
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

# Save the word index for tokenizer usage later
word_index = imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step


In [3]:
# Create and compile the LSTM model
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=128, input_length=maxlen))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [4]:
# Train the model
model.fit(x_train, y_train, epochs=3, batch_size=64, validation_data=(x_test, y_test))

Epoch 1/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 580ms/step - accuracy: 0.6820 - loss: 0.5829 - val_accuracy: 0.8216 - val_loss: 0.4017
Epoch 2/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 647ms/step - accuracy: 0.8549 - loss: 0.3485 - val_accuracy: 0.8367 - val_loss: 0.3877
Epoch 3/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 577ms/step - accuracy: 0.8847 - loss: 0.2863 - val_accuracy: 0.8428 - val_loss: 0.3954


<keras.src.callbacks.history.History at 0x7857dce24c10>

In [5]:
# Save the trained model
model.save('sentiment_lstm_model.h5')



In [6]:
# Save the tokenizer's configuration (fit on training data)
with open('word_index.pkl', 'wb') as handle:
    pickle.dump(word_index, handle)