In [4]:
import tensorflow_datasets as tfds

# Load the IMDb dataset
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)
train_data, test_data = imdb['train'], imdb['test']


Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteSZR12L/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteSZR12L/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteSZR12L/imdb_reviews-unsupervised.t…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [6]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the IMDb dataset
num_words = 10000  # Top 10,000 words in the vocabulary
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words)

# Convert word indices back to text for Tokenizer
word_index = imdb.get_word_index()
index_to_word = {i: word for word, i in word_index.items()}
x_train_text = [' '.join([index_to_word.get(idx - 3, '?') for idx in seq]) for seq in x_train]
x_test_text = [' '.join([index_to_word.get(idx - 3, '?') for idx in seq]) for seq in x_test]

# Create a tokenizer and fit on training data
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(x_train_text)

# Convert text data to sequences and pad sequences
train_sequences = tokenizer.texts_to_sequences(x_train_text)
test_sequences = tokenizer.texts_to_sequences(x_test_text)

max_sequence_length = 200  # Maximum sequence length for padding
train_sequences_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
test_sequences_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

# Convert sentiment labels to binary values (positive: 1, negative: 0)
train_labels = tf.keras.utils.to_categorical(y_train, num_classes=2)
test_labels = tf.keras.utils.to_categorical(y_test, num_classes=2)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D

# Create the LSTM model
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional

model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


# Train the model
model.fit(train_sequences_padded, train_labels, epochs=1, batch_size=64, validation_data=(test_sequences_padded, test_labels))




<keras.src.callbacks.History at 0x7bf49cdd3eb0>

In [13]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define a function to predict sentiment for a given input sentence
def predict_sentiment(input_sentence):
    # Preprocess the input sentence
    input_sequence = tokenizer.texts_to_sequences([input_sentence])
    input_sequence_padded = pad_sequences(input_sequence, maxlen=max_sequence_length, padding='post')

    # Make predictions using the model
    predictions = model.predict(input_sequence_padded)

    # Interpret the model's output
    if predictions[0][0] > predictions[0][1]:
        sentiment = "Negative"
    else:
        sentiment = "Positive"

    return sentiment

# Example usage:
input_sentence = "This movie is good"
sentiment = predict_sentiment(input_sentence)
print(f"Sentence: {input_sentence}")
print(f"Sentiment: {sentiment}")


Sentence: This movie is good
Sentiment: Positive
