In [14]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [15]:
def load_data(vocab_size=10000):
    (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)
    print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")
    return (X_train, y_train), (X_test, y_test)

def word_index_mapping():
    word_index = imdb.get_word_index()
    reverse_word_index = {value + 3: key for key, value in word_index.items()}
    reverse_word_index[0] = '<PAD>'
    reverse_word_index[1] = '<START>'
    reverse_word_index[2] = '<UNK>'
    return word_index, reverse_word_index

def decode_review(encoded_review, reverse_word_index):
    return ' '.join([reverse_word_index.get(i, '?') for i in encoded_review])

def get_decoded_training_data(X_train, reverse_word_index):
    decoded_reviews = []
    for review in X_train:
        decoded_reviews.append(decode_review(review, reverse_word_index))
    return decoded_reviews

def preprocess_data(X_train, X_test, max_length=250):
    X_train_padded = pad_sequences(X_train, maxlen=max_length, padding='post')
    X_test_padded = pad_sequences(X_test, maxlen=max_length, padding='post')
    return X_train_padded, X_test_padded

In [16]:
def build_model(vocab_size, embedding_dim, max_length):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model

def train_model(model, X_train_padded, y_train, epochs=5, batch_size=128):
    history = model.fit(
        X_train_padded, y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_split=0.2,
        verbose=1
    )
    return history

def evaluate_model(model, X_test_padded, y_test):
    loss, accuracy = model.evaluate(X_test_padded, y_test)
    print(f"Test accuracy: {accuracy:.4f}")
    return loss, accuracy

def create_tokenizer(decoded_reviews, vocab_size):
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(decoded_reviews)
    return tokenizer


In [17]:
def predict_sentiment(model, text=None, X_test=None, X_test_padded=None, y_test=None,
                      max_length=250, reverse_word_index=None, tokenizer=None):
    if text:
        sequence = tokenizer.texts_to_sequences([text])
        padded = pad_sequences(sequence, maxlen=max_length, padding='post')
    else:
        review_index = np.random.randint(0, len(X_test))
        padded = X_test_padded[review_index:review_index+1]
        text = decode_review(X_test[review_index], reverse_word_index)
        actual_sentiment = 'Positive' if y_test[review_index] == 1 else 'Negative'
        print(f"Actual sentiment: {actual_sentiment}")

    # Make prediction
    prediction = model.predict(padded, verbose=0)[0][0]
    sentiment = "Positive" if prediction >= 0.5 else "Negative"
    confidence = prediction if prediction >= 0.5 else 1 - prediction

    return {
        "text": text,
        "sentiment": sentiment,
        "confidence": float(confidence)
    }

In [20]:
def test_examples(model, X_test, X_test_padded, y_test, reverse_word_index, tokenizer, max_length):

    custom_examples = [
        "This was the best purchase I ever made",
        "This was the worst purchase I ever made."
    ]

    for example in custom_examples:
        result = predict_sentiment(
            model, text=example, tokenizer=tokenizer, max_length=max_length
        )
        print(f"Text: '{result['text']}'")
        print(f"Predicted sentiment: {result['sentiment']} (confidence: {result['confidence']:.4f})")
        print("-" * 40)

In [21]:
def main():
    vocab_size = 10000
    embedding_dim = 16
    max_length = 250
    epochs = 5
    batch_size = 128

    (X_train, y_train), (X_test, y_test) = load_data(vocab_size)
    word_index, reverse_word_index = create_word_index_mapping()

    decoded_reviews = get_decoded_training_data(X_train, reverse_word_index)

    tokenizer = create_tokenizer(decoded_reviews, vocab_size)

    X_train_padded, X_test_padded = preprocess_data(X_train, X_test, max_length)

    model = build_model(vocab_size, embedding_dim, max_length)
    train_model(model, X_train_padded, y_train, epochs, batch_size)
    evaluate_model(model, X_test_padded, y_test)

    # Test the model
    test_examples(model, X_test, X_test_padded, y_test, reverse_word_index, tokenizer, max_length)

if __name__ == "__main__":
    main()

Training samples: 25000, Testing samples: 25000
Epoch 1/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.6112 - loss: 0.6841 - val_accuracy: 0.7424 - val_loss: 0.6098
Epoch 2/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.7896 - loss: 0.5616 - val_accuracy: 0.8390 - val_loss: 0.4337
Epoch 3/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.8501 - loss: 0.3993 - val_accuracy: 0.8626 - val_loss: 0.3534
Epoch 4/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8775 - loss: 0.3176 - val_accuracy: 0.8752 - val_loss: 0.3174
Epoch 5/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8973 - loss: 0.2727 - val_accuracy: 0.8502 - val_loss: 0.3310
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8464 - loss: 0.3369
Test accuracy: 0.8454
Text: 'This w