In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.datasets import imdb
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split
import numpy as np

# Load IMDB dataset with a fixed vocabulary size
vocab_size = 50000  # Limit vocabulary to top 50,000 words
max_length = 200    # Maximum review length (truncation/padding)

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size)

# Merge training and testing data before splitting
all_data = np.concatenate((train_data, test_data), axis=0)
all_labels = np.concatenate((train_labels, test_labels), axis=0)

# Split into 80% training and 20% testing
tr_x, te_x, tr_y, te_y = train_test_split(all_data, all_labels, test_size=0.20, random_state=42)

# Pad sequences to ensure uniform input size
tr_x = pad_sequences(tr_x, maxlen=max_length, padding='post', truncating='post')
te_x = pad_sequences(te_x, maxlen=max_length, padding='post', truncating='post')

# Build the model
model = keras.Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),  # Embedding Layer
    Bidirectional(LSTM(32, return_sequences=True, kernel_regularizer=l2(0.001))),  # Reduce LSTM units & add L2 reg
    Dropout(0.5),  # Dropout to prevent overfitting
    GlobalAveragePooling1D(),  # Average pooling over time steps
    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),  # Fully connected layer with L2 reg
    Dropout(0.5),  # Dropout before output layer
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Add early stopping
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train the model
model.fit(tr_x, tr_y, epochs=10, batch_size=64, validation_data=(te_x, te_y), callbacks=[early_stopping])

# Evaluate the model
test_loss, test_acc = model.evaluate(te_x, te_y)
print(f"Test Accuracy: {test_acc:.4f}")


Epoch 1/10




[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 24ms/step - accuracy: 0.6996 - loss: 0.6386 - val_accuracy: 0.8564 - val_loss: 0.3481
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.9166 - loss: 0.2571 - val_accuracy: 0.8796 - val_loss: 0.3166
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 25ms/step - accuracy: 0.9488 - loss: 0.1749 - val_accuracy: 0.8699 - val_loss: 0.3602
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 23ms/step - accuracy: 0.9668 - loss: 0.1261 - val_accuracy: 0.8640 - val_loss: 0.4288
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.8825 - loss: 0.3131
Test Accuracy: 0.8796


In [32]:
model.summary()

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.datasets import imdb

# Load IMDB word index (word -> index mapping)
word_index = imdb.get_word_index()

# Function to convert text to sequence of numbers
def text_to_sequence(text, vocab_size=50000):
    words = text.lower().split()  # Tokenize words
    sequence = [word_index[word] + 3 if word in word_index and word_index[word] < vocab_size - 3 else 2 for word in words]
    return sequence

# Function to predict sentiment of input text
def predict_sentiment(model, text, max_length=200):
    sequence = text_to_sequence(text)  # Convert text to sequence
    padded_sequence = pad_sequences([sequence], maxlen=max_length, padding='post', truncating='post')  # Pad sequence
    prediction = model.predict(padded_sequence)[0][0]  # Get prediction probability
    sentiment = "Positive" if prediction > 0.5 else "Negative"  # Classify sentiment
    confidence = prediction if sentiment == "Positive" else 1 - prediction  # Confidence score
    return sentiment, confidence



# Testing a strong negative review

In [54]:
# example usage
text3 = "This was the worst movie I've ever seen. The plot made no sense, the characters were one-dimensional, and the acting was terrible. A total waste of time!"
sentiment3,confidence3 = predict_sentiment(model,text3)
print(f"Predicted Sentiment: {sentiment3} (Confidence: {confidence3:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Predicted Sentiment: Negative (Confidence: 0.84)


# **Testing a strong positive review**

In [27]:
# example usage
text4 = "Watching this film was my best moment of my life, i loved it, that was a super amazing  cast , i will rewatch it for tens of time! , I really want to thank this fanatastic cast and that is my best movie ever"
sentiment4,confidence4 = predict_sentiment(model,text4)
print(f"Predicted Sentiment: {sentiment4} (Confidence: {confidence4:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Predicted Sentiment: Positive (Confidence: 0.83)


# **Testing normal bad and good reviews**

In [35]:
reviews = [
    "Good movie",
    "Bad movie",
    "I was glad to watch this movie"

]

for review in reviews:
    sentiment, confidence = predict_sentiment(model, review)
    print(f"Review: {review}\nPredicted Sentiment: {sentiment}\n (Confidence: {confidence:.2f})\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Review: Good movie
Predicted Sentiment: Positive
 (Confidence: 0.57)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Review: Bad movie
Predicted Sentiment: Positive
 (Confidence: 0.54)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Review: I was glad to watch this movie
Predicted Sentiment: Positive
 (Confidence: 0.59)

