In [2]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, Flatten

# Load the IMDb dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [3]:
# Pad sequences to ensure all input data is of the same length
max_length = 200
X_train_padded = pad_sequences(X_train, maxlen=max_length)
X_test_padded = pad_sequences(X_test, maxlen=max_length)

In [4]:
# Check the number of samples (length of the dataset)
print("Number of training samples:", len(X_train))
print("Number of test samples:", len(X_test))

# Check the length of the first review (before padding)
print("Length of first training review:", len(X_train[0]))


Number of training samples: 25000
Number of test samples: 25000
Length of first training review: 218


In [5]:
# Build the ANN model
model = Sequential()

In [6]:
# Embedding layer
model.add(Embedding(input_dim=10000, output_dim=128))

# Flatten layer: Flattens the output from the Embedding layer before passing it to the Dense layers
model.add(Flatten())

# Fully connected hidden layer
model.add(Dense(128, activation='relu'))

# Dropout to prevent overfitting
model.add(Dropout(0.5))

# Fully connected hidden layer
model.add(Dense(64, activation='relu'))

# Dropout to prevent overfitting
model.add(Dropout(0.5))

# Output layer for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

In [7]:
# Train the model
history = model.fit(X_train_padded, y_train, epochs=15, batch_size=64, validation_split=0.2)


Epoch 1/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 84ms/step - accuracy: 0.6331 - loss: 0.5959 - val_accuracy: 0.8568 - val_loss: 0.3316
Epoch 2/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 83ms/step - accuracy: 0.9440 - loss: 0.1591 - val_accuracy: 0.8506 - val_loss: 0.3876
Epoch 3/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 84ms/step - accuracy: 0.9944 - loss: 0.0251 - val_accuracy: 0.8524 - val_loss: 0.6381
Epoch 4/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 84ms/step - accuracy: 0.9984 - loss: 0.0054 - val_accuracy: 0.8476 - val_loss: 0.7961
Epoch 5/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 83ms/step - accuracy: 0.9979 - loss: 0.0082 - val_accuracy: 0.8342 - val_loss: 1.0163
Epoch 6/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 85ms/step - accuracy: 0.9920 - loss: 0.0250 - val_accuracy: 0.8372 - val_loss: 0.8111
Epoch 7/15
[1m3

In [8]:
# Evaluate the model on test data
test_loss, test_acc = model.evaluate(X_test_padded, y_test)

print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_acc}')


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.8292 - loss: 1.3164
Test Loss: 1.2930119037628174
Test Accuracy: 0.8311600089073181


Predicting Sentiment for new reviews

In [9]:
# Predict sentiment on a new review
def predict_sentiment(review):
    # Convert the review to a sequence of integers and pad
    review_seq = imdb.get_word_index()
    tokens = [review_seq.get(word, 2) for word in review.lower().split()]  # 2 is the default for unknown words
    padded_tokens = pad_sequences([tokens], maxlen=max_length)

    # Predict the sentiment (1 = positive, 0 = negative)
    prediction = model.predict(padded_tokens)[0][0]
    return 'Positive' if prediction > 0.5 else 'Negative'

# Example usage
print(predict_sentiment("I really liked the movie!"))



Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
Positive


In [10]:
def predict_sentiment(review):
    # Convert the review to a sequence of integers and pad
    review_seq = imdb.get_word_index()
    tokens = [review_seq.get(word, 2) for word in review.lower().split()]  # 2 is the default for unknown words
    padded_tokens = pad_sequences([tokens], maxlen=max_length)

    # Predict the sentiment (1 = positive, 0 = negative)
    prediction = model.predict(padded_tokens)[0][0]

    # Define thresholds for classification
    if prediction > 0.6:
        return 'Positive'
    elif prediction < 0.4:
        return 'Negative'
    else:
        return 'Neutral'  # Prediction is > 0.4 and < 0.6

# Example usage
print(predict_sentiment("wow I really liked the movie!"))  # Expected: Positive
print(predict_sentiment("The movie was terrible."))       # Expected: Negative
print(predict_sentiment("it was fine."))            # Expected: Neutral


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Positive


In [11]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test)

print(f"Test Accuracy: {test_accuracy:.4f}")


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.8292 - loss: 1.3164
Test Accuracy: 0.8312
