In [2]:
## import tensorflow
import tensorflow

In [3]:
# Importing the IMDb dataset from Keras. This dataset contains 25,000 highly polar movie reviews.
from tensorflow.keras.datasets import imdb

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
# Importing the Tokenizer class for text preprocessing.
# Tokenizer is used to vectorize a text corpus by turning each text into either a sequence of integers
# (each integer being the index of a token in a dictionary) or into a matrix where each row is the vector representation of a text.
# It works by first building an index of all the unique tokens (words) in the text, then encoding each word with a unique integer.
# It allows for efficient handling of text data when preparing it for input into a neural network model.

In [5]:
# Load the IMDB dataset with a specified vocabulary size
# imdb.load_data() loads the IMDB movie review dataset,
#which is a dataset of 25,000 movie reviews from IMDB, labeled by sentiment (positive/negative).
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=20000)
# The num_words parameter specifies the maximum number of words to include in the dataset.
# Only the top 20,000 most frequently occurring words in the dataset will be considered.
# The function returns two tuples:
# (X_train, y_train): Training data and labels.
# (X_test, y_test): Testing data and labels.
# X_train and X_test are lists of sequences, where each sequence is a list of integers representing the words in a review.
# y_train and y_test are lists of integers (0 or 1), where 0 indicates a negative review and 1 indicates a positive review.

In [6]:
from keras.preprocessing import sequence
# Importing sequence for padding sequences to the same length.
# Padding ensures that all input sequences are of the same length, which is required by the neural network.
# Sequences shorter than the specified length are padded with zeros (by default) at the beginning.
# Sequences longer than the specified length are truncated so that all sequences have the same length.

In [7]:
# Pad sequences to a maximum length of 200
X_train = sequence.pad_sequences(X_train, maxlen=200)
X_test = sequence.pad_sequences(X_test, maxlen=200)
# sequence.pad_sequences() is used to ensure that all sequences (lists of integers) in the dataset have the same length.
# This function pads shorter sequences with zeros at the beginning by default, so they all have the same length, specified by the maxlen parameter.
# In this case, maxlen=200 means that each sequence will be either truncated to 200 words (if longer) or padded with zeros to 200 words (if shorter).
# Padding sequences to the same length is essential because neural networks expect input data to have a consistent shape.
# For instance, an RNN (Recurrent Neural Network) requires all input sequences to have the same length to be processed in batches.
# By setting maxlen=200, we ensure that all movie reviews in X_train and X_test are of length 200, making them suitable for input into the RNN model.

In [8]:
# Importing the Sequential model class for building a linear stack of layers.
from keras.models import Sequential
model = Sequential()
## importing the necessary layers
from keras.layers import Embedding, SimpleRNN, Dense
# Add an Embedding layer to the model
model.add(Embedding(input_dim=20000, output_dim=32)) # Embedding layer to convert word indices to dense vectors of fixed size (32).

In [9]:
# Add a SimpleRNN layer to the model
model.add(SimpleRNN(32)) # SimpleRNN layer with 32 units. This layer processes the sequence of word vectors.

In [10]:
# Add a Dense layer with 100 neurons and ReLU activation
model.add(Dense(units = 100, activation ="relu"))
# Add an output Dense layer with 1 neuron and sigmoid activation for binary classification
model.add(Dense(units = 1, activation = "sigmoid"))

In [11]:
# Compile the model with Adam optimizer and binary cross-entropy loss function
model.compile(optimizer = "adam", loss="binary_crossentropy", metrics=["accuracy"])

In [12]:
# Train the model on the training data for 5 epochs
model.fit(X_train,y_train,epochs= 10)

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 30ms/step - accuracy: 0.5681 - loss: 0.6493
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 30ms/step - accuracy: 0.8600 - loss: 0.3335
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 29ms/step - accuracy: 0.9340 - loss: 0.1748
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 31ms/step - accuracy: 0.9323 - loss: 0.1765
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 29ms/step - accuracy: 0.9842 - loss: 0.0477
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 29ms/step - accuracy: 0.9917 - loss: 0.0289
Epoch 7/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 28ms/step - accuracy: 0.9710 - loss: 0.0819
Epoch 8/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 28ms/step - accuracy: 0.9895 - loss: 0.0335
Epoch 9/10
[1m782/782[

<keras.src.callbacks.history.History at 0x2649731ee70>

In [13]:
# Custom sentence for sentiment analysis
sentences = "worst movie i have ever seen in my whole life"

In [14]:
# Preprocess the sentence
sentence = sentences.lower().split() # Converts the sentence to lowercase and splits it into a list of words.

In [15]:
# Print the preprocessed sentence
sentence

['worst', 'movie', 'i', 'have', 'ever', 'seen', 'in', 'my', 'whole', 'life']

In [16]:
# Get the word index from the IMDb dataset
word_index = imdb.get_word_index() # Returns a dictionary mapping words to their integer index in the IMDb dataset.

In [17]:
# Convert the sentence to a list of tokens (word indices)
tokens = [word_index.get(word, 0) for word in sentence]
# This line creates a list of integers (tokens) by mapping each word in the sentence to its corresponding index using the word_index dictionary.
# If a word is not found in the word_index, it returns 0 by default (due to the get method's default value).

In [18]:
# Print the tokenized sentence
tokens

[246, 17, 10, 25, 123, 107, 8, 58, 223, 110]

In [19]:
# Pad the tokenized sentence to a maximum length of 200
tokens = sequence.pad_sequences([tokens],maxlen=200)

In [20]:
# Print the padded tokens
tokens

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [21]:
# Predict the sentiment of the padded sentence
prediction = model.predict(tokens)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step


In [22]:
# Print the prediction score
prediction

array([[0.00018835]], dtype=float32)

In [23]:
# Interpret the prediction
if prediction[0][0]>0.5:
  print("positive")
else:
  print("negative")

negative
