In [1]:
# Import numpy for numerical operations and array handling
import numpy as np

# Define a list of text documents (phrases related to neural networks)
# Purpose: To demonstrate text preprocessing for machine learning
docs = ['recurrent neural network',
        'neural network',
        'artificial neural',
        'connections between nodes',
        'can create a cycle',
        'allowing output',
        'some nodes to affect subsequent',
        'exhibit temporal',
        'dynamic behavior',
        'type of Neural Network',
        'affect subsequent']

In [2]:
# Import Tokenizer from TensorFlow's Keras to convert text to integer sequences
# Purpose: To perform integer encoding on the text data
from tensorflow.keras.preprocessing.text import Tokenizer
# Initialize Tokenizer with an out-of-vocabulary token '<nothing>'
# This handles words not seen during training
tokenizer = Tokenizer(oov_token='<nothing>')

In [3]:
# Fit the tokenizer on the text documents to build a vocabulary
# Purpose: To create a word-to-index mapping based on the input texts
tokenizer.fit_on_texts(docs)

In [4]:
# Display the word-to-index mapping
# Purpose: To show how each word is assigned a unique integer
tokenizer.word_index

{'<nothing>': 1,
 'neural': 2,
 'network': 3,
 'nodes': 4,
 'affect': 5,
 'subsequent': 6,
 'recurrent': 7,
 'artificial': 8,
 'connections': 9,
 'between': 10,
 'can': 11,
 'create': 12,
 'a': 13,
 'cycle': 14,
 'allowing': 15,
 'output': 16,
 'some': 17,
 'to': 18,
 'exhibit': 19,
 'temporal': 20,
 'dynamic': 21,
 'behavior': 22,
 'type': 23,
 'of': 24}

In [5]:
# Display the frequency of each word in the documents
# Purpose: To show how often each word appears in the dataset
tokenizer.word_counts

OrderedDict([('recurrent', 1),
             ('neural', 4),
             ('network', 3),
             ('artificial', 1),
             ('connections', 1),
             ('between', 1),
             ('nodes', 2),
             ('can', 1),
             ('create', 1),
             ('a', 1),
             ('cycle', 1),
             ('allowing', 1),
             ('output', 1),
             ('some', 1),
             ('to', 1),
             ('affect', 2),
             ('subsequent', 2),
             ('exhibit', 1),
             ('temporal', 1),
             ('dynamic', 1),
             ('behavior', 1),
             ('type', 1),
             ('of', 1)])

In [6]:
# Display the number of documents processed
# Purpose: To confirm the total number of text samples
tokenizer.document_count

11

In [7]:
# Convert text documents to sequences of integers
# Purpose: To transform text into numerical format for model input
sequences = tokenizer.texts_to_sequences(docs)
sequences          'recurrent neural network'

[[7, 2, 3],
 [2, 3],
 [8, 2],
 [9, 10, 4],
 [11, 12, 13, 14],
 [15, 16],
 [17, 4, 18, 5, 6],
 [19, 20],
 [21, 22],
 [23, 24, 2, 3],
 [5, 6]]

In [8]:
# Import pad_sequences to ensure all sequences have the same length
# Purpose: To standardize sequence lengths for consistent model input
from tensorflow.keras.utils import pad_sequences

In [10]:
# Pad sequences to a fixed length, adding zeros at the end ('post')
# Purpose: To make all sequences the same length (5 in this case) for processing
sequences = pad_sequences(sequences, padding='post')
sequences

array([[ 7,  2,  3,  0,  0],
       [ 2,  3,  0,  0,  0],
       [ 8,  2,  0,  0,  0],
       [ 9, 10,  4,  0,  0],
       [11, 12, 13, 14,  0],
       [15, 16,  0,  0,  0],
       [17,  4, 18,  5,  6],
       [19, 20,  0,  0,  0],
       [21, 22,  0,  0,  0],
       [23, 24,  2,  3,  0],
       [ 5,  6,  0,  0,  0]], dtype=int32)

In [40]:
# Import modules for sentiment analysis using the IMDB dataset
# Purpose: To demonstrate a practical application of text preprocessing and RNNs
from tensorflow.keras.datasets import imdb
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding, Flatten

In [41]:
# Load the IMDB dataset (movie reviews) for sentiment analysis
# Purpose: To provide preprocessed data for training a sentiment classifier
(X_train, y_train), (X_test, y_test) = imdb.load_data()

In [42]:
X_train.shape

(25000,)

In [43]:
X_test.shape

(25000,)

In [44]:
# Inspect the labels of the test set
# Purpose: To verify the format of the target variable (binary: 0 or 1)
y_test

array([0, 1, 1, ..., 0, 0, 0])

In [45]:
# Inspect the first training sample
# Purpose: To show the structure of the preprocessed IMDB data (integer sequences)
X_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

In [46]:
print(len(X_train[0]))

218


In [47]:
# Check the length of some training samples
# Purpose: To highlight that sequences have varying lengths before padding
print(len(X_train[2]))
print(len(X_train[3]))

141
550


In [48]:
# Pad training and test sequences to a fixed length of 50
# Purpose: To ensure all input sequences are of uniform length for the RNN
X_train = pad_sequences(X_train, padding='post', maxlen=200)
X_test = pad_sequences(X_test, padding='post', maxlen=200)


In [49]:
len(X_train[3])

200

In [50]:
# Verify the padded sequence for the first training sample
# Purpose: To confirm that padding worked correctly
X_train[4]

array([    1,   249,  1323,     7,    61,   113,    10,    10,    13,
        1637,    14,    20,    56,    33,  2401,    18,   457,    88,
          13,  2626,  1400,    45,  3171,    13,    70,    79,    49,
         706,   919,    13,    16,   355,   340,   355,  1696,    96,
         143,     4,    22,    32,   289,     7,    61,   369,    71,
        2359,     5,    13,    16,   131,  2073,   249,   114,   249,
         229,   249,    20,    13,    28,   126,   110,    13,   473,
           8,   569,    61,   419,    56,   429,     6,  1513,    18,
          35,   534,    95,   474,   570,     5,    25,   124,   138,
          88,    12,   421,  1543,    52,   725,  6397,    61,   419,
          11,    13,  1571,    15,  1543,    20,    11,     4, 22016,
           5,   296,    12,  3524,     5,    15,   421,   128,    74,
         233,   334,   207,   126,   224,    12,   562,   298,  2167,
        1272,     7,  2601,     5,   516,   988,    43,     8,    79,
         120,    15,

In [51]:
# Define a simple RNN model for sentiment analysis
# Purpose: To classify movie reviews as positive or negative
model = Sequential()
# Add a SimpleRNN layer with 32 units, expecting input sequences of shape (50, 1)
model.add(SimpleRNN(32, input_shape=(200, 1), return_sequences=False))
# Add a Dense layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

In [31]:
# Display the model architecture
# Purpose: To show the layers and parameters of the model
model.summary()

In [None]:
Parameters = 32 × (1 + 32 + 1) = 32 × 34 = 1,088 ✓

In [52]:
# Compile the model with binary cross-entropy loss and Adam optimizer
# Purpose: To configure the model for training on a binary classification task
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [53]:
# Train the model on the IMDB dataset for 5 epochs
# Purpose: To fit the model to the training data and evaluate on test data
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 19ms/step - accuracy: 0.5067 - loss: 0.6970 - val_accuracy: 0.4930 - val_loss: 0.6948
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.5037 - loss: 0.6935 - val_accuracy: 0.5070 - val_loss: 0.6935
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 15ms/step - accuracy: 0.5048 - loss: 0.6940 - val_accuracy: 0.5005 - val_loss: 0.6959
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 15ms/step - accuracy: 0.5022 - loss: 0.6944 - val_accuracy: 0.5054 - val_loss: 0.6940
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 15ms/step - accuracy: 0.4997 - loss: 0.6942 - val_accuracy: 0.4973 - val_loss: 0.6944


<keras.src.callbacks.history.History at 0x7cf394ae6f00>

In [57]:
# Prepare a single test sample for prediction
# Purpose: To demonstrate how to make predictions with the trained model
test_data = X_test[0][0:200].reshape(1, -1)

In [58]:
# Predict the sentiment for the test sample
# Purpose: To show the model’s output (probability of positive sentiment)
model.predict(test_data)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step


array([[0.47372797]], dtype=float32)

In [59]:
# Redefine the text documents for demonstrating embeddings
# Purpose: To reuse the same text data for a different encoding technique
docs = ['recurrent neural network',
        'neural network',
        'artificial neural',
        'connections between nodes',
        'can create a cycle',
        'allowing output',
        'some nodes to affect subsequent',
        'exhibit temporal',
        'dynamic behavior',
        'type of Neural Network',
        'affect subsequent']