# Implementing RNN

In [1]:
# importing libraries
import pandas as pd
import numpy as np

In [2]:
# This is the document we are going to tokenize
docs = [
    "Life is full of unexpected surprises.",
    "She enjoys reading books every evening.",
    "Dreams come true.",
    "Always stay positive and keep moving.",
    "The stars shone brightly last night.",
    "Never give up on your dreams.",
    "They built a beautiful wooden house.",
    "Learning something new expands your mind.",
    "The cat jumped over the fence.",
    "Hard work often leads to success."
]


In [3]:
# Installing tensorflow
!pip install tensorflow



In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token = "<nothing>") # (out of vocabulary) suppose while prediction, if users give a words which is not in vocabulary then it will use nothing instead

In [5]:
# Tokenize the vocabulary (converting uppercase inot lowercase)
tokenizer.fit_on_texts(docs)

In [6]:
# Alloated index value to vocabulary
tokenizer.word_index

{'<nothing>': 1,
 'the': 2,
 'dreams': 3,
 'your': 4,
 'life': 5,
 'is': 6,
 'full': 7,
 'of': 8,
 'unexpected': 9,
 'surprises': 10,
 'she': 11,
 'enjoys': 12,
 'reading': 13,
 'books': 14,
 'every': 15,
 'evening': 16,
 'come': 17,
 'true': 18,
 'always': 19,
 'stay': 20,
 'positive': 21,
 'and': 22,
 'keep': 23,
 'moving': 24,
 'stars': 25,
 'shone': 26,
 'brightly': 27,
 'last': 28,
 'night': 29,
 'never': 30,
 'give': 31,
 'up': 32,
 'on': 33,
 'they': 34,
 'built': 35,
 'a': 36,
 'beautiful': 37,
 'wooden': 38,
 'house': 39,
 'learning': 40,
 'something': 41,
 'new': 42,
 'expands': 43,
 'mind': 44,
 'cat': 45,
 'jumped': 46,
 'over': 47,
 'fence': 48,
 'hard': 49,
 'work': 50,
 'often': 51,
 'leads': 52,
 'to': 53,
 'success': 54}

In [7]:
# Total number of vocabulary
len(tokenizer.word_index)

54

In [8]:
#  Number of rows or number of document
tokenizer.document_count

10

In [9]:
# Converting each sentence/document into a sequence of word indices based on the tokenizer's vocabulary
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[5, 6, 7, 8, 9, 10],
 [11, 12, 13, 14, 15, 16],
 [3, 17, 18],
 [19, 20, 21, 22, 23, 24],
 [2, 25, 26, 27, 28, 29],
 [30, 31, 32, 33, 4, 3],
 [34, 35, 36, 37, 38, 39],
 [40, 41, 42, 43, 4, 44],
 [2, 45, 46, 47, 2, 48],
 [49, 50, 51, 52, 53, 54]]

In [10]:
# applying padding to make each document into same size
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_sequences = pad_sequences(sequences, padding = "post", maxlen = 7) # keeping sequnce length of each documment as 7
padded_sequences

array([[ 5,  6,  7,  8,  9, 10,  0],
       [11, 12, 13, 14, 15, 16,  0],
       [ 3, 17, 18,  0,  0,  0,  0],
       [19, 20, 21, 22, 23, 24,  0],
       [ 2, 25, 26, 27, 28, 29,  0],
       [30, 31, 32, 33,  4,  3,  0],
       [34, 35, 36, 37, 38, 39,  0],
       [40, 41, 42, 43,  4, 44,  0],
       [ 2, 45, 46, 47,  2, 48,  0],
       [49, 50, 51, 52, 53, 54,  0]], dtype=int32)

# Applying RNN on IMDB Datset

In [11]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN, Flatten
from tensorflow.keras.callbacks import EarlyStopping

In [29]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

In [30]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))

In [31]:
X_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 5535,
 18,

#### All the data is already tokenized

In [32]:
y_train

array([1, 0, 0, ..., 0, 1, 0])

In [33]:
# checking the longest document
a = 0
for i in X_train:
    if len(i) > a:
        a = len(i)
print(f"The length of longest document is {a}")

The length of longest document is 2494


In [34]:
# checking the longest document
a = 0
for i in X_test:
    if len(i) > a:
        a = len(i)
print(f"The length of longest document is {a}")

The length of longest document is 2315


#### As we can see the longest document length in X_train and Y_train is 2494, so i will be keeping padding(maxlen) as 2500

In [35]:
# we will apply padding on each document to convert it into same length
X_train_padded = pad_sequences(X_train, padding = "post", maxlen = 100)
X_test_padded = pad_sequences(X_test, padding = "post", maxlen = 100)

In [36]:
print(X_train_padded.shape)
print(50*"-")
print(len(X_train_padded[0]))
print(50*"-")
X_train_padded

(25000, 100)
--------------------------------------------------
100
--------------------------------------------------


array([[1415,   33,    6, ...,   19,  178,   32],
       [ 163,   11, 3215, ...,   16,  145,   95],
       [1301,    4, 1873, ...,    7,  129,  113],
       ...,
       [  11,    6, 4065, ...,    4, 3586,    2],
       [ 100, 2198,    8, ...,   12,    9,   23],
       [  78, 1099,   17, ...,  204,  131,    9]], dtype=int32)

#### we will apply embedding on each token in the document with some specific dimension
- input_dim=10000: The size of the vocabulary. In this case, we're using a vocabulary of 10,000 unique tokens (words).
- output_dim=64: The size of the embedding vectors. Each word will be represented by a 64-dimensional vector.
- input_length=100: The length of each input sequence. In this case, each input sequence will be 100 tokens long (e.g., 500 words in a sentence or document).

In [37]:
# creating layers for model
model = Sequential()
model.add(Embedding(input_dim = 10000, output_dim =64, input_length = 100))
model.add(SimpleRNN(units= 128, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

# Build the model explicitly
model.build(input_shape=(None, 100))



In [38]:
model.summary()

In [39]:
# creating an instance of early stopping
earlyStopping = EarlyStopping(monitor= "val_loss", patience = 5, restore_best_weights = True)
earlyStopping

<keras.src.callbacks.early_stopping.EarlyStopping at 0x7ad916219d10>

In [40]:
# compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [41]:
# Train the model
history = model.fit(X_train_padded, y_train, epochs=100,validation_split=0.2, callbacks=[earlyStopping])

Epoch 1/100
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 44ms/step - accuracy: 0.5727 - loss: 0.6971 - val_accuracy: 0.6614 - val_loss: 0.5968
Epoch 2/100
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 44ms/step - accuracy: 0.7000 - loss: 0.6661 - val_accuracy: 0.6618 - val_loss: 0.5859
Epoch 3/100
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 44ms/step - accuracy: 0.7660 - loss: 0.4881 - val_accuracy: 0.7184 - val_loss: 0.5389
Epoch 4/100
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 44ms/step - accuracy: 0.8008 - loss: 0.4314 - val_accuracy: 0.7062 - val_loss: 0.5835
Epoch 5/100
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 44ms/step - accuracy: 0.7768 - loss: 0.4520 - val_accuracy: 0.6714 - val_loss: 0.6172
Epoch 6/100
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 44ms/step - accuracy: 0.8315 - loss: 0.3630 - val_accuracy: 0.7676 - val_loss: 0.5546
Epoch 7/10

In [44]:
model.evaluate(X_test_padded, y_test)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.7188 - loss: 0.5496


[0.5442382097244263, 0.7218000292778015]

We got an accuracy of 72 percent

## Testing the model with some data

In [65]:
data = "This is the best movie I’ve ever seen absolutely perfect in every way! and very good"

In [66]:
token = Tokenizer()
token.fit_on_texts([data])

In [67]:
token.word_index

{'this': 1,
 'is': 2,
 'the': 3,
 'best': 4,
 'movie': 5,
 'i’ve': 6,
 'ever': 7,
 'seen': 8,
 'absolutely': 9,
 'perfect': 10,
 'in': 11,
 'every': 12,
 'way': 13,
 'and': 14,
 'very': 15,
 'good': 16}

In [68]:
sequence1 = token.texts_to_sequences([data])
sequence1

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]

In [69]:
padded_sequence = pad_sequences(sequence1, padding='post', maxlen=100) #Example maxlen
padded_sequence

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0]], dtype=int32)

In [71]:
aa = model.predict(padded_sequence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


In [73]:
aa[0][0]

0.6643697