# Implementing Tokenizer and padding

In [2]:
# importing libraries
import pandas as pd
import numpy as np

In [None]:
# Installing tensorflow
!pip install tensorflow

In [3]:
# This is the document we are going to tokenize
docs = [
    "Life is full of unexpected surprises.",
    "She enjoys reading books every evening.",
    "Dreams come true.",
    "Always stay positive and keep moving.",
    "The stars shone brightly last night.",
    "Never give up on your dreams.",
    "They built a beautiful wooden house.",
    "Learning something new expands your mind.",
    "The cat jumped over the fence.",
    "Hard work often leads to success."
]


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token = "<nothing>") # (out of vocabulary) suppose while prediction, if users give a words which is not in vocabulary then it will use nothing instead

In [6]:
# Tokenize the vocabulary (converting uppercase inot lowercase)
tokenizer.fit_on_texts(docs)

In [7]:
# Alloated index value to vocabulary
tokenizer.word_index

{'<nothing>': 1,
 'the': 2,
 'dreams': 3,
 'your': 4,
 'life': 5,
 'is': 6,
 'full': 7,
 'of': 8,
 'unexpected': 9,
 'surprises': 10,
 'she': 11,
 'enjoys': 12,
 'reading': 13,
 'books': 14,
 'every': 15,
 'evening': 16,
 'come': 17,
 'true': 18,
 'always': 19,
 'stay': 20,
 'positive': 21,
 'and': 22,
 'keep': 23,
 'moving': 24,
 'stars': 25,
 'shone': 26,
 'brightly': 27,
 'last': 28,
 'night': 29,
 'never': 30,
 'give': 31,
 'up': 32,
 'on': 33,
 'they': 34,
 'built': 35,
 'a': 36,
 'beautiful': 37,
 'wooden': 38,
 'house': 39,
 'learning': 40,
 'something': 41,
 'new': 42,
 'expands': 43,
 'mind': 44,
 'cat': 45,
 'jumped': 46,
 'over': 47,
 'fence': 48,
 'hard': 49,
 'work': 50,
 'often': 51,
 'leads': 52,
 'to': 53,
 'success': 54}

In [8]:
# Total number of vocabulary
len(tokenizer.word_index)

54

In [9]:
# Number of rows or number of document
tokenizer.document_count

10

In [10]:
# Converting each sentence/document into a sequence of word indices based on the tokenizer's vocabulary
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[5, 6, 7, 8, 9, 10],
 [11, 12, 13, 14, 15, 16],
 [3, 17, 18],
 [19, 20, 21, 22, 23, 24],
 [2, 25, 26, 27, 28, 29],
 [30, 31, 32, 33, 4, 3],
 [34, 35, 36, 37, 38, 39],
 [40, 41, 42, 43, 4, 44],
 [2, 45, 46, 47, 2, 48],
 [49, 50, 51, 52, 53, 54]]

In [11]:
# applying padding to make each document into same size
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_sequences = pad_sequences(sequences, padding = "post", maxlen = 7) # keeping sequnce length of each documment as 7
padded_sequences

array([[ 5,  6,  7,  8,  9, 10,  0],
       [11, 12, 13, 14, 15, 16,  0],
       [ 3, 17, 18,  0,  0,  0,  0],
       [19, 20, 21, 22, 23, 24,  0],
       [ 2, 25, 26, 27, 28, 29,  0],
       [30, 31, 32, 33,  4,  3,  0],
       [34, 35, 36, 37, 38, 39,  0],
       [40, 41, 42, 43,  4, 44,  0],
       [ 2, 45, 46, 47,  2, 48,  0],
       [49, 50, 51, 52, 53, 54,  0]])

In [3]:
import pandas as pd
import numpy as np
a = pd.read_csv(r"C:\Users\91620\Desktop\Movie-review-sentiment-using-RNN\IMDB Dataset.csv")

In [4]:
a.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
a.shape

(50000, 2)

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

# from keras.preprocessing.text import Tokenizer

# Sample data (list of reviews with words outside top 3 frequent words)
reviews = [
    "I loved this movie",   # "loved" is not in top 3
    "This movie was awful", # "was" and "awful" are not in top 3
    "I enjoyed the plot of this movie", # "enjoyed", "plot", "of" are not in top 3
    "It was a bad movie",   # "was", "a", "bad" are not in top 3
    "This is the best movie ever!"  # "is", "best", "ever" are not in top 3
]

# Create a Tokenizer with a limited vocabulary size (e.g., top 3 words)
tokenizer = Tokenizer(num_words=3)
tokenizer.fit_on_texts(reviews)

# Print the word index (words mapped to integers)
print("Word Index:", tokenizer.word_index)

# Convert text to sequences (replace words with their integer representations)
sequences = tokenizer.texts_to_sequences(reviews)

# Print the sequences (tokenized reviews)
print("\nTokenized Reviews:")
for i, seq in enumerate(sequences):
    print(f"Review {i+1}: {seq}")
# ns out of the top 3 words replaced by 0


Word Index: {'movie': 1, 'this': 2, 'i': 3, 'was': 4, 'the': 5, 'loved': 6, 'awful': 7, 'enjoyed': 8, 'plot': 9, 'of': 10, 'it': 11, 'a': 12, 'bad': 13, 'is': 14, 'best': 15, 'ever': 16}

Tokenized Reviews:
Review 1: [2, 1]
Review 2: [2, 1]
Review 3: [2, 1]
Review 4: [1]
Review 5: [2, 1]
