Text Processing has two key elements:
1. Tokentization
2. Numerization

In [11]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
# Just a simple test
sentences = [
    "I like eggs and bread.",
    "I love chocolate and bunnies.",
    "I hate eggplant."
]

In [13]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)                   # step 1 : tokenize the sentence Max is 20000
sequences = tokenizer.texts_to_sequences(sentences) # step 2 : numerize the sentence

In [14]:
print(sequences)

[[1, 3, 4, 2, 5], [1, 6, 7, 2, 8], [1, 9, 10]]


In [15]:
word_index = tokenizer.word_index               # This is like select id from words_num_list
print(word_index)

{'i': 1, 'and': 2, 'like': 3, 'eggs': 4, 'bread': 5, 'love': 6, 'chocolate': 7, 'bunnies': 8, 'hate': 9, 'eggplant': 10}


In [16]:
# Taking care of Out of Vocabulary words too
tokenizer_ovv = Tokenizer(num_words = MAX_VOCAB_SIZE, oov_token="<OOV>") 
tokenizer_ovv.fit_on_texts(sentences)                   # step 1 : tokenize the sentence Max is 20000
sequences = tokenizer.texts_to_sequences(sentences) # step 2 : numerize the sentence

In [17]:
print(sequences)

[[1, 3, 4, 2, 5], [1, 6, 7, 2, 8], [1, 9, 10]]


In [18]:
word_index = tokenizer_ovv.word_index               # This is like select id from words_num_list
print(word_index)

{'<OOV>': 1, 'i': 2, 'and': 3, 'like': 4, 'eggs': 5, 'bread': 6, 'love': 7, 'chocolate': 8, 'bunnies': 9, 'hate': 10, 'eggplant': 11}


In [19]:
print(word_index['bread'])  

6


In [20]:
# How to get the word to index mapping?
tokenizer.word_index

{'i': 1,
 'and': 2,
 'like': 3,
 'eggs': 4,
 'bread': 5,
 'love': 6,
 'chocolate': 7,
 'bunnies': 8,
 'hate': 9,
 'eggplant': 10}

In [21]:
# use the defaults
data = pad_sequences(sequences)
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 0  0  1  9 10]]


In [22]:
MAX_SEQUENCE_LENGTH = 5
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 0  0  1  9 10]]


In [23]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 1  9 10  0  0]]


In [24]:
# too much padding
data = pad_sequences(sequences, maxlen=6)
print(data)

[[ 0  1  3  4  2  5]
 [ 0  1  6  7  2  8]
 [ 0  0  0  1  9 10]]


In [25]:
# truncation
data = pad_sequences(sequences, maxlen=4)
print(data)

[[ 3  4  2  5]
 [ 6  7  2  8]
 [ 0  1  9 10]]


In [26]:
data = pad_sequences(sequences, maxlen=4, truncating='post')
print(data)

[[ 1  3  4  2]
 [ 1  6  7  2]
 [ 0  1  9 10]]
