In [13]:
import tensorflow as tf
sentences = [
    'I love my dog',
    'I, love my cat',
    'You love my dog',
    'Do you think my dog is amazing?'
    
]
#layer intializing
vectorize_layers = tf.keras.layers.TextVectorization()
#build vocabulary
vectorize_layers.adapt(sentences)
vocabulary = vectorize_layers.get_vocabulary(include_special_tokens=False)
for index, word in enumerate(vocabulary):
    print(index, word)

0 my
1 love
2 dog
3 you
4 i
5 think
6 is
7 do
8 cat
9 amazing


In [12]:
#input sentence to integer sequence
sample_input = 'I Love my dog'
sequence = vectorize_layers(sample_input)
print(sequence)

tf.Tensor([6 3 2 4], shape=(4,), dtype=int64)


In [15]:
sentences_dataset = tf.data.Dataset.from_tensor_slices(sentences)
sequences = sentences_dataset.map(vectorize_layers)
for sentence, sequence in zip(sentences, sequences):
    print(f'{sentence} --> {sequence}')

I love my dog --> [6 3 2 4]
I, love my cat --> [ 6  3  2 10]
You love my dog --> [5 3 2 4]
Do you think my dog is amazing? --> [ 9  5  7  2  4  8 11]


## Padding

In [18]:
#post padding
sequence_post = vectorize_layers(sentences)
print(sentences)
print(sequence_post)

['I love my dog', 'I, love my cat', 'You love my dog', 'Do you think my dog is amazing?']
tf.Tensor(
[[ 6  3  2  4  0  0  0]
 [ 6  3  2 10  0  0  0]
 [ 5  3  2  4  0  0  0]
 [ 9  5  7  2  4  8 11]], shape=(4, 7), dtype=int64)


In [20]:
#pre padding
sequences_pre = tf.keras.utils.pad_sequences(sequences, padding='pre')
print(sequences_pre)

[[ 0  0  0  6  3  2  4]
 [ 0  0  0  6  3  2 10]
 [ 0  0  0  5  3  2  4]
 [ 9  5  7  2  4  8 11]]


2025-07-29 20:34:47.145674: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [22]:
sequences_post_trunc = tf.keras.utils.pad_sequences(sequences, maxlen=5, padding='pre')
print(sequences_post_trunc)

[[ 0  6  3  2  4]
 [ 0  6  3  2 10]
 [ 0  5  3  2  4]
 [ 7  2  4  8 11]]


2025-07-29 20:35:53.735274: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [23]:
vectorize_layer = tf.keras.layers.TextVectorization(ragged=True)
vectorize_layer.adapt(sentences)
ragged_sentence = vectorize_layer(sentences)
print(ragged_sentence)

<tf.RaggedTensor [[6, 3, 2, 4], [6, 3, 2, 10], [5, 3, 2, 4], [9, 5, 7, 2, 4, 8, 11]]>


In [25]:
sequences_pre = tf.keras.utils.pad_sequences(ragged_sentence.numpy())
print(sequences_pre)

[[ 0  0  0  6  3  2  4]
 [ 0  0  0  6  3  2 10]
 [ 0  0  0  5  3  2  4]
 [ 9  5  7  2  4  8 11]]


## Out of Vocabulary
The layer will use token 1 when you have input words that are not found in the vocabulary list

In [26]:
sentences_with_oov =[
    'i really love my dog',
    'my dog loves my manatte'
]
sequences_with_oov = vectorize_layer(sentences_with_oov)
for sentence, sequence in zip(sentences_with_oov, sequences_with_oov):
    print(f'{sentence} --> {sequence}')

i really love my dog --> [6 1 3 2 4]
my dog loves my manatte --> [2 4 1 2 1]
