In [11]:
import numpy as np
from keras.layers import TextVectorization

## Creating a sample dataset

In [1]:
text = "I like my brother. Go Home!. Do you like my brother?"

In [17]:
sentences = text.split('.')
sentences

['I like my brother', ' Go Home!', ' Do you like my brother?']

## Adapting a TextVectorization layer

In [24]:
text_vectorizer = TextVectorization()
text_vectorizer.adapt(sentences)

In [25]:
print(text_vectorizer.get_vocabulary())

['', '[UNK]', np.str_('my'), np.str_('like'), np.str_('brother'), np.str_('you'), np.str_('i'), np.str_('home'), np.str_('go'), np.str_('do')]


In [26]:
for i, word in enumerate(text_vectorizer.get_vocabulary()):
    print(f'{i}: {word}')

0: 
1: [UNK]
2: my
3: like
4: brother
5: you
6: i
7: home
8: go
9: do


## Using the TextVectorization layer

In [27]:
encoded_sequences = text_vectorizer(sentences)
print(encoded_sequences)

tf.Tensor(
[[6 3 2 4 0]
 [8 7 0 0 0]
 [9 5 3 2 4]], shape=(3, 5), dtype=int64)


In [28]:
print(type(encoded_sequences))

<class 'tensorflow.python.framework.ops.EagerTensor'>


In [29]:
encoded_sequences.numpy()

array([[6, 3, 2, 4, 0],
       [8, 7, 0, 0, 0],
       [9, 5, 3, 2, 4]])

## Test on a new input

In [30]:
test_sentences = ['I like my sister']

In [31]:
test_sequence = text_vectorizer(test_sentences)
print(test_sequence)

tf.Tensor([[6 3 2 1]], shape=(1, 4), dtype=int64)
