In [1]:
import tensorflow as tf
import numpy as np

In [2]:
sentences = ["my name is kebing", "I don't like green onion", "I like ice cream"]

In [3]:
# max_tokens: maximum size of the vocabulary for this layer
tokenizer = tf.keras.layers.TextVectorization(max_tokens=10)
# computes a vocabulary of string terms from tokens in a datset
tokenizer.adapt(sentences)

In [4]:
tokenizer(sentences)

<tf.Tensor: shape=(3, 5), dtype=int64, numpy=
array([[6, 5, 8, 7, 0],
       [3, 1, 2, 1, 4],
       [3, 2, 9, 1, 0]])>

In [5]:
# OOV is [UNK]
tokenizer("I like strawberries")

<tf.Tensor: shape=(3,), dtype=int64, numpy=array([3, 2, 1])>

In [6]:
tokenizer.get_vocabulary(include_special_tokens=True)

['', '[UNK]', 'like', 'i', 'onion', 'name', 'my', 'kebing', 'is', 'ice']

Pad the Sequences

In [7]:
sequences = tokenizer(sentences)
# maxlen: maximum length of all sequences, default to longest
padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=10)
print(padded)

[[0 0 0 0 0 6 5 8 7 0]
 [0 0 0 0 0 3 1 2 1 4]
 [0 0 0 0 0 3 2 9 1 0]]


In [8]:
# truncating default to pre
tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=3)

array([[8, 7, 0],
       [2, 1, 4],
       [9, 1, 0]], dtype=int32)

In [9]:
# truncate the post part
tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=3, truncating="post")

array([[6, 5, 8],
       [3, 1, 2],
       [3, 2, 9]], dtype=int32)

In [11]:
tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=15, padding="post")

array([[6, 5, 8, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [3, 1, 2, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [3, 2, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)

In [12]:
sentences_oov = ["it is not in vocabulary", "extremely involved content"]
oov_sequences = tokenizer(sentences_oov)
print(oov_sequences)

tf.Tensor(
[[1 8 1 1 1]
 [1 1 1 0 0]], shape=(2, 5), dtype=int64)
