In [2]:
import tensorflow as tf
from keras import datasets, layers, callbacks, models, optimizers, preprocessing
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
sentences = [
    'I like eggs and ham.',
    'I love chocolate and bunnies.',
    'I hate onions.'
]

In [4]:
MAX_VOCAB_SIZE = 20_000

In [6]:
vectorization_layer = layers.TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    # split='whitespace',
    # output_mode='int',
)

In [7]:
# For some reason they decided to use adapt instead of fit
vectorization_layer.adapt(sentences)

In [9]:
# Now we use the vectorization layer to convert the sentences to sequences of integers
sequences = vectorization_layer(sentences)
print(sequences)

tf.Tensor(
[[ 2  6  9  3  8]
 [ 2  5 10  3 11]
 [ 2  7  4  0  0]], shape=(3, 5), dtype=int64)


In [10]:
# This list will contain the vocabulary in the order of the most common words and their indexes
vectorization_layer.get_vocabulary()

['',
 '[UNK]',
 np.str_('i'),
 np.str_('and'),
 np.str_('onions'),
 np.str_('love'),
 np.str_('like'),
 np.str_('hate'),
 np.str_('ham'),
 np.str_('eggs'),
 np.str_('chocolate'),
 np.str_('bunnies')]

In [12]:
# Converting the sequences back to sentences
word2index = {word: index for index, word in enumerate(vectorization_layer.get_vocabulary())}
word2index

{'': 0,
 '[UNK]': 1,
 np.str_('i'): 2,
 np.str_('and'): 3,
 np.str_('onions'): 4,
 np.str_('love'): 5,
 np.str_('like'): 6,
 np.str_('hate'): 7,
 np.str_('ham'): 8,
 np.str_('eggs'): 9,
 np.str_('chocolate'): 10,
 np.str_('bunnies'): 11}

In [13]:
vectorization_layer_truncated = layers.TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    output_sequence_length=3
)

vectorization_layer_truncated.adapt(sentences)

sequences_truncated = vectorization_layer_truncated(sentences)
print(sequences_truncated)

tf.Tensor(
[[ 2  6  9]
 [ 2  5 10]
 [ 2  7  4]], shape=(3, 3), dtype=int64)


In [14]:
# ragged (no padding)
vectorization_layer_ragged = layers.TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    ragged=True
)

vectorization_layer_ragged.adapt(sentences)

sequences_ragged = vectorization_layer_ragged(sentences)
print(sequences_ragged)

<tf.RaggedTensor [[2, 6, 9, 3, 8], [2, 5, 10, 3, 11], [2, 7, 4]]>
