In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### 1 Create dummy data


In [7]:
sentences = [
    "The sun is shining very brightly",
    "Birds are chirping",
    "The trees are majestic"
]

In [8]:
MAX_VOCAB_SIZE = 20000 # 3000 words would cover 95% of most English texts
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE) # init Tokenizer
tokenizer.fit_on_texts(sentences) # break sentences into tokens
sequences = tokenizer.texts_to_sequences(sentences) # convert tokens to indices

In [9]:
print(sequences) # int starts from 1, 0 left for padding

[[1, 3, 4, 5, 6, 7], [8, 2, 9], [1, 10, 2, 11]]


In [10]:
tokenizer.word_index

{'the': 1,
 'are': 2,
 'sun': 3,
 'is': 4,
 'shining': 5,
 'very': 6,
 'brightly': 7,
 'birds': 8,
 'chirping': 9,
 'trees': 10,
 'majestic': 11}

In [12]:
data = pad_sequences(sequences)
print(data)

[[ 1  3  4  5  6  7]
 [ 0  0  0  8  2  9]
 [ 0  0  1 10  2 11]]


In [16]:
MAX_SEQUENCE_LENGTH = 7
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(data)

[[ 0  1  3  4  5  6  7]
 [ 0  0  0  0  8  2  9]
 [ 0  0  0  1 10  2 11]]


In [18]:
MAX_SEQUENCE_LENGTH = 6
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print(data)

[[ 1  3  4  5  6  7]
 [ 8  2  9  0  0  0]
 [ 1 10  2 11  0  0]]


In [20]:
data = pad_sequences(sequences, maxlen=4)
print(data)

[[ 4  5  6  7]
 [ 0  8  2  9]
 [ 1 10  2 11]]


In [21]:
data = pad_sequences(sequences, maxlen=4, truncating='post')
print(data)

[[ 1  3  4  5]
 [ 0  8  2  9]
 [ 1 10  2 11]]


In [None]:
test