In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
train_sentences = [
    'It will rain',
    'The weather is cloudy!',
    'Will it be raining today?',
    'It is a super hot day!'
]

In [3]:
# tokenizer object
tokenizer = Tokenizer(num_words=50, oov_token='<oov>')

In [4]:
# train the tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

In [5]:
# store word index for words in the sentences
word_index = tokenizer.word_index

In [6]:
# create sequences
sequences = tokenizer.texts_to_sequences(train_sentences)

In [7]:
# print the values
print(f"Word Indexes: {word_index}")
print(f"Sentences: {train_sentences}")
print(f"Sequences: {sequences}")

Word Indexes: {'<oov>': 1, 'it': 2, 'will': 3, 'is': 4, 'rain': 5, 'the': 6, 'weather': 7, 'cloudy': 8, 'be': 9, 'raining': 10, 'today': 11, 'a': 12, 'super': 13, 'hot': 14, 'day': 15}
Sentences: ['It will rain', 'The weather is cloudy!', 'Will it be raining today?', 'It is a super hot day!']
Sequences: [[2, 3, 5], [6, 7, 4, 8], [3, 2, 9, 10, 11], [2, 4, 12, 13, 14, 15]]


In [8]:
# pad_sequences
padded_sequences = pad_sequences(sequences)

In [11]:
# print the values
# print(f"Word Indexes: {word_index}")
# print(f"Sentences: {train_sentences}")
print(f"Sequences: {sequences}")
print(f"Padded Sequences: {padded_sequences}")

Sequences: [[2, 3, 5], [6, 7, 4, 8], [3, 2, 9, 10, 11], [2, 4, 12, 13, 14, 15]]
Padded Sequences: [[ 0  0  0  2  3  5]
 [ 0  0  6  7  4  8]
 [ 0  3  2  9 10 11]
 [ 2  4 12 13 14 15]]


In [12]:
# pad_sequences with padding type, max length and truncating attributes
padded_sequences_mod = pad_sequences(sequences, padding='post', maxlen=5, truncating='post')

In [13]:
print(f"Padded Sequences: {padded_sequences}")
print(f"Padded Sequences Mod: {padded_sequences_mod}")

Padded Sequences: [[ 0  0  0  2  3  5]
 [ 0  0  6  7  4  8]
 [ 0  3  2  9 10 11]
 [ 2  4 12 13 14 15]]
Padded Sequences Mod: [[ 2  3  5  0  0]
 [ 6  7  4  8  0]
 [ 3  2  9 10 11]
 [ 2  4 12 13 14]]
