In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
texts = [
    'I really love progarmming',
    'It is often seen as something difficult but beautiful',
    'This is another text!',
    'Do you like something else?'
]

In [3]:
tokenizer = Tokenizer(num_words=50)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(texts)

In [4]:
print(word_index)

{'is': 1, 'something': 2, 'i': 3, 'really': 4, 'love': 5, 'progarmming': 6, 'it': 7, 'often': 8, 'seen': 9, 'as': 10, 'difficult': 11, 'but': 12, 'beautiful': 13, 'this': 14, 'another': 15, 'text': 16, 'do': 17, 'you': 18, 'like': 19, 'else': 20}


In [5]:
print(sequences)

[[3, 4, 5, 6], [7, 1, 8, 9, 10, 2, 11, 12, 13], [14, 1, 15, 16], [17, 18, 19, 2, 20]]


If some word is not in the word_index that is previously fit, it will not be in the sequences;
E.g. I would like another cup of coffee
Since words like would, cup, of, coffee are not in the word_index the output will be

In [6]:
tests = [
    'I would like another cup of coffee',
    'I really like something beautiful'
]
sequences_test = tokenizer.texts_to_sequences(tests)

In [7]:
print(sequences_test)

[[3, 19, 15], [3, 4, 19, 2, 13]]


We don't like to end up losing words like that since it might change the meaning of the sequences. Like above we will end up with "I like another".
We would like to write a special token like \<UNK\> or \<OOV\> as unknown or out of vocabulary.

In [8]:
tokenizer = Tokenizer(num_words=50, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index

In [9]:
tests = [
    'I would like another cup of coffee',
    'I really like something beautiful'
]
sequences_test = tokenizer.texts_to_sequences(tests)
print(sequences_test)

[[4, 1, 20, 16, 1, 1, 1], [4, 5, 20, 3, 14]]


In [10]:
print(word_index)

{'<OOV>': 1, 'is': 2, 'something': 3, 'i': 4, 'really': 5, 'love': 6, 'progarmming': 7, 'it': 8, 'often': 9, 'seen': 10, 'as': 11, 'difficult': 12, 'but': 13, 'beautiful': 14, 'this': 15, 'another': 16, 'text': 17, 'do': 18, 'you': 19, 'like': 20, 'else': 21}


In [11]:
padded = pad_sequences(sequences, padding='post', maxlen=10, truncating='post')

In [12]:
padded

array([[ 3,  4,  5,  6,  0,  0,  0,  0,  0,  0],
       [ 7,  1,  8,  9, 10,  2, 11, 12, 13,  0],
       [14,  1, 15, 16,  0,  0,  0,  0,  0,  0],
       [17, 18, 19,  2, 20,  0,  0,  0,  0,  0]], dtype=int32)