In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
sentences = [
    'i love my dog',
    'I, love my cat',
    'You love my dog!'
]

In [4]:
# num_words define the number of words to keep
# if we have a lot of words(more than num_words) it will keep most frequent num_words number of words
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)

In [5]:
# each word willl have a numeric index
print(tokenizer.word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


In [10]:
# now we want to represent sentences using the word indexes
# tokenizer is smart enough to know dog and dog! are same
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

In [11]:
# oov_token for out of vocabulary words of the tokenizer to contain all OOVs.
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
print(tokenizer.word_index)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}


In [15]:
# now we will try to describe sentences using the tokens
tokenized_sentences = tokenizer.texts_to_sequences(sentences)
print(tokenized_sentences)

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


In [18]:
# testing with new word
test_sentence = ["I love my car"] # expects a list
print(tokenizer.texts_to_sequences(test_sentence)) # missing "car", resulting sequence will have 3 words only

[[4, 2, 1]]


In [22]:
# OOV token
# the tokenizer will create a token for <OOV> and replace that with missing words, as a group
tokenizer = Tokenizer(num_words=100, oov_token = "<OOV>")

In [25]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]
tokenizer.fit_on_texts(sentences)
print(tokenizer.index_word)

{1: '<OOV>', 2: 'my', 3: 'love', 4: 'dog', 5: 'i', 6: 'you', 7: 'cat', 8: 'do', 9: 'think', 10: 'is', 11: 'amazing'}


In [26]:
print(tokenizer.texts_to_sequences(test_sentence)) # car would be 1

[[5, 3, 2, 1]]


In [27]:
print(tokenizer.texts_to_sequences(["I love my Mitsubishi"])) # same value for unknown vocab

[[5, 3, 2, 1]]


In [29]:
# padding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [31]:
sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = pad_sequences(sequences) # padding to longest sentence by default
print(padded_sequences)

[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]


In [None]:
# padding parameters
# padding = "post" to add padding at the end
# maxlen = 10 to set padded sentence length max to 10
# truncating = "post/pre" remove post or pre if sentence length is greater than maxlen 