In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## 1- Normalization

In [28]:
text = "I like my brother. I like my sister. you like my brother!. \
Do you like my brother?"
# Lower all words in text
text_normalized = text.lower()
print(text_normalized)

i like my brother. i like my sister. you like my brother!. do you like my brother?


## 2- Tokenization

In [29]:
# Split input text into sentences
sentences = text_normalized.split('. ')
print(sentences)

# Create keras Tokenizer object
tokenizer = Tokenizer(num_words=100)

# Feed the list of sentences to the tokenizer
tokenizer.fit_on_texts(sentences)

# Print words and their corresponding number
print(tokenizer.word_index)


['i like my brother', 'i like my sister', 'you like my brother!', 'do you like my brother?']
{'like': 1, 'my': 2, 'brother': 3, 'i': 4, 'you': 5, 'sister': 6, 'do': 7}


## 3- Sequencing

In [30]:
# Convert list of strings to list of numerical vectors
sequences = tokenizer.texts_to_sequences(sentences)

print(sequences)

[[4, 1, 2, 3], [4, 1, 2, 6], [5, 1, 2, 3], [7, 5, 1, 2, 3]]


In [39]:
# Zero-Padding
# padding, truncating: pre or post
padded = pad_sequences(sequences, padding='post',
                       maxlen=10, truncating='post')

print(padded)

[[4 1 2 3 0 0 0 0 0 0]
 [4 1 2 6 0 0 0 0 0 0]
 [5 1 2 3 0 0 0 0 0 0]
 [7 5 1 2 3 0 0 0 0 0]]


## Test

In [48]:
test_sentence = ["I like my mother",
                 "The weather is good"]

# Define Out-of-vocabulary (oov) token for unseen words
tokenizer = Tokenizer(num_words=100, oov_token='<oov>')

tokenizer.fit_on_texts(sentences)

print(tokenizer.word_index)

test_sequences = tokenizer.texts_to_sequences(test_sentence)
print(test_sequences)

{'<oov>': 1, 'like': 2, 'my': 3, 'brother': 4, 'i': 5, 'you': 6, 'sister': 7, 'do': 8}
[[5, 2, 3, 1], [1, 1, 1, 1]]
