In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Initialize the tokenizer
tok = Tokenizer()

# Define a sample corpus
corpus = ['coffee is hot', 'water is cold']

# Fit the tokenizer on the corpus
tok.fit_on_texts(corpus)

# View the word-to-index mapping
print(tok.word_index)

# The tokenizer assigns a unique integer to each word based on frequency. For instance, is (most frequent) gets index 1, while cold gets 5.

{'is': 1, 'coffee': 2, 'hot': 3, 'water': 4, 'cold': 5}


In [3]:
# Convert new texts to sequences
sequences = tok.texts_to_sequences(['water is hot', 'black coffee is cold'])
print(sequences) # 'black' is ignored (not in vocabulary)

[[4, 1, 3], [2, 1, 5]]


**2. Handling Out-of-Vocabulary (OOV) Words**

Words not seen during training (e.g., black in the example above) are ignored by default. To handle such cases, use the oov_token parameter:

In [4]:
# Initialize tokenizer with OOV support
tok = Tokenizer(oov_token='<OOV>')

# Fit on the corpus
tok.fit_on_texts(corpus)

# View updated word index
print(tok.word_index)

{'<OOV>': 1, 'is': 2, 'coffee': 3, 'hot': 4, 'water': 5, 'cold': 6}


Now, unseen words like black are replaced with the OOV token:

In [6]:
sequences = tok.texts_to_sequences(['water is hot', 'black coffee is cold'])
print(sequences) # 'black' → 1 (OOV)

[[5, 2, 4], [1, 3, 2, 6]]


**3. Limiting Vocabulary Size**

Large corpora can lead to massive vocabularies. Use num_words to restrict the number of tokens:

In [7]:
# Limit vocabulary to the top 5 most frequent words
tok = Tokenizer(num_words=6)
tok.fit_on_texts(corpus)

print(tok.word_index)

{'is': 1, 'coffee': 2, 'hot': 3, 'water': 4, 'cold': 5}


When converting text, only top num_words are retained:

In [9]:
sequences = tok.texts_to_sequences(['water is hot', 'black coffee is cold'])
print(sequences)# 'black' is excluded (not in top 5)

[[4, 1, 3], [2, 1, 5]]
