Using Keras for word-level one-hot encoding

Source: https://freecontent.manning.com/deep-learning-for-text/

In [10]:
from keras.preprocessing.text import Tokenizer
  
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
  
# We create a tokenizer, configured to only take
# into account the top-1000 most common on words
tokenizer = Tokenizer(num_words=1000)

# This builds the word index
tokenizer.fit_on_texts(samples)
  
# This turns strings into lists of integer indices.
sequences = tokenizer.texts_to_sequences(samples)
print('samples: ', samples)
print('sequences: ', sequences)

# You could also directly get the one-hot binary representations.
# Note that other vectorization modes than one-hot encoding are supported!
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
print('one_hot_results: ', one_hot_results)


# This is how you can recover the word index that was computed
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

('samples: ', ['The cat sat on the mat.', 'The dog ate my homework.'])
('sequences: ', [[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]])
('one_hot_results: ', array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]]))
Found 9 unique tokens.
