In [23]:
from keras.preprocessing.text import hashing_trick, text_to_word_sequence, one_hot, Tokenizer

In [7]:
text = 'The quick brown fox jumped over the lazy dog.'
result = text_to_word_sequence(text)
result

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']

In [16]:
words = set(result)
vocab_size = len(words)
words, vocab_size

({'brown', 'dog', 'fox', 'jumped', 'lazy', 'over', 'quick', 'the'}, 8)

In [20]:
# The vocab size is increased by one-third to minimize collision when hashing words
one_hot(text, round(vocab_size * 1.3))

[1, 1, 3, 2, 8, 9, 1, 6, 6]

In [24]:
hashing_trick(text, round(vocab_size * 1.3), hash_function = 'md5')

[6, 4, 1, 2, 7, 5, 6, 2, 6]

In [33]:
docs = [
    'Well done!',
    'Good work',
    'Great effort',
    'nice work',
    'Excellent!'
]

t = Tokenizer()
t.fit_on_texts(docs)
print('Word counts: \n{}'.format(t.word_counts), '\n')
print('Document count: \n{}'.format(t.document_count), '\n')
print('Word index: \n{}'.format(t.word_index), '\n')
print('Word docs: \n{}'.format(t.word_docs), '\n')

encoded_docs = t.texts_to_matrix(docs, mode = 'count')
print('Encoded docs:')
encoded_docs

Word counts: 
OrderedDict([('well', 1), ('done', 1), ('good', 1), ('work', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)]) 

Document count: 
5 

Word index: 
{'work': 1, 'well': 2, 'done': 3, 'good': 4, 'great': 5, 'effort': 6, 'nice': 7, 'excellent': 8} 

Word docs: 
{'well': 1, 'done': 1, 'work': 2, 'good': 1, 'great': 1, 'effort': 1, 'nice': 1, 'excellent': 1} 

Encoded docs:


array([[ 0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])