In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

[]


In [2]:
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import hashing_trick
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [3]:
text = 'The quick brown fox jumped over the lazy dog.'
result = text_to_word_sequence(text)
print(result)

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


In [4]:
words = set(text_to_word_sequence(text))
vocab_size = len(words)
print(vocab_size)

8


## One Hot Encoding

In [5]:
words = set(text_to_word_sequence(text))
vocab_size = len(words)
print(vocab_size)
result = one_hot(text, round(vocab_size*1.3)) # The vocabulary size is increased by one-third to minimize collisions when hashing words.
print(result)

8
[2, 7, 5, 7, 4, 6, 2, 3, 9]


## Hashing Trick (better than one hot encoding as don't need any word mapping so not as memory consuming as One Hot encoding and also faster.

In [6]:
words = set(text_to_word_sequence(text))
vocab_size = len(words)
print(vocab_size)
result = hashing_trick(text, round(vocab_size*1.3), hash_function='md5') # The vocabulary size is increased by one-third to minimize collisions when hashing words.
print(result)

8
[6, 4, 1, 2, 7, 5, 6, 2, 6]


## Tokenizer API (Preferred for large dataset)

In [7]:
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!']
t = Tokenizer()
t.fit_on_texts(docs)

Once fit, the Tokenizer provides 4 attributes that you can use to query what has been learned about your documents:

- word_counts: A dictionary of words and their counts.
- word_docs: A dictionary of words and how many documents each appeared in.
- word_index: A dictionary of words and their uniquely assigned integers.
- document_count:An integer count of the total number of documents that were used to fit the Tokenizer.

In [8]:
print("Word Counts: ",t.word_counts)
print("Document Count: ",t.document_count)
print("Word Index: ",t.word_index)
print("Word Docs: ",t.word_docs)

Word Counts:  OrderedDict([('well', 1), ('done', 1), ('good', 1), ('work', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)])
Document Count:  5
Word Index:  {'work': 1, 'well': 2, 'done': 3, 'good': 4, 'great': 5, 'effort': 6, 'nice': 7, 'excellent': 8}
Word Docs:  defaultdict(<class 'int'>, {'done': 1, 'well': 1, 'good': 1, 'work': 2, 'effort': 1, 'great': 1, 'nice': 1, 'excellent': 1})


## The texts_to_matrix() function
This function provides a suite of standard bag-of-words model text encoding schemes that can be provided via a mode argument to the function.
The modes available include:
- ‘binary‘: Whether or not each word is present in the document. This is the default.
- ‘count‘: The count of each word in the document.
- ‘tfidf‘: The Text Frequency-Inverse DocumentFrequency (TF-IDF) scoring for each word in the document.
- ‘freq‘: The frequency of each word as a ratio of words within each document.

In [9]:
encoded_docs = t.texts_to_matrix(docs, mode='count')
print(encoded_docs)

[[0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]
