# One hot encoding with keras

In [13]:
import tensorflow
from keras.preprocessing.text import Tokenizer

In [3]:
samples=['The cat sat on the mat.','The dog ate my homework.']

In [4]:
tokenizer=Tokenizer(num_words=1000)

In [5]:
tokenizer.fit_on_texts(samples)

In [11]:
#one hot binary representation
sequences=tokenizer.texts_to_matrix(samples,mode='binary')
sequences

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [10]:
word_index=tokenizer.word_index
word_index

{'the': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'mat': 5,
 'dog': 6,
 'ate': 7,
 'my': 8,
 'homework': 9}

In [8]:
print('Found %s unique tokens.' % len(word_index))

Found 9 unique tokens.


# Word Level one hot encoding with hashing trick

Instead of explicitly assign index to assign each word and keeping a reference of this indices in a dictionary so that
so that you can hash the words into vectors of fixed size.The advantage of it is it saves memory and the disadvantage is it leads to hash collision i.e. two different words may end up with same hash.

In [18]:
import numpy as np

In [19]:
samples=['The cat sat on the mat.','The dog ate my homework.']

In [20]:
dimensionality=1000

In [21]:
max_length=10

In [25]:
results=np.zeros((len(samples),max_length,dimensionality))

In [26]:
for i ,sample in enumerate(samples):
    for j ,word in list(enumerate(sample.split()))[:max_length]:
        index=abs(hash(word)) % dimensionality
        results[i,j,index]=1

Stores the words of a vector of size 1000. you will see here many collision which will decrease the accuracy of encoding method

In [27]:
results

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])