# working with text data

### word_level one-hot encoding

In [26]:
import numpy as np

In [16]:
example = ['The cat sat on the mat.', 'The dog ate my homework.']

In [5]:
token_index={}
for sample in example:
    for word in sample.split():
        token_index[word] = len(token_index) + 1

In [10]:
token_index.keys()

dict_keys(['The', 'cat', 'sat', 'on', 'the', 'mat.', 'dog', 'ate', 'my', 'homework.'])

In [11]:
token_index.get('The') # this returns the word index for a specific word

7

In [19]:
token_index.values() # return all the indices of the words, each word has an unique index 

dict_values([7, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [21]:
max_length = 10
results = np.zeros(shape = (len(example), max_length, max(token_index.values())+1))
for i, sample in enumerate(example):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1

In [23]:
results.shape

(2, 10, 11)

In [18]:
for i, sample in enumerate(example):
    print(list(enumerate(sample.split()))[:10])

[(0, 'The'), (1, 'cat'), (2, 'sat'), (3, 'on'), (4, 'the'), (5, 'mat.')]
[(0, 'The'), (1, 'dog'), (2, 'ate'), (3, 'my'), (4, 'homework.')]


### character_level one-hot encoding

In [1]:
import string

In [2]:
samples = ['The cat sat on the mat', 'The dog at my homework']

In [3]:
characters = string.printable

In [4]:
characters

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [5]:
token_index = dict(zip(range(1, len(characters)+1), characters))

In [22]:
token_index.get(16)

'f'

In [24]:
token_index.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100])

In [23]:
max_length = 50

In [56]:
result2 = np.zeros(shape=(len(samples), max_length, max(token_index.keys())+1))
for i, sample in enumerate(samples):
    for j, characters in enumerate(sample):
        index = token_index.get(characters)
        result2[i, j, index] = 1

In [58]:
result2.shape

(2, 50, 101)

### using keras for word-level one-hot encoding

In [65]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [66]:
samples = ['The cat sat on the mat', 'The dog at my homework']

In [67]:
tokenizer = Tokenizer(num_words=1000)

In [68]:
tokenizer.fit_on_texts(samples)

In [70]:
sequences = tokenizer.texts_to_sequences(samples)

In [71]:
sequences

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]

In [78]:
tokenizer.index_word

{1: 'the',
 2: 'cat',
 3: 'sat',
 4: 'on',
 5: 'mat',
 6: 'dog',
 7: 'at',
 8: 'my',
 9: 'homework'}

In [79]:
result = tokenizer.texts_to_matrix(samples, mode='binary')

In [81]:
result.shape

(2, 1000)

In [88]:
word_index = tokenizer.word_index

In [90]:
print('Found %s unique tokens.' % len(word_index))

Found 9 unique tokens.


In [89]:
print(word_index)

{'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'at': 7, 'my': 8, 'homework': 9}


### Word-level one-hot encoding with hashing trick (toy example)

In [91]:
samples = ['The cat sat on the mat', 'The dog at my homework']

In [92]:
dimensionality = 1000

In [94]:
max_length = 10

In [97]:
results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = abs(hash(word))% dimensionality
        results[i, j, index] = 1

In [99]:
results.shape

(2, 10, 1000)