In [1]:
# Artificial network can only processes tensor (multidimensional matrix)
# So in order to process text, we need to transform it into tensor
# We can do so by : 
#   - one-hot-encoding the text (sparse matrix, moslty full of 0, higher dimmension, 1 unique word = 1 additional dimension, hardcoded)
#   - Word embeding (dense matrix, fewer dimension - often 256/512/1024, learned from data)

In [2]:
import numpy as np

In [3]:
# Exemple from scratch
samples = ["The cat sat on the mat.", "The dog ate my homework."]

token_index = {}
for sample in samples:
  for word in sample.split():
    if word not in token_index:
      token_index[word] = len(token_index) + 1

max_length = 10

results = np.zeros(shape=(len(samples), max_length, max(token_index.values()) +1))

for i, sample in enumerate(samples):
  for j, word in list(enumerate(sample.split()))[:max_length]:
    index = token_index.get(word)
    results[i, j, index] = 1

In [4]:
# Exemple using Keras API
from keras.preprocessing.text import Tokenizer

In [5]:
tokens = Tokenizer(num_words=1000) # Take only the 1000 most common words
tokens.fit_on_texts(samples) # Build the word index

In [6]:
sequences = tokens.texts_to_sequences(samples) # Turns string into list of integer indices

In [7]:
one_hot_results = tokens.texts_to_matrix(sample, mode='binary')

In [8]:
word_index = tokens.word_index
print("found", len(word_index), "unique tokens")

found 9 unique tokens


In [9]:
# If there is too much unique words and memory is an issue, we can "hash" word (but hash collision may occurs)

In [10]:
# Using Embeding transformation
from keras.layers import Embedding

In [11]:
# Map integer to  vector
# Word index => Ebeding layer => Word vector

embeding_layer = Embedding(1000, 64)

In [12]:
embeding_layer

<keras.layers.embeddings.Embedding at 0x7fa7ef733150>