In [57]:
# To start with this topic, is very important understand the n-grams method, so, we will see how it works in simple exercise.

import numpy as np # To tokenization of sentences.

samples = ['No he hecho tareas del colegio porque me suda el rabo', 'Arepa rellena']

token_index = {} # Dict to all index tokens.


for sample in samples:
  for word in sample.split():
    if word not in token_index:
      token_index[word] = len(token_index) + 1 # len(token_index) + 1 represents the ID 


print(token_index) # One-Hot encoding (n-bag tokenization, where n is the number of words in samples list).

{'No': 1, 'he': 2, 'hecho': 3, 'tareas': 4, 'del': 5, 'colegio': 6, 'porque': 7, 'me': 8, 'suda': 9, 'el': 10, 'rabo': 11, 'Arepa': 12, 'rellena': 13}


In [58]:
max_length = 10 # idk.


results = np.zeros(shape = (len(samples), 
                   max_length, max(token_index.values()) + 1 )) # We crate an array of zeros.

# print(results.shape) # (2, 10, 14).



for i, sample in  enumerate(samples):
  for j, word in list(enumerate(sample.split()))[:max_length:]: # In each sentence, We get only the first 10 words (from index 0)
    index = token_index.get(word) # Get index.
    results[i, j, index] = 1
    


# print(results) # Return numpy array.

In [80]:
# We can this do this to tokenize characters. We will try.

import string # Standard library from python.

samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable


token_index = dict(zip(range(1, len(characters) + 1), characters))
print(token_index) # Set index to dict of characters.

max_length = 50


results = np.zeros((len(samples), max_length, max(token_index.keys()) + 1)) # We create a numpy array.
print(results.shape)

for i, sample in enumerate(samples):
  for j, character in enumerate(sample):
    index = token_index.get(character)
    results[i, j, index] = 1


{1: '0', 2: '1', 3: '2', 4: '3', 5: '4', 6: '5', 7: '6', 8: '7', 9: '8', 10: '9', 11: 'a', 12: 'b', 13: 'c', 14: 'd', 15: 'e', 16: 'f', 17: 'g', 18: 'h', 19: 'i', 20: 'j', 21: 'k', 22: 'l', 23: 'm', 24: 'n', 25: 'o', 26: 'p', 27: 'q', 28: 'r', 29: 's', 30: 't', 31: 'u', 32: 'v', 33: 'w', 34: 'x', 35: 'y', 36: 'z', 37: 'A', 38: 'B', 39: 'C', 40: 'D', 41: 'E', 42: 'F', 43: 'G', 44: 'H', 45: 'I', 46: 'J', 47: 'K', 48: 'L', 49: 'M', 50: 'N', 51: 'O', 52: 'P', 53: 'Q', 54: 'R', 55: 'S', 56: 'T', 57: 'U', 58: 'V', 59: 'W', 60: 'X', 61: 'Y', 62: 'Z', 63: '!', 64: '"', 65: '#', 66: '$', 67: '%', 68: '&', 69: "'", 70: '(', 71: ')', 72: '*', 73: '+', 74: ',', 75: '-', 76: '.', 77: '/', 78: ':', 79: ';', 80: '<', 81: '=', 82: '>', 83: '?', 84: '@', 85: '[', 86: '\\', 87: ']', 88: '^', 89: '_', 90: '`', 91: '{', 92: '|', 93: '}', 94: '~', 95: ' ', 96: '\t', 97: '\n', 98: '\r', 99: '\x0b', 100: '\x0c'}
(2, 50, 101)


In [None]:
# Also we will do this with help from keras. Everything is most easier with this.

In [87]:
from keras.preprocessing.text import Tokenizer # The tokenizator from Keras.


samples = ['No he hecho tareas del colegio porque me suda el rabo', 'Arepa rellena']


tokenizer = Tokenizer(num_words = 1000) # The tokenizator take the 1000 words most commons into dataset.
tokenizer.fit_on_texts(samples) # Building the word index.


sequences = tokenizer.texts_to_sequences(samples) # Change each word in list to integer indices.
print(sequences) 


one_hot_results = tokenizer.texts_to_matrix(samples, mode = 'binary') # Convert the text to matrix. With dis we will make a model train.
print(one_hot_results)


word_index = tokenizer.word_index
print(word_index) # We built a word index (dictionary) most easy with Keras. The same.



[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12, 13]]
[[0. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
{'no': 1, 'he': 2, 'hecho': 3, 'tareas': 4, 'del': 5, 'colegio': 6, 'porque': 7, 'me': 8, 'suda': 9, 'el': 10, 'rabo': 11, 'arepa': 12, 'rellena': 13}


In [95]:
# One hot encoding (to word level) with hashing trick.
samples = ['Hoy iré a comprarme una empanada ranchera.', 'Mi papá se fué a comprar cigarros.']

dimensionality = 1000 # Variable to set array dimension. If we have lower dimensionality (or our words index closer to 1000),
# it's very likely that the accuracy of this method will decrease considerably.

max_length = 10

results = np.zeros((len(samples), max_length, dimensionality))

for i, sample in enumerate(samples):
  for j, word in list(enumerate(sample.split()))[:max_length:]:
    index = abs(hash(word)) % dimensionality # We hash word to radom integer index (from 0 to 1000, in this case).


821
993
266
557
621
936
146
205
321
741
188
266
597
249
