<a href="https://colab.research.google.com/github/GaborVxxx/ml_notes/blob/main/TextProcessingWithTensorFlowKeras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
# simple data
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

In [7]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [8]:
print(sequences)

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


In [9]:
# how to get the word to index mapping?
tokenizer.word_index

{'my': 1,
 'love': 2,
 'dog': 3,
 'i': 4,
 'you': 5,
 'cat': 6,
 'do': 7,
 'think': 8,
 'is': 9,
 'amazing': 10}

In [10]:
# add padding to the sequences, use the defaults (0)
data = pad_sequences(sequences)
print(data) # this is a equeal length sparse matrix

[[ 0  0  0  4  2  1  3]
 [ 0  0  0  4  2  1  6]
 [ 0  0  0  5  2  1  3]
 [ 7  5  8  1  3  9 10]]


In [12]:
# try with max length
MAX_SEQUENCE_LENGTH = 5
data_c = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(data_c) # it cut of the start of the list...

[[ 0  4  2  1  3]
 [ 0  4  2  1  6]
 [ 0  5  2  1  3]
 [ 8  1  3  9 10]]


In [13]:
data_d = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print(data_d) # it cut of the end of the list...

[[ 4  2  1  3  0]
 [ 4  2  1  6  0]
 [ 5  2  1  3  0]
 [ 8  1  3  9 10]]


In [15]:
# too much padding
data = pad_sequences(sequences, maxlen=16)
print(data)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  4  2  1  3]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  4  2  1  6]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  5  2  1  3]
 [ 0  0  0  0  0  0  0  0  0  7  5  8  1  3  9 10]]


In [16]:
# truncation
data = pad_sequences(sequences, maxlen=4)
print(data)

[[ 4  2  1  3]
 [ 4  2  1  6]
 [ 5  2  1  3]
 [ 1  3  9 10]]


In [17]:
# reversed truncation
data = pad_sequences(sequences, maxlen=4, truncating='post')
print(data)

[[4 2 1 3]
 [4 2 1 6]
 [5 2 1 3]
 [7 5 8 1]]
