In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'I love my dog',
    'I love my cat',
    'Is it sunny today?'
]

tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)

# คลังคำศัพท์ Corpus
word_index = tokenizer.word_index
print(word_index)

sequences = tokenizer.texts_to_sequences(sentences)

print(sequences)

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5, 'is': 6, 'it': 7, 'sunny': 8, 'today': 9}
[[1, 2, 3, 4], [1, 2, 3, 5], [6, 7, 8, 9]]


---

### OOV (Out-of-vocabulary Token)

In [4]:
test_data = [
    'Today is the snowy day',
    'Will it be rainy tomorrow?'
]

test_sequences = tokenizer.texts_to_sequences(test_data)
print(test_sequences)

[[9, 6], [7]]


จะเห็นว่า Token มันจะหายไป เพราะมันไม่รู้จักคำศัพท์นั้นๆ เราจึงใช้ OOV แทนจุดที่มันหายไป เพื่อคงไว้ซึ่งบริบท

In [6]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')

tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

test_sequences = tokenizer.texts_to_sequences(test_data)
print(test_sequences)
print(word_index)

[[10, 7, 1, 1, 1], [1, 8, 1, 1, 1]]
{'<OOV>': 1, 'i': 2, 'love': 3, 'my': 4, 'dog': 5, 'cat': 6, 'is': 7, 'it': 8, 'sunny': 9, 'today': 10}


### Padding เวลาที่เราจะเทรนมักจะต้องการให้ความยาวตัวอักษรที่ Train เท่ากัน

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?',
    'I really enjoyed walking in the snow today'
]

tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)

sequences = tokenizer.texts_to_sequences(sentences)
print('Before padded : ')
print(sequences)

# 0 ด้านหน้า
padded = pad_sequences(sequences)
print('After padded : ')
print(padded)

# 0 ด้านหลัง
padded = pad_sequences(sequences, padding='post')
print('After padded (post) : ')
print(padded)

Before padded : 
[[2, 3, 4, 5, 6], [2, 3, 4, 7, 6], [3, 8, 5, 2], [9, 10, 11, 12, 13, 14, 15, 2]]
After padded : 
[[ 0  0  0  2  3  4  5  6]
 [ 0  0  0  2  3  4  7  6]
 [ 0  0  0  0  3  8  5  2]
 [ 9 10 11 12 13 14 15  2]]
After padded (post) : 
[[ 2  3  4  5  6  0  0  0]
 [ 2  3  4  7  6  0  0  0]
 [ 3  8  5  2  0  0  0  0]
 [ 9 10 11 12 13 14 15  2]]
After padded (post, maxlen=5) : 
[[ 2  3  4  5  6]
 [ 2  3  4  7  6]
 [ 3  8  5  2  0]
 [12 13 14 15  2]]


In [14]:
# 0 ด้านหลัง และ จำกัดความยาว
padded = pad_sequences(sequences, padding='post', maxlen=5)
print('After padded (post, maxlen=5) : ')
print(padded)

# เกิดปัญหาตัดข้อมูลด้านหน้าแทน แต่เราต้องการจะตัดหลัง
padded = pad_sequences(sequences, padding='post', maxlen=5, truncating='post')
print('After padded (post, maxlen=5, truncating=post) : ')
print(padded)

After padded (post, maxlen=5) : 
[[ 2  3  4  5  6]
 [ 2  3  4  7  6]
 [ 3  8  5  2  0]
 [12 13 14 15  2]]
After padded (post, maxlen=5, truncating=post) : 
[[ 2  3  4  5  6]
 [ 2  3  4  7  6]
 [ 3  8  5  2  0]
 [ 9 10 11 12 13]]
