In [2]:
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer

In [12]:
# converting text to sequence

sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

print(word_index)
print(sequences)

# 미리 토큰화되어 있지 않은 'really', 'loves'와 같은 단어들은 숫자들의 시퀀스에 포함되지 않는다.

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


In [14]:
# processing untokenized words

sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

test_sentences = [
    'i really love my dog'
    'my dog loves my friend'
]

test_sequences = tokenizer.texts_to_sequences(test_sentences)

print(test_sequences)
print(word_index)

# Tokenzier의 oov_token인자를 사용하면 미리 인덱싱하지 않은 단어 <OOV>로 인덱싱된다.

[[5, 1, 3, 2, 1, 4, 1, 2, 1]]
{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


In [19]:
# setting padding

from tensorflow.keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(sequences, padding='post')
print(padded)

# default = padding = 'pre'

[[ 5  3  2  4  0  0  0]
 [ 5  3  2  7  0  0  0]
 [ 6  3  2  4  0  0  0]
 [ 8  6  9  2  4 10 11]]


In [18]:
# maxlen parameter

padded = pad_sequences(sequences, padding='pre', maxlen = 6)
print(padded)

# maxlen parameter는 시퀀스의 최대 길이를 반환. 

[[ 5  3  2  4  0  0  0]
 [ 5  3  2  7  0  0  0]
 [ 6  3  2  4  0  0  0]
 [ 8  6  9  2  4 10 11]]


In [21]:
# truncating parameter

padded = pad_sequences(sequences, padding = 'pre',maxlen = 6, truncating='post')
print(padded)

# truncating parameter 는 최대 길이를 넘는 sequences를 잘라낼 위치를 지정한다.

[[ 0  0  5  3  2  4]
 [ 0  0  5  3  2  7]
 [ 0  0  6  3  2  4]
 [ 8  6  9  2  4 10]]
