## tensorflow.keras.preprocessing.text 모듈의 Tokenizer 클래스 사용 
### => 텍스트를 단어기반으로 토큰화

## fit_on_sequences

In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
  'I love my dog',
  'I love my cat'
]

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_sequences(sentences)
index_docs = tokenizer.index_docs
print(index_docs)


defaultdict(<class 'int'>, {'y': 2, 'g': 1, 'o': 2, ' ': 2, 'd': 1, 'l': 2, 'e': 2, 'm': 2, 'v': 2, 'I': 2, 'c': 1, 'a': 1, 't': 1})


In [7]:
seq=set('I love my dog')
print(seq)

{'y', 'g', 'o', ' ', 'd', 'l', 'e', 'm', 'v', 'I'}


## fit_on_texts

In [25]:
sentences = [
  'I love my dog',
  'I love my cat',
  'You love my dog!'
]

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


## text_to_sequences

In [29]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer



# define 4 documents
docs =['Machine Learning Knowledge',
       'Machine Learning and Deep Learning',
       'Deep Learning',
       'Artificial Intelligence']

# create the tokenizer
t = Tokenizer()
t.fit_on_texts(test_text)

sequences = t.texts_to_sequences(test_text)

print("The sequences generated from text are : ",sequences)

The sequences generated from text are :  [[2, 1, 3], [2, 1], [4, 1], [5, 6]]


## texts_to_matrix

In [31]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer


# define 5 documents
docs = ['Marvellous Machine Learning Marvellous Machine Learning',
        'Amazing Artificial Intelligence',
        'Dazzling Deep Learning',
        'Champion Computer Vision',
        'Notorious Natural Language Processing Notorious Natural Language Processing']
# create the tokenizer
t = Tokenizer()

t.fit_on_texts(docs)
print(t.word_index)

encoded_docs = t.texts_to_matrix(docs, mode='binary')
print(encoded_docs)

{'learning': 1, 'marvellous': 2, 'machine': 3, 'notorious': 4, 'natural': 5, 'language': 6, 'processing': 7, 'amazing': 8, 'artificial': 9, 'intelligence': 10, 'dazzling': 11, 'deep': 12, 'champion': 13, 'computer': 14, 'vision': 15}
[[0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]


## sequences_to_matrix

In [34]:
import tensorflow as t
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer



# define 4 documents
docs =['Machine Learning Knowledge',
       'Machine Learning and Deep Learning',
       'Deep Learning',
       'Artificial Intelligence']

# create the tokenizer
t = Tokenizer()

t.fit_on_texts(docs)  # 텍스트들에 등장한 단어들에 정수를 분여

sequences = t.texts_to_sequences(docs) # 문서에 등장한 단어들을 정수들로 표현
print(sequences)

encoded_docs = t.sequences_to_matrix(sequences, mode='binary') 
# 8개의 단어가 등장했다. 등장한 단어는1 아닌 단어는 0
# mode = one of "binary", "count", "tfidf", "freq"
print(encoded_docs)

[[2, 1, 4], [2, 1, 5, 3, 1], [3, 1], [6, 7]]
[[0. 1. 1. 0. 1. 0. 0. 0.]
 [0. 1. 1. 1. 0. 1. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 1.]]


## sequences_to_texts

In [38]:
import tensorflow as t
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer



# define 4 documents
docs =['Machine Learning Knowledge',
       'Machine Learning and Deep Learning',
       'Deep Learning',
       'Artificial Intelligence']

# create the tokenizer
t = Tokenizer()

t.fit_on_texts(docs)  # 텍스트들에 등장한 단어들에 정수를 분여

sequences = t.texts_to_sequences(docs) # 문서에 등장한 단어들을 정수들로 표현
print(sequences)



seq_to_txt = t.sequences_to_texts(sequences) # 정수로 된 sequence를 text로 표현
print(seq_to_txt)

[[2, 1, 4], [2, 1, 5, 3, 1], [3, 1], [6, 7]]
['machine learning knowledge', 'machine learning and deep learning', 'deep learning', 'artificial intelligence']
