## 6. 정수 인코딩

### 1) Counter 사용하기

In [1]:
from collections import Counter

In [15]:
sentences =[
    ['barber', 'person'],
    ['barber', 'good', 'person'], 
    ['barber', 'huge', 'person'], 
    ['knew', 'secret'], 
    ['secret', 'kept', 'huge', 'secret'],
    ['huge', 'secret'], 
    ['barber', 'kept', 'word'], 
    ['barber', 'kept', 'word'], 
    ['barber', 'kept', 'secret'], 
    ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], 
    ['barber', 'went', 'huge', 'mountain']
]

In [17]:
words = sum(sentences, [])
print(words)

['barber', 'person', 'barber', 'good', 'person', 'barber', 'huge', 'person', 'knew', 'secret', 'secret', 'kept', 'huge', 'secret', 'huge', 'secret', 'barber', 'kept', 'word', 'barber', 'kept', 'word', 'barber', 'kept', 'secret', 'keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy', 'barber', 'went', 'huge', 'mountain']


In [18]:
vocab = Counter(words)
vocab

Counter({'barber': 8,
         'crazy': 1,
         'driving': 1,
         'good': 1,
         'huge': 5,
         'keeping': 2,
         'kept': 4,
         'knew': 1,
         'mountain': 1,
         'person': 3,
         'secret': 6,
         'went': 1,
         'word': 2})

In [20]:
vocab.most_common(5)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]

In [21]:
top5 = vocab.most_common(5)
top5

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]

In [22]:
word_to_index = {}
i = 0
for word, freq in top5:
    i += 1
    word_to_index[word] = i
word_to_index

{'barber': 1, 'huge': 3, 'kept': 4, 'person': 5, 'secret': 2}

### 2) NLTK의 FreqDist 사용하기

In [7]:
from nltk import FreqDist
import numpy as np

In [23]:
vocab = FreqDist(words)
vocab['barber']

8

In [24]:
top5 = vocab.most_common(5)
top5

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]

In [25]:
word_to_index = {}

for i, word in enumerate(top5):
    word_to_index[word[0]] = i+1
word_to_index

{'barber': 1, 'huge': 3, 'kept': 4, 'person': 5, 'secret': 2}

In [32]:
word_to_index = {word[0]: i+1 for i, word in enumerate(top5)}
word_to_index

{'barber': 1, 'huge': 3, 'kept': 4, 'person': 5, 'secret': 2}

### 3) Keras 텍스트 전처리

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

In [27]:
tokenizer.fit_on_texts(sentences)
tokenizer.word_index

{'barber': 1,
 'crazy': 11,
 'driving': 10,
 'good': 8,
 'huge': 3,
 'keeping': 7,
 'kept': 4,
 'knew': 9,
 'mountain': 13,
 'person': 5,
 'secret': 2,
 'went': 12,
 'word': 6}

In [28]:
tokenizer.word_counts

OrderedDict([('barber', 8),
             ('person', 3),
             ('good', 1),
             ('huge', 5),
             ('knew', 1),
             ('secret', 6),
             ('kept', 4),
             ('word', 2),
             ('keeping', 2),
             ('driving', 1),
             ('crazy', 1),
             ('went', 1),
             ('mountain', 1)])

In [29]:
tokenizer.texts_to_sequences(sentences)

[[1, 5],
 [1, 8, 5],
 [1, 3, 5],
 [9, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [7, 7, 3, 2, 10, 1, 11],
 [1, 12, 3, 13]]

- 단어 빈도수 Top5만 제대로 표시하고, 나머지는 OOV값(1)으로 표시

In [33]:
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 2, oov_token = 'OOV')
# 빈도수 상위 5개 단어만 사용. 숫자 0과 OOV를 고려해서 단어 집합의 크기는 +2
tokenizer.fit_on_texts(sentences)

In [34]:
tokenizer.texts_to_sequences(sentences)

[[2, 6],
 [2, 1, 6],
 [2, 4, 6],
 [1, 3],
 [3, 5, 4, 3],
 [4, 3],
 [2, 5, 1],
 [2, 5, 1],
 [2, 5, 3],
 [1, 1, 4, 3, 1, 2, 1],
 [2, 1, 4, 1]]

In [36]:
tokenizer.word_index

{'OOV': 1,
 'barber': 2,
 'crazy': 12,
 'driving': 11,
 'good': 9,
 'huge': 4,
 'keeping': 8,
 'kept': 5,
 'knew': 10,
 'mountain': 14,
 'person': 6,
 'secret': 3,
 'went': 13,
 'word': 7}