# 정수 인코딩(Integer Encoding)

#### 1) dictionary 사용하기

In [3]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopword

In [14]:
text="A barber is a person. \
a barber is good person. \
a barber is huge person. \
he Knew A Secret! The Secret He Kept is huge secret. \
Huge secret. His barber kept his word. a barber kept his word. \
His barber kept his secret. \
But keeping and keeping such a huge secret to himself was driving the barber crazy. \
the barber went up a huge mountain."

In [15]:
text=sent_tokenize(text)
print(text)

['A barber is a person.', 'a barber is good person.', 'a barber is huge person.', 'he Knew A Secret!', 'The Secret He Kept is huge secret.', 'Huge secret.', 'His barber kept his word.', 'a barber kept his word.', 'His barber kept his secret.', 'But keeping and keeping such a huge secret to himself was driving the barber crazy.', 'the barber went up a huge mountain.']


In [19]:
# 정제와 단어 토큰화
vocab={} # 파이썬의 dictionary 자료형
sentences = []
stop_words = set(stopwords.words('english'))

for i in text:
    sentence=word_tokenize(i)
    result=[]
    
    for word in sentence:
        # 모든 단어를 소문자화하여 단어의 개수를 줄입니다.
        word=word.lower()
        
        # 단어 토큰화 된 결과에 대해서 불용어를 제거합니다.
        if word not in stop_words:
            
            # 단어 길이가 2이하인 경우에 대하여 추가로 단어를 제거합니다.
            if len(word) > 2:
                
                result.append(word)
                if word not in vocab:
                    vocab[word] = 0
                vocab[word] += 1
    sentences.append(result)
print(sentences)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]


In [20]:
print(vocab)

{'barber': 8, 'person': 3, 'good': 1, 'huge': 5, 'knew': 1, 'secret': 6, 'kept': 4, 'word': 2, 'keeping': 2, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1}


In [21]:
print(vocab["barber"])

8


In [22]:
# 빈도수 정렬
vocab_sorted=sorted(vocab.items(), key=lambda x:x[1], reverse=True)
print(vocab_sorted)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3), ('word', 2), ('keeping', 2), ('good', 1), ('knew', 1), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)]


In [24]:
word_to_index={}
i = 0
for (word, frequency) in vocab_sorted:
    # 정제(Cleaning) 챕터에서 언급했듯이 빈도수가 적은 단어는 제외한다.
    if frequency > 1:
        i = i + 1
        word_to_index[word] = i
print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7}


In [29]:
vocab_size=5
# 인덱스가 5 초과인 단어 제거
words_frequency = [w for w,c in word_to_index.items() if c >= vocab_size + 1]
for w in words_frequency:
    # 해당 단어에 대한 인덱스 정보를 삭제
    del word_to_index[w] 
print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}
