## 참조 문서

- https://wikidocs.net/21698

### 토큰화

In [None]:
from pprint import pprint

import nltk
from nltk.tokenize import word_tokenize, WordPunctTokenizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [39]:
text_sample = """
Don't be fooled by the dark sounding name, 
Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop.
"""

text_sample2 = """
Starting a home-based restaurant may be an ideal. 
it doesn't have a food chain or restaurant of their own.
"""

print(word_tokenize(text_sample))
print(WordPunctTokenizer().tokenize(text_sample))

['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']
['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr', '.', 'Jone', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [20]:
## 케라스 이용
from tensorflow.keras.preprocessing.text import text_to_word_sequence

print(text_to_word_sequence(text_sample))
print(text_to_word_sequence(text_sample2))

["don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'mr', "jone's", 'orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']
['starting', 'a', 'home', 'based', 'restaurant', 'may', 'be', 'an', 'ideal', 'it', "doesn't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own']


In [40]:
# 표준 토큰화: Peen TreeBank Tokkenization
#     1. 하이푼으로 구성된 단어는 하나로 윶히ㅏㄴ다
#     2. doesn't 와 같이 아포스트로피로 '접어' 가 함께하는 단어는 분리해준다
from nltk.tokenize import TreebankWordTokenizer

print(word_tokenize(text_sample2))
print(TreebankWordTokenizer().tokenize(text_sample2))

['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal', '.', 'it', 'does', "n't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']
['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal.', 'it', 'does', "n't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']


### 문장 토큰화

In [27]:
from nltk.tokenize import sent_tokenize

sample_sentence = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to mae sure no one was near."
sample_sentence2="I am actively looking for Ph.D. students. and you are a Ph.D student."

In [28]:
print(sent_tokenize(sample_sentence))
print(sent_tokenize(sample_sentence2))

['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of a cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to mae sure no one was near.']
['I am actively looking for Ph.D. students.', 'and you are a Ph.D student.']


### 문장 토큰화 기타 참고 자료

- https://tech.grammarly.com/blog/posts/How-to-Split-Sentences.html

### 품사 태깅 (POS)

In [43]:
from nltk.tag import pos_tag

In [48]:
tokens = word_tokenize(text_sample)
tagged_tokens = pos_tag(tokens)

pprint(tagged_tokens)

[('Do', 'VBP'),
 ("n't", 'RB'),
 ('be', 'VB'),
 ('fooled', 'VBN'),
 ('by', 'IN'),
 ('the', 'DT'),
 ('dark', 'NN'),
 ('sounding', 'VBG'),
 ('name', 'NN'),
 (',', ','),
 ('Mr.', 'NNP'),
 ('Jone', 'NNP'),
 ("'s", 'POS'),
 ('Orphanage', 'NN'),
 ('is', 'VBZ'),
 ('as', 'RB'),
 ('cheery', 'JJ'),
 ('as', 'IN'),
 ('cheery', 'NN'),
 ('goes', 'VBZ'),
 ('for', 'IN'),
 ('a', 'DT'),
 ('pastry', 'NN'),
 ('shop', 'NN'),
 ('.', '.')]
