In [23]:
import nltk
# nltk.download()
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer

from keras.preprocessing.text import text_to_word_sequence

## 단어단위 토큰화

In [17]:
print(word_tokenize("I don't think so, Lunchtime double so!"))

['I', 'do', "n't", 'think', 'so', ',', 'Lunchtime', 'double', 'so', '!']


In [18]:
print(WordPunctTokenizer().tokenize("I don't think so, Lunchtime double so!"))

['I', 'don', "'", 't', 'think', 'so', ',', 'Lunchtime', 'double', 'so', '!']


In [19]:
print(text_to_word_sequence("I don't think so, Lunchtime double so!"))

['i', "don't", 'think', 'so', 'lunchtime', 'double', 'so']


In [22]:
print(TreebankWordTokenizer().tokenize("I don't think so, Lunchtime double so!"))

['I', 'do', "n't", 'think', 'so', ',', 'Lunchtime', 'double', 'so', '!']


### 문장단위 토큰화

In [27]:
from nltk.tokenize import sent_tokenize

In [28]:
text = "열심히 코딩한 당신, 이제 떠나라. 보라카이 발리 괌 하와이 몰디브 가고 싶다. 슈방"

In [29]:
print(sent_tokenize(text))

['열심히 코딩한 당신, 이제 떠나라.', '보라카이 발리 괌 하와이 몰디브 가고 싶다.', '슈방']


## 한국어 토큰화

In [30]:
from nltk.tag import pos_tag
x = word_tokenize(text)
pos_tag(x)

[('열심히', 'JJ'),
 ('코딩한', 'NNP'),
 ('당신', 'NNP'),
 (',', ','),
 ('이제', 'NNP'),
 ('떠나라', 'NNP'),
 ('.', '.'),
 ('보라카이', 'VB'),
 ('발리', 'JJ'),
 ('괌', 'NNP'),
 ('하와이', 'NNP'),
 ('몰디브', 'NNP'),
 ('가고', 'NNP'),
 ('싶다', 'NNP'),
 ('.', '.'),
 ('슈방', 'NN')]

In [31]:
from konlpy.tag import Okt

In [35]:
okt = Okt()

In [37]:
okt.morphs(text)

['열심히',
 '코딩',
 '한',
 '당신',
 ',',
 '이제',
 '떠나라',
 '.',
 '보라카이',
 '발리',
 '괌',
 '하와이',
 '몰디브',
 '가고',
 '싶다',
 '.',
 '슈방']

In [38]:
okt.pos(text)

[('열심히', 'Adverb'),
 ('코딩', 'Noun'),
 ('한', 'Josa'),
 ('당신', 'Noun'),
 (',', 'Punctuation'),
 ('이제', 'Noun'),
 ('떠나라', 'Verb'),
 ('.', 'Punctuation'),
 ('보라카이', 'Noun'),
 ('발리', 'Noun'),
 ('괌', 'Noun'),
 ('하와이', 'Noun'),
 ('몰디브', 'Noun'),
 ('가고', 'Verb'),
 ('싶다', 'Verb'),
 ('.', 'Punctuation'),
 ('슈방', 'Noun')]

In [39]:
from konlpy.tag import Kkma
kkma = Kkma()
kkma.morphs(text)

['열심히',
 '코딩',
 '하',
 'ㄴ',
 '당신',
 ',',
 '이제',
 '떠나',
 '라',
 '.',
 '보라',
 '카이',
 '발리',
 '괌',
 '하와이',
 '몰디브',
 '가',
 '고',
 '싶',
 '다',
 '.',
 '슈',
 '방']

In [44]:
kkma.nouns(text)

['코딩', '당신', '보라', '보라카이', '카이', '발리', '괌', '하와이', '몰디브', '슈', '슈방', '방']

## 정제, 정규화

In [40]:
import re

In [41]:
text = "I was wondering anyone out there could enlighten me on this car"

'I was wondering anyone out there could enlighten me on this car'

In [42]:
from nltk.stem import WordNetLemmatizer

In [43]:
n = WordNetLemmatizer()