# 품사 태깅 (Part-of-Speech Tagging)

<h3>NLTK를 이용한 품사 태깅</h3>

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

In [13]:
# (단어, 품사) 구성된 튜플의 리스트로 품사 태깅

from nltk.tokenize import word_tokenize

tokens = word_tokenize("Let's study text mining. How about a cup of coffee while studying?")
print(nltk.pos_tag(tokens))

[('Let', 'VB'), ("'s", 'POS'), ('study', 'VB'), ('text', 'NN'), ('mining', 'NN'), ('.', '.'), ('How', 'WRB'), ('about', 'IN'), ('a', 'DT'), ('cup', 'NN'), ('of', 'IN'), ('coffee', 'NN'), ('while', 'IN'), ('studying', 'VBG'), ('?', '.')]


In [14]:
# 품사의 약어 확인 코드

nltk.help.upenn_tagset('VB')

VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...


In [15]:
# 태그를 지정하여 원하는 품사만 추출

my_tag_set = ['NN', 'VB', 'JJ']
my_words = [word for word, tag in nltk.pos_tag(tokens) if tag in my_tag_set]
print(my_words)

['Let', 'study', 'text', 'mining', 'cup', 'coffee']


In [16]:
# 단어에 품사 정보를 추가하여 토큰화

words_with_tag = ['/'.join(item) for item in nltk.pos_tag(tokens)]
print(words_with_tag)

['Let/VB', "'s/POS", 'study/VB', 'text/NN', 'mining/NN', './.', 'How/WRB', 'about/IN', 'a/DT', 'cup/NN', 'of/IN', 'coffee/NN', 'while/IN', 'studying/VBG', '?/.']
