# 텍스트 토큰화

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = "Mary, don't slap the gree witch."
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', "n't", 'slap', 'the', 'gree', 'witch', '.']


In [4]:
from nltk.tokenize import TweetTokenizer

tweet = u"Snow White and the Seven Degrees #MakeAMovieCold@midnight:-)"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


# n-그램 (유니그램, 바이그램, 트라이그램...)

In [6]:
def n_grams(text, n):
       return [text[i:i+n] for i in range(len(text) - n + 1)]

cleaned = ['mary', ',', "n't", 'slap', 'gree', 'witch', '.']
print(n_grams(cleaned, 3))

[['mary', ',', "n't"], [',', "n't", 'slap'], ["n't", 'slap', 'gree'], ['slap', 'gree', 'witch'], ['gree', 'witch', '.']]


# 표제어와 어간
## 표제어 추출 lemmatization

In [7]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"he was running late")
for token in doc:
    print("{} --> {}".format(token, token.lemma_))

he --> he
was --> be
running --> run
late --> late


# 단어 분류하기: 품사 태깅 (part-of-speech (POS) tagging)

In [8]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Mary slapped the green witch.")
for token in doc:
    print("{} --> {}".format(token, token.pos_))

Mary --> PROPN
slapped --> VERB
the --> DET
green --> ADJ
witch --> NOUN
. --> PUNCT


# 청크 나누기(chunking)와 부문 구문 분석(shallow parsing)
## chunk: 하나의 의미가 있는 말 덩어리
# 개체명(named entity) 인식

In [9]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Mary slapped the green witch.")
for chunk in doc.noun_chunks:
    print("{} --> {}".format(chunk, chunk.label_))

Mary --> NP
the green witch --> NP
