## 코드 2-1. 텍스트 토큰화

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = "Mary, don't slap the gree witch."
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', "n't", 'slap', 'the', 'gree', 'witch', '.']


In [2]:
from nltk.tokenize import TweetTokenizer

tweet = u"Snow White and the Seven Degrees #MakeAMovieCold@midnight:-)"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


## 코드 2-2. 텍스트에서 n-그램 만들기
### n-그램 (유니그램, 바이그램, 트라이그램...)

In [3]:
def n_grams(text, n):
       return [text[i:i+n] for i in range(len(text) - n + 1)]

In [4]:
cleaned = ['mary', ',', "n't", 'slap', 'gree', 'witch', '.']
print(n_grams(cleaned, 3))

[['mary', ',', "n't"], [',', "n't", 'slap'], ["n't", 'slap', 'gree'], ['slap', 'gree', 'witch'], ['gree', 'witch', '.']]


## 코드 2-3. 표제어 추출(lemmatization): 단어를 표제어로 바꿉니다.

In [5]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"he was running late")
for token in doc:
    print("{} --> {}".format(token, token.lemma_))

he --> he
was --> be
running --> run
late --> late


## 어간 추출(stemming)

In [10]:
# Porter 추출기 사용

import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()

sentence = "This was not the map we found in Billy Bones's chest, but an accurate copy, complete in all things--names and heights and soundings--with the single exception of the red crosses and the written notes."
tokenized_sentence = word_tokenize(sentence)

print('어간 추출 전 :', tokenized_sentence)
print('어간 추출 후 :',[stemmer.stem(word) for word in tokenized_sentence])

어간 추출 전 : ['This', 'was', 'not', 'the', 'map', 'we', 'found', 'in', 'Billy', 'Bones', "'s", 'chest', ',', 'but', 'an', 'accurate', 'copy', ',', 'complete', 'in', 'all', 'things', '--', 'names', 'and', 'heights', 'and', 'soundings', '--', 'with', 'the', 'single', 'exception', 'of', 'the', 'red', 'crosses', 'and', 'the', 'written', 'notes', '.']
어간 추출 후 : ['thi', 'wa', 'not', 'the', 'map', 'we', 'found', 'in', 'billi', 'bone', "'s", 'chest', ',', 'but', 'an', 'accur', 'copi', ',', 'complet', 'in', 'all', 'thing', '--', 'name', 'and', 'height', 'and', 'sound', '--', 'with', 'the', 'singl', 'except', 'of', 'the', 'red', 'cross', 'and', 'the', 'written', 'note', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 코드 2-4. 품사 태깅 (part-of-speech (POS) tagging)

In [6]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Mary slapped the green witch.")
for token in doc:
    print("{} --> {}".format(token, token.pos_))

Mary --> PROPN
slapped --> VERB
the --> DET
green --> ADJ
witch --> NOUN
. --> PUNCT


## 코드 2-5. 명사구(NP) 부문 구문 분석

In [7]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Mary slapped the green witch.")
for chunk in doc.noun_chunks:
    print("{} --> {}".format(chunk, chunk.label_))

Mary --> NP
the green witch --> NP
