In [4]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [5]:
doc= nlp('Ailcia and me went to school by bus')

In [6]:
for token in doc:
    print(token.text, token.pos, spacy.explain(token.pos_))

Ailcia 96 proper noun
and 89 coordinating conjunction
me 95 pronoun
went 100 verb
to 85 adposition
school 92 noun
by 85 adposition
bus 92 noun


In [7]:
# POS : part of speech : 각 단어가 위치나 문장 구조 안에 형태에 따라 어떤 형식을 가지는지 알아보았는데
# NLU(Natural Language Understanding) 측면에서 정확도를 높이기 위해 필요함.
# 구문 정보 pos에 영향을 가져왔다고 할 수 있다.

In [9]:
sentence1 = 'I flew to Rome'
sentence2 = "I'm flying to Rome"
sentence3 = 'I will fly to Rome'

doc1 = nlp(sentence1)
doc2 = nlp(sentence2)
doc3 = nlp(sentence3)

for doc in [doc1, doc2, doc3]:
    print( [(w.text, w.lemma_) for w in doc if w.tag_ == 'VBG' or w.tag_ == 'VB'])

[]
[('flying', 'fly')]
[('fly', 'fly')]


In [10]:
# WSD : 단어 의미 중의성 해소
# 단어들이 가진 중의성으로 인해 사람들도 다른 의미로 해석을 한다
# WSD problem : 자연어 처리에서 open problem, 복잡하고 통계적인 모델을 계속 연구하고 발표하고 있音

# Bass - fish (Noun)
# Bass = lowest male voice (Noun)
# Bass - guitar(Noun)

for token in doc3:
    print(token.text, token.tag_, spacy.explain(token.tag_))

I PRP pronoun, personal
will MD verb, modal auxiliary
fly VB verb, base form
to IN conjunction, subordinating or preposition
Rome NNP noun, proper singular


In [11]:
doc = nlp('He earned $5. 5 million in 2020 and paid %3$ max.')

for token in doc:
    print(token.text, token.tag_, spacy.explain(token.tag_))

He PRP pronoun, personal
earned VBD verb, past tense
$ $ symbol, currency
5 CD cardinal number
. . punctuation mark, sentence closer
5 CD cardinal number
million CD cardinal number
in IN conjunction, subordinating or preposition
2020 CD cardinal number
and CC conjunction, coordinating
paid VBD verb, past tense
% NN noun, singular or mass
3 CD cardinal number
$ $ symbol, currency
max NN noun, singular or mass
. . punctuation mark, sentence closer


In [12]:
doc = nlp('blue flower')
for token in doc:
    print(token.text, token.dep_)

blue amod
flower ROOT


In [13]:
# NER : name entity
doc = nlp('The president Donald Trump visited France')
doc.ents

(Donald Trump, France)

In [14]:
spacy.explain('france')

In [16]:
doc = nlp('He worked for NASA')
doc[3].ent_type_, spacy.explain(doc[3].ent_type_)

('ORG', 'Companies, agencies, institutions, etc.')

In [17]:
# 많이 사용되는 entity type
# PER : Named person or family
# LOC
# OBG
# MISC : 기타등등

In [18]:
doc = nlp('Albert Einstain was born in Ulm on 19=897. he studied electronical engineering at ETH 2urich.')
doc.ents

(Albert Einstain, Ulm, 19=897, ETH 2urich)

In [19]:
for token in doc:
    print(token.text, token.ent_type_, spacy.explain(token.ent_type_))

Albert PERSON People, including fictional
Einstain PERSON People, including fictional
was  None
born  None
in  None
Ulm PERSON People, including fictional
on  None
19=897 CARDINAL Numerals that do not fall under another type
.  None
he  None
studied  None
electronical  None
engineering  None
at  None
ETH ORG Companies, agencies, institutions, etc.
2urich ORG Companies, agencies, institutions, etc.
.  None


In [20]:
doc = nlp('Jean-Michel Basquiat was an American artist of Haitian and Puerto Rican descent who gained fame with his graffiti and street art work')
doc.ents

(Jean-Michel Basquiat, American, Haitian, Puerto Rican)

In [21]:
type(doc.ents)

tuple

In [23]:
for ent in doc.ents:
    print(ent, ent.label_, spacy.explain(ent.label))

Jean-Michel Basquiat PERSON None
American NORP None
Haitian GPE None
Puerto Rican NORP None
