In [1]:
import spacy
spacy.prefer_gpu()

True

In [2]:
nlp = spacy.load("en_core_web_lg")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


#### 关系标签

#### 标签表示从属的语法功能，名词性的标签是：

* root：中心词，通常是动词
* nsubj：名词性主语（nominal subject）
* dobj：直接宾语（direct object）
* prep：介词
* pobj：介词宾语
* cc：连词

#### 其他常用的标签：
* compound：复合词
* advmod：状语
* det：限定词
* amod：形容词修饰语

In [3]:
from spacy import displacy

displacy.render(doc.sents)

## NOUN chunks

#### 遍历文档中的基本名词短语。 如果已通过语法分析文档，则产生基本名词短语Span对象。 基本名词短语或“ NP块”是不允许将其他NP嵌套在其中的名词短语–因此，没有NP级别的协调，介词短语和相关从句。

In [4]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


## Navigating the parse tree



#### spaCy使用术语“head”和“child”来描述在依赖关系树中由单弧连接的词。 术语dep用于弧形标签，它描述了将child 连接到head的句法关系的类型。 与其他属性一样，.dep的值是哈希值。 您可以使用.dep_获取字符串值。

In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for token in doc:
    print(token.text, token.dep_, token.dep, token.head.text, token.head.pos_,
            [child for child in token.children])

Autonomous amod 402 cars NOUN []
cars nsubj 429 shift VERB [Autonomous]
shift ROOT 8206900633647566924 shift VERB [cars, liability]
insurance compound 7037928807040764755 liability NOUN []
liability dobj 416 shift VERB [insurance, toward]
toward prep 443 liability NOUN [manufacturers]
manufacturers pobj 439 toward ADP []


In [6]:
displacy.render(doc.sents)

In [7]:
import spacy
from spacy.symbols import nsubj, VERB

nlp = spacy.load("en_core_web_lg")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")

verbs = set()

for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
        
print(verbs)

{shift}


## Iterating around the local tree

In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("bright red apples on the tree")

for token in doc:
    print(token.text)

print([token.text for token in doc[2].lefts])
print([token.text for token in doc[2].rights])
print(doc[2].n_lefts)
print(doc[2].n_rights)

bright
red
apples
on
the
tree
['bright', 'red']
['on']
2
1


In [9]:
# !python -m spacy download de_core_news_sm

In [10]:
# !python -m spacy download zh_core_web_lg

In [11]:
import spacy

nlp = spacy.load("de_core_news_sm")
doc = nlp("schöne rote Äpfel auf dem Baum")
print([token.text for token in doc[2].lefts])  # ['schöne', 'rote']
print([token.text for token in doc[2].rights])  # ['auf']

['schöne', 'rote']
['auf']


In [14]:
import spacy


nlp = spacy.load("en_core_web_sm")
doc = nlp("Credit and mortgage account holders must submit their requests")

root = [token for token in doc if token.head == token][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
    assert subject is descendant or subject.is_ancestor(descendant)
    print(descendant.text, descendant.dep_, descendant.n_lefts,
            descendant.n_rights,
            [ancestor.text for ancestor in descendant.ancestors])

Credit nmod 0 2 ['holders', 'submit']
and cc 0 0 ['Credit', 'holders', 'submit']
mortgage compound 0 0 ['account', 'Credit', 'holders', 'submit']
account conj 1 0 ['Credit', 'holders', 'submit']
holders nsubj 1 0 ['submit']


In [17]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Credit and mortgage account holders must submit their requests")

print(doc[4].text)

span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1]
print(span.text)
with doc.retokenize() as retokenizer:
    retokenizer.merge(span)
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

holders
Credit and mortgage account holders
Credit and mortgage account holders NOUN nsubj submit
must VERB aux submit
submit VERB ROOT submit
their DET poss requests
requests NOUN dobj submit


## Visualizing dependencies

In [18]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
# Since this is an interactive Jupyter environment, we can use displacy.render here
displacy.render(doc, style='dep')

In [36]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm", disable=["tagger"])
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")

for token in doc:
    print(token.text, token.dep_, token.lemma_, token.tag_, token.sent)
# Since this is an interactive Jupyter environment, we can use displacy.render here
# displacy.render(doc, style='dep')

Autonomous amod Autonomous  Autonomous cars shift insurance liability toward manufacturers
cars nsubj car  Autonomous cars shift insurance liability toward manufacturers
shift ROOT shift  Autonomous cars shift insurance liability toward manufacturers
insurance compound insurance  Autonomous cars shift insurance liability toward manufacturers
liability dobj liability  Autonomous cars shift insurance liability toward manufacturers
toward prep toward  Autonomous cars shift insurance liability toward manufacturers
manufacturers pobj manufacturer  Autonomous cars shift insurance liability toward manufacturers


## Named Entity Recognition 101

In [37]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [39]:
displacy.render(doc, style="ent")

## Accessing entity annotations

* ent_iob	IOB code of named entity tag. 3 means the token begins an entity, 2 means it is outside an entity, 1 means it is inside an entity, and 0 means no entity tag is set.

* ent_type Named entity type.

In [40]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("San Francisco considers banning sidewalk delivery robots")

# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]   ## ent_iob 
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)  # ['San', 'B', 'GPE']
print(ent_francisco)  # ['Francisco', 'I', 'GPE']

[('San Francisco', 0, 13, 'GPE')]
['San', 'B', 'GPE']
['Francisco', 'I', 'GPE']


## Setting entity annotations

#### To ensure that the sequence of token annotations remains consistent, you have to set entity annotations at the document level. However, you can’t write directly to the token.ent_iob or token.ent_type attributes, so the easiest way to set entities is to assign to the doc.ents attribute and create the new entity as a Span.

In [42]:
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
doc = nlp("fb is hiring a new vice president of global policy")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('Before', ents)
# the model didn't recognise "fb" as an entity :(

fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity
doc.ents = list(doc.ents) + [fb_ent]

ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('After', ents)
# [('fb', 0, 2, 'ORG')] 🎉

Before []
After [('fb', 0, 2, 'ORG')]


## Setting entity annotations from array

In [54]:
import numpy
import spacy
from spacy.attrs import ENT_IOB, ENT_TYPE

nlp = spacy.load("en_core_web_sm")
doc = nlp.make_doc("London is a big city in the United Kingdom.")
print("Before", doc.ents)  # []

header = [ENT_IOB, ENT_TYPE]
attr_array = numpy.zeros((len(doc), len(header)), dtype="uint64")  ## shape=[10, 2]
attr_array[0, 0] = 3  # B
attr_array[0, 1] = doc.vocab.strings["GPE"]
doc.from_array(header, attr_array)
print("After", doc.ents)  # [London]

Before ()
After (London,)


## Accessing entity identifiers

In [57]:
## 自定义实体标识器

In [58]:
# import spacy

# nlp = spacy.load("my_custom_el_model")
# doc = nlp("Ada Lovelace was born in London")

# # document level
# ents = [(e.text, e.label_, e.kb_id_) for e in doc.ents]
# print(ents)  # [('Ada Lovelace', 'PERSON', 'Q7259'), ('London', 'GPE', 'Q84')]

# # token level
# ent_ada_0 = [doc[0].text, doc[0].ent_type_, doc[0].ent_kb_id_]
# ent_ada_1 = [doc[1].text, doc[1].ent_type_, doc[1].ent_kb_id_]
# ent_london_5 = [doc[5].text, doc[5].ent_type_, doc[5].ent_kb_id_]
# print(ent_ada_0)  # ['Ada', 'PERSON', 'Q7259']
# print(ent_ada_1)  # ['Lovelace', 'PERSON', 'Q7259']
# print(ent_london_5)  # ['London', 'GPE', 'Q84']

## Adding special case tokenization rules

In [59]:
import spacy
from spacy.symbols import ORTH
nlp = spacy.load("en_core_web_sm")
doc = nlp("gimme that")  # phrase to tokenize
print([w.text for w in doc])  # ['gimme', 'that']

# Add special case rule
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)

# Check new tokenization
print([w.text for w in nlp("gimme that")])  # ['gim', 'me', 'that']

['gimme', 'that']
['gim', 'me', 'that']


## Debugging the tokenizer

In [61]:
from spacy.lang.en import English

nlp = English()
text = '''"Let's go!"'''
doc = nlp(text)
tok_exp = nlp.tokenizer.explain(text)
assert [t.text for t in doc if not t.is_space] == [t[1] for t in tok_exp]
for t in tok_exp:
    print(t[1], "\t", t[0])

" 	 PREFIX
Let 	 SPECIAL-1
's 	 SPECIAL-2
go 	 TOKEN
! 	 SUFFIX
" 	 SUFFIX


In [62]:
tok_exp

[('PREFIX', '"'),
 ('SPECIAL-1', 'Let'),
 ('SPECIAL-2', "'s"),
 ('TOKEN', 'go'),
 ('SUFFIX', '!'),
 ('SUFFIX', '"')]

In [63]:
import re
import spacy
from spacy.tokenizer import Tokenizer

special_cases = {":)": [{"ORTH": ":)"}]}
prefix_re = re.compile(r'''^[[("']''')
suffix_re = re.compile(r'''[])"']$''')
infix_re = re.compile(r'''[-~]''')
simple_url_re = re.compile(r'''^https?://''')

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, rules=special_cases,
                                prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                url_match=simple_url_re.match)

nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = custom_tokenizer(nlp)
doc = nlp("hello-world. :)")
print([t.text for t in doc]) # ['hello', '-', 'world.', ':)']

['hello', '-', 'world.', ':)']


## Merging and splitting 

In [68]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("I live in New York")
print("Before:", [token.text for token in doc])

with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[3:5], attrs={"LEMMA": "new york"})
print("After:", [token.text for token in doc])

Before: ['I', 'live', 'in', 'New', 'York']
After: ['I', 'live', 'in', 'New York']


## Splitting tokens

In [69]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("I live in NewYork")
print("Before:", [token.text for token in doc])
displacy.render(doc)  # displacy.serve if you're not in a Jupyter environment

with doc.retokenize() as retokenizer:
    heads = [(doc[3], 1), doc[2]]
    attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
    retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
print("After:", [token.text for token in doc])
displacy.render(doc)  # displacy.serve if you're not in a Jupyter environment|

Before: ['I', 'live', 'in', 'NewYork']


After: ['I', 'live', 'in', 'New', 'York']


## Overwriting custom extension attributes

In [71]:
import spacy
from spacy.tokens import Token

# Register a custom token attribute, token._.is_musician
Token.set_extension("is_musician", default=False)

nlp = spacy.load("en_core_web_sm")
doc = nlp("I like David Bowie")
print("Before:", [(token.text, token._.is_musician) for token in doc])

with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[2:4], attrs={"_": {"is_musician": True}})
print("After:", [(token.text, token._.is_musician) for token in doc])

Before: [('I', False), ('like', False), ('David', False), ('Bowie', False)]
After: [('I', False), ('like', False), ('David Bowie', True)]


## Default: Using the dependency parse

In [75]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("This is a sentence. This is another sentence.")
for sent in doc.sents:
    print(sent.text)

This is a sentence.
This is another sentence.


## Rule-based pipeline component

In [74]:
import spacy
from spacy.lang.en import English

nlp = English()  # just the language with no model
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
doc = nlp("This is a sentence. This is another sentence.")
for sent in doc.sents:
    print(sent.text)

This is a sentence.
This is another sentence.


In [76]:
import spacy

text = "this is a sentence...hello...and another sentence."

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
print("Before:", [sent.text for sent in doc.sents])

def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == "...":
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_boundaries, before="parser")
doc = nlp(text)
print("After:", [sent.text for sent in doc.sents])

Before: ['this is a sentence...hello...and another sentence.']
After: ['this is a sentence...', 'hello...', 'and another sentence.']
