### created by hcy 20200729 16:04

###  Expanding named entities

In [1]:
import spacy 
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
doc = nlp("Dr. Alex Smith chaired first board meeting at Google")

In [4]:
doc

Dr. Alex Smith chaired first board meeting at Google

In [7]:
print([(ent.text, ent.label_, ent.start) for ent in doc.ents])

[('Alex Smith', 'PERSON', 1), ('Google', 'ORG', 8)]


In [8]:
def add_title(doc):
    new_ents = []
    for ent in doc.ents:
        if ent.label_ == "PERSON" and ent.start != 0:
            prev_token = doc[ent.start-1]
            if prev_token.text in ("Dr", "Dr.", "Mr", "Mr."):
                new_ent = Span(doc, ent.start-1, ent.end, label=ent.label)
                new_ents.append(new_ent)
            else:
                new_ents.append(ent)
    doc.ents = new_ents
    return doc

In [9]:
# nlp = spacy.load("en_core_web_lg")
nlp.add_pipe(add_title, after="ner")

In [10]:
doc = nlp("Dr. Alex Smith chaired first board meeting at Google")

In [11]:
print([(ent.text, ent.label_) for ent in doc.ents])

[('Dr. Alex Smith', 'PERSON')]


## Use of POS and Dep Parsing

In [12]:
nlp = spacy.load("en_core_web_lg")

In [13]:
doc = nlp("Alex Smith was working at Google")

In [14]:
print([token.text for token in doc])

['Alex', 'Smith', 'was', 'working', 'at', 'Google']


#### 关系标签

#### 标签表示从属的语法功能，名词性的标签是：

* root：中心词，通常是动词
* nsubj：名词性主语（nominal subject）
* dobj：直接宾语（direct object）
* prep：介词
* pobj：介词宾语
* cc：连词

#### 其他常用的标签：
* compound：复合词
* advmod：状语
* det：限定词
* amod：形容词修饰语

In [15]:
displacy.render(doc, style="dep", options={"compact": True, "distance":100})

In [24]:
person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
for ent in person_entities:
    print(ent.text)
    print(ent.root.text)
    head = ent.root.head
    print(head.text)
    print("\n")
    for token in ent.root.head.children:
        print(token.text)

Alex Smith
Smith
working


Smith
was
at


In [25]:
def get_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == 'work':
            preps = [token for token in head.children if token.dep_ == 'prep']
            for prep in preps:
                orgs = [token for token in prep.children if token.ent_type_ == "ORG"]
                print({'person': ent, 'orgs': orgs, 'past': head.tag == "VBD"})
    return doc

In [26]:
from spacy.pipeline import merge_entities

In [27]:
nlp = spacy.load("en_core_web_lg")

In [97]:
nlp.add_pipe(merge_entities)

In [98]:
nlp.add_pipe(get_person_orgs)

In [109]:
doc = nlp("Alex Smith is working at Google")

{'person': Alex Smith, 'orgs': [Google], 'past': False}
