In [147]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [148]:
nlp=spacy.load('en_core_web_sm')

In [149]:
doc=nlp('Dr. Alex Smith chaired firat board meeting at Google')

In [150]:
doc

Dr. Alex Smith chaired firat board meeting at Google

In [151]:
print([(ent.text,ent.label_) for ent in doc.ents])

[('Alex Smith', 'PERSON')]


In [152]:
from spacy.language import Language

@Language.component("add_title")
def add_title(doc):
    new_ents=[]
    for ent in doc.ents:
        if ent.label_=='PERSON' and ent.start!=0:
            prev_token=doc[ent.start-1]
            if prev_token.text in ('Dr','Dr.','Mr','Mr.'):
                new_ent=Span(doc,ent.start-1,ent.end,label=ent.label_)
                new_ents.append(new_ent)
            else:
                new_ents.append(ent)
    doc.ents=new_ents
    return doc

In [153]:
nlp=spacy.load('en_core_web_sm')
nlp.add_pipe("add_title",after='ner')

<function __main__.add_title(doc)>

In [154]:
doc=nlp('Dr. Alex Smith chaired first board meeting at Google')

In [155]:
print([(ent.text,ent.label_) for ent in doc.ents])

[('Dr. Alex Smith', 'PERSON')]


### Use of Pos and Dep Parsing

In [156]:
nlp=spacy.load('en_core_web_sm')

In [157]:
doc=nlp('Alex Smith was working at Google')

In [158]:
displacy.render(doc,style='dep',options={'compact':True,'distance':100})

In [200]:
"""
from spacy.language import Language
@Language.component("get_person_orgs")
def get_person_orgs(doc):
    person_entities=[ent for ent in doc.ents if ent.label_=="PERSON"]
    for ent in person_entities:
        head=ent.root.head
        if head.lemma_=='work':
            preps=[token for token in head.children if token.dep_=='prep']
            for prep in preps:
                orgs=[token for token in prep.children if token.ent_type_=="ORG"]
                print({'person':ent,'orgs':orgs,'past':head.tag_=="VBD"})
    return doc
"""
import spacy
from spacy.language import Language
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

@Language.component("extract_person_orgs")
def extract_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == "work":
            preps = [token for token in head.children if token.dep_ == "prep"]
            for prep in preps:
                orgs = [token for token in prep.children if token.ent_type_ == "ORG"]
                print({'person': ent, 'orgs': orgs, 'past': head.tag_ == "VBD"})
    return doc

# To make the entities easier to work with, we'll merge them into single tokens
nlp.add_pipe("merge_entities")
nlp.add_pipe("extract_person_orgs")

doc = nlp("Alex Smith worked at Acme Corp Inc.")
doc23=nlp("Alex Smith worked at Google.")
# If you're not in a Jupyter / IPython environment, use displacy.serve
displacy.render(doc, options={"fine_grained": True})

{'person': Alex Smith, 'orgs': [Acme Corp Inc.], 'past': True}
{'person': Alex Smith, 'orgs': [Google], 'past': True}


In [201]:
from spacy.language import Language
from spacy.pipeline import merge_entities


In [202]:
nlp=spacy.load("en_core_web_sm")


In [203]:
nlp.add_pipe("merge_entities")

<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

In [204]:
nlp.add_pipe("extract_person_orgs")

<function __main__.extract_person_orgs(doc)>

In [206]:
doc23=nlp("Alex Smith worked at Google.")

{'person': Alex Smith, 'orgs': [Google], 'past': True}


### Modify Model


In [236]:
"""
from spacy.language import Language
@Language.component("get_person_orgs")
def get_person_orgs(doc):
    person_entities=[ent for ent in doc.ents if ent.label_=="PERSON"]
    for ent in person_entities:
        head=ent.root.head
        if head.lemma_=='work':
            preps=[token for token in head.children if token.dep_=='prep']
            for prep in preps:
                orgs=[token for token in prep.children if token.ent_type_=="ORG"]
                aux=[token for token in head.children if token.dep_=='aux']
                past_aux=any(t.tag_=="VBD" for t in aux)
                past=head.tag=="VBD" or head.tag_=="VBG" and past_aux
            print({'person':ent,'orgs':orgs,'past':past})   
                
    return doc
"""
import spacy
from spacy.language import Language
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

@Language.component("extract_person_orgs")
def extract_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == "work":
            preps = [token for token in head.children if token.dep_ == "prep"]
            for prep in preps:
                orgs = [token for token in prep.children if token.ent_type_ == "ORG"]
                aux=[token for token in head.children if token.dep_=='aux']
                past_aux=any(t.tag_=="VBD" for t in aux)
                past=head.tag=="VBD" or head.tag_=="VBG" and past_aux
                print({'person': ent, 'orgs': orgs, 'past': head.tag_ == "VBD"})
    return doc

# To make the entities easier to work with, we'll merge them into single tokens
nlp.add_pipe("merge_entities")
nlp.add_pipe("extract_person_orgs")

doc23=nlp("Aryamaan Pandey worked at Google.")

# If you're not in a Jupyter / IPython environment, use displacy.serve
displacy.render(doc23, options={"fine_grained": True})

{'person': Aryamaan Pandey, 'orgs': [Google], 'past': True}


In [237]:
nlp=spacy.load("en_core_web_sm")


In [238]:
nlp.add_pipe("merge_entities")

<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

In [239]:
nlp.add_pipe("get_person_orgs")

<function __main__.get_person_orgs(doc)>

In [232]:
doc123=nlp("Alex Smith was working at Apple.")

{'person': Alex Smith, 'orgs': [Apple], 'past': True}


### Processing Text

In [169]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [170]:
nl=spacy.load("en_core_web_sm")

In [171]:
doc2=nl("This is a raw text")

In [172]:
texts=["This is raw text","There is lots of text"]


In [173]:
doc2=list(nl.pipe(texts))

In [174]:
import spacy
texts=[
    "Net income was $9.4 million compared to the prior year of $2.7 billion",
    "Revenue exceeded twelve billion dollars,with a loss of $1b.",
]
nlp=spacy.load("en_core_web_sm")
for doc in nlp.pipe(texts,disable=["tagger","parser"]):
    #Do something with doc here
    print([(ent.text,ent.label_) for ent in doc.ents])
    print()



[('$9.4 million', 'MONEY'), ('the prior year', 'DATE'), ('$2.7 billion', 'MONEY')]

[('twelve billion dollars', 'MONEY'), ('1b', 'MONEY')]



### Disabling and Modifying Pipeline Components

In [175]:
nl2=spacy.load("en_core_web_sm",disable=["tagger","parser"])


In [176]:
nl2

<spacy.lang.en.English at 0x180cc896e08>

In [177]:
doc2311=nl2("Apple is buying a startup!!!!!")



In [178]:
for ent in doc2311.ents:
    print(ent.text,ent.label_)

Apple ORG


In [179]:
nl2=spacy.load("en_core_web_sm")

In [180]:
#1. Use as a context manager
with nlp.disable_pipes("tagger","parser"):
    doc=nl2("I won't be tagged and parsed")
    doc=nl2("I will be tagged and parsed")

In [181]:
#2. Restore manually
disabled=nl2.disable_pipes("ner")
doc=nl2("I won't have named entities")
disabled.restore()