# EXPANDING NAME ENTITIES

In [3]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [4]:
nlp= spacy.load("en_core_web_sm")

In [5]:
doc= nlp('Dr. Alex Smith chaired first board meeting at Google')

In [6]:
doc

Dr. Alex Smith chaired first board meeting at Google

In [7]:
print([(ent.text, ent.label_) for ent in doc.ents])

[('Alex Smith', 'PERSON'), ('first', 'ORDINAL'), ('Google', 'ORG')]


In [2]:
# Installing spacy transformer
!pip install spacy[transformers]

Collecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.4-py2.py3-none-any.whl (51 kB)
Collecting torch>=1.6.0
  Downloading torch-1.11.0-cp39-cp39-win_amd64.whl (157.9 MB)
Collecting transformers<4.16.0,>=3.4.0
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.8.4-cp39-cp39-win_amd64.whl (183 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp39-cp39-win_amd64.whl (2.0 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers, torch, spacy-alignments, spacy-transformers
Successfully installed huggingface-hub-0.4.0 sacremoses-0.0.47 spacy-alignments-0.8.4 spacy-transformers-1.1.4 tokenizers-0.10.3 torch-1.11.0 transformers-4.15.0


In [3]:
import spacy
from spacy.language import Language
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")

In [6]:
# Writing function to extract DR Mr Ms etc
@Language.component("add_title")
def add_title(doc):
    new_ents = []
    for ent in doc.ents:
        if ent.label_ == "PERSON" and ent.start != 0:
            prev_token = doc[ent.start - 1]
            if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
                new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
                new_ents.append(new_ent)
        else:
            new_ents.append(ent)
    doc.ents = new_ents
    return doc

In [7]:
# Add the component after the named entity recognizer
nlp.add_pipe("add_title", after="ner")

doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Dr. Alex Smith', 'PERSON'), ('first', 'ORDINAL'), ('Acme Corp Inc.', 'ORG')]


In [10]:
# Another approach to do ectract Dr Mr etc
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")


def get_person_title(span):
    if span.label_ == "PERSON" and span.start != 0:
        prev_token = span.doc[span.start - 1]
        if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
            return prev_token.text

# Register the Span extension as 'person_title'
Span.set_extension("person_title", getter=get_person_title)

doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.")
print([(ent.text, ent.label_, ent._.person_title) for ent in doc.ents])

ValueError: [E090] Extension 'person_title' already exists on Span. To overwrite the existing extension, set `force=True` on `Span.set_extension`.

# USE OF POS AND DEEP PARSING

In [11]:
nlp= spacy.load('en_core_web_sm')

In [12]:
doc= nlp("Alex Smith was working at Google")

In [16]:
from spacy import displacy
displacy.render(doc,style='dep', options={'compact':True, 'distance':100})

In [24]:
@Language.component("get_person_orgs")
def get_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        # Because the entity is a span, we need to use its root token. The head
        # is the syntactic governor of the person, e.g. the verb
        head = ent.root.head
        if head.lemma_ == "work":
            # Check if the children contain a preposition
            preps = [token for token in head.children if token.dep_ == "prep"]
            for prep in preps:
            # Check if tokens part of ORG entities are in the preposition's
            # children, e.g. at -> Acme Corp Inc.
                orgs = [token for token in prep.children if token.ent_type_ == "ORG"]
            # If the verb is in past tense, the company was a previous company
                print({"person": ent, "orgs": orgs, "past": head.tag_ == "VBD"})
    return doc

In [19]:
# We need to extract whether it is a past or oresent?

In [25]:
from spacy.pipeline import merge_entities

In [26]:
nlp= spacy.load('en_core_web_sm')

In [32]:
# To make the entities easier to work with, we'll merge them into single tokens
nlp.add_pipe("merge_entities")
#nlp.add_pipe("get_person_orgs")

ValueError: [E007] 'merge_entities' already exists in pipeline. Existing names: ['tok2vec', 'tagger', 'parser', 'senter', 'attribute_ruler', 'lemmatizer', 'ner', 'merge_entities']

In [33]:
nlp.add_pipe("get_person_orgs")

<function __main__.get_person_orgs(doc)>

In [34]:
doc=nlp("Alex Smith was working at Google")

{'person': Alex Smith, 'orgs': [Google], 'past': False}


In [35]:
doc=nlp("Alex Smith worked at Google")

{'person': Alex Smith, 'orgs': [Google], 'past': True}


In [39]:
# Modify model so it will be working for "was" as well, we will change the function name as well

In [40]:
@Language.component("extract_person_orgs")
def extract_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == "work":
            preps = [token for token in head.children if token.dep_ == "prep"]
            for prep in preps:
                orgs = [t for t in prep.children if t.ent_type_ == "ORG"]
                aux = [token for token in head.children if token.dep_ == "aux"]
                past_aux = any(t.tag_ == "VBD" for t in aux)
                past = head.tag_ == "VBD" or head.tag_ == "VBG" and past_aux
                print({'person': ent, 'orgs': orgs, 'past': past})
    return doc

In [41]:
nlp.add_pipe("extract_person_orgs")

<function __main__.extract_person_orgs(doc)>

In [42]:
doc=nlp("Alex Smith was working at Google")

{'person': Alex Smith, 'orgs': [Google], 'past': False}
{'person': Alex Smith, 'orgs': [Google], 'past': True}


In [43]:
doc=nlp("Alex Smith worked at Google")

{'person': Alex Smith, 'orgs': [Google], 'past': True}
{'person': Alex Smith, 'orgs': [Google], 'past': True}
