In [1]:
import re

In [2]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

In [3]:
pattern=r"Paul [A-Z]\w+"

In [4]:
matches=re.finditer(pattern,text)

In [5]:
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [6]:
import spacy
from spacy.tokens import Span

In [14]:
nlp=spacy.blank("en")
doc=nlp(text)
print(doc.ents)
original_ents=list(doc.ents)

mwt_ents=[]
for match in re.finditer(pattern,doc.text):
    start,end=match.span()
    span=doc.char_span(start,end)
    if span is not None:
        mwt_ents.append((span.start,span.end,span.text))
        
for ent in mwt_ents:
    start,end,name=ent
    per_ent=Span(doc,start,end,label="PERSON")
    original_ents.append(per_ent)
doc.ents=original_ents

for ent in doc.ents:
    print(ent.text,ent.label_)


()
Paul Newman PERSON
Paul Hollywood PERSON


In [15]:
print(mwt_ents)

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]


In [16]:
from spacy.language import Language

In [18]:
@Language.component("paul_ner")
def paul_ner(doc):
    original_ents=list(doc.ents)
    pattern=r"Paul [A-Z]\w+"
    mwt_ents=[]
    for match in re.finditer(pattern,doc.text):
        start,end=match.span()
        span=doc.char_span(start,end)
        if span is not None:
            mwt_ents.append((span.start,span.end,span.text))

    for ent in mwt_ents:
        start,end,name=ent
        per_ent=Span(doc,start,end,label="PERSON")
        original_ents.append(per_ent)
    doc.ents=original_ents
    return (doc)

In [19]:
nlp2=spacy.blank("en")
nlp2.add_pipe("paul_ner")

<function __main__.paul_ner(doc)>

In [20]:
doc2=nlp2(text)
print(doc2.ents)

(Paul Newman, Paul Hollywood)


In [62]:
from spacy.util import filter_spans

@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern=r"Hollywood"
    original_ents=list(doc.ents)
    
    mwt_ents=[]
    
    for match in re.finditer(pattern,doc.text):
        start,end=match.span()
        span=doc.char_span(start,end)
        if span is not None:
            mwt_ents.append((span.start,span.end,span.text))

    for ent in mwt_ents:
        start,end,name=ent
        per_ent=Span(doc,start,end,label="CINEMA")
        original_ents.append(per_ent)
        
    filtered=filter_spans(original_ents)
    doc.ents=filtered
    
    return (doc)


In [63]:
nlp3=spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")

<function __main__.cinema_ner(doc)>

In [64]:
from spacy.util import filter_spans
doc3=nlp3(text)
for ent in doc3.ents:
    print(ent.text,ent.label_)

Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
