RegEx to Extract Multi-Word Tokens

In [2]:
import re

In [21]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

In [4]:
pattern = r"Paul [A-Z]\w+"

matches = re.finditer(pattern, text)

for match in matches:
    print (match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [5]:
import re
import spacy
from spacy.tokens import Span

In [None]:
nlp = spacy.blank("en")
doc = nlp(text)
original_ents = list(doc.ents)
mwt_ents = []
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))
for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label= "PERSON")
    original_ents.append(per_ent)
doc.ents = original_ents
for ent in doc.ents:
    print(ent.text, ent.label_)
print(mwt_ents)

Paul Newman PERSON
Paul Hollywood PERSON
[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]


Create a costume component with this

In [30]:
from spacy.language import Language
pattern = r"Paul [A-Z]\w+"

@Language.component("paul_ner")
def paul_ner(doc):
    pattern = r"Paul [A-Z]\w+"
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label= "PERSON")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return (doc)

In [31]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")

<function __main__.paul_ner(doc)>

In [32]:
doc2 = nlp2(text)
print(doc2.ents)

(Paul Newman, Paul Hollywood)


In [33]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Paul Newman PERSON
Paul Hollywood PERSON


Create a new Entiti/component; solving a real life problem

In [37]:
from spacy.language import Language

@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern = r"Hollywood"
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label= "CINEMA")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return (doc)

In [38]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")

<function __main__.cinema_ner(doc)>

In [39]:
doc3 = nlp3(text)

ValueError: [E1010] Unable to set entity information for token 9 which is included in more than one span in entities, blocked, missing or outside.

This is a commun error, because there are spans that overlap. We are trying to assign a span to two of the same tokens. A method to solve this is. For solving it is to ork with the filter spans from the spaCy.util  

In [None]:
from spacy.language import Language
from spacy.util import filter_spans
@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern = r"Hollywood"
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label= "CINEMA")
        original_ents.append(per_ent)
    #this goes and look to all teh start and end section of the entities and if it founds an overlap in tokens it will give the priority to the longer token 
    fiiltered = filter_spans(original_ents)
    doc.ents = fiiltered
    return (doc)

In [44]:
nlp4 = spacy.load("en_core_web_sm")
nlp4.add_pipe("cinema_ner")
doc4 = nlp4(text)
for ent in doc4.ents:
    print(ent.text, ent.label_)


Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
Paul PERSON


We don't se the Cinema label because Paul Hollywood is a longer token than just Holywood