<h3>Regular Expressions, or RegEx for short, is a way of achieving complex string matching based on simple or complex patterns</h3>

In [26]:
import re
import spacy
from spacy.tokens import span

In [3]:
# state (\d){1,2}. This means that we are looking for any digit (0-9) that occurs either once or twice ({1,2})
# 
pattern = r"((\d){1,2} (January|February|March|April|May|June|July|August|September|October|November|December))"
text = "This is a date 2 February. Another date would be 14 August"
matches =  re.findall(pattern, text)
print(matches)

[('2 February', '2', 'February'), ('14 August', '4', 'August')]


In [9]:
pattern = r"(((\d){1,2}( (January|February|March|April|May|June|July|August|September|October|November|December)))|(((January|February|March|April|May|June|July|August|September|October|November|December) )(\d){1,2}))"

text = "This is a date February 2. Another date would be 14 August."
matches = re.findall(pattern, text)
print (matches)

[('February 2', '', '', '', '', 'February 2', 'February ', 'February', '2'), ('14 August', '14 August', '4', ' August', 'August', '', '', '', '')]


In [10]:
text = "This is a date February 2. Another date would be 14 August."
iter_matches = re.finditer(pattern, text)
print (iter_matches)

<callable_iterator object at 0x1102f9750>


In [11]:
text = "This is a date February 2. Another date would be 14 August."
iter_matches = re.finditer(pattern, text)
print (iter_matches)
for hit in iter_matches:
    print (hit)

<callable_iterator object at 0x1102f9840>
<re.Match object; span=(15, 25), match='February 2'>
<re.Match object; span=(49, 58), match='14 August'>


In [12]:
text = "This is a date February 2. Another date would be 14 August."
iter_matches = re.finditer(pattern, text)
for hit in iter_matches:
    start = hit.start()
    end = hit.end()
    print (text[start:end])

February 2
14 August


<h3>RegEx in Spacy</h3>

In [15]:
import spacy
text = "This is a sample number 555-5555"
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")
patterns = [
                {"label": "PHONE_NUMBER", "pattern": [{"SHAPE": "ddd"},
                {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]}
            ]
ruler.add_patterns(patterns)
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

555-5555 PHONE_NUMBER


In [16]:
pattern = r"((\d){3}-(\d){4})"
text = "This is a sample number 555-5555."
matches = re.findall(pattern, text)
print (matches)

[('555-5555', '5', '5')]


In [18]:
text = "This is a sample number (555) 555-5555."
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")
patterns = [{
    "label": "PHONE_NUMBER", "pattern": [{"TEXT": {"REGEX": "((\d){3}-(\d){4})"}}
                                        ]}]
ruler.add_patterns(patterns)
doc = nlp(text)
for ent in doc.ents:
    print (ent.text, ent.label_)

In [21]:
text = "This is a sample number 5555555."
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")
patterns = [{
            "label": "PHONE_NUMBER", "pattern": [{"TEXT": {"REGEX": "((\d){5})"}}
            ]}]
ruler.add_patterns(patterns)
doc = nlp(text)
for ent in doc.ents:
    print (ent.text, ent.label_)

5555555 PHONE_NUMBER


<h3>Multi-word Tokens</h3>

In [22]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common"

In [24]:
pattern = r"Paul [A-Z]\w+"

In [25]:
matches = re.finditer(pattern, text)
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [29]:
nlp = spacy.blank("en")
doc = nlp(text)
original_ents = list(doc.ents)
mwt_ents = []
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))
for ent in mwt_ents:
    start,end,name = ent
    per_ent = Span(doc, start, end, label = "PERSON")
    original_ents.append(per_ent)
doc.ent

In [30]:
print(mwt_ents) # start,end,text

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]
