In [3]:
#Import the requisite library
import spacy

#Sample text
text = "This is a sample number 555-5555."

#Build upon the spaCy Small Model
nlp = spacy.blank("en")

#Create the Ruler and Add it
ruler = nlp.add_pipe("entity_ruler")

#List of Entities and Patterns 
#(source: https://spacy.io/usage/rule-based-matching)
patterns = [
                {"label": "PHONE_NUMBER", 
                 "pattern": [{"SHAPE": "ddd"},
                             {"ORTH": "-", "OP": "?"}, 
                             {"SHAPE": "dddd"}]
                }
            ]
#add patterns to ruler
ruler.add_patterns(patterns)

#create the doc
doc = nlp(text)

#extract entities
for ent in doc.ents:
    print (ent.text, ent.label_)

555-5555 PHONE_NUMBER


In [2]:
nlp.analyze_pipes()

{'summary': {'entity_ruler': {'assigns': ['doc.ents',
    'token.ent_type',
    'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'entity_ruler': []},
 'attrs': {'token.ent_iob': {'assigns': ['entity_ruler'], 'requires': []},
  'doc.ents': {'assigns': ['entity_ruler'], 'requires': []},
  'token.ent_type': {'assigns': ['entity_ruler'], 'requires': []}}}

{
    'summary': 
        {
        'entity_ruler': 
            {
                'assigns': ['doc.ents','token.ent_type','token.ent_iob'],
                'requires': [],
                'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
                'retokenizes': False
            }
        },
    'problems': 
        {
            'entity_ruler': []
        },
    'attrs': 
        {
            'token.ent_iob': 
                {
                    'assigns': ['entity_ruler'], 'requires': []
                },
            'doc.ents': 
                {
                    'assigns': ['entity_ruler'], 'requires': []
                },
            'token.ent_type': 
                {
                    'assigns': ['entity_ruler'], 'requires': []
                }
        }
}

In [4]:
import re
pattern = r"((\d){3}-(\d){4})"
text = "This is a sample number 555-5555."
matches = re.findall(pattern, text)
print (matches)

[('555-5555', '5', '5')]


## **use regex for spacy**

In [5]:
#Import the requisite library
import spacy

#Sample text
text = "This is a sample number (555) 555-5555."

#Build upon the spaCy Small Model
nlp = spacy.blank("en")

#Create the Ruler and Add it
ruler = nlp.add_pipe("entity_ruler")

#List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)
patterns = [
                {
                    "label": "PHONE_NUMBER", 
                    "pattern": [{"TEXT": {"REGEX": "((\d){3}-(\d){4})"}}
                                                        ]
                }
            ]
#add patterns to ruler
ruler.add_patterns(patterns)


#create the doc
doc = nlp(text)

#extract entities
for ent in doc.ents:
    print (ent.text, ent.label_)

### This is for one very important reason. SpaCy’s EntityRuler cannot use RegEx to pattern match across tokens. The dash in the phone number throws off the EntityRuler. So, what are we to do in this scenario? Well, we have a few different options that we will explore in the next notebook. But before we get to that, let’s try and use RegEx to capture the phone number with no hyphen.

In [6]:
#Import the requisite library
import spacy

#Sample text
text = "This is a sample number 5555555."
#Build upon the spaCy Small Model
nlp = spacy.blank("en")

#Create the Ruler and Add it
ruler = nlp.add_pipe("entity_ruler")

#List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)
patterns = [
                {
                    "label": "PHONE_NUMBER", "pattern": [{"TEXT": {"REGEX": "((\d){5})"}}
                                                        ]
                }
            ]
#add patterns to ruler
ruler.add_patterns(patterns)


#create the doc
doc = nlp(text)

#extract entities
for ent in doc.ents:
    print (ent.text, ent.label_)

5555555 PHONE_NUMBER
