In [49]:
import spacy

In [50]:
# !python -m spacy download en_core_web_sm

In [51]:
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [52]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


In [53]:
from spacy import displacy

displacy.render(doc, style="ent")

In [54]:
# List down all the entities

In [55]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [56]:

doc = nlp("Michael Bloomberg founded Bloomberg in 1982")
for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Michael Bloomberg | PERSON | People, including fictional
Bloomberg | PERSON | People, including fictional
1982 | DATE | Absolute or relative dates or periods


In [57]:
doc = nlp("Tesla Inc is going to acquire Twitter Inc for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", ent.start_char, "|", ent.end_char)

Tesla Inc  |  ORG  |  0 | 9
Twitter Inc  |  ORG  |  30 | 41
$45 billion  |  MONEY  |  46 | 57


In [58]:
# Setting custom entities

In [59]:
doc = nlp("Tesla is going to acquire Twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_)

Tesla  |  ORG
Twitter  |  PRODUCT
$45 billion  |  MONEY


In [60]:
s = doc[2:5]
s

going to acquire

In [61]:
from spacy.tokens import Span

s1 = Span(doc, 0, 1, label="ORG")
s2 = Span(doc, 5, 6, label="ORG")

doc.set_ents([s1, s2], default="unmodified")

In [62]:
for ent in doc.ents:
    print(ent.text, " | ", ent.label_)

Tesla  |  ORG
Twitter  |  ORG
$45 billion  |  MONEY


In [63]:
doc = nlp("Bhaiya Singh works in nagarro, mobile nos is 5689745841")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_)

Bhaiya Singh  |  PERSON
5689745841  |  DATE


In [64]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [65]:
from spacy.pipeline import EntityRuler

In [66]:
# create a new entity ruler
ruler = EntityRuler(nlp)

In [67]:
import re

In [68]:
# define the regular expression pattern for a mobile phone number
phone_pattern = re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b')

In [69]:
import json

In [70]:

# define the pattern to match using the regular expression for phone numbers
patterns = [{"label": "PHONE", "pattern": [{"TEXT": {"REGEX": phone_pattern}}]}]

# add the patterns to the ruler
ruler.add_patterns(patterns)

# add the ruler to the pipeline
nlp.add_pipe(ruler)

TypeError: Object of type Pattern is not JSON serializable

In [71]:
#Import the requisite library
import spacy

#Sample text
text = "This is a sample number (555) 555-5555."

#Build upon the spaCy Small Model
nlp = spacy.blank("en")

#Create the Ruler and Add it
ruler = nlp.add_pipe("entity_ruler")

#List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)
patterns = [
                {
                    "label": "PHONE_NUMBER", "pattern": [{"TEXT": {"REGEX": "((\d){3}-(\d){4})"}}
                                                        ]
                }
            ]
#add patterns to ruler
ruler.add_patterns(patterns)


#create the doc
doc = nlp(text)

#extract entities
for ent in doc.ents:
    print (ent.text, ent.label_)

In [72]:
import spacy
import re
import json

# define the custom encoder
class PatternEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, re.Pattern):
            return obj.pattern
        return json.JSONEncoder.default(self, obj)

# load the Spacy model
nlp = spacy.load('en_core_web_sm')

# define the regular expression pattern for phone numbers
phone_pattern = re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b')

# add the new entity to the model using the entity ruler
from spacy.pipeline import EntityRuler

# create a new entity ruler
ruler = EntityRuler(nlp)

# create the pattern for the new entity
phone_entity = {"label": "PHONE", "pattern": [{"TEXT": {"REGEX": phone_pattern}}]}

# add the pattern to the entity ruler
ruler.add_patterns([phone_entity])

# add the entity ruler to the pipeline
nlp.add_pipe(ruler)

# test the model with a sample text
doc = nlp("Call me at 555-123-4567.")
for ent in doc.ents:
    print(ent.text, ent.label_)


TypeError: Object of type Pattern is not JSON serializable