### REGEX basics

In [18]:
import spacy

In [19]:
#Sample text
text = "This is a sample number (555) 555-5555."

#build upon the spaCy Small Model
nlp = spacy.blank("en")

#Create the ruler and add it 
ruler = nlp.add_pipe("entity_ruler")

#List of Entity and Patterns  
patterns = [
    {
        "label": "PHONE_NUMBER",
        "pattern":
        [{"TEXT":
          {"REGEX": "((\d){3})-(\d){4}"}} #this is the sequnce that it looking for
                                          #the sequnce is 3 digits followed by - then 4 digits
                                          #This pattern applies only to a single token.
        ]
    }
]

#add patterns to ruler
ruler.add_patterns (patterns)

#create the doc
doc = nlp(text)

#extact entities 
for ent in doc. ents:
    print (ent.text, ent.label_)

#the code doesnt output any thing because the patern matching works with REGEX with one single token
#you cant use regex with multi tokens

In [20]:
#Sample text
text = "This is a sample number (555) 55555."

#build upon the spaCy Small Model
nlp = spacy.blank("en")

#Create the ruler and add it 
ruler = nlp.add_pipe("entity_ruler")

#List of Entity and Patterns 
patterns = [
    {
        "label": "PHONE_NUMBER",
        "pattern":
        [{"TEXT":
          {"REGEX": "((\d){5})"}} #this is the sequnce that it looking for
                                          #the sequnce is 5 digits
                                         
        ]
    }
]

#add patterns to ruler
ruler.add_patterns (patterns)

#create the doc
doc = nlp(text)

#extact entities 
for ent in doc. ents:
    print (ent.text, ent.label_)

#The code give output as the regex is only single token

55555 PHONE_NUMBER


### REGEX Multi-Words Token

In [21]:
import re

In [22]:
text = "Paul Newman was an American ator, but Paul Hollywood is a British TV Host. The name Paul is quite common."

In [23]:
pattern = r"Paul [A-Z]\w+" 
# Matches the exact word "Paul" 
# then 	Matches one uppercase letter (e.g., "M")
# \w+	Matches one or more word characters: [a-zA-Z0-9_]
# A space separates "Paul" and the next name

In [24]:
#Search in raw text using regex
matches = re.finditer(pattern, text)  #Finds all matches of the pattern in the text using re.finditer,
                                      #which returns match objects.
for match in matches:
    print(match)                      #Prints each match object 
    print(match.group())              #Prints each match TEXT
    


<re.Match object; span=(0, 11), match='Paul Newman'>
Paul Newman
<re.Match object; span=(38, 52), match='Paul Hollywood'>
Paul Hollywood


In [25]:
from spacy.tokens import Span

In [26]:
#Load blank spaCy pipeline
nlp = spacy.blank("en") #Loads a blank English spaCy pipeline (no tokenizer or components except the default rules).
                        #No NER, POS tagging, or lemmatization unless added manually.

#Create spaCy doc
doc = nlp(text)

#Copy the current named entities 
original_ents = list (doc.ents) #In this case, it's likely an empty list because the blank pipeline has no NER.

#Create list to hold new spans (matches from regex)
mwt_ents = [] #store new matched spans for custom entity creation (like multi-word tokens).

#Run regex again, but now convert character spans to spaCy spans
for match in re.finditer(pattern, doc.text):
    start, end = match.span() #Gets the character offsets of the match.
                              #e.g., if "Paul Smith" starts at character 5 and ends at 16 → (5, 16)

    span = doc.char_span(start,end) #Converts character offsets into a Span (a slice of tokens from doc)
                                    #If the match aligns cleanly with token boundaries, char_span will succeed.
                                    #If not, span will be None — always check it.

    #print (span) #Displays the actual matched Span, or None if it failed due to misalignment.

    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))
    
#Purpose of the Code is to use a regex pattern to find matches in raw text (character positions),
#then convert those character spans into spaCy Span objects
#and finally extract their start token index, end token index, and text.

In [27]:
print (mwt_ents)

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]


In [28]:
#converting regex-based matches into actual named entities in spaCy's Doc object.

#This loop goes through each custom match that was stored in mwt_ents earlier.
for ent in mwt_ents: #Each ent in mwt_ents is a tuple: (start_token_index, end_token_index, matched_text)
    start, end, name =ent
    
    per_ent = Span(doc, start, end, label = "PERSON")
    original_ents.append(per_ent)
doc.ents = original_ents

for ent in doc.ents:
    print (ent.text, ent.label_)

Paul Newman PERSON
Paul Hollywood PERSON


In [29]:
from spacy.language import Language

@Language.component("paul_ner")
def paul_ner(doc):
    pattern = pattern = r"Paul [A-Z]\w+" 
    original_ents = list (doc.ents) 
    mwt_ents = [] 
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()

        span = doc.char_span(start,end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    for ent in mwt_ents: 
        start, end, name =ent
        
        per_ent = Span(doc, start, end, label = "PERSON")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return(doc)

In [30]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")

<function __main__.paul_ner(doc)>

In [31]:
doc2 = nlp2(text)
print (doc2.ents)

(Paul Newman, Paul Hollywood)


In [None]:
from spacy.language import Language
from spacy.util import filter_spans

@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern = r"Hollywood" 
    original_ents = list (doc.ents) 
    mwt_ents = [] 
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()

        span = doc.char_span(start,end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    for ent in mwt_ents: 
        start, end, name =ent
        
        per_ent = Span(doc, start, end, label = "CINEMA")
        original_ents.append(per_ent)
    filtered = filter_spans(original_ents) #Clean up and deduplicate spans that overlap or conflict, keeping only the best ones.
                                           #Hollywood might already be labeled by spaCy's built-in NER 
    doc.ents = filtered
    return(doc)

In [33]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")

<function __main__.cinema_ner(doc)>

In [34]:
doc3 = nlp3(text)
for ent in doc3.ents:
    print(ent.text, ent.label_)

Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
Paul PERSON
