### 1. Setting-up spacy Environment

In [29]:
# importing spacy
import spacy

# loading the language model for tekenizatoin and stuffs from spacy
nlp = spacy.load('en_core_web_sm')

# reading texts from txt file
with open("employees.txt", "r", encoding="utf-8") as f:
  text = f.read()
  

In [30]:
print(text)

**Paragraph 1:**

John Smith works as a software engineer at Google in Mountain View, California. He has a base salary of $100,000 per year, plus a bonus of up to $20,000. He is responsible for developing new software features for Google's search engine.

**Paragraph 2:**

Mary Jones is a marketing manager at Acme Corporation in Anytown, USA. She has a base salary of $75,000 per year, plus a bonus of up to $15,000. She is responsible for developing and executing marketing campaigns for Acme's products.

**Paragraph 3:**

Susan Brown is a doctor at Mercy Hospital in Chicago, Illinois. She has a base salary of $150,000 per year, plus bonuses and benefits. She is a general practitioner and sees patients of all ages.

**Paragraph 4:**

Peter Green is a lawyer at Jones & Smith Law Firm in New York City. He has a base salary of $200,000 per year, plus bonuses and benefits. He specializes in corporate law and has represented some of the biggest companies in the world.




In [31]:
doc = nlp(text=text)

In [32]:
print(doc)

**Paragraph 1:**

John Smith works as a software engineer at Google in Mountain View, California. He has a base salary of $100,000 per year, plus a bonus of up to $20,000. He is responsible for developing new software features for Google's search engine.

**Paragraph 2:**

Mary Jones is a marketing manager at Acme Corporation in Anytown, USA. She has a base salary of $75,000 per year, plus a bonus of up to $15,000. She is responsible for developing and executing marketing campaigns for Acme's products.

**Paragraph 3:**

Susan Brown is a doctor at Mercy Hospital in Chicago, Illinois. She has a base salary of $150,000 per year, plus bonuses and benefits. She is a general practitioner and sees patients of all ages.

**Paragraph 4:**

Peter Green is a lawyer at Jones & Smith Law Firm in New York City. He has a base salary of $200,000 per year, plus bonuses and benefits. He specializes in corporate law and has represented some of the biggest companies in the world.




### 2. Identifying only Names and Location from the paragraphs

In [33]:
# printing only names and locations

for ent in doc.ents:
    if ent.label_ == "PERSON" or ent.label_ == "GPE":
        print(ent.text,":", ent.label_)
    
    


John Smith : PERSON
Mountain View : GPE
California : GPE
Mary Jones : PERSON
Anytown : GPE
USA : GPE
Susan Brown : PERSON
Chicago : GPE
Illinois : GPE
Peter Green : PERSON
New York City : GPE


In [34]:
from spacy import displacy
displacy.render(doc,style='ent')

### 3. Custom Component 

In [35]:
# printing only names and locations

for ent in doc.ents:
    if ent.label_ == "PERSON" or ent.label_ == "GPE":
        print(ent.text,":", ent.label_)
    
    

John Smith : PERSON
Mountain View : GPE
California : GPE
Mary Jones : PERSON
Anytown : GPE
USA : GPE
Susan Brown : PERSON
Chicago : GPE
Illinois : GPE
Peter Green : PERSON
New York City : GPE


In [36]:
from spacy.language import Language

### 4. Custom Entities

##### Removing an Entity

In [37]:
#function to remove the GPE entity
@Language.component("remove_gpe")
def remove_gpe(doc):
    original_ents = list(doc.ents)
    for ent in doc.ents:
        if ent.label_ == "GPE":
            original_ents.remove(ent)
    doc.ents = original_ents
    return (doc)

In [38]:
nlp.add_pipe("remove_gpe")

<function __main__.remove_gpe(doc)>

In [41]:
doc = nlp(text=text)
for ent in doc.ents:
    if ent.label_ == "PERSON" or ent.label_ == "GPE":
        print(ent.text,":", ent.label_)
    

John Smith : PERSON
Mary Jones : PERSON
Susan Brown : PERSON
Peter Green : PERSON


The above output removes all the GPE entity from the paragraph

##### Adding an entity by finding patterns with the help of Re-Gex

In [4]:
import re

In [11]:
with open("telephone.txt", "r", encoding="utf-8") as f:
  text = f.read()
print(text)

**Paragraph 1**

Shri Ram is a traditional Indian name that means "lord of the universe".
It is a common name for Hindu men, and it is often shortened to "Shri" or "Ram".
Shri Ram is a very popular name in India, and it is often associated with strength, power, and prosperity.
His telephone number is 2345678901 

**Paragraph 2**

Shri Lakshmi is a traditional Indian name that means "goddess of wealth".
It is a common name for Hindu women, and it is often shortened to "Lakshmi" or "Lali".
Shri Lakshmi is a very popular name in India, and it is often associated with beauty, prosperity, and good fortune.
Her telephone number is 9876543210



In [12]:
# pattern = r"^[0-9]{3}-[0-9]{3}-[0-9]{4}$"
pattern = r"Shri [A-Z]\w+"

In [24]:
# finding matches from the text
matches = re.finditer(pattern,text)
for match in matches:
    print(match)

<re.Match object; span=(17, 25), match='Shri Ram'>
<re.Match object; span=(171, 179), match='Shri Ram'>
<re.Match object; span=(332, 344), match='Shri Lakshmi'>
<re.Match object; span=(493, 505), match='Shri Lakshmi'>


In [38]:
# creating a blank model
nlp = spacy.blank("en")

doc = nlp(text)

# converting all the entities into list 
original_ents = list(doc.ents)

# new entity empty list
hindu_ents = []

# finding our desired location of the entity with the pattern 
pattern = r"Shri [A-Z]\w+"
for match in re.finditer(pattern=pattern,string=doc.text):
    start, end = match.span()
    span = doc.char_span(start,end)
    
    if span is not None:
        hindu_ents.append((span.start, span.end, span.text))



In [41]:
# printing the tuples we extracted from the text
print(hindu_ents)

[(7, 9, 'Shri Ram'), (48, 50, 'Shri Ram'), (85, 87, 'Shri Lakshmi'), (125, 127, 'Shri Lakshmi')]


In [40]:
# now creating Custom Entity and Label using Span module
from spacy.tokens import Span

for ent in hindu_ents:
    start, end, name = ent 
    temp_ent = Span(doc, start, end, label="Hindu")
    original_ents.append(temp_ent)

# replacing the doc entities with original entities
doc.ents = original_ents

In [29]:
# Printing the Entities with Labels
for ent in doc.ents:
    print(ent.text, ent.label_)

Shri Ram Hindu
Shri Ram Hindu
Shri Lakshmi Hindu
Shri Lakshmi Hindu


#### 5. Using Language module adding the Custom Entities with Labels and adding to a blank model using the above code we Executed

In [42]:
from spacy.language import Language

@Language.component("hindu_ner")
def hindu_ner(doc):
    pattern = r"Shri [A-Z]\w+"
    original_ents = list(doc.ents)
    hindu_ents = []

    for match in re.finditer(pattern=pattern,string=doc.text):
        start, end = match.span()
        span = doc.char_span(start,end)
        
        if span is not None:
            hindu_ents.append((span.start, span.end, span.text))
    
    for ent in hindu_ents:
        start, end, name = ent 
        temp_ent = Span(doc, start, end, label="Hindu")
        original_ents.append(temp_ent)

    doc.ents = original_ents

    return (doc)


In [33]:
# creating another blank model and adding a pipeline 
nlp2 = spacy.blank("en")
nlp2.add_pipe("hindu_ner")

<function __main__.hindu_ner(doc)>

In [43]:
doc2 = nlp2(text)
for ent in doc2.ents:
    print(ent.text,ent.label_)

Shri Ram Hindu
Shri Ram Hindu
Shri Lakshmi Hindu
Shri Lakshmi Hindu


# End