In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [2]:
nlp= spacy.load("en_core_web_sm")

# USING LINGUISTIC ANNOTATIONS

Lets say you are analyzing user comment and you want to find out what people are saying about Facebook. You want to start off by finding adjectives following "Facebook is" or "Facebook was"

In [3]:
matcher= Matcher(nlp.vocab)

In [12]:
matched_sents= []

In [13]:
pattern= [{"LOWER": "facebook"}, {"LEMMA":"be"}, {"POS": "ADV", "OP":"*"},{"POS":"ADJ"}]

In [14]:
def callback_method_fb(matcher, doc, i, matches):
    matched_id, start, end= matches[i]
    span= doc[start:end]
    sent=span.sent
    
    match_ents=[{
        'start':span.start_char - sent.start_char,
        'end':span.end_char - sent.start_char,
        'label':'MATCH'
    }]
    
    matched_sents.append({'text':sent.text, 'ents':match_ents})

In [15]:
matcher.add("fb", [pattern], on_match=callback_method_fb)

In [16]:
doc= nlp("I'd say that Facebook is evil. - Facebook is pretty coll, right?")

In [17]:
matches= matcher(doc)

In [18]:
matches

[(8017838677478259815, 4, 7), (8017838677478259815, 9, 12)]

In [19]:
matched_sents

[{'text': "I'd say that Facebook is evil.",
  'ents': [{'start': 13, 'end': 29, 'label': 'MATCH'}]},
 {'text': '- Facebook is pretty coll, right?',
  'ents': [{'start': 2, 'end': 20, 'label': 'MATCH'}]}]

In [20]:
displacy.render(matched_sents, style='ent',manual=True)

# PHONE NUMBERS

In [21]:
# Here we are matching this pattern
(123) 4567 8901 or (123) 4567-8901

SyntaxError: invalid syntax (Temp/ipykernel_14608/3019055236.py, line 2)

In [22]:
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "dddd"},
 {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]

In [23]:
matcher= Matcher(nlp.vocab)
matcher.add("PhoneNumber",[pattern])

In [25]:
doc= nlp("Call me at (123) 4560 7890")

In [26]:
print([t.text for t in doc])

['Call', 'me', 'at', '(', '123', ')', '4560', '7890']


In [27]:
matches= matcher(doc)
matches

[(7978097794922043545, 3, 8)]

In [28]:
for match_id, start, end in matches:
    span=doc[start:end]
    print(span.text)

(123) 4560 7890


# Email Address Matching

In [29]:
pattern= [{"TEXT":{"REGEX": "[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+"}}]

In [30]:
matcher= Matcher(nlp.vocab)
matcher.add("Email", [pattern])

In [31]:
text= "Email me at abdulbasitnedian@gmail.com and highmoralstatus@yahoo.com"

In [32]:
doc= nlp(text)

In [33]:
matches= matcher(doc)

In [34]:
matches

[(11010771136823990775, 3, 4), (11010771136823990775, 5, 6)]

In [35]:
for match_id, start, end in matches:
    span=doc[start:end]
    print(span.text)

abdulbasitnedian@gmail.com
highmoralstatus@yahoo.com


# EMOJI EXTRACTION

In [36]:
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]  # Positive emoji
neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"]  # Negative emoji

In [37]:
pos_emoji

['😀', '😃', '😂', '🤣', '😊', '😍']

In [38]:
# Add patterns to match one or more emoji tokens
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]

In [39]:
pos_patterns

[[{'ORTH': '😀'}],
 [{'ORTH': '😃'}],
 [{'ORTH': '😂'}],
 [{'ORTH': '🤣'}],
 [{'ORTH': '😊'}],
 [{'ORTH': '😍'}]]

In [40]:
neg_patterns

[[{'ORTH': '😞'}],
 [{'ORTH': '😠'}],
 [{'ORTH': '😩'}],
 [{'ORTH': '😢'}],
 [{'ORTH': '😭'}],
 [{'ORTH': '😒'}]]

In [41]:
# Function to label the sentiment
def label_sentiment(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
    elif doc.vocab.strings[match_id] == "SAD":
        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment

In [42]:
matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)  # Add positive pattern
matcher.add("SAD", neg_patterns, on_match=label_sentiment)  # Add negative pattern


In [43]:
# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]])

In [44]:
doc = nlp("Hello world 😀 #MondayMotivation")
matches = matcher(doc)

In [45]:
for match_id, start, end in matches:
    string_id = doc.vocab.strings[match_id]  # Look up string ID
    span = doc[start:end]
    print(string_id, span.text)

HAPPY 😀
HASHTAG #MondayMotivation


With a library like Emojipedia, we can also retrieve a short description for each emoji – for example, 😍‘s official title is “Smiling Face With Heart-Eyes”. Assigning it to a custom attribute on the emoji span will make it available as span._.emoji_desc.

In [None]:
'''Code is written here
from emojipedia import Emojipedia  # Installation: pip install emojipedia
from spacy.tokens import Span  # Get the global Span object

Span.set_extension("emoji_desc", default=None)  # Register the custom attribute

def label_sentiment(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
    elif doc.vocab.strings[match_id] == "SAD":
        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
    span = doc[start:end]
    emoji = Emojipedia.search(span[0].text)  # Get data for emoji
    span._.emoji_desc = emoji.title  # Assign emoji description
    '''

# EFFICIENT PHRASE MATCHER

In [46]:
import spacy
from spacy.matcher import PhraseMatcher

In [47]:
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)

In [48]:
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]

In [49]:
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]

In [50]:
patterns

[Barack Obama, Angela Merkel, Washington, D.C.]

In [51]:
matcher.add("TerminologyList", patterns)

In [52]:
doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")

In [53]:
matches = matcher(doc)

In [54]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

Angela Merkel
Barack Obama
Washington, D.C.


# CUSTOM RULE BASED ENTITY RECOGNITION

In [55]:
from spacy.pipeline import EntityRuler

In [56]:
nlp= spacy.load('en_core_web_sm')

In [57]:
ruler= EntityRuler(nlp)

In [60]:
pattern= [{"label":"ORG", "pattern":"Apple"},
{"label":"GPE","pattern":[{"LOWER": "san"}, {"LOWER":"francisco"}]}]

In [61]:
pattern

[{'label': 'ORG', 'pattern': 'Apple'},
 {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]

In [63]:
ruler.add_patterns(pattern)

In [65]:
ruler = nlp.add_pipe("entity_ruler")

In [66]:
doc= nlp("I am opening a new restaurant in San Francisco")



In [67]:
doc

I am opening a new restaurant in San Francisco

In [68]:
for ent in doc.ents:
    print(ent.text, ent.label_)

San Francisco GPE
