### Using Linguistic Annotations

In [27]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher
nlp=spacy.load("en_core_web_sm")
matcher=Matcher(nlp.vocab)

In [28]:
matched_sents=[]

In [29]:
pattern=[{"LOWER":"facebook"},{"LEMMA":"be"},{"POS":"ADV","OP":"*"},{"POS":"ADJ"}]

In [30]:
def callback_method_fb(matcher,doc,i,matches):
    matched_id,start,end=matches[i]
    span=doc[start:end]
    sent=span.sent
    match_ents=[{
        'start':span.start_char-sent.start_char,
        'end':span.end_char-sent.end_char,
        'label':'MATCH'
    }]
    
    matched_sents.append({'text':sent.text,'ents':match_ents})

In [31]:
matcher.add("fb", [pattern], on_match=callback_method_fb)

In [32]:
doc=nlp("I'd say that Facebook is evil. - Facebook is pretty cool, right?")

In [33]:
matches=matcher(doc)

In [34]:
matches

[(8017838677478259815, 4, 7), (8017838677478259815, 9, 13)]

In [35]:
matched_sents

[{'text': "I'd say that Facebook is evil.",
  'ents': [{'start': 13, 'end': -1, 'label': 'MATCH'}]},
 {'text': '- Facebook is pretty cool, right?',
  'ents': [{'start': 2, 'end': -8, 'label': 'MATCH'}]}]

In [36]:
displacy.render(matched_sents,style='ent',manual=True)

### Phone Numbers

In [37]:
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "dddd"}, {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]

In [38]:
matcher=Matcher(nlp.vocab)
matcher.add("PhoneNumber",[pattern])

In [46]:
doc=nlp("Call me at (123) 4560-7890")

In [47]:
print([t.text for t in doc])

['Call', 'me', 'at', '(', '123', ')', '4560', '-', '7890']


In [48]:
matches=matcher(doc)
matches

[(7978097794922043545, 3, 9)]

In [49]:
for match_id,start,end in matches:
    span=doc[start:end]
    print(span.text)

(123) 4560-7890


### Email Address Matching

In [50]:
pattern = [{"TEXT": {"REGEX": "[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+"}}]

In [51]:
matcher=Matcher(nlp.vocab)
matcher.add("Email",[pattern])


In [52]:
text="Hey everyone!My email is pandeyaryamaan@gmail.com"

In [53]:
doc=nlp(text)

In [54]:
matches=matcher(doc)

In [55]:
matches

[(11010771136823990775, 4, 5)]

In [56]:
for match_id,start,end in matches:
    span=doc[start:end]
    print(span.text)

pandeyaryamaan@gmail.com


### Hashtags and Emoji detection on social media

In [63]:
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]
neg_emoji = ["😥", "😫", "😔", "😠", "😑"]

In [64]:
pos_emoji

['😀', '😃', '😂', '🤣', '😊', '😍']

In [65]:
neg_emoji

['😥', '😫', '😔', '😠', '😑']

In [66]:
pos_patterns=[[{"ORTH":emoji}] for emoji in pos_emoji]
neg_patterns=[[{"ORTH":emoji}] for emoji in neg_emoji]

In [67]:
pos_patterns

[[{'ORTH': '😀'}],
 [{'ORTH': '😃'}],
 [{'ORTH': '😂'}],
 [{'ORTH': '🤣'}],
 [{'ORTH': '😊'}],
 [{'ORTH': '😍'}]]

In [68]:
neg_patterns

[[{'ORTH': '😥'}],
 [{'ORTH': '😫'}],
 [{'ORTH': '😔'}],
 [{'ORTH': '😠'}],
 [{'ORTH': '😑'}]]

In [69]:
def label_sentiment(matcher,doc,i,matches):
    match_id,start,end=matches[i]
    if doc.vocab.strings[match_id]=='HAPPY':
        doc.sentiment+=0.1
    elif doc.vocab.strings[match_id]=='SAD':
        doc.sentiment-=0.1

In [70]:
matcher=Matcher(nlp.vocab)


In [72]:
matcher.add("HAPPY",[*pos_patterns],on_match=label_sentiment)
matcher.add("SAD",[*neg_patterns],on_match=label_sentiment)

In [74]:
matcher.add("HASHTAG", [[{'TEXT': '#'}, {'IS_ASCII': True}]])

In [75]:
doc=nlp("Hello World 😑 #pandeyaryamaan23")

In [76]:
matches=matcher(doc)

In [78]:
for match_id,start,end in matches:
    string_id=doc.vocab.strings[match_id]
    span=doc[start:end]
    print(string_id,span.text)

SAD 😑
HASHTAG #pandeyaryamaan23


### Efficient Phrase Matching

In [79]:
from spacy.matcher import PhraseMatcher


In [80]:
matcher=PhraseMatcher(nlp.vocab)


In [81]:
terms=['BARACK OBAMA','ANGELA MERKEL','WASHINGTON D.C.']


In [82]:
pattern=[nlp.make_doc(text) for text in terms]

In [83]:
pattern

[BARACK OBAMA, ANGELA MERKEL, WASHINGTON D.C.]

In [84]:
matcher.add('term',[*pattern])

In [85]:
doc=nlp("German chancellor ANGELA MERKEL and US President BARACK OBAMA "
         "converse in the oval Office inside the White House in WASHINGTON D.C.")

In [86]:
doc

German chancellor ANGELA MERKEL and US President BARACK OBAMA converse in the oval Office inside the White House in WASHINGTON D.C.

In [87]:
matches=matcher(doc)

In [88]:
for match_id,start,end in matches:
    span=doc[start:end]
    print(span.text)

ANGELA MERKEL
BARACK OBAMA
WASHINGTON D.C.


In [89]:
matches

[(4519742297340331040, 2, 4),
 (4519742297340331040, 7, 9),
 (4519742297340331040, 19, 21)]

### Custom Rule Based Entity Recognition

In [90]:
from spacy.pipeline import EntityRuler

In [91]:
nlp=spacy.load("en_core_web_sm")

In [92]:
ruler=EntityRuler(nlp)

In [97]:
patterns = [{"label": "ORG", "pattern": "KGP Talkie"},
           {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]

In [98]:
patterns

[{'label': 'ORG', 'pattern': 'KGP Talkie'},
 {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]

In [99]:
ruler.add_patterns(patterns)

In [102]:
nlp.add_pipe('entity_ruler')

<spacy.pipeline.entityruler.EntityRuler at 0x23151222088>

In [103]:
doc=nlp("KGP Talkie is opening its first big office in San Francisco.")

In [104]:
for ent in doc.ents:
    print(ent.text,ent.label_)

KGP Talkie PERSON
first ORDINAL
San Francisco GPE
