    Token Based matching

In [5]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_md')


In [8]:
doc = nlp('Good morning, I want to reserve a ticket.')

matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "good"}, {"LOWER": "morning"}, {"IS_PUNCT": True}]
matcher.add("morningGreeting", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
    m_span = doc[start:end]
    print(start, end, m_span.text)

0 3 Good morning,


In [35]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_md')
doc = nlp("Good morning, I want to resverse a ticket. I will then say good evening!")

matcher = Matcher(nlp.vocab)
pattern1 = [{"LOWER": "good"}, {"LOWER": "morning"}, {"IS_PUNCT": True}]
matcher.add("morningGreeting", [pattern1])
pattern2 = [{"LOWER": "good"}, {"LOWER": "evening"}, {"IS_PUNCT": True}]
matcher.add("eveningGreeting", [pattern2])
matches = matcher(doc)
for match_id, start, end in matches:
    pattern_name = nlp.vocab[match_id]
    m_span = doc[start:end]
    print(pattern_name, start, end, m_span.text)
             
            

<spacy.lexeme.Lexeme object at 0x000001BB2A4EF3C0> 0 3 Good morning,
<spacy.lexeme.Lexeme object at 0x000001BBA94F88C0> 14 17 good evening!


In [53]:
#length
doc = nlp("I bought a pineapple.")
matcher = Matcher(nlp.vocab)
pattern = [{"LENGTH": 1}]
matcher.add("onlyShort", [pattern])
matches = matcher(doc)
for mid, start, end in matches:
    print(start, end, doc[start:end])

0 1 I
2 3 a
4 5 .


In [54]:
#IS_ALPHA, IS_ASCII, IS_DIGIT
doc1 = nlp("I met him at 2 o'clock.")
doc2 = nlp("He brought me 2 apples.")
matcher = Matcher(nlp.vocab)
pattern = [{"IS_DIGIT": True},{"IS_ALPHA": True}]
matcher.add("numberAndPlainWord", [pattern])

matches = matcher(doc1)
doc1

I met him at 2 o'clock.

In [55]:
matches = matcher(doc2)
len(matches)

1

In [56]:
mid, start, end = matches[0]
print(start, end, doc2[start:end])

3 5 2 apples


In [58]:
#IS_UPPER, IS_LOWER, IS_TITLE

doc = nlp("Take me out of your SPAM list. We never asked you to conatact me. If you write again we'll SUE!!!")
matcher = Matcher(nlp.vocab)
pattern = [{"IS_UPPER": True}]
matcher.add("capitals", [pattern])
matches = matcher(doc)
for mid, start, end in matches:
    print(start, end, doc[start:end])

5 6 SPAM
22 23 SUE


In [59]:
#IS_PUNCT, IS_SPACE, IS_STOP
doc1 = nlp("Can you swim?")
doc2 = nlp("Can Sally swim?")
matcher = Matcher(nlp.vocab)
pattern = [{"IS_SENT_START": True, "LOWER": "can"}, {"IS_TITLE": True}]

matcher.add("canThenCapitalized", [pattern])


In [60]:
matches = matcher(doc2)
len(matches)

1

In [61]:
mid, start, end = matches[0]
print(start, end, doc2[start:end])

0 2 Can Sally


In [63]:
doc = nlp("Will you go there?")
matcher = Matcher(nlp.vocab)
pattern = [{"IS_SENT_START": True, "TAG": "MD"}]
matcher.add("tagM", [pattern])
matches = matcher(doc)
len(matches)

1

In [64]:
mid, start, end = matches[0]
print(start, end, doc[start:end])

0 1 Will


In [65]:
doc2 = nlp("I might go there.")
matcher(doc2)

[]

    Extended Syntax Support

In [66]:
doc = nlp("Good morning, I'm here. I'll say good evening!!")
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "good"}, {"LOWER": {"IN": ["morning", "evening"]}},
          {"IS_PUNCT": True}]
matcher.add("greeting", [pattern])
matches = matcher(doc)
for mid, start, end in matches:
    print(start, end, doc[start:end])

0 3 Good morning,
10 13 good evening!


In [67]:
#comparison operator with length
doc = nlp("I suffered from Trichotillomania when I was in college. The doctor precribed me Psychosomatic medicine.")
matcher = Matcher(nlp.vocab)
pattern = [{"LENGTH": {">=" : 10}}]
matcher.add("longWords", [pattern])
matches = matcher(doc)
for mid, start, end in matches:
    print(start, end, doc[start:end])

3 4 Trichotillomania
14 15 Psychosomatic


    Regex-like operators

In [69]:
doc1 = nlp("Barack Obama visited France.")
doc2 = nlp("Barack Hussein Obama visited France.")
pattern = [{"LOWER": "barack"},
          {"LOWER": "hussein", "OP": "?"},
          {"LOWER": "obama"}]
matcher = Matcher(nlp.vocab)
matcher.add("obamaNames", [pattern])
matcher(doc1)

[(9957319642918298529, 0, 2)]

In [70]:
matcher(doc2)

[(9957319642918298529, 0, 3)]

In [90]:
doc1 = nlp("Hello hello hello, how are you?")
doc2 = nlp("Hello, how are you?")
doc3 = nlp("How are you?")
pattern = [{"LOWER": {"IN": ["hello", "hi", "hallo"]},"OP" : "*"}, {"IS_PUNCT": True}]
matcher = Matcher(nlp.vocab)
matcher.add("greetings", [pattern])
for mid, start, end in matcher(doc1):
    print(start, end, doc1[start:end])

0 4 Hello hello hello,
1 4 hello hello,
2 4 hello,
3 4 ,
7 8 ?


In [91]:
for mid, start, end in matcher(doc2):
    print(start, end, doc2[start:end])

0 2 Hello,
1 2 ,
5 6 ?


In [92]:
for mid, start, end in matcher(doc3):
    print(start, end, doc3[start:end])

3 4 ?


In [93]:
doc1 = nlp("Hello hello hello, how are you?")
doc2 = nlp("Hello, how are you?")
doc3 = nlp("How are you?")
pattern = [{"LOWER": {"IN": ["hello", "hi", "hallo"]},"OP" : "+"}, {"IS_PUNCT": True}]
matcher = Matcher(nlp.vocab)
matcher.add("greetings", [pattern])
for mid, start, end in matcher(doc1):
    print(start, end, doc1[start:end])

2 4 hello,
1 4 hello hello,
0 4 Hello hello hello,


In [94]:
for mid, start, end in matcher(doc2):
    print(start, end, doc2[start:end])

0 2 Hello,


In [95]:
for mid, start, end in matcher(doc3):
    print(start, end, doc3[start:end])

In [96]:
doc = nlp("My name is Alice and his name was Elliot.")
pattern = [{"LOWER": "name"},{"LEMMA": "be"}, {}]
matcher = Matcher(nlp.vocab)
matcher.add("pickName", [pattern])
for mid, start, end in matcher(doc):
    print(start, end, doc[start:end])

1 4 name is Alice
6 9 name was Elliot


In [97]:
doc1 = nlp("I forwarded his email to you.")
doc2 = nlp("I forwarded an email to you.")
doc3 = nlp("I forwarded the email to you.")

pattern = [{"LEMMA": "forward"}, {}, {"LOWER": "email"}]
matcher.add("forwardMail", [pattern])
for mid, start, end in matcher(doc1):
    print(start, end, doc1[start:end])

1 4 forwarded his email


In [98]:
for mid, start, end in matcher(doc2):
    print(start, end, doc2[start:end])

1 4 forwarded an email


In [99]:
for mid, start, end in matcher(doc3):
    print(start, end, doc3[start:end])

1 4 forwarded the email


    Regex Support

In [101]:
doc1 = nlp("I travelled by bus.")
doc2 = nlp("She travelled by bike.")
pattern = [{"POS": "PRON"}, 
          {"TEXT": {"REGEX": "[Tt]ravell?ed"}}]
matcher = Matcher(nlp.vocab)
matcher.add("regexSupport", [pattern])
for mid, start, end in matcher(doc1):
    print(start, end, doc1[start:end])

0 2 I travelled


In [102]:
for mid, start, end in matcher(doc2):
    print(start, end, doc2[start:end])

0 2 She travelled


In [105]:
doc = nlp("I went to Italy; he has been there too. His mother also has told me she wants to visit Rome.")
pattern = [{"TAG": {"REGEX": "^V"}}]
matcher.add("verbs", [pattern])
for mid, start, end in matcher(doc):
    print(start, end, doc[start:end])

1 2 went
6 7 has
7 8 been
14 15 has
15 16 told
18 19 wants
20 21 visit


In [107]:
filename = "data/sherlock_holmes_1.txt"
file = open(filename, "r", encoding="utf-8")
text = file.read()

In [108]:
doc = nlp(text)
pattern = [{"TAG": {"REGEX": "^V"}}]
matcher.add("verbs", [pattern])
for mid, start, end in matcher(doc):
    print(start, end, doc[start:end])

4 5 is
12 13 have
14 15 heard
17 18 mention
28 29 eclipses
31 32 predominates
39 40 was
43 44 felt
63 64 were
77 78 was
80 81 take
88 89 observing
94 95 has
95 96 seen
103 104 have
104 105 placed
114 115 spoke
120 121 save
123 124 gibe
130 131 were
140 141 drawing
153 154 trained
157 158 admit
167 168 adjusted
169 170 was
171 172 introduce
173 174 distracting
178 179 throw
210 211 be
228 229 was
238 239 was


In [None]:
    PhraseMatcher

In [112]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_md")
matcher = PhraseMatcher(nlp.vocab)

terms = ['Angela Merkel', 'Donald Trump', 'Alexis Tsipras']
patterns = [nlp.make_doc(term) for term in terms]
matcher.add("politicianList", None, *patterns)

doc = nlp("3 EU leaders met in Berlin. German chancellor Angela Merkel first welcomed the US president Donald Trump. The following day Alexis Tsipras joined them in Brandenburg.")

matches = matcher(doc)
for mid, start, end in matches:
    print(start, end, doc[start:end])

9 11 Angela Merkel
16 18 Donald Trump
22 24 Alexis Tsipras


In [115]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
terms = ["Asset", "Investment", "Derivatives", "Demand", "Market"]
patterns = [nlp.make_doc(term) for term in terms]
matcher.add("financeTerms", None, *patterns)
doc = nlp("During the last decade, derivatives market became an asset class of their own and influenced the financial landscape strongly.")
matches = matcher(doc)
for mid, start, end in matches:
    print(start, end, doc[start:end])

5 6 derivatives
6 7 market
9 10 asset


In [118]:
matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
ip_nums = ["127.0.0.0", "127.256.0.0"]
patterns = [nlp.make_doc(ip) for ip in ip_nums]
matcher.add("IPNums", None, *patterns)

doc = nlp("This log contains the following IP addresses: 192.1.1.1 and 192.12.1.1 and 192.160.1.1 .")
for mid, start, end in matcher(doc):
    print(start, end, doc[start:end])

8 9 192.1.1.1
12 13 192.160.1.1


    Entity Ruler

In [120]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_md')

matcher = Matcher(nlp.vocab)
pattern = [{"ENT_TYPE": "PERSON"}]
matcher.add('personEnt', [pattern])
doc = nlp("Bill Gates visited Berlin.")
matches = matcher(doc)
for mid, start, end in matches:
    print(start, end, doc[start:end])

0 1 Bill
1 2 Gates


In [121]:
pattern = [{"ENT_TYPE": "PERSON", "OP": "+"}]
matcher.add('personEnt', [pattern])
doc = nlp("Bill Gates visited Berlin.")
matches = matcher(doc)
for mid, start, end in matches:
    print(start, end, doc[start:end])

0 1 Bill
1 2 Gates
0 2 Bill Gates


In [50]:
pattern = [{"ENT_TYPE": "PERSON", "OP": "+"},
          {"POS": "VERB"}]
matcher.add("personEntAction", [pattern])
doc = nlp("Today German chancellor Angela Merkel met with the US president.")
matches = matcher(doc)
for mid, start, end in matches:
    print(start, end, doc[start:end])

4 6 Merkel met
3 6 Angela Merkel met


In [125]:
doc = nlp("I have an account with chime since 2017.")
doc.ents

(2017,)

In [33]:
import spacy
from spacy import pipeline
nlp = spacy.load('en_core_web_md')

patterns = [{"label": "ORG",
            "pattern": [{"LOWER": "chime"}]}]
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
doc = nlp("I have an account with chime since 2017.")
doc.ents

(chime, 2017)

    Combining spacy models and matchers

In [2]:
#Extracting IBAN and account numbers
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_md')

matcher = Matcher(nlp.vocab)

doc = nlp("My IBAN number is BE71 0961 2345 6769, please send the money there.")
doc1 = nlp("My IBAN number is FR76 3000 6000 0112 3456 7890 189, please send the money there.")

pattern = [{"SHAPE": "XXdd"},
           {"TEXT": {"REGEX": "\d{1,4}"},
           "OP": "+"}]
matcher.add("ibanNum", [pattern])
for mid, start, end in matcher(doc):
    print(start, end, doc[start:end])

4 6 BE71 0961
4 7 BE71 0961 2345
4 8 BE71 0961 2345 6769


In [3]:
for mid, start, end in matcher(doc1):
    print(start, end, doc1[start:end])

4 6 FR76 3000
4 7 FR76 3000 6000
4 8 FR76 3000 6000 0112
4 9 FR76 3000 6000 0112 3456
4 10 FR76 3000 6000 0112 3456 7890
4 11 FR76 3000 6000 0112 3456 7890 189


In [4]:
pattern = [{"LOWER": "account"}, 
           {"LOWER": {"IN": ["num", "number", "no"]}}, 
           {}, 
           {"IS_DIGIT": True}]
doc1 = nlp("My account number is 8891273")
doc2 = nlp("My account num is 8891273")
doc3 = nlp("My account no is 8891273")

matcher.add("accountNum", [pattern])
for mid, start, end in matcher(doc1):
    print(start, end, doc1[start:end])
for mid, start, end in matcher(doc2):
    print(start, end, doc2[start:end])
for mid, start, end in matcher(doc3):
    print(start, end, doc3[start:end])

1 5 account number is 8891273
1 5 account num is 8891273
1 5 account no is 8891273


    Extracting Phone Number

In [6]:
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_md')

pattern = [{"TEXT": "+1", "OP": "?"},
          {"TEXT": "("}, {"SHAPE": "ddd"}, {"TEXT": ")"},
          {"SHAPE": "ddd"}, 
          {"TEXT": "-", "OP": "?"}, {"SHAPE": "dddd"}]

In [8]:
doc1 = nlp("You can call my office on +1 (221) 103-2423 or email me directly.")
doc2 = nlp("You can call me on (221) 102 2423 or text me.")
matcher = Matcher(nlp.vocab)
matcher.add("usPhoneNum", [pattern])
for mid, start, end in matcher(doc1):
    print(start, end, doc1[start:end])

6 13 +1 (221) 103-2423
7 13 (221) 103-2423


In [9]:
for mid, start, end in matcher(doc2):
    print(start, end, doc2[start:end])

5 10 (221) 102 2423


    Extracting Mentions

In [25]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_md")

pattern = [{"ENT_TYPE": "ORG"}, {"LEMMA": "be"},
          {"POS": "ADV", "OP": "*"},
          {"POS": "ADJ"}]
matcher = Matcher(nlp.vocab)
matcher.add("extractMentions", [pattern])
doc1 = nlp("ACME is so expensive, stay away!")
doc2 = nlp("ACME is good though price are expensive but they are worth buying.")
for mid, start, end in matcher(doc1):
    print(start, end, doc1[start:end])
for mid, start, end in matcher(doc2):
    print(start, end, doc2[start:end])

    Hashtag and emoji extraction

In [26]:
doc = nlp("#MySpace")
[token.text for token in doc]

['#', 'MySpace']

In [27]:
#hashtag extract
doc = nlp("Starting working out now #WeekendShred")

pattern = [{"TEXT": "#"}, {"IS_ASCII": True}]
matcher = Matcher(nlp.vocab)
matcher.add("hashTag", [pattern])
for mid, start, end in matcher(doc):
    print(start, end, doc[start:end])

4 6 #WeekendShred


In [35]:
#emoji extract
pos_emoji = ["😊", "😁", "😂", "😄", "😉", "😘"]
neg_emoji = ["😒", "😏", "🙃", "😤", "😬", "😖"]

pos_patterns = [{"ORTH": emoji} for emoji in pos_emoji]
neg_patterns = [{"ORTH": emoji} for emoji in neg_emoji]

matcher = Matcher(nlp.vocab)
matcher.add("posEmoji", [pos_patterns])
matcher.add("negEmoji", [neg_patterns])

doc = nlp(" I love Zara 😍.")
for mid, start, end in matcher(doc):
    print(start, end, doc[start:end])

 Expanding named entities

In [32]:
doc = nlp("Ms.Smith left her house 2 hours ago.")
doc.ents

(Smith, 2 hours ago)

In [45]:
import spacy
from spacy import pipeline
nlp = spacy.load('en_core_web_md')

patterns = [{"label": "TITLE", "pattern":
            [{"LOWER": {"IN": ["ms.", "mr.", "mrs.", "prof.", "dr."]}}]}]
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
doc = nlp("Ms.Smith have an account with chime since 2017.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Ms.', 'TITLE'), ('Smith', 'PERSON'), ('2017', 'DATE')]


 Combining linguistic features and named entities

In [49]:
import spacy
from spacy import displacy
doc = nlp("Einstein lived in Zurich.")
[(ent.text, ent.label_) for ent in doc.ents]

displacy.render(doc, style="dep", jupyter= True, options={'distance': 150}) 

In [53]:
doc = nlp("Einstein lived in Zurich.")
person_ents = [ent for ent in doc.ents if ent.label_ == "PERSON"]

for person_ent in person_ents:
    #use head of the entity's last token
    head = person_ent[-1].head
    if head.lemma_ == "live":
        #check if the children of live contain prepositional attachment
        preps = [token for token in head.children
                if token.dep_ == "prep" ]
        for prep in preps:
            places = [token for token in prep.children 
                      if token.ent_type_ == "GPE"]
            #verb is in past or present tense
            print({"person": person_ent, "city": places,
                  "past": head.tag_ == "VBD"})

{'person': Einstein, 'city': [Zurich], 'past': True}
