# Spacy Matcher

In [1]:
import spacy
from spacy.matcher import Matcher

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
matcher = Matcher(nlp.vocab)

In [4]:
pattern = [{"LIKE_EMAIL":True}]

In [5]:
matcher.add("EMAIL_ADDRESS",[pattern])

In [6]:
doc = nlp("This is my email address: chirag.juneja@gmail.com")

In [7]:
matches = matcher(doc)

In [8]:
print(matches)

[(16571425990740197027, 6, 7)]


In [9]:
print(nlp.vocab[matches[0][0]].text)

EMAIL_ADDRESS


In [10]:
doc[matches[0][1]:matches[0][2]]

chirag.juneja@gmail.com

## Grabbing all proper nouns

In [11]:
with open("data/wiki_mlk.txt",'r') as f:
    text = f.read()

In [12]:
doc = nlp(text)

In [13]:
nlp = spacy.load("en_core_web_sm")



In [14]:
matcher = Matcher(nlp.vocab)
pattern = [{'POS':'PROPN'}]
matcher.add("PROPER_NOUN",[pattern])
matches = matcher(doc)
for match in matches[:10]:
    print(match,doc[match[1]:match[2]])

(451313080118390996, 0, 1) Martin
(451313080118390996, 1, 2) Luther
(451313080118390996, 2, 3) King
(451313080118390996, 3, 4) Jr.
(451313080118390996, 6, 7) Michael
(451313080118390996, 7, 8) King
(451313080118390996, 8, 9) Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 16, 17) April
(451313080118390996, 23, 24) American


### Multi-Word Tokens

In [15]:
matcher = Matcher(nlp.vocab)
pattern = [{'POS':'PROPN','OP':'+'}]
matcher.add("PROPER_NOUN",[pattern])
matches = matcher(doc)
for match in matches[:10]:
    print(match,doc[match[1]:match[2]])

(451313080118390996, 0, 1) Martin
(451313080118390996, 0, 2) Martin Luther
(451313080118390996, 1, 2) Luther
(451313080118390996, 0, 3) Martin Luther King
(451313080118390996, 1, 3) Luther King
(451313080118390996, 2, 3) King
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 1, 4) Luther King Jr.
(451313080118390996, 2, 4) King Jr.
(451313080118390996, 3, 4) Jr.


### Greedy Matcher

In [16]:
matcher = Matcher(nlp.vocab)
pattern = [{'POS':'PROPN','OP':'+'}]
matcher.add("PROPER_NOUN",[pattern],greedy="LONGEST")
matches = matcher(doc)
matches.sort(key=lambda x:x[1])
for match in matches[:10]:
    print(match,doc[match[1]:match[2]])

(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 16, 17) April
(451313080118390996, 23, 25) American Baptist
(451313080118390996, 50, 51) King
(451313080118390996, 70, 72) Mahatma Gandhi
(451313080118390996, 84, 88) Martin Luther King Sr
(451313080118390996, 90, 91) King
(451313080118390996, 114, 115) King


### Adding in Sequences

In [17]:
matcher = Matcher(nlp.vocab)
pattern = [{'POS':'PROPN','OP':'+'},{'POS':'VERB'}]
matcher.add("PROPER_NOUN",[pattern],greedy="LONGEST")
matches = matcher(doc)
matches.sort(key=lambda x:x[1])
for match in matches[:10]:
    print(match,doc[match[1]:match[2]])

(451313080118390996, 50, 52) King advanced
(451313080118390996, 90, 92) King participated
(451313080118390996, 114, 116) King led
(451313080118390996, 248, 253) Director J. Edgar Hoover considered
(451313080118390996, 323, 325) King won
(451313080118390996, 486, 489) United States beginning


In [18]:
import json

In [19]:
with open("data/alice.json",'r') as f:
    data = json.load(f)

In [20]:
text = data[0][2][0]

In [21]:
text

"Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,' thought Alice `without pictures or conversation?'"

In [22]:
text = text.replace('`',"'")

In [23]:
doc = nlp(text)

In [24]:
speak_lemmas = ["think","say"]
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH":"'", "OP":"+"},
           {"IS_ALPHA":True, "OP":"+"},
           {"IS_PUNCT":True, "OP":"+"},
           {"ORTH":"'", "OP":"+"},
           {"POS":"VERB","LEMMA":{"IN":speak_lemmas}},
           {"POS":"PROPN"}
          ]
matcher.add("QUOTES",[pattern],greedy="LONGEST")
matches = matcher(doc)
matches.sort(key=lambda x:x[1])
for match in matches[:10]:
    print(match,doc[match[1]:match[2]])

(16432004385153140588, 47, 60) 'and what is the use of a book,' thought Alice
