In [66]:
import spacy

nlp = spacy.load("en_core_web_sm")
print(nlp)

<spacy.lang.en.English object at 0x7fc7f6b677f0>


In [67]:
from spacy.matcher import Matcher 
print(nlp.vocab)
matcher = Matcher(nlp.vocab)

<spacy.vocab.Vocab object at 0x7fc8132fb4c0>


In [68]:
print(matcher, type(matcher))
print(matcher)
print(type(matcher))

<spacy.matcher.matcher.Matcher object at 0x7fc7f6c2af40> <class 'spacy.matcher.matcher.Matcher'>
<spacy.matcher.matcher.Matcher object at 0x7fc7f6c2af40>
<class 'spacy.matcher.matcher.Matcher'>


In [69]:
# Patterns to be found:
# SolarPower
# Solar-power
# Solar power

## Token patterns to perform rule-based matching 

#SolarPower
# The first pattern checks when we transform the token to its lowercase version, if it will be same as 'solarpower'.
first_pattern = [{'LOWER': 'solarpower'}]

#Solar-power
second_pattern = [{'LOWER': 'solar'}, {'IS_PUNCT':True}, {'LOWER': 'power'}]

#Solar power
third_pattern = [{'LOWER': 'solar'}, {'LOWER': 'power'}]

In [70]:
matcher.add("SolarPower", [first_pattern])
matcher.add("Solar-power", [second_pattern])
matcher.add("Solar power", [third_pattern])

In [71]:
document = nlp(u'The Solar Power industry continues to grow as solarpower increases. Solar-power is a crucial resource.')
found_matches = matcher(document)
print(found_matches)

# In the output of this cell, each tuple will contain the match id, starting index of the match and the ending 
# index of the match in sequence. The starting and ending indexes are at the token level. In other words; the 
# index 0 belongs to the first word, the index 1 belongs to the second word, and so forth.

[(3273117015852423453, 1, 3), (8656102463236116519, 8, 9), (17734808517635807839, 11, 14)]


In [72]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id] # get string representation
    matched_span = document[start:end] # get the matched span
    print(match_id, string_id, start, end, matched_span.text)

3273117015852423453 Solar power 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
17734808517635807839 Solar-power 11 14 Solar-power


In [73]:
matcher.remove('SolarPower') # Remove 'SolarPower' from the matcher object

In [74]:
overlapping_part = matcher(document)
print(overlapping_part)

[(3273117015852423453, 1, 3), (17734808517635807839, 11, 14)]


In [75]:
# 'OP': '*' allows the pattern to match zero or more times. For the pattern2, it allows us to put 
# any amount of punctuation in the middle.l
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP': '*'}, {'LOWER': 'power'}]

In [76]:
my_matcher = Matcher(nlp.vocab)
my_matcher.add('solarpower', [pattern1]) # adding pattern1 to the matcher object
my_matcher.add('Solar-Power', [pattern2]) # adding pattern2 to the matcher object

In [77]:
new_document = nlp(u'Solar--power can be solarpower, Solar Power, or solar-power.')
matches = matcher(new_document)
print(matches)

[(17734808517635807839, 0, 3), (3273117015852423453, 7, 9), (17734808517635807839, 11, 14)]


In [121]:
# Phrase Matching 
from spacy.matcher import PhraseMatcher 

phraseMatcher = PhraseMatcher(nlp.vocab) # creating a phrase matcher instance

with open('reaganomics.txt') as file:
    content = file.read()
    third_doc = nlp(content)
    
print(third_doc)

Reaganomics (/reɪɡəˈnɒmɪks/; a portmanteau of Reagan and economics attributed to Paul Harvey),[1] or Reaganism, were the neoliberal[2][3][4] economic policies promoted by U.S. President Ronald Reagan during the 1980s. These policies are characterized as supply-side economics, trickle-down economics, or "voodoo economics" by opponents,[5] while Reagan and his advocates preferred to call it free-market economics.

The pillars of Reagan's economic policy included increasing defense spending, balancing the federal budget and slowing the growth of government spending, reducing the federal income tax and capital gains tax, reducing government regulation, and tightening the money supply in order to reduce inflation.[6]

The results of Reaganomics are still debated. Supporters point to the end of stagflation, stronger GDP growth, and an entrepreneurial revolution in the decades that followed.[7][8] Critics point to the widening income gap, what they described as an atmosphere of greed, reduced

In [122]:
phrases = ["supply-side economics", "voodo economics", "free-market economics", "trickle-down economics"]

In [123]:
phrase_patterns = []
for phrase in phrases:
    doc = nlp(phrase)
    phrase_patterns.append(doc)

In [124]:
phrase_patterns

[supply-side economics,
 voodo economics,
 free-market economics,
 trickle-down economics]

In [125]:
print(phrase_patterns)

[supply-side economics, voodo economics, free-market economics, trickle-down economics]


In [126]:
print(type(phrase_patterns))
print(type(phrase_patterns[0])) # An element of the list of phrase patterns is of type 'spacy Doc'.

<class 'list'>
<class 'spacy.tokens.doc.Doc'>


In [134]:
# Instead of a single pattern as in the case of Matcher, it can take a list of patterns as an argument.
phraseMatcher.add('EconMatcher', phrase_patterns) 
matches_lst = phraseMatcher(third_doc)
print(matches_lst)

[(3680293220734633682, 39, 43), (3680293220734633682, 44, 48), (3680293220734633682, 66, 70), (3680293220734633682, 667, 671)]


In [137]:
for match_id, initial_index, end_index in matches_lst:
    string_id = nlp.vocab.strings[match_id] # get the string representation of the match id
    matched_span = third_doc[initial_index:end_index] # get the matched span
    print(match_id, string_id, initial_index, end_index, matched_span, matched_span.text)

3680293220734633682 EconMatcher 39 43 supply-side economics supply-side economics
3680293220734633682 EconMatcher 44 48 trickle-down economics trickle-down economics
3680293220734633682 EconMatcher 66 70 free-market economics free-market economics
3680293220734633682 EconMatcher 667 671 supply-side economics supply-side economics
