In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
from spacy.matcher import Matcher

In [4]:
matcher = Matcher(nlp.vocab)

In [5]:
# SolarPower
pattern1 = [{'LOWER':'solarpower'}]
# Solar-power
pattern2 = [{'LOWER':'solar'},{"IS_PUNCT":True},{'LOWER':'power'}]
# Solar power
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

## Other token attributes
Besides lemmas, there are a variety of token attributes we can use to determine matching rules:
<table><tr><th>Attribute</th><th>Description</th></tr>

<tr ><td><span >`ORTH`</span></td><td>The exact verbatim text of a token</td></tr>
<tr ><td><span >`LOWER`</span></td><td>The lowercase form of the token text</td></tr>
<tr ><td><span >`LENGTH`</span></td><td>The length of the token text</td></tr>
<tr ><td><span >`IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`</span></td><td>Token text consists of alphanumeric characters, ASCII characters, digits</td></tr>
<tr ><td><span >`IS_LOWER`, `IS_UPPER`, `IS_TITLE`</span></td><td>Token text is in lowercase, uppercase, titlecase</td></tr>
<tr ><td><span >`IS_PUNCT`, `IS_SPACE`, `IS_STOP`</span></td><td>Token is punctuation, whitespace, stop word</td></tr>
<tr ><td><span >`LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`</span></td><td>Token text resembles a number, URL, email</td></tr>
<tr ><td><span >`POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE`</span></td><td>The token's simple and extended part-of-speech tag, dependency label, lemma, shape</td></tr>
<tr ><td><span >`ENT_TYPE`</span></td><td>The token's entity label</td></tr>

</table>

In [6]:
matcher.add('SolarPower',None, pattern1,pattern2,pattern3)

In [7]:
doc = nlp(u"The Solar Power industry continues to grow a solarpower increases. Solar-power is amazing")

In [8]:
found_matches = matcher(doc)

In [9]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [10]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [11]:
matcher.remove('SolarPower') #Remove a pattern

In [12]:
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True,'OP':'*'},{'LOWER':'power'}]

In [13]:
matcher.add('SolarPower',None,pattern1, pattern2)

In [14]:
doc2 = nlp(u"Solar--power is solarpower yay! SolarPower units")

In [15]:
found_matches = matcher(doc2)

In [16]:
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5), (8656102463236116519, 7, 8)]


In [17]:
from spacy.matcher import PhraseMatcher

In [18]:
matcher = PhraseMatcher(nlp.vocab)

In [24]:
with open("../TextFiles/reaganomics.txt",encoding="latin1") as f:
    doc3 = nlp(f.read())

In [26]:
phrase_list = ["voodoo economics",'supply-side economics','trickle-down economics','free-market economics']

In [27]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [28]:
matcher.add('EconMatcher',None,*phrase_patterns)

In [29]:
found_matches = matcher(doc3)

In [31]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2984 2988 trickle-down economics


For additional information visit https://spacy.io/usage/linguistic-features#section-rule-based-matching