In [1]:
import spacy
from spacy.matcher import Matcher

## Load model and create a document

In [2]:
# Define a document
string = 'My new iPhone 6 is white. Xiaiomi smartphones are pretty. I like to spend afternoons in Old Yafo and \
take pictures with my iPhone 7.'

In [3]:
# Load a language model
nlp = spacy.load("en_core_web_sm")

In [4]:
# Create a spacy document
doc = nlp(string)

In [5]:
doc.text

'My new iPhone 6 is white. Xiaiomi smartphones are pretty. I like to spend afternoons in Old Yafo and take pictures with my iPhone 7.'

## POS tagging

In [6]:
for token in doc:
    if token.pos_ == 'PROPN':
        print(token, token.pos_)

iPhone PROPN
Xiaiomi PROPN
Old PROPN
Yafo PROPN
iPhone PROPN


## NER

In [7]:
# Get entities
for ent in doc.ents:
    print(ent, ent.label_)

iPhone ORG
Xiaiomi PERSON
Old Yafo ORG
iPhone ORG


## Rule-based matching

In [8]:
# Initiate matcher
matcher = Matcher(nlp.vocab)

In [9]:
# Define a pattern
pattern = [
    {'LOWER': 'iphone'},
    {'IS_DIGIT': True}
]

In [10]:
# Add pattern to matcher
matcher.add('IPHONE_PATTERN', None, pattern)   # `None` because we don't use any callbacks now 

In [11]:
# Get matches - it returns a tuple of 3 vals
matches = matcher(doc)
matches

[(9528407286733565721, 2, 4), (9528407286733565721, 25, 27)]

In [25]:
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(f'Your match: {matched_span.text}')

Your match: iPhone 6
Your match: iPhone 7


## More complex matching

In [30]:
matcher2 = Matcher(nlp.vocab)

doc2 = nlp("I installed WinZip in 8th grade. Nevertheless, it didn\'t work. I tried to install GZip, \
I tried to install XZip and others. Nothing worked. I am gonna install Ubuntu now.")

pattern2 = [{'LEMMA': 'install'}, {'POS': 'PROPN'}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher2.add('DOWNLOAD_THINGS_PATTERN', None, pattern2)
matches2 = matcher2(doc2)
print('Total matches found:', len(matches2))

# Iterate over the matches and print the span text
for match_id, start, end in matches2:
    print('Your match:', doc2[start:end].text)

Total matches found: 3
Your match: install GZip
Your match: install XZip
Your match: install Ubuntu


In [28]:
# No WinZip though
for token in doc2:
    if token.pos_ == 'PROPN':
        print(token)

WinZip
GZip
XZip
Ubuntu
