In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [5]:
doc = nlp('Hello World!')

In [7]:
doc

Hello World!

In [9]:
for token in doc:
    print(token)

Hello
World
!


In [12]:
pattern = [{"LOWER": "hello", 'OP':'?'}, {"IS_PUNCT": True, 'OP':'?'}, {"LOWER": "world"}]

In [13]:
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', None, pattern)

In [14]:
doc = nlp("Hello, world!")

In [16]:
matches = matcher(doc)
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [18]:
for token in doc:
    print(token)

Hello
,
world
!


In [20]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, world
15578876784678163569 HelloWorld 1 3 , world
15578876784678163569 HelloWorld 2 3 world


### Regex

In [21]:
text = "my phone number is 1256. Ohh its wrong! Correct one is 1256348790. call me!"

In [22]:
import re

In [24]:
re.search(r'\d{10}', text)

<re.Match object; span=(55, 65), match='1256348790'>

In [26]:
re.search(r'\d{4}', text)

<re.Match object; span=(19, 23), match='1256'>

In [31]:
re.findall(r'\d{3,10}', text)

['1256', '1256348790']

In [33]:
re.findall(r'\w{4,}', text)

['phone', 'number', '1256', 'wrong', 'Correct', '1256348790', 'call']

### wildcard text

In [34]:
re.findall(r'c..', text)

['ct ', 'cal']

In [36]:
text = "this is cat but not that. i want hat and cat both"

In [39]:
re.findall(r'.a.', text)

['cat', 'hat', 'wan', 'hat', ' an', 'cat']

In [40]:
text = 'hi thanks for watching <3'

In [42]:
re.findall(r'\d$', text)

['3']

In [43]:
text = '3 hi thanks for watching <3'

In [45]:
re.findall(r'^\d', text)

['3']

### Exclusion

In [46]:
text

'3 hi thanks for watching <3'

In [48]:
re.findall(r'[^\d]+', text)

[' hi thanks for watching <']

In [49]:
text = 'hi 33 thanks for watching <3'

In [51]:
re.findall(r'[^\D]+', text)

['33', '3']

In [52]:
text = "you can get free-videos on youtube"

In [54]:
re.findall(r'[\w]+-[\w]+', text)

['free-videos']

### Regular Expression in SpaCy