## CREATED BY HCY 20200728 23:28

In [1]:
import spacy
from spacy.vocab import Vocab
from spacy.matcher import Matcher

In [2]:
nlp = spacy.load('en_core_web_lg')
matcher = Matcher(nlp.vocab)

In [3]:
matched_sets = []

In [4]:
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, {"POS":"ADJ"}]

In [5]:
def callback_method_fb(matcher, doc, i, matches):
    matched_id, start, end = matches[i]
    span = doc[start:end]
    sent = span.sent
    match_ents = [{'start': span.start_char - sent.start_char,
                   'end': span.end_char - sent.start_char,
                   'label': 'MATCH'}]
    matched_sets.append({'text': sent.text, 'ents': match_ents})

In [6]:
doc = nlp("I'd say that Facebook is evil. - Facebook is pretty cool, right?")

In [7]:
matcher.add('fb', callback_method_fb, pattern)

In [8]:
matches = matcher(doc)

In [9]:
matches

[(8017838677478259815, 4, 7), (8017838677478259815, 9, 13)]

In [10]:
matched_sets

[{'text': "I'd say that Facebook is evil.",
  'ents': [{'start': 13, 'end': 29, 'label': 'MATCH'}]},
 {'text': '- Facebook is pretty cool, right?',
  'ents': [{'start': 2, 'end': 25, 'label': 'MATCH'}]}]

In [11]:
from spacy import displacy

In [12]:
displacy.render(matched_sets, style='ent', manual=True)

### Phone number

In [59]:
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "dddd"}, {"ORTH":"-", "OP": "?"}, {"SHAPE": "dddd"},{"ORTH":"-", "OP": "?"}, {"SHAPE": "dddd"}]

In [60]:
doc = nlp("My phone number is (123) 1386-7945-5832, Please call me")

In [61]:
matcher = Matcher(nlp.vocab)
matcher.add('phone_num', None, pattern)

In [62]:
matches = matcher(doc)

In [63]:
matches

[(534639841163226624, 4, 12)]

In [64]:
print([t.text for t in doc])

['My', 'phone', 'number', 'is', '(', '123', ')', '1386', '-', '7945', '-', '5832', ',', 'Please', 'call', 'me']


In [65]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

(123) 1386-7945-5832


#### Email Address Matching

In [66]:
pattern = [{"TEXT": {"REGEX": "[a-zA-Z0-9-_.]+@[a-zA-Z0-9_.]+"}}]

In [67]:
matcher = Matcher(nlp.vocab)
matcher.add("email", None, pattern)

In [68]:
doc = nlp("my email is 536480973@qq.com and 13867945832@163.com")

In [69]:
matches = matcher(doc)

In [70]:
matches

[(7320900731437023467, 3, 4), (7320900731437023467, 5, 6)]

In [71]:
for matcher_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

536480973@qq.com
13867945832@163.com


In [72]:
a = "536480973@qq.com"

In [73]:
import re

In [82]:
re.findall(r"[\w]+@[\w_.]+", a)

['536480973@qq.com']

## Emoji on social media

#### By defaultm Spacy's tokenizer will split emoji into separate tokens. This means that you can create a pattern for one or more emoji tokens
#### Valid hashtags usually consists of a #, plus a sequence of ASCII characters with no whitespace, making them easy to match as well

In [116]:
pos_emoji = ["🥺", "🥺", "🥺 "]
neg_emoji = ["🐻", "🍔"]

In [117]:
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]

In [118]:
pos_patterns

[[{'ORTH': '\U0001f97a'}], [{'ORTH': '\U0001f97a'}], [{'ORTH': '\U0001f97a '}]]

In [119]:
neg_patterns

[[{'ORTH': '🐻'}], [{'ORTH': '🍔'}]]

In [120]:
def label_sentiment(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == "HAPPY":
        doc.sentiment += 0.1
    elif doc.vocab.strings[match_id] == "SAD":
        doc.sentiment -= 0.1

In [110]:
import spacy
nlp = spacy.load("en_core_web_lg")
matcher = Matcher(nlp.vocab)

In [121]:
matcher.add("HAPPY", label_sentiment, *pos_patterns)
matcher.add("SAD", label_sentiment, *neg_patterns)

In [122]:
matcher.add("HASHTAG", None, [{"TEXT": "#"}, {"IS_ASCII": True}])

In [123]:
doc = nlp("Hello World 🥺 #KGPTalkie")

In [124]:
matches = matcher(doc)

In [126]:
matches

[(2686646543460454932, 2, 3), (16536914698459818706, 3, 5)]

In [129]:
for matcher_id, start, end in matches:
    string_id = doc.vocab.strings[match_id]
    span = doc[start:end]
    print(string_id, span.text)

phone_num 🥺
phone_num #KGPTalkie


In [131]:
for token in doc:
    print(token.text)

Hello
World
🥺
#
KGPTalkie


## Efficient phrase matching 有效短语匹配

In [141]:
from spacy.matcher import PhraseMatcher

In [142]:
matcher = PhraseMatcher(nlp.vocab)

In [143]:
terms = ['BARAC OBAMA', 'ANGELA MERKEL', 'WASHINGTON D.C.']

In [144]:
pattern = [nlp.make_doc(text) for text in terms]

In [145]:
pattern

[BARAC OBAMA, ANGELA MERKEL, WASHINGTON D.C.]

In [148]:
matcher.add("term", None, *pattern)

In [156]:
doc = nlp("German Chancellor ANGELA MERKEL and US President BARAC OBAMA in WASHINGTON D.C.!")

In [157]:
doc

German Chancellor ANGELA MERKEL and US President BARAC OBAMA in WASHINGTON D.C.!

In [158]:
matches = matcher(doc)

In [159]:
matches

[(4519742297340331040, 2, 4),
 (4519742297340331040, 7, 9),
 (4519742297340331040, 10, 12)]

In [160]:
for matcher_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

ANGELA MERKEL
BARAC OBAMA
WASHINGTON D.C.


## Custom Ruled Based Entity Recognition 基于自定义规则的实体识别

### Entity patterns are dictionaries with two keys:"labels"， specifying the label to assign to the entity if the pattern is matched, and "pattern", the match pattern. The Entity ruler accepts two types of patterns

In [161]:
from spacy.pipeline import EntityRuler

In [162]:
nlp = spacy.load("en_core_web_lg")

In [163]:
ruler = EntityRuler(nlp)

In [165]:
patterns = [{"label": "ORG", "pattern": "KGP Talkie"},
            {"label":"GPE", "pattern": [{"LOWER": "san"}, {"LOWER":"francisco"}]}]

In [166]:
patterns

[{'label': 'ORG', 'pattern': 'KGP Talkie'},
 {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]

In [167]:
ruler.add_patterns(patterns)

In [168]:
nlp.add_pipe(ruler)

In [169]:
doc = nlp("KGP Talkie is opening its first big office in San Francisco.")

In [170]:
for ent in doc.ents:
    print(ent.text, ent.label_)

KGP Talkie ORG
first ORDINAL
San Francisco GPE
