## CREATED BY HCY 20200728 23:28

In [1]:
import spacy
from spacy.vocab import Vocab
from spacy.matcher import Matcher

In [2]:
nlp = spacy.load('en_core_web_lg')
matcher = Matcher(nlp.vocab)

In [3]:
matched_sets = []

In [4]:
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, {"POS":"ADJ"}]

In [5]:
def callback_method_fb(matcher, doc, i, matches):
    matched_id, start, end = matches[i]
    span = doc[start:end]
    sent = span.sent
    match_ents = [{'start': span.start_char - sent.start_char,
                   'end': span.end_char - sent.start_char,
                   'label': 'MATCH'}]
    matched_sets.append({'text': sent.text, 'ents': match_ents})

In [6]:
doc = nlp("I'd say that Facebook is evil. - Facebook is pretty cool, right?")

In [7]:
matcher.add('fb', callback_method_fb, pattern)

In [8]:
matches = matcher(doc)

In [9]:
matches

[(8017838677478259815, 4, 7), (8017838677478259815, 9, 13)]

In [10]:
matched_sets

[{'text': "I'd say that Facebook is evil.",
  'ents': [{'start': 13, 'end': 29, 'label': 'MATCH'}]},
 {'text': '- Facebook is pretty cool, right?',
  'ents': [{'start': 2, 'end': 25, 'label': 'MATCH'}]}]

In [11]:
from spacy import displacy

In [12]:
displacy.render(matched_sets, style='ent', manual=True)

### Phone number

In [13]:
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "dddd"}, {"ORTH":"-", "OP": "?"}, {"SHAPE": "dddd"},{"ORTH":"-", "OP": "?"}, {"SHAPE": "dddd"}]

In [14]:
doc = nlp("My phone number is (123) 1386-7945-5832, Please call me")

In [15]:
matcher = Matcher(nlp.vocab)
matcher.add('phone_num', None, pattern)

In [16]:
matches = matcher(doc)

In [17]:
matches

[(534639841163226624, 4, 12)]

In [18]:
print([t.text for t in doc])

['My', 'phone', 'number', 'is', '(', '123', ')', '1386', '-', '7945', '-', '5832', ',', 'Please', 'call', 'me']


In [19]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

(123) 1386-7945-5832


#### Email Address Matching

In [20]:
pattern = [{"TEXT": {"REGEX": "[a-zA-Z0-9-_.]+@[a-zA-Z0-9_.]+"}}]

In [21]:
matcher = Matcher(nlp.vocab)
matcher.add("email", None, pattern)

In [22]:
doc = nlp("my email is 536480973@qq.com and 13867945832@163.com")

In [23]:
matches = matcher(doc)

In [24]:
matches

[(7320900731437023467, 3, 4), (7320900731437023467, 5, 6)]

In [25]:
for matcher_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

536480973@qq.com
13867945832@163.com


In [26]:
a = "536480973@qq.com"

In [27]:
import re

In [28]:
re.findall(r"[\w]+@[\w_.]+", a)

['536480973@qq.com']

## Emoji on social media

#### By defaultm Spacy's tokenizer will split emoji into separate tokens. This means that you can create a pattern for one or more emoji tokens
#### Valid hashtags usually consists of a #, plus a sequence of ASCII characters with no whitespace, making them easy to match as well

In [29]:
pos_emoji = ["🥺", "🥺", "🥺 "]
neg_emoji = ["🐻", "🍔"]

In [30]:
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]

In [31]:
pos_patterns

[[{'ORTH': '\U0001f97a'}], [{'ORTH': '\U0001f97a'}], [{'ORTH': '\U0001f97a '}]]

In [32]:
neg_patterns

[[{'ORTH': '🐻'}], [{'ORTH': '🍔'}]]

In [33]:
def label_sentiment(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == "HAPPY":
        doc.sentiment += 0.1
    elif doc.vocab.strings[match_id] == "SAD":
        doc.sentiment -= 0.1

In [34]:
import spacy
nlp = spacy.load("en_core_web_lg")
matcher = Matcher(nlp.vocab)

In [35]:
matcher.add("HAPPY", label_sentiment, *pos_patterns)
matcher.add("SAD", label_sentiment, *neg_patterns)

In [36]:
matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])

In [37]:
doc = nlp("Hello World 🥺 #KGPTalkie")

In [38]:
matches = matcher(doc)

In [39]:
matches

[(2686646543460454932, 2, 3), (16536914698459818706, 3, 5)]

In [45]:
for matcher_id, start, end in matches:
    string_id = doc.vocab.strings[matcher_id]
    span = doc[start:end]
    print(string_id, span.text)

HAPPY 🥺
HASHTAG #KGPTalkie


In [46]:
for token in doc:
    print(token.text)

Hello
World
🥺
#
KGPTalkie


## Efficient phrase matching 有效短语匹配

In [141]:
from spacy.matcher import PhraseMatcher

In [142]:
matcher = PhraseMatcher(nlp.vocab)

In [143]:
terms = ['BARAC OBAMA', 'ANGELA MERKEL', 'WASHINGTON D.C.']

In [144]:
pattern = [nlp.make_doc(text) for text in terms]

In [145]:
pattern

[BARAC OBAMA, ANGELA MERKEL, WASHINGTON D.C.]

In [148]:
matcher.add("term", None, *pattern)

In [156]:
doc = nlp("German Chancellor ANGELA MERKEL and US President BARAC OBAMA in WASHINGTON D.C.!")

In [157]:
doc

German Chancellor ANGELA MERKEL and US President BARAC OBAMA in WASHINGTON D.C.!

In [158]:
matches = matcher(doc)

In [159]:
matches

[(4519742297340331040, 2, 4),
 (4519742297340331040, 7, 9),
 (4519742297340331040, 10, 12)]

In [160]:
for matcher_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

ANGELA MERKEL
BARAC OBAMA
WASHINGTON D.C.


## Custom Ruled Based Entity Recognition 基于自定义规则的实体识别

### Entity patterns are dictionaries with two keys:"labels"， specifying the label to assign to the entity if the pattern is matched, and "pattern", the match pattern. The Entity ruler accepts two types of patterns

In [94]:
from spacy.pipeline import EntityRuler

nlp = spacy.load("en_core_web_lg")

ruler = EntityRuler(nlp)

patterns = [{"label": "ORG", "pattern": "KGP Talkie"},
            {"label":"GPE", "pattern": [{"LOWER": "san"}, {"LOWER":"francisco"}]}]

ruler.add_patterns(patterns)

nlp.add_pipe(ruler)

doc = nlp("KGP Talkie is opening its first big office in San Francisco.")

for ent in doc.ents:
    print(ent.text, ent.label_)

KGP Talkie ORG
first ORDINAL
San Francisco GPE


In [88]:
# ruler.to_disk("./patterns.json")

## MATCHER

In [63]:
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.tokens import Span

nlp = English()
matcher = Matcher(nlp.vocab)

def add_event_ent(matcher, doc, i, matches):
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="EVENT")
    doc.ents += (entity,)
    print(entity.text)

pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
matcher.add("GoogleIO", add_event_ent, pattern)
doc = nlp("This is a text about Google I/O")
matches = matcher(doc)

Google I/O


In [73]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Token

# We're using a class because the component needs to be initialised with
# the shared vocab via the nlp object
class BadHTMLMerger(object):
    def __init__(self, nlp):
        # Register a new token extension to flag bad HTML
        Token.set_extension("bad_html", default=False)
        self.matcher = Matcher(nlp.vocab)
        self.matcher.add(
            "BAD_HTML",
            None,
            [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
        )

    def __call__(self, doc):
        # This method is invoked when the component is called on a Doc
        matches = self.matcher(doc)
        spans = []  # Collect the matched spans here
        for match_id, start, end in matches:
            spans.append(doc[start:end])
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
                for token in span:
                    token._.bad_html = True  # Mark token as bad HTML
        return doc

nlp = spacy.load("en_core_web_sm")
html_merger = BadHTMLMerger(nlp)
nlp.add_pipe(html_merger, last=True)  # Add component to the pipeline
doc = nlp("Hello<br>world! <br/> This is a test.")
for token in doc:
    print(token.text, token._.bad_html)

Hello False
<br> True
world False
! False
<br/> True
This False
is False
a False
test False
. False


In [76]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
matched_sents = []  # Collect data of matched sentences to be visualized

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]  # Matched span
    sent = span.sent  # Sentence containing matched span
    # Append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{
        "start": span.start_char - sent.start_char,
        "end": span.end_char - sent.start_char,
        "label": "MATCH",
    }]
    matched_sents.append({"text": sent.text, "ents": match_ents})

pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
           {"POS": "ADJ"}]
matcher.add("FacebookIs", collect_sents, pattern)  # add pattern
doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?")
matches = matcher(doc)
displacy.render(matched_sents, style="ent", manual=True)

In [79]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")
matcher = Matcher(nlp.vocab)
matched_sents = []


def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]
    sent = span.sent
    
    
    match_ents = [{"start": span.start_char - sent.start_char,
                  "end": span.end_char - sent.end_char,
                  "label": "MATCH",}]
    
    matched_sents.append({"text": sent.text, "ents": match_ents})
    
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, {"POS": "ADJ"}]

matcher.add("FacebookIs", collect_sents, pattern)
doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?")
matches = matcher(doc)

displacy.render(matched_sents, style="ent", manual=True)