In [1]:
import spacy
from spacy.matcher import DependencyMatcher
from pathlib import Path

nlp = spacy.load("en_core_web_sm")


def doc_dep_graph(doc):
    ''' Put the graph with entity labels present (see 'tag' and 'label')
    '''
    words = []
    arcs = []
    for tok in doc:
        if tok.ent_type == 0:
            tag = tok.pos_
        else:
            tag = "_::"+tok.ent_type_+" ("+tok.pos_+")::_"
        words.append({
            "text": tok.text,
            "tag": tag
        })
        if tok.dep_ in {'punct'}:
                continue
        if tok.i < tok.head.i:
            arcs.append({
                "start": tok.i,
                "end": tok.head.i,
                "label": tok.dep_,
                "dir": "left"
            })
        elif tok.i > tok.head.i:
            arcs.append({
                "start": tok.head.i,
                "end": tok.i,
                "label": tok.dep_,
                "dir": "right"
            })
    return {"words": words, "arcs": arcs}

def output_to_svg(filename, dep):
    '''Save the dependency graph to SVG '''
    svg = spacy.displacy.render(dep, style="dep",
                                jupyter=False, manual=True)
    Path(filename+".svg").open("w", encoding="utf-8").write(svg)


def get_dep_matcher(nlp, patterns, pattern_names=None):
    ''' Add patterns with pattern_names to the dependency matcher '''
    if pattern_names is None:
        pattern_names = ["pattern"+str(pi) for pi in range(len(patterns))]
    else:
        pattern_names = [x for x in pattern_names]
    matcher = DependencyMatcher(nlp.vocab)
    for pi, pattern in enumerate(patterns):
        print(pattern_names[pi], pattern)
        matcher.add(pattern_names[pi], None, pattern)
    return matcher

def predicate_matching(doc, matcher, source_target_at_pattern_end=True):
    ''' Match the patterns to a doc, returns dep graph with edges that match '''
    words = []
    arcs = []
    node_inds = {}
    for ti,tok in enumerate(doc):
#         if tok.ent_type > 0 and tok.text not in node_inds:
        words.append({
            "text": tok.text,
            "tag": tok.ent_type
        })
        node_inds[tok.text] = len(node_inds)
    for match in matcher(doc):
        print(match)
        for match_inds in match[1]:
            print([doc[mi] for mi in match_inds])
            start, end = match_inds[-2], match_inds[-1]
            if source_target_at_pattern_end:
                print("Getting SOURCE+TARGET from root and final node in pattern")
                start, end = match_inds[0], match_inds[-1]
            else:
                print("Getting SOURCE+TARGET from final two nodes in pattern")
            if doc[start].text == doc[end].text or \
                    doc[start].text not in node_inds or \
                    doc[end].text not in node_inds:
                continue
            if end > start:
                arcs.append({
                    "start": node_inds[doc[start].text],
                    "end": node_inds[doc[end].text],
                    "link": doc[start].text+" -> "+doc[end].text,
                    "label": '',
                    "dir": "right"
                })
            else:
                arcs.append({
                    "start": node_inds[doc[end].text],
                    "end": node_inds[doc[start].text],
                    "link": doc[end].text+" -> "+doc[start].text,
                    "label": '',
                    "dir": "left"
                })
    return {"words": words, "arcs": arcs}



text = "The evidence we have all points to a loosely affiliated terrorist organisation known as al Qaeda"
doc = nlp(text)
dep_graph = doc_dep_graph(doc)
spacy.displacy.render(dep_graph, style="dep", jupyter=True, manual=True)
# output_to_svg('./graphs/sentA_dep', dep_graph)


In [2]:
patterns = {}

# patterns.update({"X->Y": [
#     {"PATTERN": {
#         "ENT_TYPE": {"NOT_IN": [""]}
#     }, "SPEC": {
#         "NODE_NAME": "START_ENTITY"
#     },
#     }, {"PATTERN": {
#         "POS": {"IN": ["VERB"]},
#     }, "SPEC": {
#         "NBOR_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NODE_NAME": "known"}
#     },
# ]})


patterns.update({"KnownAs": [
    {"PATTERN": {
        "POS": {"IN": ["NOUN","PROPN"]}
    }, "SPEC": {"NODE_NAME": "START_ENTITY"}
    }, {"PATTERN": {
        "POS": {"IN": ["VERB"]},
    }, "SPEC": {"NBOR_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NODE_NAME": "known"}
    }, {"PATTERN": {
        "POS": {"IN": ["SCONJ"]},
    }, "SPEC": {"NBOR_NAME": "known", "NBOR_RELOP": ">", "NODE_NAME": "as"}
    }, {"PATTERN": {
        "POS": {"IN": ["NOUN","PROPN"]},
    }, "SPEC": {"NBOR_NAME": "as", "NBOR_RELOP": ">", "NODE_NAME": "END_ENTITY"}
    }
]})



matcher = get_dep_matcher(nlp, patterns.values(), patterns.keys())

### One of the following two lines should find the pattern.
# If source_target_at_pattern_end=True, the edge is drawn between the 
# two nodes at the end of the pattern (pattern[-2]->pattern[-1]).
# If source_target_at_pattern_end=False, the edge is drawn between the
# first and final nodes of the pattern (pattern[0]->pattern[-1]).
### This is so the matcher knows which nodes in the pattern to draw the edge between 
### and toovercome the requirement that the first node in the defined pattern must
### be the root in the sub-tree.

matched_edges = predicate_matching(doc, matcher, source_target_at_pattern_end=True)
# matched_edges = predicate_matching(doc, matcher, source_target_at_pattern_end=False)

spacy.displacy.render(matched_edges, style="dep", jupyter=True, manual=True)
# output_to_svg('./graphs/sentA_ascope', matched_edges)
matched_edges




KnownAs [{'PATTERN': {'POS': {'IN': ['NOUN', 'PROPN']}}, 'SPEC': {'NODE_NAME': 'START_ENTITY'}}, {'PATTERN': {'POS': {'IN': ['VERB']}}, 'SPEC': {'NBOR_NAME': 'START_ENTITY', 'NBOR_RELOP': '>', 'NODE_NAME': 'known'}}, {'PATTERN': {'POS': {'IN': ['SCONJ']}}, 'SPEC': {'NBOR_NAME': 'known', 'NBOR_RELOP': '>', 'NODE_NAME': 'as'}}, {'PATTERN': {'POS': {'IN': ['NOUN', 'PROPN']}}, 'SPEC': {'NBOR_NAME': 'as', 'NBOR_RELOP': '>', 'NODE_NAME': 'END_ENTITY'}}]
(10646777160426594090, [[11, 12, 13, 15]])
[organisation, known, as, Qaeda]
Getting SOURCE+TARGET from root and final node in pattern


{'words': [{'text': 'The', 'tag': 0},
  {'text': 'evidence', 'tag': 0},
  {'text': 'we', 'tag': 0},
  {'text': 'have', 'tag': 0},
  {'text': 'all', 'tag': 0},
  {'text': 'points', 'tag': 0},
  {'text': 'to', 'tag': 0},
  {'text': 'a', 'tag': 0},
  {'text': 'loosely', 'tag': 0},
  {'text': 'affiliated', 'tag': 0},
  {'text': 'terrorist', 'tag': 0},
  {'text': 'organisation', 'tag': 0},
  {'text': 'known', 'tag': 0},
  {'text': 'as', 'tag': 0},
  {'text': 'al', 'tag': 383},
  {'text': 'Qaeda', 'tag': 383}],
 'arcs': [{'start': 11,
   'end': 15,
   'link': 'organisation -> Qaeda',
   'label': '',
   'dir': 'right'}]}