# Dependency Matcher
---

The purpose of this experiment is to test spaCy's Dependency Matcher.

Code taken from: http://markneumann.xyz/blog/dependency_matcher/

The ingroup and outgroup sentences will be tested to determine whether links between named entities and concepts can be detected.

For example, a link between an named entity and an outgroup concept from the schema would infer an othering statement.

## Creating the Dataset

## Setup pipeline

In [1]:
%%time
import datetime
import os
import spacy

model = 'en_core_web_md'
print('loading: ', model)
nlp = spacy.load(model)

print(f'completed at: {datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")}')

loading:  en_core_web_md
completed at: May 13 2020 19:02:11
Wall time: 53.3 s


## Create a Dictionary Object of Sentences

In [2]:
%%time

import datetime
import os

FileList = ['20010114-Remarks at the National Day of Prayer & Remembrance Service.txt',
            '20010115-First Radio Address following 911.txt',
            '20010117-Address at Islamic Center of Washington, D.C..txt',
           '20010120-Address to Joint Session of Congress Following 911 Attacks.txt',
           '20010911-Address to the Nation.txt',
           '20011007-Operation Enduring Freedom in Afghanistan Address to the Nation.txt',
           '20011011-911 Pentagon Remembrance Address.txt',
           '20011011-Prime Time News Conference on War on Terror.txt',
           '20011026-Address on Signing the USA Patriot Act of 2001.txt',
           '20011110-First Address to the United Nations General Assembly.txt',
           '20011211-Address to Citadel Cadets.txt',
           '20011211-The World Will Always Remember 911.txt',
           '20020129-First (Official) Presidential State of the Union Address.txt',
           ]
bushraw = ''
binladenraw = ''

filepath = 'C:/Users/Steve/OneDrive - University of Southampton/CulturalViolence/KnowledgeBases/Speeches/'

binladenpath = os.path.join(filepath, 'Osama bin Laden/')
bushpath = os.path.join(filepath, 'George Bush/')

for f in FileList:
    with open(bushpath + f, 'r') as text:
        bushraw = bushraw + text.read()
        
with open(bushpath + "bush_complete.txt", "w") as file:
    file.write(bushraw)

FileList = ['19960823-OBL Declaration.txt',
            '20011007-OBL Full Warning.txt',
            '20011109-OBL.txt',
            '20021124-OBL Letter to America.txt',
            '20041101-Al Jazeera Speech.txt'
           ]

for f in FileList:
    with open(binladenpath + f, 'r') as text:
        binladenraw = binladenraw + text.read()
        
with open(binladenpath + "binladen_complete.txt", "w") as file:
    file.write(binladenraw)
        
# with open(os.path.join(filepath, "fulltext.txt"), 'w') as text:
#         text.write(raw)

print('length of doc: ', len(bushraw))
print(f'completed at: {datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")}')

with nlp.disable_pipes('tagger', 'ner'):
    bush_doc = nlp(bushraw)
    binladen_doc = nlp(binladenraw)

# dictionary object for the sentences from each file
bush_sentences = dict()
binladen_sentences = dict()

# iterate over sentences from each orator, remove any return symbols and add to dictionary object
# note, sentences are identified by their index in a document rather than the word
for sentence in bush_doc.sents:
        bush_sentences[len(bush_sentences)] = sentence.text.strip()
        
for sentence in binladen_doc.sents:
        binladen_sentences[len(binladen_sentences)] = sentence.text.strip()
        
for i, s in enumerate(bush_sentences.values()):
    print(i, '>', str(s))
    if i == 10:
        break
        
print("-----")
        
for i, s in enumerate(binladen_sentences.values()):
    print(i, '>', str(s))
    if i == 10:
        break
        
print('number of Bush sentences: ', len(bush_sentences))
print('number of Bin Laden sentences: ', len(bush_sentences))
print(f'completed at: {datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")}')

length of doc:  111934
completed at: May 13 2020 19:02:11
0 > We are here in the middle hour of our grief.
1 > So many have suffered so great a loss, and today we express our nation's sorrow.
2 > We come before God to pray for the missing and the dead, and for those who loved them.
3 > On Tuesday, our country was attacked with deliberate and massive cruelty.
4 > We have seen the images of fire and ashes and bent steel.
5 > Now come the names, the list of casualties we are only beginning to read:
6 > They are the names of men and women who began their day at a desk or in an airport, busy with life.
7 > They are the names of people who faced death and in their last moments called home to say, be brave and I love you.
8 > They are the names of passengers who defied their murderers and prevented the murder of others on the ground.
9 > They are the names of men and women who wore the uniform of the United States and died at their posts.
10 > They are the names of rescuers -- the ones whom d

## Setup Dependency Matcher

In [3]:
%%time

## Modify the pipe

from typing import List
from collections import defaultdict
from spacy.pipeline import merge_entities

if 'merge_entities' not in nlp.pipe_names:
    nlp.add_pipe(merge_entities)
    
print(f'completed at {str(datetime.datetime.now())}')

completed at 2020-05-13 19:02:18.358877
Wall time: 850 µs


In [4]:
%%time

from spacy import displacy
from spacy.tokens import Doc


def visualise_doc(doc: Doc):
    """
    Visualise both the dependency tree and entities in a spacy Doc.
    """
    displacy.render(doc, style="dep", options={"distance": 120}, jupyter=True)
    displacy.render(doc, style="ent", options={"distance": 120}, jupyter=True)


def visualise_subtrees(doc: Doc, subtrees: List[int]):

    words = [{"text": t.text, "tag": t.pos_} for t in doc]

    if not isinstance(subtrees[0], list):
        subtrees = [subtrees]

    for subtree in subtrees:
        arcs = []

        tree_indices = set(subtree)
        for index in subtree:

            token = doc[index]
            head = token.head
            if token.head.i == token.i or token.head.i not in tree_indices:
                continue

            else:
                if token.i < head.i:
                    arcs.append(
                        {
                            "start": token.i,
                            "end": head.i,
                            "label": token.dep_,
                            "dir": "left",
                        }
                    )
                else:
                    arcs.append(
                        {
                            "start": head.i,
                            "end": token.i,
                            "label": token.dep_,
                            "dir": "right",
                        }
                    )
        print("Subtree: ", subtree)
        print(doc[subtree[0]], doc[subtree[1]], doc[subtree[2]])
        displacy.render(
            {"words": words, "arcs": arcs},
            style="dep",
            options={"distance": 120},
            manual=True,
            jupyter=True
        )
        
print(f'completed at {str(datetime.datetime.now())}')

completed at 2020-05-13 19:02:18.402616
Wall time: 2.99 ms


In [5]:
%%time

def check_for_non_trees(dependency_triples: List[List[str]]):
    """
    A utility function which checks:

    1. The dependency triples you pass in are not self referential
    2. The triples you pass in form a single tree, with one root.
    3. There are no loops in the triples you pass in.

    # Parameters
    dependency_triples: List[List[str]]
        A list of [parent, relation, child] triples, which together
        form a tree that we would like to match on.

    # Returns
    root: str
        The root of the subtree
    parent_to_children: Dict[str, List[Tuple[str, str]]]
        A dictionary mapping parents to a list of their children,
        where the child is represented as a (relation, child) tuple.
    """

    parent_to_children = defaultdict(list)
    seen = set()
    has_incoming_edges = set()
    for (parent, rel, child) in dependency_triples:
        seen.add(parent)
        seen.add(child)
        has_incoming_edges.add(child)
        if parent == child:
            return None, None
        parent_to_children[parent].append((rel, child))

    # Only accept strictly connected trees with a single root.
    roots = seen.difference(has_incoming_edges)
    #if len(roots) != 1:
    #    return None, None

    root = roots.pop()
    seen = {root}

    # Step 2: check that the tree doesn't have a loop:
    def contains_loop(node):
        has_loop = False
        for (_, child) in parent_to_children[node]:
            if child in seen:
                return True
            else:
                seen.add(child)
                has_loop = contains_loop(child)
            if has_loop:
                break

        return has_loop

    if contains_loop(root):
        return None, None

    return root, parent_to_children


def construct_pattern(dependency_triples: List[List[str]]):
    """
    Idea: add patterns to a matcher designed to find a subtree in a spacy dependency tree.
    Rules are strictly of the form "Parent --rel--> Child". To build this up, we add rules
    in DFS order, so that the parent nodes have already been added to the dict for each child
    we encounter.

    # Parameters
    dependency_triples: List[List[str]]
        A list of [parent, relation, child] triples, which together
        form a tree that we would like to match on.

    # Returns
    pattern:
        A json structure defining the match for the given tree, which
        can be passed to the spacy DependencyMatcher.

    """
    # Step 1: Build up a dictionary mapping parents to their children
    # in the dependency subtree. Whilst we do this, we check that there is
    # a single node which has only outgoing edges.

    root, parent_to_children = check_for_non_trees(dependency_triples)
    if root is None:
        return None

    def add_node(parent: str, pattern: List):

        for (rel, child) in parent_to_children[parent]:
            # First, we add the specification that we are looking for
            # an edge which connects the child to the parent.
            node = {
                "SPEC": {"NODE_NAME": child, "NBOR_RELOP": ">", "NBOR_NAME": parent}
            }
            # We want to match the relation exactly.
            token_pattern = {"DEP": rel}

            # Because we're working specifically with relation extraction in mind,
            # we'll use START_ENTITY and END_ENTITY as dummy placeholders in our
            # list of triples to indicate that we want to match a word which is contained
            # within an entity (or the entity itself if you have added the merge_entities pipe
            # to your pipeline before running the matcher).
            if child not in {"START_ENTITY", "END_ENTITY"}:
                token_pattern["ORTH"] = child
            else:
                token_pattern["ENT_TYPE"] = {"NOT_IN": [""]}

            node["PATTERN"] = token_pattern

            pattern.append(node)
            add_node(child, pattern)

    pattern = [{"SPEC": {"NODE_NAME": root}, "PATTERN": {"ORTH": root}}]
    add_node(root, pattern)

    return pattern

print(f'completed at {str(datetime.datetime.now())}')

completed at 2020-05-13 19:02:18.439518
Wall time: 11 ms


In [6]:
%%time

from spacy.matcher import DependencyMatcher

example = [["founded", "nsubj", "START_ENTITY"], ["founded", "dobj", "END_ENTITY"]]

pattern = construct_pattern(example)
matcher = DependencyMatcher(nlp.vocab)
matcher.add("pattern1", None, pattern)

docs = [
    "Bill Gates founded Microsoft.",
    "Bill Gates, the Seattle Seahawks owner, founded Microsoft.",
    "The evidence we have gathered all points to a collection of loosely affiliated terrorist organizations known as Al Qaeda",
    "North Korea is a regime arming with missiles and weapons of mass destruction, while starving its citizens",
    "Terrorist groups like al Qaeda depend upon the aid or indifference of governments."
]

#displacy.render(doc3, style="dep", options={"distance": 120}, jupyter=True)
#displacy.render(doc4, style="dep", options={"distance": 120}, jupyter=True)

for d in docs:
    doc = nlp(d)
    displacy.render(doc, style="dep", options={"distance": 120}, jupyter=True)

    match = matcher(doc)[0]
    try:
        subtree = match[1][0]
        visualise_subtrees(doc, subtree)
    except:
        pass

print(f'completed at {str(datetime.datetime.now())}')

Subtree:  [1, 0, 2]
founded Bill Gates Microsoft


Subtree:  [5, 0, 6]
founded Bill Gates Microsoft


completed at 2020-05-13 19:02:42.967465
Wall time: 1.27 s


In [13]:
%%time

from spacy.matcher import DependencyMatcher
import json

filepath = 'C:/Users/Steve/OneDrive - University of Southampton/CulturalViolence/KnowledgeBases/Experiment 7 - Dependency Matcher/'

ingroupfile = "bush_ingroup_sents.json"
outgroupfile = "bush_outgroup_sents.json"

with open(os.path.join(filepath, outgroupfile), 'r') as fp:
    sentences = json.load(fp)
    
"""
successful
- term = "regime", link = "compound"
"""
    
term = input("term:").strip()
link = "compound"

rules = [
    [[term, "compound", "START_ENTITY"], [term, "compound", "END_ENTITY"]]
    #[[term, "compound", "START_ENTITY"], [term, "compound", "END_ENTITY"]],
    #[[term, "amod", "START_ENTITY"], [term, "amod", "END_ENTITY"]]
]

matcher = DependencyMatcher(nlp.vocab)

for i, rule in enumerate(rules):
    pattern = construct_pattern(rule)
    for p in pattern:
        print(p)
    matcher.add(f'pattern{i}', None, pattern)

for sent in bush_sentences.values():
    doc = nlp(sent)
    match = matcher(doc)[0]
    try:
        subtree = match[1][0]
        visualise_subtrees(doc, subtree)
    except:
        pass
        

print(f'completed at {str(datetime.datetime.now())}')

term: terrorist


{'SPEC': {'NODE_NAME': 'terrorist'}, 'PATTERN': {'ORTH': 'terrorist'}}
{'SPEC': {'NODE_NAME': 'START_ENTITY', 'NBOR_RELOP': '>', 'NBOR_NAME': 'terrorist'}, 'PATTERN': {'DEP': 'compound', 'ENT_TYPE': {'NOT_IN': ['']}}}
{'SPEC': {'NODE_NAME': 'END_ENTITY', 'NBOR_RELOP': '>', 'NBOR_NAME': 'terrorist'}, 'PATTERN': {'DEP': 'compound', 'ENT_TYPE': {'NOT_IN': ['']}}}
completed at 2020-05-13 18:49:53.514645
Wall time: 31.7 s


In [47]:
%%time

import json

filepath = 'C:/Users/Steve/OneDrive - University of Southampton/CulturalViolence/KnowledgeBases/Experiment 7 - Dependency Matcher/'

ingroupfile = "bush_ingroup_sents.json"
outgroupfile = "bush_outgroup_sents.json"

with open(os.path.join(filepath, outgroupfile), 'r') as fp:
    sentences = json.load(fp)
    
for sent in sentences.values():
    visualise_doc(nlp(sent))

Wall time: 1.2 s
