In [1]:
%time
test_data = [
        "Forty-four percent of patients with uveitis had one or more identifiable signs or symptoms, such as red eye, ocular pain, visual acuity, or photophobia, in order of decreasing frequency.",
        "There are works by such authors as Herrick, Goldsmith, and Shakespeare.",
        "There were bruises, lacerations, or other injuries were not prevalent.",
        "common law countries, including Canada, Australia, and England enjoy toast.",
        "Many countries, especially France, England and Spain also enjoy toast.",
        "There are such benefits as postharvest losses reduction, food increase and soil fertility improvement.",
        "Fruits, i.e. , apples, bananas, oranges and peaches.",
        "Fruits, e.g. apples, bananas, oranges and peaches.",
        "Fruits (e.g. apples, bananas, oranges and peaches.)",
        "Fruits (i.e. apples, bananas, oranges and peaches.)",
        "Fruits, for example apples, bananas, oranges and peaches.",
        "Fruits, which may include apples, bananas, oranges and peaches."
]

Wall time: 0 ns


In [2]:
%time

import spacy
from spacy.matcher import Matcher

# load spacy pipe
nlp = spacy.load("en_core_web_md")

# add merge noun phrase to the pipe
merge_nps = nlp.create_pipe("merge_noun_chunks")
nlp.add_pipe(merge_nps)

In [42]:
%time


# set up matcher
matcher = Matcher(nlp.vocab, validate = True)


# Hearst patterns take the form of (NP <predicate> (NP (and | or)?)+)

hearst_patterns = [
     {"label" : "such_as", "pattern" : [
        # (NP_\\w+ (, )?such as (NP_\\w+ ?(, )?(and |or )?)+)
         {"POS": "NOUN"}, 
         {"IS_PUNCT": True, "OP": "?"},
         {"LEMMA": "such"},
         {"LEMMA": "as"},
         {"IS_PUNCT" : True, "OP" : "?"},
         {"POS" : {"IN": ["NOUN", "PROPN"]}, "OP" : "?"},
#          {"OP" : "*"},
#          {"LEMMA" : "and", "OP" : "*"},
#          {"LEMMA" : "or", "OP" : "*"},
#          {"POS" : {"IN": ["NOUN", "PROPN"]}, "DEP" : "conj", "OP" : "?"},
     ]},
    {"label" : "known_as", "pattern" : [
        # (NP_\\w+ (, )?know as (NP_\\w+ ?(, )?(and |or )?)+)
         {"POS": "NOUN"}, 
         {"IS_PUNCT": True, "OP": "?"},
         {"LEMMA": "know"},
         {"LEMMA": "as"},
         {"IS_PUNCT" : True, "OP" : "?"},
         {"POS" : {"IN": ["NOUN", "PROPN"]}, "OP" : "?"},
#          {"OP" : "*"},
#          {"LEMMA" : "and", "OP" : "?"},
#          {"LEMMA" : "or", "OP" : "?"},
#          {"POS" : {"IN": ["NOUN", "PROPN"]}, "DEP" : "conj", "OP" : "?"},
    ]},
    {"label" : "including", "pattern" : [
        # '(NP_\\w+ (, )?include (NP_\\w+ ?(, )?(and |or )?)+)'
         {"POS": "NOUN"}, 
         {"IS_PUNCT": True, "OP": "?"},
         {"LEMMA" : "include"},
         {"IS_PUNCT" : True, "OP" : "?"},
         {"POS" : {"IN": ["NOUN", "PROPN"]}, "OP" : "?"},
#          {"OP" : "*"},
#          {"LEMMA" : "and", "OP" : "?"},
#          {"LEMMA" : "or", "OP" : "?"},
#          {"POS" : {"IN": ["NOUN", "PROPN"]}, "DEP" : "conj", "OP" : "?"},
    ]},
    {"label" : "like", "pattern" : [
        # '(NP_\\w+ (, )?like (NP_\\w+ ? (, )?(and |or )?)+)'
         {"POS": "NOUN"}, 
         {"IS_PUNCT": True, "OP": "?"},
         {"LEMMA" : "like"},
         {"IS_PUNCT" : True, "OP" : "?"},
         {"POS" : {"IN": ["NOUN", "PROPN"]}, "OP" : "?"},
#          {"OP" : "*"},
#          {"LEMMA" : "and", "OP" : "?"},
#          {"LEMMA" : "or", "OP" : "?"},
#          {"POS" : {"IN": ["NOUN", "PROPN"]}, "DEP" : "conj", "OP" : "?"},
    ]},   
]


# add patterns to the matcher
for pattern in hearst_patterns:
    matcher.add(pattern["label"], None, pattern["pattern"])
    
# create a list of docs
docs = ["Forty-four percent of patients with uveitis had one or more identifiable signs or symptoms, such as red eye, ocular pain, visual acuity, or photophobia, in order of decreasing frequency.",
        "Other close friends, including Canada, Australia, Germany and France, have pledged forces as the operation unfolds.",
        "The evidence we have gathered all points to a collection of loosely affiliated terrorist organizations known as al Qaeda.",
         "Terrorist groups like al Qaeda depend upon the aid or indifference of governments."
       ]

#iterate through the docs
for text in docs:
    pairs = [] # set up array containing pairs
    
    doc = nlp(text) # initiate doc 
    matches = matcher(doc) # find matches in doc

    if matches: # if matches are found
        
        print('-----')
        print(doc.text)
        hypernym = doc[matches[0][1]] # find and print hypernym
        
        pairs += [(hypernym, doc[matches[0][2]])] + [(hypernym, word) for word in doc[matches[0][2]].conjuncts]
        
        print(pairs)
        
        
        
        print()
        print('--- testing for dependents on first hyponym ---')
        print()
        
#         #testing ancestors
#         term = doc[matches[0][2]]
#         span = term.ancestors
#         print("ancestors of ", term, ": ",  [word for word in span])
#         print('---')
        
#         #testing left_edge to right_edge
#         term = doc[matches[0][2]]
#         span = doc[term.left_edge.i : term.right_edge.i + 1]
#         print("left edge/right edge = ", [word for word in span])
#         print('---')
        
        #testing conjuncts
        term = doc[matches[0][2]]
        span = term.conjuncts
        print("conjuncts: ",  span)
        print('---')
        
#         #testing children
#         term = doc[matches[0][2]]
#         span = term.children
#         print("children: ",  [word for word in span])
#         print('---')
        
#         term = doc[matches[0][2]]
#         span = term.subtree
#         print("subtree: ",  [word for word in span])
#         print('-----')
        
#         print("match id: ", nlp.vocab.strings[matches[0][0]]) # print predicate as the pattern id
        
        span = doc[matches[0][1]:matches[-1][2]] # create  and print phrase containing hearst pattern
#         print("text: ", span)
        
        for token in span[1:]: # the first token will be the hypernym, iterate through remaining nouns and proper nouns
            if token.pos_ in ["NOUN", "PROPN"] and token.dep_ in ["pobj", "conj"]:
                pairs.append((hypernym, token))

        print(pairs)        
    else:
        print("no matches found")

Wall time: 0 ns
-----
Forty-four percent of patients with uveitis had one or more identifiable signs or symptoms, such as red eye, ocular pain, visual acuity, or photophobia, in order of decreasing frequency.
[(symptoms, red eye), (symptoms, ocular pain), (symptoms, visual acuity), (symptoms, photophobia)]
-----
Other close friends, including Canada, Australia, Germany and France, have pledged forces as the operation unfolds.
[(Other close friends, Canada), (Other close friends, Australia), (Other close friends, Germany), (Other close friends, France)]
-----
The evidence we have gathered all points to a collection of loosely affiliated terrorist organizations known as al Qaeda.
[(loosely affiliated terrorist organizations, al Qaeda)]
-----
Terrorist groups like al Qaeda depend upon the aid or indifference of governments.
[(Terrorist groups, al Qaeda)]
