In [None]:
import spacy
from spacy import displacy
import json
!python3 -m spacy download en
import en_core_web_sm

nlp = en_core_web_sm.load()

***
***TEST PROPOSITIONS***
***

In [None]:
sentence_group_1 = ["The whale is an animal.",
"The blue whale is an animal.",
"The whale is a quadrupedal mammal.",
"The chameleon is a polychromatic arboreal insectivore.",
"Carnivorous spiders are common.",
"All cephalopods are nocturnal.",
"The majority of cephalopods are nocturnal.",
"Some cephalopods are nocturnal.",
"No cephalopods are nocturnal.",
"Most terrestrial gastropods are hermaphrodites.",
"The simplest gastropods are the limpets and abalone.",
"A cephalopod is any member of the molluscan class Cephalopoda such as a squid, octopus, cuttlefish, or nautilus.",
"Gastropods were described as gastropodes by Georges Cuiver.",
"Cephalopods are widely regarded as the most intelligent of the invertebrates."]

sentence_group_2 = ["The fastest biped is the ostrich.",
"Not all tetrapods are quadrupeds and not all quadrupeds are tetrapods.",
"A tripod stance is a behaviour in which quadruped animals rear up on their hind legs and use their tail to support their weight.",
"Brontosaurus (meaning \"thunder lizard\" from Greek βροντή, \"thunder\" and σαῦρος, \"lizard\") is a genus of gigantic quadruped sauropod dinosaurs.",
"Conrad Gessner (1516–1565) was a Swiss naturalist and bibliographer and one of founding figures of modern zoology.",
"Sound waves with frequency above 20 kHz are called ultrasonic waves and are inaudble to humans, but some animals, like bats and dolphins, use them.",
"A timer is a kind of clock for measuring time intervals, often with an audible alarm that the time is up.",
"There is a debate among audiophiles on whether there is an audible difference between tubes and transistors in sound equipment.",
"A Geiger counter is an instrument that measures ionizing radiation such as alpha particles, beta particles, or gamma rays. The radiation sensor produces a series of audible clicks when radiation is present.",
"Autophagy is the process that cells use to break down and recycle cellular components."]

***
***HELPERS***
***

In [None]:
def visualize(doc):
    # https://spacy.io/api/top-level#displacy_options
    options= {
        "compact":True,
        "distance":100,
    }
    displacy.render(doc, style='dep',jupyter=True,options=options)    
    
def describe(sentence):
    doc = nlp((sentence))
    visualize(doc)
    print(json.dumps(doc.to_json(), indent=1))    
    
def first_or_none(lst):
    return None if len(lst) == 0 else lst[0]    

def flatten(l):
    return [item for sublist in l for item in sublist]

***
***DEPENDENCY TREE PARSING METHODS***
***

In [None]:
def children_for(json_doc, head_id, recursive=False):
    results = []
    
    for x in json_doc["tokens"]:
        if x["head"] == head_id and x["id"] != head_id:
            results.append(x)
            
    if recursive and len(results) > 0:
        return results + flatten([children_for(json_doc, r["id"], True) for r in results])
    else:
        return results

def find_token_by_attr(tokens, attr, value):
    results = [t for t in tokens if t[attr] == value]
    return first_or_none(results)

def filter_token_by_attr(tokens, attr, value):
    return [t for t in tokens if t[attr] != value]

def filter_token_by_attrs(tokens, k_v_pairs):
    for key, value in k_v_pairs:
        tokens = filter_token_by_attr(tokens, key, value)
    return tokens

def get_verbs(json_doc, only=None):
    results = []
    for x in json_doc["tokens"]:
        value = value_for_token(x, json_doc)
        is_verb = x["pos"] == "VERB"
        include = only is None or value in only
        if is_verb and include:
            results.append(x)
    return results

def sorted_values(tokens, json_doc):
    return [value_for_token(t, json_doc) for t in sorted(tokens, key=lambda w: w["id"])]

def value_for_token(token, json_doc):
    return json_doc["text"][token["start"]:token["end"]].lower()

In [None]:
# primary entry point
def mine(json_doc):
    results = []
    
    for verb in get_verbs(json_doc, ["is", "are", "were"]):
        children = children_for(json_doc, verb["id"])
        subject = find_token_by_attr(children, "dep", "nsubj")        
        other = filter_token_by_attrs(children, [("dep", "nsubj"),("pos", "PUNCT")])
        
        if not subject:
            continue
                    
        subject_deps = children_for(json_doc, subject["id"])
        subject_deps = filter_token_by_attrs(subject_deps, [("pos", "DET"),("pos", "PUNCT")])
        word = [subject] + subject_deps
        
        properties = []
        
        for o in other:
            properties.append(o)            
            other_deps = children_for(json_doc, o["id"], True)
            properties += filter_token_by_attrs(other_deps, [("pos", "DET")])
            
        word = sorted_values(word, json_doc)
        properties = sorted_values(properties, json_doc)
            
        results.append({
            "word":word,
            "properties":properties
        })
        
    return results

***
***WORK IN PROGRESS***
***

In [None]:
for sentence in sentence_group_2:
    doc = nlp((sentence))
    json_doc = doc.to_json()
    
    print()
    print(json_doc["text"])
    print()
    print(mine(json_doc))
    # print(json.dumps(mine(json_doc), indent=1))

In [None]:
sentence_group_3 = [
    "An otolith, also called statoconium or otoconium or statolith, is a calcium carbonate structure in the saccule or utricle of the inner ear, specifically in the vestibular system of vertebrates."    
]

describe(sentence_group_3[0])