In [1]:
#!pip install matplotlib

In [2]:
import json
import pprint
import matplotlib.pyplot as plt

from anytree import PreOrderIter, RenderTree
from anytree.search import findall
from comorbid_graphs import ComorbidGraph, ComorbidGraphNode

### Creating graph 

In [3]:
with open('../tests/fixtures/symp_tree.json') as f:
    data = json.load(f)
cg = ComorbidGraph(data, ComorbidGraphNode, assign_ids=True)

In [4]:
print(cg.pretty_print_tree()[:500])
print('...')

Source
├── symptom
│   ├── urinary system symptom
│   ├── general symptom
│   ├── respiratory system and chest symptom
│   ├── neurological and physiological symptom
│   ├── musculoskeletal system symptom
│   ├── nervous system symptom
│   ├── abdominal symptom
│   ├── head and neck symptom
│   ├── skin and integumentary tissue symptom
│   ├── hemic and immune system symptom
│   ├── digestive system symptom
│   ├── cardiovascular system symptom
│   ├── nutrition, metabolism, and development symp
...


## Searching
Searching should be extensible to allow sql search for larger data than KGs.
For this we have a many step search.
- search for ancestors, parents first
  - get the names of these, include-exclude
  - merge all of the nodes, merging with include-exclude
- for docs in the ending KnowledgeGraph
  - filter by type
  - filter by body length
- search in the body

### Notes
**1. Networkx Graph**    
first I tried to turn the ComorbidGraph into a networkx graph and then filter-out what I needed - [docs](https://networkx.org/documentation/stable/reference/generated/networkx.classes.function.subgraph_view.html#networkx.classes.function.subgraph_view)

**2. Anytree Rocks!**   
Then I realize, I could keep it simple with the right kind of iteration, and this would be so much better in terms of complexity of code. <3

### 1. Subgraph
Filter the `ancestors` and `parents`, and use `inc-exc` to zoom in.

In [17]:
class SearchableMixin(object):
    
    def get_node_list(self, list_words):
        return list(findall(
            self.tree,
            filter_=lambda node: any(x in node.name for x in list_words)
        ))

    def get_results(self, inc_list, exc_list, src_name="result"):
        """Does the ancestor filtering"""

        f = self.tree
        #print(f.pretty_print_tree())
        result_node = ComorbidGraphNode(name="result")

        exclude = self.get_node_list(list_words=exc_list)
        include = self.get_node_list(list_words=inc_list)

        def add_node(node):
            # if parent not found, add directly
            if not node.parent or not node.parent:
                node.old_parent = None
                node.parent = result_node
            # if parent not found in list, add directly, but fix children issues
            elif node.parent.name not in [i.name for i in include]:
                node.old_parent = node.parent
                # fix parenting issues
                node.parent = result_node
                node.parent.children = list(
                    [i for i in node.parent.children if i.name != node.name]
                ) + [node]
            else:
                # TODO: dont know why this works yet ..
                include.append(node)

        def remove_node(node):
            if node.parent:
                node.parent.children = list(
                    [i for i in node.parent.children if i.name != node.name]
                )

        for node in PreOrderIter(f):
            # which nodes crossover
            inc_ancestors = set(include) & set(list(node.ancestors) + [node])
            if inc_ancestors:
                inc_max_level = max([len(i.ancestors) for i in inc_ancestors])
            else:
                inc_max_level = -1

            # find nodes that are excluding this node
            exc_ancestors = set(exclude) & set(list(node.ancestors) + [node])
            if exc_ancestors:
                exc_max_level = max([len(i.ancestors) for i in exc_ancestors])
            else:
                exc_max_level = -1

            if inc_max_level > exc_max_level:
                add_node(node)
            else:
                remove_node(node)
        return result_node

In [21]:
class ComorbidGraphSearchable(ComorbidGraph, SearchableMixin):
    pass

In [28]:
search_cg = ComorbidGraphSearchable(data, ComorbidGraphNode, assign_ids=True)
res = search_cg.get_results(['nervous system'], ['pain'])
for pre, fill, node in RenderTree(res):
    print("%s%s" % (pre, node.name))

result
└── nervous system symptom
    ├── coordination symptom
    │   ├── lack of coordination
    │   │   └── incoordination
    │   └── impaired coordination
    ├── sensation perception
    │   ├── hypoalgesia
    │   ├── hyperalgesia
    │   ├── hypoesthesia
    │   └── hyperesthesia
    ├── aphasia
    │   ├── alexia
    │   ├── expressive aphasia
    │   ├── inability to comprehend speech
    │   │   └── receptive aphasia
    │   ├── inability to form words
    │   ├── poor enunciation
    │   ├── inability to speak
    │   ├── agraphia
    │   └── anomia
    ├── paralysis
    │   ├── transient paralysis of limb
    │   ├── motor paralysis
    │   ├── spastic paralysis
    │   ├── paraplegia
    │   ├── lip paralysis
    │   ├── throat muscle paralysis
    │   ├── ophthalmoplegia
    │   ├── proximal paralysis of arm and leg
    │   ├── respiratory paralysis
    │   ├── hind limb paralysis
    │   │   └── partial hind limb paralysis
    │   ├── pareses
    │   ├── extraocular mu

### 2. Filtering
Filter based on node properties if included or not - `name`, `type`, `body-length`.


### 3. Content
Filter if a certain word or combination of words is found in the document.


## Ordering Results
There should be two options - first the graph properties, second our simple algorithm based on combination of scores - as found in `very-comorbid`.

## Merging all
Create the search language by allowing all entries.   
Control for inputs irregularities and more.

In [8]:
DIRECTION = ['inc_', 'exc_', 'include_', 'exclude_']
FILTERS = ['name', 'phrase', 'type', 'text_longer', 'ancestor', 'parent']

In [9]:
query_str = """
inc_phrase:anxiety
inc_title:Disorder
inc_type:document
inc_text_longer:300
exc_ancestor:Neurodevelopmental Disorders
"""

In [10]:
def clean_query(query_str):
    query_str.replace('  ',' ').replace('\n',' ')
    for i in DIRECTION:
        query_str = query_str.replace(i, '_BREAK_'+ i)
    return [
        i.rstrip().lstrip()
        for i in query_str.split('_BREAK_')
        if i.rstrip().lstrip() != ''
    ]
clean_query(query_str)

['inc_phrase:anxiety',
 'inc_parent:DSM-V',
 'inc_title:Disorder',
 'inc_type:document',
 'inc_text_longer:300',
 'exc_ancestor:Neurodevelopmental Disorders']