In [1]:
#!pip install matplotlib
#!pip install .

In [2]:
import json
import pprint
import matplotlib.pyplot as plt

from anytree import PreOrderIter, RenderTree
from anytree.search import findall
from comorbid_graphs import ComorbidGraph, ComorbidGraphNode

### Creating graph 

In [3]:
with open('../tests/fixtures/symp_tree.json') as f:
    data = json.load(f)
cg = ComorbidGraph(data, ComorbidGraphNode, assign_ids=True)

In [26]:
print(cg.pretty_print_tree()[:500])
print('...')

Source
├── symptom
│   ├── urinary system symptom
│   ├── general symptom
│   ├── respiratory system and chest symptom
│   ├── neurological and physiological symptom
│   ├── musculoskeletal system symptom
│   ├── nervous system symptom
│   ├── abdominal symptom
│   ├── head and neck symptom
│   ├── skin and integumentary tissue symptom
│   ├── hemic and immune system symptom
│   ├── digestive system symptom
│   ├── cardiovascular system symptom
│   ├── nutrition, metabolism, and development symp
...


## Searching
Searching should be extensible to allow sql search for larger data than KGs.
For this we have a many step search.
- search for ancestors, parents first
  - get the names of these, include-exclude
  - merge all of the nodes, merging with include-exclude
- for docs in the ending KnowledgeGraph
  - filter by type
  - filter by body length
- search in the body

In [27]:
DIRECTION = ['inc_', 'exc_', 'include_', 'exclude_']
FILTERS = ['name', 'content', 'type', 'text_longer', 'ancestor', 'parent']

In [70]:
def get_query_dict(query_str):
    query_str = query_str.replace('  ',' ').replace('\n',' ').rstrip().lstrip()
    if not query_str.startswith('inc_') and not query_str.startswith('exc_'):
        query_str = 'inc_content:' + query_str
    for i in DIRECTION:
        query_str = query_str.replace(i, '_BREAK_'+ i)

    values = [
        i.rstrip().lstrip()
        for i in query_str.split('_BREAK_')
        if i.rstrip().lstrip() != ''
    ]

    query = {}
    for line in values:
        u_ind = line.find('_')         # underline index
        c_ind = line.find(':')         # comma index
        key = line[u_ind+1:c_ind]
        if key not in FILTERS:
            continue
        if key not in query:
            query[line[u_ind+1:c_ind]] = {}
        query[line[u_ind+1:c_ind]][line[:u_ind]] = [
            i for i in line[c_ind+1:].rstrip().lstrip().split(',')
            if i != ''
        ]
    return query

In [73]:
query_str = """
inc_name:Disorder
inc_type:document
inc_text_longer:300
inc_ancestor:Phobic
exc_ancestor:Neurodevelopmental Disorders
"""
query_dict = get_query_dict(query_str)

In [74]:
query_dict

{'name': {'inc': ['Disorder']},
 'type': {'inc': ['document']},
 'text_longer': {'inc': ['300']},
 'ancestor': {'inc': ['Phobic'], 'exc': ['Neurodevelopmental Disorders']}}

### 1. Filtering
Filter based on node properties if included or not - `name`, `type`, `body-length`.


In [95]:
class FilterableMixin(object):
    """Allows specific filtering for some parameters."""
            
    def filter_query(self, query_dict):
        return (
            self.filter_name(
                query_dict['name']['inc'], query_dict['name']['exc']
            ) if 'name' in query_dict else True
            and
            self.filter_parent(
                query_dict['parent']['inc'], query_dict['parent']['exc']
            ) if 'parent' in query_dict else True
            and
            self.filter_type(
                query_dict['type']['inc'], query_dict['type']['exc']
            ) if 'type' in query_dict else True
            and
            self.filter_text_longer(
                query_dict['text_longer']['inc'], query_dict['text_longer']['exc']
            ) if 'text_longer' in query_dict else True
            and
            self.filter_content(
                query_dict['text_content']['inc'], query_dict['text_longer']['exc']
            ) if 'content' in query_dict else True
        )

    def filter_name(self, inc_list, exc_list, strict=False):
        """Checks if the name is in the included list, and not in the excluded."""
        if strict:
            if self.name in inc_list and self.name not in exc_list:
                return True
            return False
        else:
            if any(i in self.name for i in inc_list) and not any(i in self.name for i in exc_list):
                return True
            return False

    def filter_content(self, inc_list, exc_list):
        """Filters content that has the phrases put in the included list."""
        if not hasattr(self, 'body'):
            return False
        if any(i in self.body for i in inc_list) and not any(i in self.body for i in exc_list):
            return True
        return False

    def filter_parent(self, inc_list, exc_list, strict=False):
        if not self.parent or not self.parent.name:
            return False
        if strict:
            if self.name in inc_list and self.name not in exc_list:
                return True
            return False
        else:
            if any(i in self.parent.name for i in inc_list) and not any(i in self.parent.name for i in exc_list):
                return True
        return False

    def filter_type(self, inc_list, exc_list):
        if not hasattr(self, 'type'):
            return False
        if self.type in inc_list and self.type not in exc_list:
            return True
        return False
    
    def filter_text_longer(self, inc_list, exc_list):
        if not hasattr(self, 'body'):
            return False
        if inc_list != [] and len(self.body) <= int(inc_list[0]):
            return False
        if exc_list != [] and len(self.body) > int(exc_list[0]):
            return False                
        return True

In [96]:
class FilterableNode(ComorbidGraphNode, FilterableMixin):
    pass

class FilterableGraph(ComorbidGraph):
    @staticmethod
    def fix_query(query_dict):
        for key, val in query_dict.items():
            if 'inc' not in val:
                query_dict[key]['inc'] = []
            if 'exc' not in val:
                query_dict[key]['exc'] = []
        return query_dict

In [97]:
filter_cg = ComorbidGraph(data, FilterableNode, assign_ids=True)

In [98]:
print(filter_cg.pretty_print_tree())

Source
├── symptom
│   ├── urinary system symptom
│   ├── general symptom
│   ├── respiratory system and chest symptom
│   ├── neurological and physiological symptom
│   ├── musculoskeletal system symptom
│   ├── nervous system symptom
│   ├── abdominal symptom
│   ├── head and neck symptom
│   ├── skin and integumentary tissue symptom
│   ├── hemic and immune system symptom
│   ├── digestive system symptom
│   ├── cardiovascular system symptom
│   ├── nutrition, metabolism, and development symptom
│   └── reproductive system symptom
├── obsolete appendicitis
├── ankle swelling
├── respiratory distress
├── ulcerations
├── osteomyelitis
├── portal hepatitis
├── photosensitivity
├── persistent lymphocytosis
├── petechial hemorrhages
├── partial paralysis
├── reddening
├── pustular skin lesions
├── prolonged thrombocytopenia
├── proliferative tracheitis
├── proliferative mouth papules
├── proliferative skin lesions
├── self-limiting liver inflammation
├── sclerodermoid plaques
├── salivar

In [99]:
query_str = """inc_name:system inc_parent:symptom"""
query_dict = get_query_dict(query_str)
query_dict

{'name': {'inc': ['system']}, 'parent': {'inc': ['symptom']}}

In [94]:
def fix_query(query_dict):
    for key, val in query_dict.items():
        if 'inc' not in val:
            query_dict[key]['inc'] = []
        if 'exc' not in val:
            query_dict[key]['exc'] = []
    return query_dict
query_dict = fix_query(query_dict)

results = findall(
    filter_cg.tree,
    filter_=lambda node: 
    node.filter_query(query_dict)
)

### 2. Subgraph
Filter the `ancestors` and `parents`, and use `inc-exc` to zoom in.

In [90]:
class SearchableMixin(object):
    """TODO: Needs proper testing."""

    @staticmethod
    def add_node(node, include, source):
        # if parent not found, add directly
        if not node.parent or not node.parent:
            node.old_parent = None
            node.parent = result_node
        # if parent not found in list, add directly, but fix children issues
        elif node.parent.name not in [i.name for i in include]:
            node.old_parent = node.parent
            # fix parenting issues
            node.parent = source
            node.parent.children = list(
                [i for i in node.parent.children if i.name != node.name]
            ) + [node]
        else:
            # TODO: dont know why this works yet ..
            include.append(node)
        return include

    @staticmethod
    def remove_node(node):
        if node.parent:
            node.parent.children = list(
                [i for i in node.parent.children if i.name != node.name]
            )

    @staticmethod
    def get_node_list(base_node, list_words):
        return list(findall(
            base_node,
            filter_=lambda node: any(x in node.name for x in list_words)
        ))

    def subgraph_search(self, inc_list, exc_list, src_name="search results", node_type=ComorbidGraphNode):
        """Does the ancestor filtering."""

        f = self.tree.deep_copy()
        result_node = node_type(name=src_name)

        exclude = self.get_node_list(f, exc_list)
        include = self.get_node_list(f, inc_list)

        for node in PreOrderIter(f):
            # which nodes crossover
            inc_ancestors = set(include) & set(list(node.ancestors) + [node])
            if inc_ancestors:
                inc_max_level = max([i.depth for i in inc_ancestors])
            else:
                inc_max_level = -1

            # find nodes that are excluding this node
            exc_ancestors = set(exclude) & set(list(node.ancestors) + [node])
            if exc_ancestors:
                exc_max_level = max([i.depth for i in exc_ancestors])
            else:
                exc_max_level = -1

            # add if index of inclusion > ind-of-exclusion
            if inc_max_level > exc_max_level:
                include = self.add_node(
                    node,
                    include,
                    result_node
                )
            else:
                self.remove_node(node)
        return result_node

In [91]:
class ComorbidGraphSearchable(ComorbidGraph, SearchableMixin):
    pass

In [92]:
search_cg = ComorbidGraphSearchable(data, FilterableNode, assign_ids=True)

In [93]:
res = search_cg.subgraph_search(
    inc_list=['epileptic', 'seizure'],
    exc_list=[],
    node_type=ComorbidGraphNode
)
for pre, fill, node in RenderTree(res):
    print("%s%s" % (pre, node.name))

search results
└── seizure
    ├── generalized seizure
    │   ├── tonic-clonic seizure
    │   ├── absence seizure
    │   ├── myoclonic seizure
    │   ├── atonic seizure
    │   ├── clonic seizure
    │   └── tonic seizure
    ├── epileptic seizure
    └── focal seizure
        ├── complex partial seizure
        └── simple partial seizure


## Ordering Results
There should be two options - first the graph properties.  
Second our simple algorithm based on combination of scores - as found in `comorbid-lab`.

## Merging all
Create the search language by allowing all entries.   
Control for inputs irregularities and more.