In [367]:
from operator import methodcaller
from functional import seq
from parse import Result, parse
from typing import Iterable
from operator import itemgetter
from Bio import Entrez

## Parse MeSH Tree

In [426]:
def parse_mesh(path: str) -> seq:
    return seq(open(path, 'rb'))\
            .map(methodcaller('decode', 'utf-8'))\
            .map(methodcaller('rstrip', '\n'))\
            .map(methodcaller('lower'))

def search(tree: Iterable, term: str) -> seq:
    return seq(tree).filter(lambda item: term.lower() in item)

def parse_entry(entry: str) -> Result:
    return parse('{term};{tag}.{children}', entry)

def get_children(tree: Iterable, tag: str, at_level: int = 1):
    return seq(search(tree, tag)).map(methodcaller('split', '.'))\
            .filter(lambda item: len(item) == at_level)

In [244]:
# @hidden_cell
Entrez.email = "talaataboudakika@std.sehir.edu.tr"

In [226]:
%ls *.bin

mtrees2017.bin


In [227]:
tree = parse_mesh('mtrees2017.bin').to_list()

In [424]:
result = get_children(tree, 'neoplasms', 2)

In [254]:
result.size()

17

In [350]:
base_query = """{disease}[MeSH Terms] AND normal[Subset Description] AND Homo sapiens [Organism] AND "Expression profiling by array"[Filter] AND "attribute name tissue"[Filter] AND ("20"[n_samples] : "10000"[n_samples]))"""

In [429]:
records = []

for i,_ in result:
    handle = Entrez.esearch(db="gds", field='title', term=base_query.format(base_query, disease=i.split(';')[0]))
    records.append(Entrez.read(handle))
    handle.close()

In [430]:
found = seq(records).filter(lambda item: int(item['Count'])).map(itemgetter('Count', 'QueryTranslation'))

### Sibilings of neoplasms by site and their datasets counts

datasets with less than 20 samples or no healthy samplesare discarded!!

In [431]:
found.map(lambda item: (parse('"{keyword}"{}', item[1])['keyword'], item[0]))

0,1
neoplasms by histologic type,21
neoplasms by site,24
digestive system neoplasms,11
respiratory tract neoplasms,4
nervous system neoplasms,1
urogenital neoplasms,6
endocrine gland neoplasms,3
