In [2]:
from operator import methodcaller
from functional import seq, pseq
from functools import partial, reduce
from parse import Result, parse
from typing import Iterable
from operator import itemgetter, concat
from Bio import Entrez

import pandas as pd
import time
import re

In [3]:
base_query = """{disease}[MeSH Terms] AND normal[Subset Description] AND Homo sapiens [Organism] AND "Expression profiling by array"[Filter] AND "attribute name tissue"[Filter] AND ("20"[n_samples] : "10000"[n_samples]))"""

## Parse MeSH Tree

In [4]:
def parse_mesh(path: str) -> seq:
    return seq(open(path, 'rb'))\
            .map(methodcaller('decode', 'utf-8'))\
            .map(methodcaller('rstrip', '\n'))\
            .map(methodcaller('lower'))

def search(tree: Iterable, term: str) -> seq:
    return seq(tree).filter(lambda item: term.lower() in item)

def parse_entry(entry: str) -> Result:
    return parse('{term};{tag}.{children}', entry)

def get_children(tree: Iterable, tag: str, level=None):
    return seq(search(tree, tag))\
            .filter(lambda item: len(item.split('.')) == level if level else len(item) > 1)\
            .map(lambda item: item.split(';')[0:2][::-1])
        
def until(tag: str, level: int) -> str:
    return '.'.join(tag.split('.')[:level])

def query(q: str):
    handle = Entrez.esearch(db="gds", field='title', term=q)
    result = None
    
    try:
        result = Entrez.read(handle)
    
    except Exception as e:
        print(time.sleep(5.0) or e)
        result = query(q)
    
    finally:
        handle.close()
        return result

In [5]:
# @hidden_cell
Entrez.email = "talaataboudakika@std.sehir.edu.tr"

In [6]:
tree = parse_mesh('mtrees2017.bin').to_list()

In [7]:
result = seq(map(partial(get_children, tree, 'c04'), range(2, 10))).flatten()\
            .map(lambda item: (until(item[0], 2), item[1]))\
            .group_by_key().map(lambda item: (item[0], item[1][0], item[1][1:]))

In [14]:
df = result.to_pandas(columns=['tag', 'term', 'children'])

In [9]:
records = result.map(lambda x: (x[0], seq(x[2] + [x[1]])
                                .map(lambda child: query(base_query.format(disease=child)))))

In [10]:
counts = records.map(lambda x: (x[0], x[1].map(itemgetter('IdList'))\
                                           .reduce(lambda y, z: y + z))).map(lambda x: (x[0], len(set(x[1]))))

In [21]:
counts.to_pandas(columns=['tag', 'count']).merge(df)[['term', 'count']].sort_values('count', ascending=False)

Unnamed: 0,term,count
3,neoplasms by site,25
2,neoplasms by histologic type,21
10,neoplastic processes,7
11,"neoplastic syndromes, hereditary",1
13,precancerous conditions,1
0,cysts,0
1,hamartoma,0
4,"neoplasms, experimental",0
5,"neoplasms, hormone-dependent",0
6,"neoplasms, multiple primary",0
