In [1]:
from operator import methodcaller

# pip install pyfunctional
from functional import seq, pseq
from functools import partial, reduce
from itertools import product

# pip install parse
from parse import Result, parse
from typing import Iterable
from operator import itemgetter, concat

# pip install biopython
from Bio import Entrez

import pandas as pd
import time
import re

In [2]:
base_query =\
"""
(normal[All Fields] OR control[All Fields] OR adjacent[All Fields] OR healthy[All Fields]) AND
 "{mesh_term}"[MeSH Terms] AND
 ("Homo sapiens"[Organism] AND
 ("Expression profiling by array"[Filter] OR
 "Expression profiling by SNP array"[Filter] OR
 "Expression profiling by SAGE"[Filter] OR
 "Expression profiling by RT-PCR"[Filter] OR
 "Expression profiling by MPSS"[Filter] OR
 "Expression profiling by genome tiling array"[Filter] OR
 "Expression profiling by high throughput sequencing"[Filter]) AND
 ("20"[n_samples] : "1000000"[n_samples]))
""".replace('\n', '')

## Parse MeSH Tree

In [3]:
def parse_mesh(path: str) -> seq:
    return seq(open(path, 'rb'))\
            .map(methodcaller('decode', 'utf-8'))\
            .map(methodcaller('rstrip', '\n'))\
            .map(methodcaller('lower'))

def search(tree: Iterable, term: str) -> seq:
    return seq(tree).filter(lambda item: term.lower() in item.split(';')[1])

def parse_entry(entry: str) -> Result:
    return parse('{term};{tag}.{children}', entry)

def get_children(tree: Iterable, tag: str, level=None):
    return seq(search(tree, tag))\
            .filter(lambda item: len(item.split('.')) == level if level else len(item) > 1)\
            .map(lambda item: item.split(';')[0:2][::-1])
        
def until(tag: str, level: int) -> str:
    return '.'.join(tag.split('.')[:level])

def query(q: str):
    while True:
        try:
            handle = Entrez.esearch(db="gds", field='title', term=q)
            result = Entrez.read(handle)
            handle.close()
            return result

        except Exception as e:
            print(e)
            time.sleep(1.0)

In [4]:
# @hidden_cell
Entrez.email = "talaataboudakika@std.sehir.edu.tr"
tree = parse_mesh('mtrees2017.bin').to_list()

In [5]:
result = seq(map(partial(get_children, tree, 'c'), range(1, 11))).flatten()\
            .map(lambda item: (until(item[0], 1), item[1]))\
            .group_by_key().map(lambda item: (item[0], item[1][0], item[1][1:]))

### terms and their children

In [6]:
df = result.to_pandas(columns=['tag', 'term', 'descendants'])

df.head()

Unnamed: 0,tag,term,descendants
0,c01,bacterial infections and mycoses,"[bacterial infections, central nervous system ..."
1,c02,virus diseases,"[arbovirus infections, bronchiolitis, viral, c..."
2,c03,parasitic diseases,"[central nervous system parasitic infections, ..."
3,c04,neoplasms,"[cysts, hamartoma, neoplasms by histologic typ..."
4,c05,musculoskeletal diseases,"[bone diseases, cartilage diseases, fasciitis,..."


### terms and their children GEO dataset IDs

In [7]:
records = result.map(lambda x: (x[0], pseq(x[2] + [x[1]])
                                .map(lambda child: query(base_query.format(mesh_term=child))['IdList'])))

In [8]:
# concatenate children datasets ids to form parent matches
# convert list to set to remove datasets having the same GEO dataset id
# convert set to len(set)
counts = records.map(lambda x: (x[0], len(set(reduce(concat, x[1]))))).to_pandas(columns=['tag', 'count'])\
             .merge(df).sort_values('count', ascending=False)

<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary failure in name resolution>
<urlopen error [Errno -3] Temporary fail

In [11]:
counts.sort_values('count', ascending=False)

Unnamed: 0,tag,count,term,descendants
3,c04,1209,neoplasms,"[cysts, hamartoma, neoplasms by histologic typ..."
22,c23,1082,"pathological conditions, signs and symptoms","[morphological and microscopic findings, patho..."
9,c10,607,nervous system diseases,"[autoimmune diseases of the nervous system, au..."
19,c20,417,immune system diseases,"[autoimmune diseases, erythroblastosis, fetal,..."
16,c17,409,skin and connective tissue diseases,"[connective tissue diseases, skin diseases, an..."
5,c06,403,digestive system diseases,"[biliary tract diseases, digestive system abno..."
12,c13,363,female urogenital diseases and pregnancy compl...,"[female urogenital diseases, pregnancy complic..."
15,c16,342,"congenital, hereditary, and neonatal diseases ...","[congenital abnormalities, fetal diseases, gen..."
14,c15,335,hemic and lymphatic diseases,"[hematologic diseases, lymphatic diseases, ane..."
7,c08,305,respiratory tract diseases,"[bronchial diseases, ciliary motility disorder..."
