In [1]:
from operator import methodcaller

# pip install pyfunctional
from functional import seq, pseq
from functools import partial, reduce

# pip install parse
from parse import Result, parse
from typing import Iterable
from operator import itemgetter, concat

# pip install biopython
from Bio import Entrez

import pandas as pd
import time
import re

In [2]:
base_query = """{disease}[MeSH Terms] AND normal[Subset Description] AND Homo sapiens [Organism] AND "Expression profiling by array"[Filter] AND "attribute name tissue"[Filter] AND ("20"[n_samples] : "10000"[n_samples]))"""

## Parse MeSH Tree

In [3]:
def parse_mesh(path: str) -> seq:
    return seq(open(path, 'rb'))\
            .map(methodcaller('decode', 'utf-8'))\
            .map(methodcaller('rstrip', '\n'))\
            .map(methodcaller('lower'))

def search(tree: Iterable, term: str) -> seq:
    return seq(tree).filter(lambda item: term.lower() in item)

def parse_entry(entry: str) -> Result:
    return parse('{term};{tag}.{children}', entry)

def get_children(tree: Iterable, tag: str, level=None):
    return seq(search(tree, tag))\
            .filter(lambda item: len(item.split('.')) == level if level else len(item) > 1)\
            .map(lambda item: item.split(';')[0:2][::-1])
        
def until(tag: str, level: int) -> str:
    return '.'.join(tag.split('.')[:level])

def query(q: str):
    handle, result = None, None
    
    while True:
        try:
            handle = Entrez.esearch(db="gds", field='title', term=q)
            result = Entrez.read(handle)
            handle.close()
            return result

        except Exception as e:
            print(e)
            time.sleep(5.0)

In [4]:
# @hidden_cell
Entrez.email = "talaataboudakika@std.sehir.edu.tr"
tree = parse_mesh('mtrees2017.bin').to_list()

In [5]:
result = seq(map(partial(get_children, tree, 'c04'), range(2, 10))).flatten()\
            .map(lambda item: (until(item[0], 2), item[1]))\
            .group_by_key().map(lambda item: (item[0], item[1][0], item[1][1:]))

### terms and their children

In [6]:
df = result.to_pandas(columns=['tag', 'term', 'children'])

df

Unnamed: 0,tag,term,children
0,c04.182,cysts,"[arachnoid cysts, bone cysts, branchioma, brea..."
1,c04.445,hamartoma,"[hamartoma syndrome, multiple, pallister-hall ..."
2,c04.557,neoplasms by histologic type,"[histiocytic disorders, malignant, leukemia, l..."
3,c04.588,neoplasms by site,"[abdominal neoplasms, anal gland neoplasms, bo..."
4,c04.619,"neoplasms, experimental","[carcinoma 256, walker, carcinoma, brown-pearc..."
5,c04.626,"neoplasms, hormone-dependent",[]
6,c04.651,"neoplasms, multiple primary","[hamartoma syndrome, multiple, multiple endocr..."
7,c04.666,"neoplasms, post-traumatic",[]
8,c04.682,"neoplasms, radiation-induced","[leukemia, radiation-induced]"
9,c04.692,"neoplasms, second primary",[]


### terms and their children GEO dataset IDs

In [None]:
records = result.map(lambda x: (x[0], pseq(x[2] + [x[1]])
                                .map(lambda child: query(base_query.format(disease=child))['IdList'])))

records

In [17]:
records.to_pandas(columns=['tag', 'Dataset IDs']).merge(df)

Unnamed: 0,tag,Dataset IDs,term,children
0,c04.182,"[[], [], [], [], [], [], [], [], [], [], [], [...",cysts,"[arachnoid cysts, bone cysts, branchioma, brea..."
1,c04.445,"[[], [], [], [], []]",hamartoma,"[hamartoma syndrome, multiple, pallister-hall ..."
2,c04.557,"[[], ['843', '841', '829'], [], ['1960', '829'...",neoplasms by histologic type,"[histiocytic disorders, malignant, leukemia, l..."
3,c04.588,"[[], [], [], ['4766', '4114', '3324', '3268'],...",neoplasms by site,"[abdominal neoplasms, anal gland neoplasms, bo..."
4,c04.619,"[[], [], [], [], [], [], [], [], [], [], [], [...","neoplasms, experimental","[carcinoma 256, walker, carcinoma, brown-pearc..."
5,c04.626,[[]],"neoplasms, hormone-dependent",[]
6,c04.651,"[[], [], [], [], [], [], [], []]","neoplasms, multiple primary","[hamartoma syndrome, multiple, multiple endocr..."
7,c04.666,[[]],"neoplasms, post-traumatic",[]
8,c04.682,"[[], []]","neoplasms, radiation-induced","[leukemia, radiation-induced]"
9,c04.692,[[]],"neoplasms, second primary",[]


In [11]:
# concatenate children datasets ids to form parent matches
# convert list to set to remove datasets having the same GEO dataset id
# convert set to len(set)

counts = records.map(lambda x: (x[0], len(set(reduce(lambda y, z: y + z, x[1])))))

In [14]:
counts.to_pandas(columns=['tag', 'count']).merge(df)[['term', 'count']].sort_values('count', ascending=False)

Unnamed: 0,term,count
3,neoplasms by site,25
2,neoplasms by histologic type,21
10,neoplastic processes,7
11,"neoplastic syndromes, hereditary",1
13,precancerous conditions,1
0,cysts,0
1,hamartoma,0
4,"neoplasms, experimental",0
5,"neoplasms, hormone-dependent",0
6,"neoplasms, multiple primary",0
