# MeSH tree set up

In [1]:
import json
import pandas as pd
from collections import Counter

In [2]:
meshtree_file = "mtrees2021.bin"

In [3]:
name2id = {}
id2name = {}

CVDtree =[]

c = 0
with open(meshtree_file, "r") as ftree:
    for line in ftree:
        #print (line)
        c = c+1
        term = line.strip().split(";")
        name = term[0].lower()
        ID = term[1]    
        
        if ID[0:3] =="C14":
            #print(term,ID)
            CVDtree.append({"name": name, "ID":ID})
            
            
        name2id.update({name:ID})
        id2name.update({ID:name})

In [4]:
c

61314

In [5]:
with open("name2id.json","w") as f1:
    json.dump(name2id,f1)

In [6]:
with open("id2name.json","w") as f2:
    json.dump(id2name,f2)

In [7]:
id2name['C14']

'cardiovascular diseases'

In [8]:
name2id['pathology']

'H02.403.650'

In [9]:
len(list(CVDtree))

672

In [10]:
CVD = pd.DataFrame(CVDtree)
CVD = CVD.set_index('name')
CVD = CVD.sort_values("ID",ascending =True)

In [11]:
CVD.head(10)

Unnamed: 0_level_0,ID
name,Unnamed: 1_level_1
cardiovascular diseases,C14
cardiovascular abnormalities,C14.240
"heart defects, congenital",C14.240.400
22q11 deletion syndrome,C14.240.400.021
digeorge syndrome,C14.240.400.021.500
alagille syndrome,C14.240.400.044
aortic coarctation,C14.240.400.090
aortico-ventricular tunnel,C14.240.400.118
arrhythmogenic right ventricular dysplasia,C14.240.400.145
barth syndrome,C14.240.400.172


In [12]:
CVD.to_csv("cvd.csv")

In [13]:
len(CVD)

672

### CVD Categories

data: https://docs.google.com/spreadsheets/d/1H1sO0Lgpc623sUNEZDD5h8tIx7-znLo6QQMwszCrfQY/edit#gid=0

In [14]:
cvd_categories = [
{"name":"Cardiomyopathies",\
"abbr" :"CM",\
"roots":["C14.280.238","C14.280.434"]},#heart failure is included
{"name": "Arrhythmias, Cardiac",\
 "abbr":"ARR",\
 "roots":["C14.280.067"]},
{"name":"Heart Defects, Congenital",\
 "abbr" :"CHD",\
 "roots":["C14.280.400"]},
{"name":"Heart Valve Diseases",\
 "abbr" :"VD",\
 "roots":["C14.280.484"]},
{"name":"Myocardial Ischemia",\
 "abbr" :"IHD",\
 "roots":["C14.280.647"]},
{"name":"Cardiac Conduction System Disease",\
 "abbr" :"CCS",\
 "roots":["C14.280.123"]},
{"name":"Ventricular Outflow Obstruction",\
 "abbr" :"VOO",\
 "roots":["C14.280.955"]},
{"name":"Other Heart Disease",\
 "abbr" :"OHD",\
 "roots": ["C14.280.195","C14.280.282",\
        "C14.280.383","C14.280.470",\
        "C14.280.945","C14.280.459","C14.280.720"]}
]

In [15]:
with open("cvd_categories.json", "w")as f:
    json.dump(cvd_categories,f)

In [16]:
cvdDict = {}
for item in cvd_categories:
    cvdDict.update({item['abbr']:[]})
cvdDict

{'ARR': [],
 'CCS': [],
 'CHD': [],
 'CM': [],
 'IHD': [],
 'OHD': [],
 'VD': [],
 'VOO': []}

In [17]:
'''Lets work on each CVD category 1-by-1'''
for item in cvd_categories:
    abbr =  item['abbr']
    roots = item['roots']
    for meshid in roots:
        '''iterate over the CVD dataframe to collect terms per CVD category'''
        for name, ID in zip(CVD.index,CVD['ID']):
            if ID[0:11]== meshid:
                #print(mesh,"|",name,"|",ID)
                cvdDict[abbr].append(name)

In [18]:
with open("cvd_abr2mesh.json", "w")as f:
    json.dump(cvdDict,f)

#### OS MeSH