# Download 
https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2024.gz   
or   
https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2024.xml

In [1]:
#!wget --timestamping --directory-prefix download/ ftp://nlmpubs.nlm.nih.gov/online/mesh/.xmlmesh/desc2015.gz

In [2]:
import os
import gzip
import io
import json

import xml.etree.ElementTree as ET

import networkx
import pandas

In [3]:
download_dir ='../data/MeSH'

In [4]:
# Read MeSH xml release
#xml_path = '../data/desc2024.gz' #os.path.join('download', 'desc2015.gz')
#with gzip.open(xml_path) as xml_file:
#    tree = ET.parse(xml_file)
#root = tree.getroot()

xml_path = os.path.join(download_dir, 'desc2024.xml') 
tree = ET.parse(xml_path)
root = tree.getroot()

In [5]:
# Extract mesh terms
term_dicts = list()
for descriptor in root:
    for concept in descriptor.findall('ConceptList/Concept'):
        for term in concept.findall('TermList/Term'):
            term_dict = {
                'DescriptorUI': descriptor.findtext('DescriptorUI'),
                'ConceptUI': concept.findtext('ConceptUI'),
                'TermUI': term.findtext('TermUI'),
                'TermName': term.findtext('String')
            }
            term_dict.update(concept.attrib)
            term_dict.update(term.attrib)
            term_dicts.append(term_dict)

columns = ['DescriptorUI', 'ConceptUI', 'PreferredConceptYN', 'TermUI', 'TermName',
           'ConceptPreferredTermYN', 'IsPermutedTermYN', 'LexicalTag', 'RecordPreferredTermYN'] #, 'PrintFlagYN'
term_df = pandas.DataFrame(term_dicts)[columns]
term_df.to_csv(os.path.join(download_dir, 'descriptor-terms.tsv'), index=False, sep='\t')
term_df

Unnamed: 0,DescriptorUI,ConceptUI,PreferredConceptYN,TermUI,TermName,ConceptPreferredTermYN,IsPermutedTermYN,LexicalTag,RecordPreferredTermYN
0,D000001,M0000001,Y,T000002,Calcimycin,Y,N,NON,Y
1,D000001,M0000001,Y,T001124965,"4-Benzoxazolecarboxylic acid, 5-(methylamino)-...",N,N,NON,N
2,D000001,M0353609,N,T000001,A-23187,Y,N,LAB,N
3,D000001,M0353609,N,T000001,A 23187,N,Y,LAB,N
4,D000001,M0353609,N,T000004,A23187,N,N,LAB,N
...,...,...,...,...,...,...,...,...,...
263931,D066331,M0594254,Y,T859704,Laser-Evoked Potentials,Y,N,NON,Y
263932,D066331,M0594254,Y,T859704,Laser Evoked Potentials,N,Y,NON,N
263933,D066331,M0594254,Y,T859704,Laser-Evoked Potential,N,Y,NON,N
263934,D066331,M0594254,Y,T859704,"Potential, Laser-Evoked",N,Y,NON,N


In [6]:
# Test whether MeSH term names are unique
len(term_df) == len(set(term_df.TermName))

False

In [7]:
dup_termname = term_df[term_df.duplicated('TermName', keep=False) == True]
dup_termname #https://www.ncbi.nlm.nih.gov/mesh/?term=D000081207 it do have duplicated entry term

Unnamed: 0,DescriptorUI,ConceptUI,PreferredConceptYN,TermUI,TermName,ConceptPreferredTermYN,IsPermutedTermYN,LexicalTag,RecordPreferredTermYN
19706,D000081207,M000650100,N,T000999067,Inherited Immunodeficiency Syndrome,N,N,NON,N
19715,D000081207,M000650100,N,T000958692,Inherited Immunodeficiency Syndrome,N,Y,NON,N
50398,D002522,M0003848,Y,T007361,African Green Monkey,N,N,EPO,N
50410,D002522,M0003850,N,T007362,African Green Monkey,N,Y,EPO,N
71491,D005175,M0008159,Y,T751631,Factor 12 Deficiency,N,N,NON,N
71498,D005175,M0008159,Y,T000913569,Factor 12 Deficiency,N,Y,NON,N
80191,D006319,M0009934,Y,T001102790,Sensoryneural Deafness,N,N,NON,N
80201,D006319,M0009934,Y,T001102791,Sensoryneural Deafness,N,Y,NON,N
99886,D008872,M0013839,Y,T001078150,Micro Waves,N,Y,NON,N
99887,D008872,M0013839,Y,T001078148,Micro Waves,N,N,NON,N


In [8]:
# Parse MeSH xml release
terms = list()

for elem in root:
    term = dict()
    term['mesh_id'] = elem.findtext('DescriptorUI')
    term['mesh_name'] = elem.findtext('DescriptorName/String')
    term['semantic_types'] = list({x.text for x in elem.findall(
        'ConceptList/Concept/SemanticTypeList/SemanticType/SemanticTypeUI')})
    term['tree_numbers'] = [x.text for x in elem.findall('TreeNumberList/TreeNumber')]
    terms.append(term)

In [9]:
terms

[{'mesh_id': 'D000001',
  'mesh_name': 'Calcimycin',
  'semantic_types': [],
  'tree_numbers': ['D02.355.291.933.125',
   'D02.540.576.625.125',
   'D03.633.100.221.173',
   'D04.345.241.654.125',
   'D04.345.674.625.125']},
 {'mesh_id': 'D000002',
  'mesh_name': 'Temefos',
  'semantic_types': [],
  'tree_numbers': ['D02.705.400.625.800',
   'D02.705.539.345.800',
   'D02.886.300.692.800']},
 {'mesh_id': 'D000003',
  'mesh_name': 'Abattoirs',
  'semantic_types': [],
  'tree_numbers': ['J01.576.423.200.700.100', 'J03.540.020']},
 {'mesh_id': 'D000004',
  'mesh_name': 'Abbreviations as Topic',
  'semantic_types': [],
  'tree_numbers': ['L01.559.598.400.556.131']},
 {'mesh_id': 'D000005',
  'mesh_name': 'Abdomen',
  'semantic_types': [],
  'tree_numbers': ['A01.923.047']},
 {'mesh_id': 'D000006',
  'mesh_name': 'Abdomen, Acute',
  'semantic_types': [],
  'tree_numbers': ['C23.888.592.612.054.200', 'C23.888.821.030.249']},
 {'mesh_id': 'D000007',
  'mesh_name': 'Abdominal Injuries',
  'sem

In [11]:
# Determine ontology parents
tree_number_to_id = {tn: term['mesh_id'] for term in terms for tn in term['tree_numbers']}

for term in terms:
    parents = set()
    for tree_number in term['tree_numbers']:
        try:
            parent_tn, self_tn = tree_number.rsplit('.', 1)
            parents.add(tree_number_to_id[parent_tn])
        except ValueError:
            pass
    term['parents'] = list(parents)

In [12]:
term

{'mesh_id': 'D066331',
 'mesh_name': 'Laser-Evoked Potentials',
 'semantic_types': [],
 'tree_numbers': ['G07.265.216.500.400.500', 'G11.561.200.500.400.500'],
 'parents': ['D005073']}

In [13]:
path = os.path.join(download_dir, 'mesh.json')
with open(path, 'w') as write_file:
    json.dump(terms, write_file, indent=2)

In [14]:
# Create a newtorkx directed graph represented mesh
network = networkx.DiGraph()

# add nodes
for term in terms:
    network.add_node(term['mesh_id'], name=term['mesh_name'])

# add edges
for term in terms:
    for parent in term['parents']:
        network.add_edge(parent, term['mesh_id'])

#assert networkx.is_directed_acyclic_graph(network)

networkx.write_gexf(network, os.path.join(download_dir, 'ontology.gexf.gz'))

In [15]:
# Read mesh
path = os.path.join(download_dir, 'mesh.json')
with open(path) as read_file:
    mesh = json.load(read_file)

mesh_df = pandas.DataFrame.from_dict(mesh)[['mesh_id', 'mesh_name']]
mesh_df.to_csv(os.path.join(download_dir, 'terms.tsv'), sep='\t', index=False)
mesh_df

Unnamed: 0,mesh_id,mesh_name
0,D000001,Calcimycin
1,D000002,Temefos
2,D000003,Abattoirs
3,D000004,Abbreviations as Topic
4,D000005,Abdomen
...,...,...
30759,D066310,Digital Divide
30760,D066328,Ventral Striatum
30761,D066329,Protein Aggregates
30762,D066330,"Printing, Three-Dimensional"


In [16]:
# Extract (mesh_id, mesh_tree_number) pairs
rows = []
for term in mesh:
    mesh_id = term['mesh_id']
    mesh_name = term['mesh_name']
    for tree_number in term['tree_numbers']:
        rows.append([mesh_id, mesh_name, tree_number])

tn_df = pandas.DataFrame(rows, columns=['mesh_id', 'mesh_name', 'mesh_tree_number'])
tn_df.to_csv(os.path.join(download_dir, 'tree-numbers.tsv'), sep='\t', index=False)
tn_df

Unnamed: 0,mesh_id,mesh_name,mesh_tree_number
0,D000001,Calcimycin,D02.355.291.933.125
1,D000001,Calcimycin,D02.540.576.625.125
2,D000001,Calcimycin,D03.633.100.221.173
3,D000001,Calcimycin,D04.345.241.654.125
4,D000001,Calcimycin,D04.345.674.625.125
...,...,...,...
64452,D066330,"Printing, Three-Dimensional",J01.897.564
64453,D066330,"Printing, Three-Dimensional",L01.224.108.150.500
64454,D066330,"Printing, Three-Dimensional",L01.296.110.150.500
64455,D066331,Laser-Evoked Potentials,G07.265.216.500.400.500


# Diseases

In [17]:
def is_human_disease(tn):
    """Given a tree number, return whether the heirarchical path suggests a human disease."""
    # F03 (mental disorders)
    if tn.startswith('F03'):
        return True
    # C01 though C21 and C24 -- C26
    for i in list(range(1, 22)) + ['C24', 'C25', 'C26']:
        if tn.startswith('C' + str(i).zfill(2)):
            return True
    # C23 exlcuding C23.888 (Symptoms and Signs)
    if tn.startswith('C23') and not tn.startswith('C23.888'):
        return True
    return False

In [18]:
diseases = {term['mesh_id'] for term in terms if any(map(is_human_disease, term['tree_numbers']))}
len(diseases)

4851

# Symptoms

In [19]:
# Read HSDN symptoms
url = 'https://raw.githubusercontent.com/LABrueggs/HSDN/master/Symptom-Occurence-Output.tsv' #https://github.com/LeoBman/HSDN/tree/master?tab=readme-ov-file
hsdn_symptom_df = pandas.read_table(url, index_col=0)
hsdn_symptoms = hsdn_symptom_df['MeSH Symptom ID']

In [20]:
# find MeSH symptoms
symptoms = networkx.descendants(network, 'D012816') # signs and symptoms https://www.ncbi.nlm.nih.gov/mesh/68012816 
symptom_df = mesh_df[mesh_df.mesh_id.isin(symptoms)]
pandas.options.mode.chained_assignment = None
symptom_df['in_hsdn'] = symptom_df.mesh_id.isin(hsdn_symptoms).astype(int)
symptom_df.to_csv(os.path.join(download_dir, 'symptoms.tsv'), index=False, sep='\t')
symptom_df

Unnamed: 0,mesh_id,mesh_name,in_hsdn
5,D000006,"Abdomen, Acute",1
131,D000067329,"Obesity, Metabolically Benign",0
141,D000067404,Social Communication Disorder,0
147,D000067454,Childhood-Onset Fluency Disorder,0
168,D000067559,Specific Learning Disorder,0
...,...,...,...
30548,D065634,Cerebrospinal Fluid Leak,0
30549,D065635,Benign Paroxysmal Positional Vertigo,0
30645,D065906,Hyperlactatemia,0
30685,D066190,Allesthesia,0


In [21]:
sum(symptom_df.in_hsdn)

317

In [27]:
#compare with hetionet version  438 mesh terms 
symptom_hetionet = pandas.read_table('https://raw.githubusercontent.com/dhimmel/mesh/gh-pages/data/symptoms.tsv') #, index_col=0
symptom_hetionet

Unnamed: 0_level_0,mesh_name,in_hsdn
mesh_id,Unnamed: 1_level_1,Unnamed: 2_level_1
D000006,"Abdomen, Acute",1
D000270,Adie Syndrome,0
D000326,Adrenoleukodystrophy,0
D000334,Aerophagy,1
D000370,Ageusia,1
...,...,...
D064250,Hypertriglyceridemic Waist,0
D065634,Cerebrospinal Fluid Leak,0
D065635,Benign Paroxysmal Positional Vertigo,0
D065906,Hyperlactatemia,0


# Side Effects

In [None]:
side_effects = networkx.descendants(network, 'D064420') # Drug-Related Side Effects and Adverse Reactions
side_effect_df = mesh_df[mesh_df.mesh_id.isin(side_effects)]
len(side_effect_df)