In [1]:
import os
import csv
import re

import networkx
import pandas

#import do_tools

Download HumanDO.obo https://disease-ontology.org/downloads/
https://github.com/DiseaseOntology/HumanDiseaseOntology/tree/main/src/ontology/HumanDO.obo

In [2]:
import networkx

import IGS_scripts.oboparser as oboparser 

def load_do(path):
    """path is location of obo file"""
    return oboparser.parse(path, ['is_a'])

def do_to_networkx(do):
    """Return a networkx representation of do"""
    terms = do.get_terms()
    dox = networkx.MultiDiGraph()
    dox.add_nodes_from(term for term in terms if not term.obsolete)
    for term in dox:
        for typedef, id_, name in term.relationships:
            dox.add_edge(term, do.get_term(id_), key = typedef)

    assert networkx.is_directed_acyclic_graph(dox)
    return dox

In [3]:
download_dir = '../data/DiseaseOntology'
path = os.path.join(download_dir, 'HumanDO.obo')
do = load_do(path)
dox = do_to_networkx(do)

In [4]:
# Create a table of descriptions
pattern = re.compile(r'^"(.*?)"')
rows = list()
for term in dox:
    match = pattern.search(term.definition)
    description = match.group(1) if match else ''
    rows.append((term.id, term.name, description))
description_df = pandas.DataFrame(rows, columns = ['disease_id', 'name', 'description']).sort_values('disease_id')
#description_df.to_csv(os.path.join(download_dir, 'description.tsv'), sep='\t', index=False)
description_df.head(2)

Unnamed: 0,disease_id,name,description
0,DOID:0001816,angiosarcoma,A vascular cancer that derives_from the cells ...
1,DOID:0002116,pterygium,A corneal disease that is characterized by a t...


In [5]:
description_df.shape

(11537, 3)

In [6]:
xref_rename = {
    'ICD10CM': 'ICD10',
    'ICD9CM': 'ICD9',
    'NCI2009_04D': 'NCI',
    'SNOMEDCT_2010_1_31': 'SNOMEDCT',
    'SNOMEDCT_2013_01_31': 'SNOMEDCT',
    'SNOMEDCT_US_2020_03_01': 'SNOMEDCT',
    'SNOMEDCT_US_2021_07_31': 'SNOMEDCT',
    'SNOMEDCT_US_2021_09_01': 'SNOMEDCT',
    'SNOMEDCT_US_2022_07_31': 'SNOMEDCT',
    'SNOMEDCT_US_2023_03_01': 'SNOMEDCT',
    'SNOMEDCT_US_2023_09_01': 'SNOMEDCT',
    'SNOMEDCT_US_2023_10_01': 'SNOMEDCT',
    'SNOMEDCT_US_2023_11_01': 'SNOMEDCT',
    'UMLS_CUI': 'UMLS',
}

In [7]:
def write_xref_row(writer, doid_code, doid_name, xrefs, rename_dict):
    rows = list()
    for xref in xrefs:
        resource, resource_id = xref.split(':', 1)
        if resource in rename_dict:
            resource = rename_dict[resource]
        rows.append([doid_code, doid_name, resource, resource_id])
    rows.sort()
    writer.writerows(rows)

file_unprop = open(os.path.join(download_dir, 'xrefs.tsv'), 'w')
file_prop = open(os.path.join(download_dir, 'xrefs-prop.tsv'), 'w')

writer_unprop = csv.writer(file_unprop, delimiter='\t')
writer_prop = csv.writer(file_prop, delimiter='\t')

for writer in writer_unprop, writer_prop:
    writer.writerow(['doid_code', 'doid_name', 'resource', 'resource_id'])

for term in list(reversed(list(networkx.topological_sort(dox)))):
    xrefs = set(term.xrefs)
    xrefs_prop = set(xrefs)
    for ancestor in networkx.ancestors(dox, term):
        xrefs_prop |= set(ancestor.xrefs)
    
    write_xref_row(writer_unprop, term.id, term.name, xrefs, xref_rename)
    write_xref_row(writer_prop, term.id, term.name, xrefs_prop, xref_rename)

for write_file in file_unprop, file_prop:
    write_file.close()


In [8]:
# list of xrefs
import pandas
path = os.path.join(download_dir, 'xrefs.tsv')
xref_df = pandas.read_table(path)
set(xref_df.resource)

{'EFO',
 'GARD',
 'ICD10',
 'ICD11',
 'ICD9',
 'ICDO',
 'KEGG',
 'MEDDRA',
 'MESH',
 'NCI',
 'OMIM',
 'ORDO',
 'SNOMEDCT',
 'UMLS'}

In [9]:
# create a name to term mapping
rows = list()
for term in dox:
    rows.append({'doid': term.id, 'name': term.name, 'type': 'name'})
    for synonym in term.synonyms:
        rows.append({'doid': term.id, 'name': synonym[0], 'type': '{}-synonym'.format(synonym[1].lower())})
path = os.path.join(download_dir, 'term-names.tsv')
with open(path, 'w') as write_file:
    writer = csv.DictWriter(write_file, delimiter='\t', fieldnames=['doid', 'name', 'type'])
    writer.writeheader()
    writer.writerows(rows)

## Create useful datasets for DO Slim -- a non-redundant set of diseases

In [10]:
# read the slim disease ontology terms
path = os.path.join(download_dir, 'slim-terms.tsv')
slim_df = pandas.read_table(path)
slim_df[:3]

Unnamed: 0,doid,name,source,pathophysiology
0,DOID:2531,hematologic cancer,DOcancerslim,neoplastic
1,DOID:1319,brain cancer,DOcancerslim,neoplastic
2,DOID:1324,lung cancer,DOcancerslim,neoplastic


In [11]:
# check for terms that in slim but not in the DO
all_doids = set(do.get_term_ids())
slim_doids = set(slim_df.doid)
unmatched = slim_doids - all_doids
slim_doids &= all_doids
slim_df.loc[slim_df.doid.isin(unmatched)]

Unnamed: 0,doid,name,source,pathophysiology
55,DOID:9917,pleural cancer,DOcancerslim,neoplastic


In [12]:
# check for node redundancy
slim_terms = set(map(do.get_term, slim_doids))
for term in slim_terms:
    nodes_to_root = networkx.descendants(dox, term)
    conflicts = {x.name for x in nodes_to_root & slim_terms}
    if conflicts:
        print(term.name, conflicts)

In [13]:
path = os.path.join(download_dir, 'xrefs.tsv')
map_unprop_df = pandas.read_table(path)

path = os.path.join(download_dir, 'xrefs-prop.tsv')
map_prop_df = pandas.read_table(path)

In [14]:
slim_df = slim_df.rename(columns={'doid': 'doid_code'})
slim_map_unprop_df = slim_df[['doid_code']].merge(map_unprop_df)
slim_map_prop_df = slim_df[['doid_code']].merge(map_prop_df)
slim_map_prop_df[:3]

Unnamed: 0,doid_code,doid_name,resource,resource_id
0,DOID:2531,hematologic cancer,EFO,95
1,DOID:2531,hematologic cancer,EFO,96
2,DOID:2531,hematologic cancer,EFO,183


In [15]:
path = os.path.join(download_dir, 'xrefs-slim.tsv')
slim_map_unprop_df.to_csv(path, sep='\t', index=False)

path = os.path.join(download_dir, 'xrefs-prop-slim.tsv')
slim_map_prop_df.to_csv(path, sep='\t', index=False)

In [16]:
rows = list()
for term in slim_terms:
    subsumed = networkx.ancestors(dox, term)
    row_part = [term.id, term.name]
    rows.append(row_part + row_part + [0])
    for subterm in subsumed:
        distance = networkx.shortest_path_length(dox, subterm, term)
        rows.append(row_part + [subterm.id, subterm.name, distance])
rows.sort()

slim_prop_df = pandas.DataFrame(rows, columns=['slim_id', 'slim_name', 'subsumed_id', 'subsumed_name', 'min_distance'])
path = os.path.join(download_dir, 'slim-terms-prop.tsv')
slim_prop_df.to_csv(path, sep='\t', index=False)

## Compare with Hetionet

In [7]:
import pandas 
import os

download_dir = '../data/DiseaseOntology'
path = os.path.join(download_dir, 'xrefs-slim.tsv')
disease_df = pandas.read_table(path) #1244 rows
disease_df = disease_df.query('resource == "MESH"').drop(columns=['resource'])
disease_df = disease_df.rename(columns={'resource_id': 'mesh_id'})
disease_df

Unnamed: 0,doid_code,doid_name,mesh_id
0,DOID:2531,hematologic cancer,D019337
7,DOID:1319,brain cancer,D001932
44,DOID:263,kidney cancer,D007680
62,DOID:1793,pancreatic cancer,D010190
77,DOID:4159,skin cancer,D012878
...,...,...,...
1207,DOID:1312,focal segmental glomerulosclerosis,D005923
1217,DOID:216,dental caries,D003731
1225,DOID:2355,anemia,D000740
1231,DOID:594,panic disorder,D016584


In [8]:
disease_df['doid_code'].nunique()

121

In [9]:
disease_df[disease_df.duplicated('doid_code', keep=False) == True]

Unnamed: 0,doid_code,doid_name,mesh_id
236,DOID:1192,peripheral nervous system neoplasm,D010524
237,DOID:1192,peripheral nervous system neoplasm,D018317
333,DOID:4045,muscle cancer,D009217
334,DOID:4045,muscle cancer,D019042
509,DOID:5612,spinal cancer,D013120
510,DOID:5612,spinal cancer,D013125
596,DOID:3393,coronary artery disease,D003324
597,DOID:3393,coronary artery disease,D003327
598,DOID:3393,coronary artery disease,D017202
1047,DOID:13189,gout,D006073


In [2]:
import pandas
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/xrefs-slim.tsv'
disease_df_hetionet = pandas.read_table(url)
disease_df_hetionet = disease_df_hetionet.query('resource == "MSH"').drop(columns=['resource'])
disease_df_hetionet = disease_df_hetionet.rename(columns={'resource_id': 'mesh_id'})
disease_df_hetionet

Unnamed: 0,doid_code,doid_name,mesh_id
0,DOID:2531,hematologic cancer,D019337
7,DOID:1319,brain cancer,D001932
27,DOID:1324,lung cancer,D008175
44,DOID:263,kidney cancer,D007680
57,DOID:1793,pancreatic cancer,D010190
...,...,...,...
1761,DOID:1312,focal segmental glomerulosclerosis,D005923
1776,DOID:216,dental caries,D003731
1787,DOID:2355,anemia,D000740
1797,DOID:594,panic disorder,D016584


In [3]:
disease_df_hetionet['doid_code'].nunique()

133

In [5]:
disease_df_hetionet[disease_df_hetionet.duplicated('doid_code', keep=False) == True]

Unnamed: 0,doid_code,doid_name,mesh_id
198,DOID:10283,prostate cancer,C537243
199,DOID:10283,prostate cancer,D011471
607,DOID:8778,Crohn's disease,C536215
608,DOID:8778,Crohn's disease,D003424
705,DOID:3393,coronary artery disease,D003324
706,DOID:3393,coronary artery disease,D003327
707,DOID:3393,coronary artery disease,D017202
966,DOID:10652,Alzheimer's disease,C536597
967,DOID:10652,Alzheimer's disease,D000544
1152,DOID:5408,Paget's disease of bone,C538098


In [20]:
check = disease_df_hetionet.merge(disease_df, on='doid_code', how='outer', indicator=True)
check['_merge'].unique()

['both', 'left_only']
Categories (3, object): ['left_only', 'right_only', 'both']

In [21]:
check[(check['_merge']=='left_only')]

Unnamed: 0,doid_code,doid_name_x,mesh_id_x,doid_name_y,mesh_id_y,_merge
2,DOID:1324,lung cancer,D008175,,,left_only
14,DOID:8850,salivary gland cancer,D012468,,,left_only
35,DOID:10153,ileum cancer,D007078,,,left_only
36,DOID:1115,sarcoma,D012509,,,left_only
40,DOID:11920,tracheal cancer,D014134,,,left_only
43,DOID:1725,peritoneum cancer,D010534,,,left_only
110,DOID:0050741,alcohol dependence,D000437,,,left_only
118,DOID:8398,osteoarthritis,D010003,,,left_only
133,DOID:90,degenerative disc disease,D055959,,,left_only
140,DOID:4481,allergic rhinitis,D065631,,,left_only


In [22]:
slim_doids&{'DOID:1324'}

{'DOID:1324'}

https://disease-ontology.org/do

In [23]:
slim_map_unprop_df[(slim_map_unprop_df['doid_code']=='DOID:1324')&(slim_map_unprop_df['resource']=='MESH')]#['resource'].unique()

Unnamed: 0,doid_code,doid_name,resource,resource_id


In [24]:
map_unprop_df[(map_unprop_df['doid_code']=='DOID:1324')&(map_unprop_df['resource']=='MESH')]

Unnamed: 0,doid_code,doid_name,resource,resource_id


In [25]:
rows = list()
for term in list(reversed(list(networkx.topological_sort(dox)))):
    xrefs = set(term.xrefs)
    for xref in xrefs:
        resource, resource_id = xref.split(':', 1)
        if resource in xref_rename:
            resource = xref_rename[resource]
        rows.append([term.id, term.name, resource, resource_id])

In [26]:
disease_df_hetionet[disease_df_hetionet['doid_code']=='DOID:1324']

Unnamed: 0,doid_code,doid_name,mesh_id
27,DOID:1324,lung cancer,D008175
