## human-constraint

In [11]:
import pandas
import obonet
import os

In [12]:
download_dir ='../data/Uberon'

In [13]:
# Read the ontology into a networkx MultiDiGraph
path = os.path.join(download_dir, 'uberon-basic.obo') #uberon-basic.obo
with open(path) as obo_file:
    ontology = obonet.read_obo(obo_file)

print(ontology.number_of_nodes())
print(ontology.number_of_edges())

14769
35165


In [14]:
# Extract xref resources that indication human terms
human_xrefs = set()
for x in ontology.graph['treat-xrefs-as-reverse-genus-differentia']:
    resource, relationship, genus = x.split(' ')
    if genus == 'NCBITaxon:9606':
        human_xrefs.add(resource)
human_xrefs

{'DHBA', 'EHDAA2', 'FMA', 'HBA', 'HsapDv', 'KUPO'}

In [15]:
# Parse xrefs and add as node attribute
for node, data in ontology.nodes(data=True):
    data['xrefs'] = [xref.split(':', 1) for xref in data.get('xref', [])]

In [16]:
def get_children(graph, node, in_keys = {'is_a', 'part_of'}, out_keys = set()):
    children = set()
    for u, v, key in graph.in_edges(node, keys=True):
        if key in in_keys:
            children.add(u)
            children |= get_children(graph, u)
    for u, v, key in graph.out_edges(node, keys=True):
        if key in out_keys:
            children.add(v)
            children |= get_children(graph, v)
    return children

In [24]:
# Extract MeSH cross-references
rows = []
for node, data in ontology.nodes(data=True):
    if not node.startswith('UBERON:'):
        continue
    xrefs = data['xrefs']
    
    # Find whether term is in humans by seeing if node or its children
    # xref specific human terminologies
    children = get_children(ontology, node, in_keys = {'is_a', 'part_of', 'develops_from'})
    child_xrefs = set()
    for child in children | {node}:
        child_xrefs |= {xref[0] for xref in ontology.nodes[child]['xrefs']}
    human = int(bool(child_xrefs & human_xrefs))
    
    rows.append([node, data['name'], human])

pos_df = pandas.DataFrame(rows, columns=['uberon_id', 'uberon_name', 'positive_evidence'])

In [26]:
pos_df[pos_df['uberon_id']=='UBERON:0000022']

Unnamed: 0,uberon_id,uberon_name,positive_evidence
20,UBERON:0000022,feather,0


In [10]:
# create negative evidence df
# Read obo into graph
with open(os.path.join(download_dir, 'uberon.obo')) as read_file:
    ext_obo = obonet.read_obo(read_file)
# Extract information from the graph
term_rows = []
err_nodes = []
for node, data in ext_obo.nodes(data=True):
    try:
        term_rows.append((node, data['name']))
    except:
        err_nodes.append(node)
ext_df = pandas.DataFrame(term_rows, columns=['Uberon ID', 'Uberon name'])

# Read obo into graph
with open(os.path.join(download_dir, 'human-view.obo')) as read_file:
    human_obo = obonet.read_obo(read_file)
# Extract information from the graph
term_rows = []
err_nodes = []
for node, data in human_obo.nodes(data=True):
    try:
        term_rows.append((node, data['name']))
    except:
        err_nodes.append(node)
human_df = pandas.DataFrame(term_rows, columns=['Uberon ID', 'Uberon name'])

# Merge the dataframes
ext_human_df = pandas.merge(ext_df, human_df, on='Uberon ID', how='left', suffixes=('', ' human'))

# Add a column to indicate which rows are present in the second dataframe
ext_human_df['9606'] = ext_human_df['Uberon name human'].notnull()
ext_human_df['9606'] = ext_human_df['9606'].map(lambda x: 'T' if x else 'F')
ext_human_df.drop(columns=['Uberon name human'], inplace=True)

ext_human_df.to_csv(os.path.join(download_dir, 'ext_human_constraints.csv'),index=False)
ext_human_df
#24945

In [34]:
# Read obo into graph
with open(os.path.join(download_dir, 'human-view.obo')) as read_file:
    human_obo = obonet.read_obo(read_file)
# Extract information from the graph
term_rows = []
err_nodes = []
for node, data in human_obo.nodes(data=True):
    try:
        term_rows.append((node, data['name']))
    except:
        err_nodes.append(node)
human_df = pandas.DataFrame(term_rows, columns=['Uberon ID', 'Uberon name'])
human_df

Unnamed: 0,Uberon ID,Uberon name
0,CL:0000000,cell
1,CL:0000008,migratory cranial neural crest cell
2,CL:0000011,migratory trunk neural crest cell
3,CL:0000019,sperm
4,CL:0000020,spermatogonium
...,...,...
14501,UBERON:8600045,right lateral wall of urinary bladder
14502,UBERON:8600046,anterior wall of urinary bladder
14503,UBERON:8600047,posterior wall of urinary bladder
14504,UBERON:8600048,apex of urinary bladder


In [35]:
human_df[human_df['Uberon ID']=='UBERON:0000022']

Unnamed: 0,Uberon ID,Uberon name
260,UBERON:0000022,feather


In [31]:
# Read negative evidence df
#neg_df = pandas.read_table('https://raw.githubusercontent.com/dhimmel/uberon/gh-pages/download/ext_human_constraints.tsv')
#17066
neg_df = pandas.read_table('../data/Uberon/ext_human_constraints.csv', sep=',')
neg_df = neg_df.rename(columns={'Uberon ID': 'uberon_id', '9606': 'no_negative_evidence'})
neg_df = neg_df[['uberon_id', 'no_negative_evidence']]
neg_df.no_negative_evidence = (neg_df.no_negative_evidence  == 'T').astype(int)

In [33]:
neg_df[neg_df['uberon_id']=='UBERON:0000022']

Unnamed: 0,uberon_id,no_negative_evidence
10485,UBERON:0000022,1


In [36]:
human_df = pos_df.merge(neg_df) #result does not change after merge
human_df.to_csv(os.path.join(download_dir, 'human-constraint.tsv'), sep='\t', index=False)
human_df 

Unnamed: 0,uberon_id,uberon_name,positive_evidence,no_negative_evidence
0,UBERON:0000000,processual entity,1,1
1,UBERON:0000002,uterine cervix,1,1
2,UBERON:0000003,naris,1,1
3,UBERON:0000004,nose,1,1
4,UBERON:0000005,chemosensory organ,1,1
...,...,...,...,...
14475,UBERON:8600045,right lateral wall of urinary bladder,1,1
14476,UBERON:8600046,anterior wall of urinary bladder,1,1
14477,UBERON:8600047,posterior wall of urinary bladder,1,1
14478,UBERON:8600048,apex of urinary bladder,1,1


## Process

In [41]:
import collections
import re

import pandas

import obonet

Download most recent uberon release

•	http://purl.obolibrary.org/obo/uberon/uberon-basic.obo

•	http://purl.obolibrary.org/obo/uberon/uberon-ext.obo

! wget --no-verbose --timestamping --directory-prefix download/ http://purl.obolibrary.org/obo/uberon/ext.obo
! wget --no-verbose --timestamping --directory-prefix download/ http://purl.obolibrary.org/obo/uberon/basic.obo

In [42]:
# Read obo into graph
with open(path) as read_file:
    basic = obonet.read_obo(read_file)
dict(collections.Counter(key for u, v, key in basic.edges(keys=True)))

{'present_in_taxon': 2092,
 'is_a': 20506,
 'part_of': 9058,
 'develops_from': 1330,
 'fma_set_term': 43,
 'immediate_transformation_of': 71,
 'never_in_taxon': 301,
 'mutually_spatially_disjoint_with': 794,
 'dc-contributor': 837,
 'dubious_for_taxon': 43,
 'seeAlso': 21,
 'transformation_of': 47,
 'has_no_connections_with': 2,
 'ambiguous_for_taxon': 4,
 'implements_design_pattern': 5,
 'source_atlas': 11}

In [43]:
# Extract information from the graph
term_rows = []
xref_rows = []
subset_rows = []

for node, data in basic.nodes(data=True):
    if 'name' in data.keys():
        term_rows.append((node, data['name']))
    else:
        break
    
    for xref in data.get('xref', []):
        xref_rows.append((node, xref))

    for subset in data.get('subset', []):
        subset_rows.append((node, subset))

term_df = pandas.DataFrame(term_rows, columns=['uberon_id', 'uberon_name']).sort_values(['uberon_id', 'uberon_name'])
xref_df = pandas.DataFrame(xref_rows, columns=['uberon_id', 'xref']).sort_values(['uberon_id', 'xref'])
subset_df = pandas.DataFrame(subset_rows, columns=['uberon_id', 'subset']).sort_values(['uberon_id', 'subset'])

In [28]:
# Create a dataframe of Uberon terms
term_df.to_csv(os.path.join(download_dir, 'terms.tsv'), sep='\t', index=False)
term_df.head()

Unnamed: 0,uberon_id,uberon_name
0,UBERON:0000000,processual entity
1,UBERON:0000002,uterine cervix
2,UBERON:0000003,naris
3,UBERON:0000004,nose
4,UBERON:0000005,chemosensory organ


In [29]:
# Update MESH IDs that are tree numbers
url = '../data/MeSH/tree-numbers.tsv'
tree_number_df = pandas.read_table(url)
tn_to_id = dict(zip(tree_number_df.mesh_tree_number, tree_number_df.mesh_id))

def update_xref(x):
    vocab, identifier = x.split(':', 1)
    if vocab == 'MESH':
        if re.search('D[0-9]{6}', identifier):
            return x
        return tn_to_id.get(identifier) or x
    return x

xref_df.xref = xref_df.xref.map(update_xref)

# Create a dataframe of cross-references
xref_df.to_csv(os.path.join(download_dir, 'xref.tsv'), sep='\t', index=False)
xref_df.head()

Unnamed: 0,uberon_id,xref
0,UBERON:0000000,BFO:0000003
1,UBERON:0000002,BTO:0001421
2,UBERON:0000002,BTO:0002249
3,UBERON:0000002,CALOHA:TS-0134
4,UBERON:0000002,EFO:0000979


In [59]:
# Create a dataframe of term subsets
subset_df.to_csv(os.path.join(download_dir, 'subset.tsv'), sep='\t', index=False)
subset_dict = {subset: set(df.uberon_id) for subset, df in subset_df.groupby('subset')}
subset_df.head()

Unnamed: 0,uberon_id,subset
0,UBERON:0000000,common_anatomy
1,UBERON:0000000,upper_level
2,UBERON:0000002,efo_slim
3,UBERON:0000002,human_reference_atlas
4,UBERON:0000002,pheno_slim


## Create `hetio-slim`

`hetio-slim` is a subset of terms created for our [specific project](https://dx.doi.org/10.15363/thinklab.4):

+ potentially human-relevant (definitively non-human terms are removed)
+ in `uberon_slim` -- We chose a restrictive subset of Uberon terms because the vast extent of tissue-specific gene expression edges can become computationally troubling. We did not include cell types from the Cell Ontology because this ontology lags behind Uberon in terms of subset assignments, cross-references, and documentation.
+ not in `non_informative`, `upper_level`, `grouping_class`
+ have a MeSH cross-reference

In [45]:
human_df = pandas.read_table(os.path.join(download_dir, 'human-constraint.tsv'))
human_ids = set(human_df.query('no_negative_evidence == 1').uberon_id)

In [65]:
# terms in human.obo, with MeSH xrefs, not in [non_informative, upper_level, grouping_class], and in uberon_slim 
merged_df = term_df[term_df.uberon_id.isin(human_ids)].merge(xref_df)
merged_df['mesh_id'] = merged_df.xref.map(lambda x: x.split(':', 1)[1] if x and x.startswith('MESH:') else '')
merged_df = merged_df[merged_df.mesh_id != ''].drop(columns=['xref'])
exclude = subset_dict['non_informative'] | subset_dict['upper_level'] | subset_dict['grouping_class']
merged_df = merged_df[-merged_df.uberon_id.isin(exclude)]
merged_df = merged_df[merged_df.uberon_id.isin(subset_dict['uberon_slim'])]
merged_df.head()

Unnamed: 0,uberon_id,uberon_name,mesh_id
11,UBERON:0000002,uterine cervix,D002584
37,UBERON:0000004,nose,D009666
57,UBERON:0000006,islet of Langerhans,D007515
83,UBERON:0000007,pituitary gland,D010902
117,UBERON:0000010,peripheral nervous system,D017933


In [33]:
len(merged_df)

652

In [34]:
# Add mesh_name column
url = '../data/MeSH/terms.tsv'
mesh_df = pandas.read_table(url)
merged_df = merged_df.merge(mesh_df)
assert not any(merged_df.uberon_id.duplicated())

In [35]:
# Add BTO cross-references. Assumes that uberon-to-bto relationships are one-to-one, which is occaisionally not true.
bto_df = xref_df[xref_df.xref.str.startswith('BTO:').fillna(False)]
bto_df = bto_df.rename(columns={'xref': 'bto_id'})
bto_df = bto_df[bto_df.uberon_id.isin(merged_df.uberon_id)]
merged_df = merged_df.merge(bto_df, how='left').drop_duplicates('uberon_id')

In [36]:
# Save hetio-slim as a tsv
merged_df.to_csv(os.path.join(download_dir, 'hetio-slim.tsv'), index=False, sep='\t')

In [37]:
merged_df['uberon_id'].nunique()

652

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
#compare with hetionet version  402 anatomy
import pandas 
merged_df = pandas.read_table('../data/Uberon/hetio-slim.tsv')
merged_df['uberon_id'].nunique()

anatomy_hetionet = pandas.read_table('https://raw.githubusercontent.com/dhimmel/uberon/gh-pages/data/hetio-slim.tsv') 
anatomy_hetionet['uberon_id'].nunique()

652

402

In [9]:
import os 
download_dir ='../data/Uberon'
human_df = pandas.read_table(os.path.join(download_dir, 'hetio-slim.tsv'))

In [69]:
outer = merged_df.merge(anatomy_hetionet, on=['uberon_id'], how='outer', indicator=True)
outer[outer['_merge']=='left_only']
outer[outer['_merge']=='left_only'].to_csv(os.path.join(download_dir, 'hetio-slim_new_added.tsv'), index=False, sep='\t')

Unnamed: 0,uberon_id,uberon_name_x,mesh_id_x,uberon_name_y,mesh_id_y,mesh_name,bto_id,_merge
7,UBERON:0000016,endocrine pancreas,D007515,,,,,left_only
8,UBERON:0000017,exocrine pancreas,D046790,,,,,left_only
10,UBERON:0000022,feather,D005241,,,,,left_only
17,UBERON:0000044,dorsal root ganglion,D005727,,,,,left_only
23,UBERON:0000159,anal canal,D001003,,,,,left_only
...,...,...,...,...,...,...,...,...
636,UBERON:0003124,chorion membrane,D002823,,,,,left_only
638,UBERON:0003128,cranium,D012886,,,,,left_only
645,UBERON:0004454,tarsal region,D000842,,,,,left_only
649,UBERON:0005290,myelencephalon,D054024,,,,,left_only


In [67]:
outer[outer['_merge']=='right_only']

Unnamed: 0,uberon_id,uberon_name_x,mesh_id_x,uberon_name_y,mesh_id_y,mesh_name,bto_id,_merge


In [10]:
human_df[human_df['uberon_id']=='UBERON:0000022']

Unnamed: 0,uberon_id,uberon_name,positive_evidence,no_negative_evidence
20,UBERON:0000022,feather,0,1
