In [1]:
import pandas as pd

## Load in the CTD file parsed by monarch-ingest. Ensure all edges are "biolink:treats" edges, ensure there are no duplicates.

In [2]:
monarch_treats = pd.read_csv("PARSED_FILES/monarch-ingest_ctd_chemical_to_disease_edges.tsv", delimiter='\t')
#Ensure all edges have biolink:treats in them.
assert(monarch_treats[monarch_treats.predicate.str.contains("biolink:treats")].size==monarch_treats.size)
#Ensure there are no duplicate subject -> object rows.
assert(monarch_treats.get(["subject","object"]).drop_duplicates().size==monarch_treats.get(["subject","object"]).size)

In [10]:
monarch_treats.groupby('predicate')['id'].nunique()

predicate
biolink:treats_or_applied_or_studied_to_treat    38527
Name: id, dtype: int64

## Load in the CTD file as parsed by ORION. Drop all edges which aren't "biolink:treats". Ensure there are no duplicates.

In [54]:
orion = pd.read_json("PARSED_FILES/ORION_parsed_sourced_edges.jsonl", lines=True).reset_index()
orion_treats = orion[orion.predicate.str.contains("biolink:treats")]

In [55]:
assert(orion_treats.get(["subject","object"]).size==orion_treats.get(["subject","object"]).drop_duplicates().size)

In [56]:
orion_treats.groupby('predicate')["index"].nunique()

predicate
biolink:treats_or_applied_or_studied_to_treat    34766
Name: index, dtype: int64

In [57]:
orion_treats

Unnamed: 0,index,subject,predicate,object,primary_knowledge_source,publications,knowledge_level,agent_type
8,8,MESH:D000077206,biolink:treats_or_applied_or_studied_to_treat,MESH:C531617,infores:ctd,[PMID:8967745],knowledge_assertion,manual_agent
9,9,MESH:D019782,biolink:treats_or_applied_or_studied_to_treat,MESH:C531617,infores:ctd,[PMID:8967745],knowledge_assertion,manual_agent
10,10,MESH:D014810,biolink:treats_or_applied_or_studied_to_treat,MESH:C531617,infores:ctd,[PMID:8967745],knowledge_assertion,manual_agent
12,12,MESH:D000086,biolink:treats_or_applied_or_studied_to_treat,MESH:C531795,infores:ctd,[PMID:18070060],knowledge_assertion,manual_agent
25,25,MESH:D014227,biolink:treats_or_applied_or_studied_to_treat,MESH:C535306,infores:ctd,[PMID:27469509],knowledge_assertion,manual_agent
...,...,...,...,...,...,...,...,...
99103,99103,MESH:D016642,biolink:treats_or_applied_or_studied_to_treat,OMIM:188890,infores:ctd,[PMID:16785264],knowledge_assertion,manual_agent
99104,99104,MESH:D000655,biolink:treats_or_applied_or_studied_to_treat,OMIM:248310,infores:ctd,[PMID:2663213],knowledge_assertion,manual_agent
99118,99118,MESH:D020849,biolink:treats_or_applied_or_studied_to_treat,OMIM:614160,infores:ctd,[PMID:29571322],knowledge_assertion,manual_agent
99119,99119,MESH:D020845,biolink:treats_or_applied_or_studied_to_treat,OMIM:614160,infores:ctd,[PMID:29571322],knowledge_assertion,manual_agent


Make sure the 34766 rows parsed by ORION are a subset of the 38527 rows parsed by Monarch-Ingest 

In [62]:
monarch_treats_set = set([tuple(x) for x in monarch_treats.get(["subject","predicate","object"]).values])
orion_treats_set = set([tuple(x) for x in orion_treats.get(["subject","predicate","object"]).values])
print(len(monarch_treats_set.difference(orion_treats_set)))
assert(len(orion_treats_set.intersection(monarch_treats_set))==len(orion_treats_set))

3761


## Grab 10 rows which were included by Monarch-Ingest but not ORION

In [59]:
import itertools
for row in itertools.islice(monarch_treats_set.difference(orion_treats_set), 10): print(row)

('MESH:D000077152', 'biolink:treats_or_applied_or_studied_to_treat', 'MESH:D004421')
('MESH:D013974', 'biolink:treats_or_applied_or_studied_to_treat', 'MESH:D006973')
('MESH:D009271', 'biolink:treats_or_applied_or_studied_to_treat', 'MESH:D013375')
('MESH:D010672', 'biolink:treats_or_applied_or_studied_to_treat', 'MESH:D064420')
('MESH:D000077205', 'biolink:treats_or_applied_or_studied_to_treat', 'MESH:D015430')
('MESH:D013390', 'biolink:treats_or_applied_or_studied_to_treat', 'MESH:D009127')
('MESH:D005680', 'biolink:treats_or_applied_or_studied_to_treat', 'MESH:D013226')
('MESH:D005557', 'biolink:treats_or_applied_or_studied_to_treat', 'MESH:D006470')
('MESH:D002857', 'biolink:treats_or_applied_or_studied_to_treat', 'MESH:D003924')
('MESH:D000806', 'biolink:treats_or_applied_or_studied_to_treat', 'MESH:D006973')


## Try again with file from modified ORION

In [74]:
orion_v2 = pd.read_json("PARSED_FILES/ORION_modified_CTD_source_edges.jsonl", lines=True).reset_index()
orion_v2_treats = orion_v2[orion_v2.predicate.str.contains("biolink:treats")]
assert(orion_v2_treats.get(["subject","object"]).size==orion_v2_treats.get(["subject","object"]).drop_duplicates().size)
orion_v2_treats.groupby('predicate')["index"].nunique()
monarch_treats_set = set([tuple(x) for x in monarch_treats.get(["subject","predicate","object"]).values])
orion_v2_treats_set = set([tuple(x) for x in orion_v2_treats.get(["subject","predicate","object"]).values])
print(len(monarch_treats_set.difference(orion_v2_treats_set)))
assert(len(orion_v2_treats_set.intersection(monarch_treats_set))==len(orion_v2_treats_set))

0


In [67]:
orion_v2_treats

Unnamed: 0,index,subject,predicate,object,primary_knowledge_source,publications,knowledge_level,agent_type
8,8,MESH:D000077206,biolink:treats_or_applied_or_studied_to_treat,MESH:C531617,infores:ctd,[PMID:8967745],knowledge_assertion,manual_agent
9,9,MESH:D019782,biolink:treats_or_applied_or_studied_to_treat,MESH:C531617,infores:ctd,[PMID:8967745],knowledge_assertion,manual_agent
10,10,MESH:D014810,biolink:treats_or_applied_or_studied_to_treat,MESH:C531617,infores:ctd,[PMID:8967745],knowledge_assertion,manual_agent
12,12,MESH:D000086,biolink:treats_or_applied_or_studied_to_treat,MESH:C531795,infores:ctd,[PMID:18070060],knowledge_assertion,manual_agent
25,25,MESH:D014227,biolink:treats_or_applied_or_studied_to_treat,MESH:C535306,infores:ctd,[PMID:27469509],knowledge_assertion,manual_agent
...,...,...,...,...,...,...,...,...
99103,99103,MESH:D016642,biolink:treats_or_applied_or_studied_to_treat,OMIM:188890,infores:ctd,[PMID:16785264],knowledge_assertion,manual_agent
99104,99104,MESH:D000655,biolink:treats_or_applied_or_studied_to_treat,OMIM:248310,infores:ctd,[PMID:2663213],knowledge_assertion,manual_agent
99118,99118,MESH:D020849,biolink:treats_or_applied_or_studied_to_treat,OMIM:614160,infores:ctd,[PMID:29571322],knowledge_assertion,manual_agent
99119,99119,MESH:D020845,biolink:treats_or_applied_or_studied_to_treat,OMIM:614160,infores:ctd,[PMID:29571322],knowledge_assertion,manual_agent


In [71]:
orion_v2 = pd.read_json("PARSED_FILES/ORION_modified_CTD_source_edges.jsonl", lines=True).reset_index()
orion_v2_treats = orion_v2[orion_v2.predicate.str.contains("biolink:treats")]

In [72]:
orion_v2_treats

Unnamed: 0,index,subject,predicate,object,primary_knowledge_source,publications,knowledge_level,agent_type
8,8,MESH:D000077206,biolink:treats_or_applied_or_studied_to_treat,MESH:C531617,infores:ctd,[PMID:8967745],knowledge_assertion,manual_agent
9,9,MESH:D019782,biolink:treats_or_applied_or_studied_to_treat,MESH:C531617,infores:ctd,[PMID:8967745],knowledge_assertion,manual_agent
10,10,MESH:D014810,biolink:treats_or_applied_or_studied_to_treat,MESH:C531617,infores:ctd,[PMID:8967745],knowledge_assertion,manual_agent
12,12,MESH:D000086,biolink:treats_or_applied_or_studied_to_treat,MESH:C531795,infores:ctd,[PMID:18070060],knowledge_assertion,manual_agent
25,25,MESH:D014227,biolink:treats_or_applied_or_studied_to_treat,MESH:C535306,infores:ctd,[PMID:27469509],knowledge_assertion,manual_agent
...,...,...,...,...,...,...,...,...
102864,102864,MESH:D009538,biolink:treats_or_applied_or_studied_to_treat,OMIM:188890,infores:ctd,[PMID:16785264],knowledge_assertion,manual_agent
102865,102865,MESH:D000655,biolink:treats_or_applied_or_studied_to_treat,OMIM:248310,infores:ctd,[PMID:2663213],knowledge_assertion,manual_agent
102879,102879,MESH:D020849,biolink:treats_or_applied_or_studied_to_treat,OMIM:614160,infores:ctd,[PMID:29571322],knowledge_assertion,manual_agent
102880,102880,MESH:D020845,biolink:treats_or_applied_or_studied_to_treat,OMIM:614160,infores:ctd,[PMID:29571322],knowledge_assertion,manual_agent
