DRKG

Adapted from: https://github.com/gnn4dr/DRKG/blob/master/drkg_with_dgl/loading_drkg_in_dgl.ipynb

In [1]:
import pandas as pd
import numpy as np
import os 
import torch_geometric.transforms as T
import sys
sys.path.append('..')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from SIMP_LLM.DRKG_loading   import  get_triplets, read_tsv,filter_drkg,map_drkg_relationships,filter_interaction_subset,print_head
from SIMP_LLM.DRKG_translate import  load_lookups
from SIMP_LLM.DRKG_entity_processing import get_unique_entities, get_entity_lookup, convert_entitynames, flip_headtail
from SIMP_LLM.raredisease_loading import get_orphan_data # SP 05/24/23 added

# 1) Load Data

In [3]:
### 1) Read: This section reads DRKG and a glossary (used to map entities from codes to words)
DATA_DIR           = os.path.join("../data")
verbose            =  True 
triplets,drkg_df   =  get_triplets(drkg_file = os.path.join(DATA_DIR  ,'drkg.tsv'),             verbose=verbose)  # Read triplets (head,relationship,tail)
relation_glossary  =  read_tsv(relation_file = os.path.join(DATA_DIR  ,'relation_glossary.tsv'),verbose=verbose)  # Read relationship mapping  


### 2) Filter & Map Interactions: This section returns a list of interactions e.g. DRUGBANK::treats::Compound:Disease )
# 2.1: First  we filter the interactions to only Compound-Disease
# 2.2: Then   we map the codes -> text  (this will be use to further filter interactions based on text) e.g.  Hetionet::CpD::Compound:Disease -> palliation
# 2.3: We use natural text to fitler  interactions based on terms such as "treat" (but we return the orignal interaction name )



# modularize this in create_dataframe
drkg_rx_dx_relations        = filter_drkg(data_frame = drkg_df ,  filter_column = 1 ,  filter_term = r'.*?Compound:Disease', verbose = verbose) # 2.1 Filter only Compound-Disease Interactions
drkg_rx_dx_relations_mapped = map_drkg_relationships(drkg_rx_dx_relations,relation_glossary,verbose=verbose)                                    # 2.2 Map codes to text 

### 2.3 Filter Drug interactions Interaction types to only include: treat inhibit or alleviate interactions  ###
drkg_rx_dx_relation_subset =  filter_interaction_subset(df                  = drkg_rx_dx_relations_mapped,
                                                        filter_colunm_name = 'Interaction-type' ,
                                                        regex_string       =  'treat|inhibit|alleviate',
                                                        return_colunm_name =  'Relation-name')

# 3) Use Filter Interactions to get Gilter DRKG 
drkg_df_filtered = drkg_df[drkg_df[1].isin(drkg_rx_dx_relation_subset)] # 3.1 Filter DRKG  to only  Compund-Disease 
print_head(df=drkg_df_filtered)



###

rx_dx_triplets   = drkg_df_filtered.values.tolist()                     # 3.2 Convert filtered DRKG to list


 Triplets:

[['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2157'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5264'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2158'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::3309'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::28912'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::811'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2159'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::821'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5627'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5624']]

 ../data/drkg.tsv  Dataframe:

+----+------------+--------------------------------+-------------+
|    | 0          | 1                              | 2           |
|----+------------+--------------------------------+-------------|
|  0 | Gene::2157 | bioarx::HumGenHumGen:Gene:Gene | Gene::2157  |
|  1 | Gene::2157 | bioarx::HumGenHumGen:Gene:G

In [4]:
# 4) Load Data frames for translation
hetionet_df, gene_df, drugbank_df, omim_df, mesh_dict, chebi_df, chembl_df = load_lookups(data_path=DATA_DIR,verbose=verbose)


 ../data/hetionet-v1.0-nodes.tsv  Dataframe:

+----+-------------------------+---------------------------+---------+
|    | id                      | name                      | kind    |
|----+-------------------------+---------------------------+---------|
|  0 | Anatomy::UBERON:0000002 | uterine cervix            | Anatomy |
|  1 | Anatomy::UBERON:0000004 | nose                      | Anatomy |
|  2 | Anatomy::UBERON:0000006 | islet of Langerhans       | Anatomy |
|  3 | Anatomy::UBERON:0000007 | pituitary gland           | Anatomy |
|  4 | Anatomy::UBERON:0000010 | peripheral nervous system | Anatomy |
+----+-------------------------+---------------------------+---------+

 Sample of Hetionet Data Types (Before processing):

+-------+----------------------------------+-------------------------------------------+---------------------+
|       | id                               | name                                      | kind                |
|-------+-----------------------------

In [5]:
# Load orphan disease names and codes (28 Nov 2022 version) # SP 05/24/23
orphan_names, orphan_codes = get_orphan_data(os.path.join(DATA_DIR, 'en_product1-Orphadata.xml'), verbose=verbose)

# Get orphan disease MeSH codes
orphan_codes_mesh = orphan_codes[orphan_codes['code_source']=='MeSH'].copy()
orphan_codes_mesh['id'] = 'MESH::'+orphan_codes_mesh['code']


 Long-form orphan disease data (before processing):

+----+-----------+------------------------------------------------------------------------+
|    | cols      | data                                                                   |
|----+-----------+------------------------------------------------------------------------|
|  0 | Orphacode | 166024                                                                 |
|  1 | Name      | Multiple epiphyseal dysplasia, Al-Gazali type                          |
|  2 | Synonym   | Multiple epiphyseal dysplasia-macrocephaly-distinctive facies syndrome |
|  3 | Source    | ICD-10                                                                 |
|  4 | Reference | Q77.3                                                                  |
+----+-----------+------------------------------------------------------------------------+

 Long-form orphan disease data (after processing):

+----+-------------------------+---------------------------------

In [6]:
# Make dictionaries for codes
code_df   = pd.concat([hetionet_df[['name', 'id']], 
                       gene_df.rename(columns = {"description":"name", "GeneID":"id"}),
                       drugbank_df.rename(columns = {"Common name":"name", "DrugBank ID":"id"}),
                       omim_df.rename(columns = {"MIM Number":"id"}),
                       chebi_df.rename(columns = {"NAME":"name", "CHEBI_ACCESSION":"id"}),
                       chembl_df.rename(columns = {"pref_name":"name", "chembl_id":"id"}),
                       orphan_codes_mesh.rename(columns = {"Name":"name"}) # SP 05/24/23 added orphan disease MeSH terms
                       ], ignore_index=True, axis=0).drop_duplicates() 
code_dict = pd.Series(code_df['name'].values, index=code_df['id']).to_dict() | mesh_dict # Convert node df to dict and merge with MeSH dictionary

# Get unique DRKG entities
drkg_entities = get_unique_entities(drkg_df, [0,2])

# Create and use convert_entitynames function
drkg_entity_df, drkg_unmatched = get_entity_lookup(drkg_entities, code_dict)

# Create final node dictionary
node_dict = pd.Series(drkg_entity_df['name'].values, index=drkg_entity_df['drkg_id']).to_dict() 

# Initialize translated DRKG and manually clean heads/tails for one case where they were flipped
drkg_translated    = drkg_df.copy()
drkg_translated = flip_headtail(drkg_translated, 'Gene:Compound')

# Map DRKG to translated entity names
drkg_translated = convert_entitynames(drkg_translated, 0, node_dict)
drkg_translated = convert_entitynames(drkg_translated, 2, node_dict)
drkg_translated = drkg_translated.dropna()
print_head(drkg_translated) 

# Summarize percentage translated
print("Number of unique DRKG entities: ", len(drkg_entities)) # should be 97238
print("Number of translated entities: ", drkg_entity_df.shape[0])
print("Number of untranslated entities: ", drkg_unmatched.shape[0])
pct_entity_translated = drkg_entity_df.shape[0]/len(drkg_entities)
print('Percentage of entities translated: ', round(pct_entity_translated*100,1), '%')

print('Total DRKG relationships: ', drkg_df.shape[0])
print('Translated DRKG relationships: ', drkg_translated.shape[0])
pct_translated = drkg_translated.shape[0]/drkg_df.shape[0]
print('Percentage of relationships fully translated: ', round(pct_translated*100,1), '%')

+----+------------------------------+--------------------------------+------------------------------------------------------+
|    | 0                            | 1                              | 2                                                    |
|----+------------------------------+--------------------------------+------------------------------------------------------|
|  0 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | coagulation factor VIII (F8)                         |
|  1 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | phytanoyl-CoA 2-hydroxylase (PHYH)                   |
|  2 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | coagulation factor IX (F9)                           |
|  3 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | heat shock protein family A (Hsp70) member 5 (HSPA5) |
|  4 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | immunoglobulin kappa variable 3-20 (IGKV3-20)  

In [7]:



# Update relation glossary 
relation_df = relation_glossary.copy().rename(columns={'Relation-name':'drkg_id'})
relation_df[['head_entity','tail_entity']] = relation_df['drkg_id'].str.split('::', expand=True)[2].str.split(':', expand=True) # Set head and tail nodes

# Manually fix head and tail nodes for DGIDB relations, which reverse compound-gene interactions
relation_df.loc[relation_df['drkg_id'].str.contains('Gene:Compound'),'head_entity'] = 'Compound'
relation_df.loc[relation_df['drkg_id'].str.contains('Gene:Compound'),'tail_entity'] = 'Gene'

# Fix bioarx entries without the second "::" delimiter
bioarx_ht = relation_df['drkg_id'].str.split(':', expand=True)[[3,4]]
relation_df['head_entity'] = np.where(relation_df['head_entity'].isna(), bioarx_ht[3], relation_df['head_entity'])
relation_df['tail_entity'] = np.where(relation_df['tail_entity'].isna(), bioarx_ht[4], relation_df['tail_entity'])

# Add mapped relation group labels
relation_groups = [['activation', 'agonism', 'agonism, activation', 'activates, stimulates'],
    ['antagonism', 'blocking', 'antagonism, blocking'],
    ['binding', 'binding, ligand (esp. receptors)'],
    ['blocking', 'channel blocking'],
    ['inhibition', 'inhibits cell growth (esp. cancers)', 'inhibits'],
    ['enzyme', 'enzyme activity'],
    ['upregulation', 'increases expression/production'],
    ['downregulation', 'decreases expression/production'],
    ['Compound treats the disease', 'treatment/therapy (including investigatory)', 'treatment']]

relation_df['relation_name'] = relation_df['Interaction-type']

for grp in relation_groups:
    relation_df_subset = relation_df[relation_df['Interaction-type'].isin(grp)].copy()
    for entities in relation_df_subset['Connected entity-types'].unique():
        subgrp = relation_df_subset[relation_df_subset['Connected entity-types'] == entities]['Interaction-type'].unique()
        relation_df.loc[(relation_df_subset['Connected entity-types'] == entities) & (relation_df['Interaction-type'].isin(subgrp)), 'relation_name'] = subgrp[0]

# Remove special characters from relation names
relation_df['relation_name'] = relation_df['relation_name'].str.replace(',|/', ' or ', regex=True)
relation_df['relation_name'] = relation_df['relation_name'].str.replace('esp.','especially')
relation_df['relation_name'] = relation_df['relation_name'].str.replace('\(|\)|-|\.', '', regex=True)

# Check if any relationshp names still have non alpha numeric values except space
error_relation_names = relation_df['relation_name'][relation_df['relation_name'].str.replace(' ', '').str.contains(r"[^a-zA-Z0-9]+", regex=True)].drop_duplicates()
if len(error_relation_names):
    print('Warning: The following relation names contain special characters, which can interfere with PyG/GraphSage')
    print(error_relation_names)

relation_df


Unnamed: 0,drkg_id,Data-source,Connected entity-types,Interaction-type,Description,Reference for the description,head_entity,tail_entity,relation_name
0,DGIDB::ACTIVATOR::Gene:Compound,DGIDB,Compound:Gene,activation,An activator interaction is when a drug activa...,http://www.dgidb.org/getting_started,Compound,Gene,activation
1,DGIDB::AGONIST::Gene:Compound,DGIDB,Compound:Gene,agonism,An agonist interaction occurs when a drug bind...,http://www.dgidb.org/getting_started,Compound,Gene,activation
2,DGIDB::ALLOSTERIC MODULATOR::Gene:Compound,DGIDB,Compound:Gene,allosteric modulation,An allosteric modulator interaction occurs whe...,http://www.dgidb.org/getting_started,Compound,Gene,allosteric modulation
3,DGIDB::ANTAGONIST::Gene:Compound,DGIDB,Compound:Gene,antagonism,An antagonist interaction occurs when a drug b...,http://www.dgidb.org/getting_started,Compound,Gene,antagonism
4,DGIDB::ANTIBODY::Gene:Compound,DGIDB,Compound:Gene,antibody,An antibody interaction occurs when an antibod...,http://www.dgidb.org/getting_started,Compound,Gene,antibody
...,...,...,...,...,...,...,...,...,...
102,bioarx::Covid2_acc_host_gene::Disease:Gene,BIBLIOGRAPHY,Disease:Gene,interaction,"Interactions between 27 viral proteins, and ...",,Disease,Gene,interaction
103,bioarx::DrugHumGen:Compound:Gene,BIBLIOGRAPHY,Compound:Gene,interaction,,,Compound,Gene,interaction
104,bioarx::DrugVirGen:Compound:Gene,BIBLIOGRAPHY,Compound:Gene,interaction,,,Compound,Gene,interaction
105,bioarx::HumGenHumGen:Gene:Gene,BIBLIOGRAPHY,Gene:Gene,interaction,Protein-protein interaction,,Gene,Gene,interaction


## Identify rare diseases in DRKG

In [8]:
from SIMP_LLM.raredisease_loading import get_drkg_entity_ontologies, read_and_process_doid, create_orphanet_regex, merge_regex, find_drkg_rarediseases, check_raredisease_multiple_codes

### Download and process data

In [9]:
# All drkg entities 
drkg_all_entities = pd.concat([drkg_entity_df, drkg_unmatched], ignore_index=True, axis=0) 

 # Check disease code ontologies in DRKG - extra code in DKRG called DOID - and Orphanet
if verbose:
    condition_list = ['Disease', 'Symptom', 'Side Effect']
    drkg_ontology_counts = get_drkg_entity_ontologies(drkg_all_entities, condition_list)
    print('DRKG code counts:')
    print(drkg_ontology_counts)
    print('\nOrphanet code types: ', orphan_codes['code_source'].unique())

# Download and process DOID (disease ontology) codes
doid_df  =  read_and_process_doid(relation_file = os.path.join(DATA_DIR,'DOID.csv'), verbose=verbose) 

# Create regex for match Orphanet mapped codes to cross reference codes in DOID
orphan_codes_match = create_orphanet_regex(orphan_codes, verbose=verbose)

DRKG code counts:
   matched ontology_name  count
0        0          MESH   1002
1        1          DOID    127
2        1          MESH   4284
3        1          OMIM     78
4        1      UMLS CUI   5701

Orphanet code types:  ['ICD-10' 'OMIM' 'UMLS' 'MeSH' 'ICD-11' 'GARD' 'MedDRA']

 DOID Dataframe (After processing):

+-----+--------------+--------------------------+------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------------------------------------------------------------------------------------------------------------------+----------------------+------------------------------------------------+---------------------------------------------+
|     | id           | Preferred Label          | Synonyms                                       | Definitions                                            

  df_raw = pd.read_csv(relation_file)


### Code and regex matching between Orphacodes and DRKG

In [10]:
# Match Orphacodes and DOIDs - Takes ~3min to run
orphacode_doid_regex = merge_regex(orphan_codes_match, 'regex', doid_df, 'database_cross_reference')

# Match rare diseases in DRKG
matched_rarediseases = find_drkg_rarediseases(drkg_all_entities, orphan_codes, orphacode_doid_regex, verbose=verbose)

# Check for rare diseases with multiple codes
if verbose:
    multiple_orphacode, multiple_drkg = check_raredisease_multiple_codes(matched_rarediseases)
    print_head(multiple_orphacode)
    print_head(multiple_drkg)

# Get non-matched rare diseases (could still be in DRKG but under different name or code - need to check that embeddings are separate)
unmatched_rarediseases = orphan_codes[~orphan_codes['Orphacode'].isin(matched_rarediseases['Orphacode'])]
if verbose:
    print('Number of unmatched Orphacodes: ', len(unmatched_rarediseases['Orphacode'].unique()))
    print_head(unmatched_rarediseases)

# Get all DRKG relationships involving matched rare diseases
raredisease_heads = drkg_df[drkg_df[0].isin(matched_rarediseases['drkg_id'])]
raredisease_tails = drkg_df[drkg_df[2].isin(matched_rarediseases['drkg_id'])]

if verbose:
    print('Rare disease heads in untranslated DRKG: ', raredisease_heads.shape[0])
    print_head(raredisease_heads)
    print('\n Rare disease tails in untranslated DRKG: ',raredisease_tails.shape[0])
    print_head(raredisease_tails)

# Get DRKG rare disease entries
raredisease_index = raredisease_heads.index.tolist() + raredisease_tails.index.tolist()
drkg_raredisease = drkg_translated.loc[drkg_translated.index.isin(raredisease_index)]
if verbose:
    print_head(drkg_raredisease)


 DRKG-Orphacode matches:

+----+-----------------------+----------------+------------------------------------------------------------------------+---------------+-----------------+-----------------+---------+-----------+-------------+------------------------------------------------------------------------+---------------+----------------------------------------------------+------------------------------+-----------------------------------+---------------------+--------------+----------+------+----------+---------+
|    | drkg_id               | drkg_dict_id   | name                                                                   | entity_type   | ontology_code   | ontology_name   | code    |   matched |   Orphacode | Name                                                                   | code_source   | Disordermappingrelation                            |   Disordermappingicdrelation | Disordermappingvalidationstatus   | code_source_upper   | match_type   |   code_x |   id |   code

In [11]:
drkg_raredisease

Unnamed: 0,0,1,2
84697,Middle East Respiratory Syndrome Coronavirus,bioarx::Coronavirus_ass_host_gene::Disease:Gene,S-phase kinase associated protein 2 (SKP2)
84698,Middle East Respiratory Syndrome Coronavirus,bioarx::Coronavirus_ass_host_gene::Disease:Gene,karyopherin subunit alpha 4 (KPNA4)
84699,Middle East Respiratory Syndrome Coronavirus,bioarx::Coronavirus_ass_host_gene::Disease:Gene,protein activator of interferon induced protei...
84700,Middle East Respiratory Syndrome Coronavirus,bioarx::Coronavirus_ass_host_gene::Disease:Gene,CD9 molecule (CD9)
84701,Middle East Respiratory Syndrome Coronavirus,bioarx::Coronavirus_ass_host_gene::Disease:Gene,transmembrane serine protease 2 (TMPRSS2)
...,...,...,...
3594805,Azithromycin,Hetionet::CcSE::Compound:Side Effect,Stevens-Johnson syndrome
3594863,Amifostine,Hetionet::CcSE::Compound:Side Effect,Toxic epidermal necrolysis
3594923,Quinine,Hetionet::CcSE::Compound:Side Effect,Fixed drug eruption
3594932,Olmesartan,Hetionet::CcSE::Compound:Side Effect,Systemic lupus erythematosus


#### Rare disease tests

In [85]:
# checks for rare disease matches in drkg
matched_rarediseases[matched_rarediseases['drkg_id']=='Disease::MESH:D065207']

Unnamed: 0,drkg_id,drkg_dict_id,name,entity_type,ontology_code,ontology_name,code,matched,Orphacode,Name,code_source,Disordermappingrelation,Disordermappingicdrelation,Disordermappingvalidationstatus,code_source_upper,match_type,code_x,id,code_y,key_0
1067,Disease::MESH:D065207,MESH::D065207,Middle East Respiratory Syndrome Coronavirus,Disease,MESH:D065207,MESH,D065207,1,576074,Middle East respiratory syndrome,MeSH,E (Exact mapping: the two concepts are equival...,,Validated,MESH,MeSH/OMIM,,,,


In [246]:
# All drkg entities (and remove 'CUI' form 'UMLS CUI' in dataframe for merging later)
# drkg_all_entities['ontology_name'] = np.where(drkg_all_entities['ontology_name'].str.contains('UMLS'), 'UMLS', drkg_all_entities['ontology_name'])

# doid_df_raw[doid_df_raw['id'] =='CHEBI:102166'].iloc[:, 100:120]
# doid_df[doid_df['label'] != doid_df['Preferred Label']]
doid_df[doid_df['database_cross_reference'].isna()==False] # different code types
# doid_df[doid_df['has_alternative_id'].isna()==False] # different values of same ID type

Unnamed: 0,id,Preferred Label,Synonyms,Definitions,CUI,database_cross_reference,has_alternative_id,has_exact_synonym,Parents
572,DOID:0001816,angiosarcoma,hemangiosarcoma,A vascular cancer that derives_from the cells ...,,SNOMEDCT_US_2022_09_01:39000009|NCI:C3088|UMLS...,DOID:4508|DOID:267,hemangiosarcoma,http://purl.obolibrary.org/obo/DOID_175
573,DOID:0002116,pterygium,surfer's eye,A corneal disease that is characterized by a t...,,UMLS_CUI:C0033999,,surfer's eye,http://purl.obolibrary.org/obo/DOID_10124
574,DOID:0014667,disease of metabolism,metabolic disease,A disease that involving errors in metabolic p...,,ICD10CM:E88.9|ICD9CM:277.9|NCI:C3235|SNOMEDCT_...,,metabolic disease,http://purl.obolibrary.org/obo/DOID_4
576,DOID:0040002,aspirin allergy,ASA allergy|acetylsalicylic acid allergy,A drug allergy that has_allergic_trigger acety...,,UMLS_CUI:C0004058|SNOMEDCT_US_2022_09_01:29358...,,ASA allergy|acetylsalicylic acid allergy,http://purl.obolibrary.org/obo/DOID_0060500
577,DOID:0040003,benzylpenicillin allergy,penicillin G allergy|benzyl penicillin allergy,A beta-lactam allergy that has_allergic_trigge...,,SNOMEDCT_US_2022_09_01:294499007|UMLS_CUI:C057...,,penicillin G allergy|benzyl penicillin allergy,http://purl.obolibrary.org/obo/DOID_0060519
...,...,...,...,...,...,...,...,...,...
17114,SYMP:0000818,localized superficial mass,,,,ICD9CM_2005:782.2,,,http://purl.obolibrary.org/obo/SYMP_0000488
17115,SYMP:0000819,localized superficial swelling,,,,ICD9CM_2005:782.2,,,http://purl.obolibrary.org/obo/SYMP_0000488
17116,SYMP:0000820,mass in chest,,,,ICD9CM_2005:786.6,,,http://purl.obolibrary.org/obo/SYMP_0000514
17148,SYMP:0000852,anuria,,A urinary system symptom that is characterized...,,UMLS_ICD9CM_2005_AUI:A0243854|UMLS_CUI:C002896...,,,http://purl.obolibrary.org/obo/SYMP_0000486


In [30]:
# CHECK filter DOID to relevant codes
# doid_df_raw  =  pd.read_csv(os.path.join(DATA_DIR,'DOID.csv'))  # Read relationship mapping  

# doid_vars = ['id', 'Preferred Label', 'Synonyms', 'Definitions', 'CUI', 'database_cross_reference', 'has_alternative_id', 'has_exact_synonym', 'Parents']

# doid_df = doid_df_raw[doid_vars]
# # doid_df

# doid_othercodes_df = doid_df[doid_df['database_cross_reference'].str.upper().str.contains(r'UMLS|ICD|OMIM|MESH', na=False)]
# doid_othercodes_df
# doid_othercodes_df[doid_othercodes_df['database_cross_reference'].str.contains('MESH')]

  doid_df_raw  =  pd.read_csv(os.path.join(DATA_DIR,'DOID.csv'))  # Read relationship mapping


Unnamed: 0,id,Preferred Label,Synonyms,Definitions,CUI,database_cross_reference,has_alternative_id,has_exact_synonym,Parents
572,DOID:0001816,angiosarcoma,hemangiosarcoma,A vascular cancer that derives_from the cells ...,,SNOMEDCT_US_2022_09_01:39000009|NCI:C3088|UMLS...,DOID:4508|DOID:267,hemangiosarcoma,http://purl.obolibrary.org/obo/DOID_175
573,DOID:0002116,pterygium,surfer's eye,A corneal disease that is characterized by a t...,,UMLS_CUI:C0033999,,surfer's eye,http://purl.obolibrary.org/obo/DOID_10124
574,DOID:0014667,disease of metabolism,metabolic disease,A disease that involving errors in metabolic p...,,ICD10CM:E88.9|ICD9CM:277.9|NCI:C3235|SNOMEDCT_...,,metabolic disease,http://purl.obolibrary.org/obo/DOID_4
576,DOID:0040002,aspirin allergy,ASA allergy|acetylsalicylic acid allergy,A drug allergy that has_allergic_trigger acety...,,UMLS_CUI:C0004058|SNOMEDCT_US_2022_09_01:29358...,,ASA allergy|acetylsalicylic acid allergy,http://purl.obolibrary.org/obo/DOID_0060500
577,DOID:0040003,benzylpenicillin allergy,penicillin G allergy|benzyl penicillin allergy,A beta-lactam allergy that has_allergic_trigge...,,SNOMEDCT_US_2022_09_01:294499007|UMLS_CUI:C057...,,penicillin G allergy|benzyl penicillin allergy,http://purl.obolibrary.org/obo/DOID_0060519
...,...,...,...,...,...,...,...,...,...
17114,SYMP:0000818,localized superficial mass,,,,ICD9CM_2005:782.2,,,http://purl.obolibrary.org/obo/SYMP_0000488
17115,SYMP:0000819,localized superficial swelling,,,,ICD9CM_2005:782.2,,,http://purl.obolibrary.org/obo/SYMP_0000488
17116,SYMP:0000820,mass in chest,,,,ICD9CM_2005:786.6,,,http://purl.obolibrary.org/obo/SYMP_0000514
17148,SYMP:0000852,anuria,,A urinary system symptom that is characterized...,,UMLS_ICD9CM_2005_AUI:A0243854|UMLS_CUI:C002896...,,,http://purl.obolibrary.org/obo/SYMP_0000486


In [332]:
# Test regex strings
test_regex = pd.DataFrame(["ICD10CM:E88.9|ICD9CM:277.9|NCI:C3235|UMLS_CUI:C0033", 
                           "abc|ICD10CM:E88.9|ICD9CM:277.9|UMLS_CUI:C0033|NCI:C3235", 
                           "ICD10CM:E8889|ICD9CM:277.9|NCI:C3235|UMLS_CUI:C0033asdf", 
                           "ICD10CM:xE88.9|ICD9CM:277.9|NCI:C3235|UMLS_CUI:xyczC0033",
                           "bICD10CM:E88.9|ICD9CM:277.9|NCI:C3235|asdfUMLS_CUI:C0033",
                           "ICD10CM:xyz|UMLS:E88.9|ICD9CM:277.9|NCI:C3235|UMLS_CUI:zdfsj|xya:C0033"])
test_regex[0].str.contains('(?:^|\|)ICD10(?:(?!\|).)*:E88\.9.*', regex=True) # For ICD codes, need to make sure to escape middle period

# Expected results for rows (starting index = 1)
# 1 and 2 true, matches and ontology is at start (see 5)
# 3 false, For ICD codes, need to make sure to escape middle period, 
# 4 false, have : in front of all codes
# 5 false, ontology can't start with another value
# 6 prevent | between code and value

# Below: For non-ICD, prevent additional characters after end (ICD ok)



0     True
1     True
2    False
3    False
4    False
5    False
Name: 0, dtype: bool

In [333]:
test_regex[0].str.contains('(?:^|\|)UMLS(?:(?!\|).)*:C0033(?:$|\|)', regex=True) # For non-ICD, prevent additional characters after end (ICD ok)

# Expected results for rows (starting index = 1)
# 1 and 2 true, matches and ontology is at start (see 5)
# 3 false, for non-ICD, prevent additional characters after end (ICD ok)
# 4 false, have : in front of all codes
# 5 false, ontology can't start with another value
# 6 prevent | between code and value

0     True
1     True
2    False
3    False
4    False
5    False
Name: 0, dtype: bool

In [335]:
# Create regex strings for orphan disease codes

# regex format: 
# ICD:   '(?:^|\|)ICD10(?:(?!\|).)*:E88\.9.*'
# Other: '(?:^|\|)UMLS(?:(?!\|).)*:C0033(?:$|\|)'
orphan_codes_match = orphan_codes.copy()
orphan_codes_match['code'] = np.where(orphan_codes_match['code_source'].str.startswith('ICD'), 
                                       orphan_codes_match['code'].str.replace('.','\.'),
                                       orphan_codes_match['code'])
orphan_codes_match['regex'] = '(?:^|\|)'+orphan_codes['code_source'].str.upper()+'(?:(?!\|).)*:'+orphan_codes['code']
orphan_codes_match['regex'] = orphan_codes_match['regex'].str.replace('-', '', regex=True)
orphan_codes_match['regex'] = np.where(orphan_codes_match['code_source'].str.startswith('ICD'), 
                                       orphan_codes_match['regex']+'.*',
                                       orphan_codes_match['regex']+'(?:$|\|)')

# .isin(doid_df['database_cross_reference'].str.upper())
orphan_codes_match

Unnamed: 0,Orphacode,Name,code_source,code,Disordermappingrelation,Disordermappingicdrelation,Disordermappingvalidationstatus,code_source_upper,regex
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",ICD-10,Q77\.3,NTBT (ORPHA code's Narrower Term maps to a Bro...,Attributed (The ICD code is attributed by Orph...,Validated,ICD-10,(?:^|\|)ICD10(?:(?!\|).)*:Q77.3.*
1,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",OMIM,607131,E (Exact mapping: the two concepts are equival...,,Validated,OMIM,(?:^|\|)OMIM(?:(?!\|).)*:607131(?:$|\|)
2,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",UMLS,C1846722,E (Exact mapping: the two concepts are equival...,,Validated,UMLS,(?:^|\|)UMLS(?:(?!\|).)*:C1846722(?:$|\|)
3,58,Alexander disease,OMIM,203450,E (Exact mapping: the two concepts are equival...,,Validated,OMIM,(?:^|\|)OMIM(?:(?!\|).)*:203450(?:$|\|)
4,58,Alexander disease,MeSH,D038261,E (Exact mapping: the two concepts are equival...,,Validated,MESH,(?:^|\|)MESH(?:(?!\|).)*:D038261(?:$|\|)
...,...,...,...,...,...,...,...,...,...
30794,620368,EGF-related primary hypomagnesemia with intell...,UMLS,C5681825,E (Exact mapping: the two concepts are equival...,,Validated,UMLS,(?:^|\|)UMLS(?:(?!\|).)*:C5681825(?:$|\|)
30795,617910,Conjunctival malignant melanoma,UMLS,C0346360,E (Exact mapping: the two concepts are equival...,,Validated,UMLS,(?:^|\|)UMLS(?:(?!\|).)*:C0346360(?:$|\|)
30796,619948,Early-onset autoimmunity-autoinflammation-immu...,UMLS,C5680416,E (Exact mapping: the two concepts are equival...,,Validated,UMLS,(?:^|\|)UMLS(?:(?!\|).)*:C5680416(?:$|\|)
30797,619360,NON RARE IN EUROPE: Isolated hereditary persis...,ICD-10,D56\.4,E (Exact mapping: the two concepts are equival...,Specific code (The ORPHA code has its own code...,Validated,ICD-10,(?:^|\|)ICD10(?:(?!\|).)*:D56.4.*


In [336]:
# merge by regex
# source: https://stackoverflow.com/questions/62521616/can-i-perform-a-left-join-merge-between-two-dataframes-using-regular-expressions
import re
def merge_regex(regex_df, regex_col, search_df, search_col):
    idx = [(i,j) for i,r in enumerate(regex_df[regex_col]) for j,v in enumerate(search_df[search_col].astype(str)) if re.match(r,v)]
    regex_df_idx, search_df_idx = zip(*idx)
    t = regex_df.iloc[list(regex_df_idx),0].reset_index(drop=True)
    t1 = search_df.iloc[list(search_df_idx),0].reset_index(drop=True)
    return pd.concat([t,t1],axis=1)

test = merge_regex(orphan_codes_match, 'regex', doid_df, 'database_cross_reference')

In [341]:
test

Unnamed: 0,Orphacode,id
0,58,DOID:4252
1,58,DOID:1926
2,93,DOID:0050461
3,166035,DOID:0110137
4,166035,DOID:0110139
...,...,...
21508,99792,DOID:0110065
21509,99792,DOID:701
21510,99772,DOID:0110214
21511,99777,DOID:0050602


In [280]:
drkg_all_entities[drkg_all_entities['ontology_name'].str.contains('UMLS', na=False)]
# drkg_all_entities

Unnamed: 0,drkg_id,drkg_dict_id,name,entity_type,ontology_code,ontology_name,code,matched
58268,Side Effect::C0000727,Side Effect::C0000727,Acute abdomen,Side Effect,C0000727,UMLS CUI,C0000727,1
58269,Side Effect::C0000729,Side Effect::C0000729,Abdominal cramps,Side Effect,C0000729,UMLS CUI,C0000729,1
58270,Side Effect::C0000731,Side Effect::C0000731,Abdominal distension,Side Effect,C0000731,UMLS CUI,C0000731,1
58271,Side Effect::C0000735,Side Effect::C0000735,Abdominal neoplasm,Side Effect,C0000735,UMLS CUI,C0000735,1
58272,Side Effect::C0000737,Side Effect::C0000737,Abdominal pain,Side Effect,C0000737,UMLS CUI,C0000737,1
...,...,...,...,...,...,...,...,...
63964,Side Effect::C3665624,Side Effect::C3665624,Blood calcium decreased,Side Effect,C3665624,UMLS CUI,C3665624,1
63965,Side Effect::C3665770,Side Effect::C3665770,Acquired lipoatrophic diabetes,Side Effect,C3665770,UMLS CUI,C3665770,1
63966,Side Effect::C3665818,Side Effect::C3665818,Medication residue present,Side Effect,C3665818,UMLS CUI,C3665818,1
63967,Side Effect::C3665888,Side Effect::C3665888,Reproductive toxicity,Side Effect,C3665888,UMLS CUI,C3665888,1


In [342]:
# NEW
doid_orphan_codes = test.merge(orphan_codes, how='left', on='Orphacode').drop_duplicates()

drkg_all_entities['ontology_name'] = np.where(drkg_all_entities['ontology_name'].str.contains('UMLS'), 'UMLS', drkg_all_entities['ontology_name'])

# Get matches on MeSH and OMIM codes
orphan_codes['code_source_upper'] = orphan_codes['code_source'].str.upper()
match_try1 = drkg_all_entities.merge(orphan_codes, how='inner', left_on=['ontology_name', 'code'], right_on=['code_source_upper', 'code'])
match_try1['match_type'] = 1

match_try2 = drkg_all_entities.merge(doid_orphan_codes, how='inner', left_on='ontology_code', right_on='id')
match_try2['match_type'] = 2

# Match by name
match_try3 = drkg_all_entities.merge(orphan_codes, how='inner', left_on=drkg_all_entities['name'].str.upper(), right_on=orphan_codes['Name'].str.upper())
match_try3['match_type'] = 3

# Stack matched entities
matched_rarediseases = pd.concat([match_try1, match_try2, match_try3], ignore_index=True, axis=0).drop_duplicates(subset=['drkg_id', 'Orphacode']) # keeps first entry of duplicates
matched_rarediseases.drop_duplicates(subset='drkg_id').drop(columns=['Disordermappingicdrelation', 'Disordermappingvalidationstatus']) # Show just unique entities in DRKG
matched_rarediseases.groupby(by=['match_type', 'entity_type', 'ontology_name']).agg(
    ct = ('drkg_id', 'count')
)
# matched_rarediseases

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ct
match_type,entity_type,ontology_name,Unnamed: 3_level_1
1,Disease,MESH,1033
1,Disease,OMIM,67
1,Side Effect,UMLS,407
1,Symptom,MESH,41
2,Disease,DOID,21
3,Disease,DOID,7
3,Disease,MESH,54
3,Side Effect,UMLS,6
3,Symptom,MESH,3


In [348]:
# Check those matched by name only
matched_rarediseases[matched_rarediseases['match_type']==3]

Unnamed: 0,drkg_id,drkg_dict_id,name,entity_type,ontology_code,ontology_name,code,matched,Orphacode,Name,code_source,Disordermappingrelation,Disordermappingicdrelation,Disordermappingvalidationstatus,code_source_upper,match_type,code_x,id,code_y,key_0
1711,Disease::DOID:0050156,Disease::DOID:0050156,idiopathic pulmonary fibrosis,Disease,DOID:0050156,DOID,,1,2032,Idiopathic pulmonary fibrosis,MedDRA,E (Exact mapping: the two concepts are equival...,,Validated,MEDDRA,3,0050156,,10021240,IDIOPATHIC PULMONARY FIBROSIS
1865,Disease::DOID:12930,Disease::DOID:12930,dilated cardiomyopathy,Disease,DOID:12930,DOID,,1,217604,Dilated cardiomyopathy,ICD-10,E (Exact mapping: the two concepts are equival...,Specific code (The ORPHA code has its own code...,Validated,ICD-10,3,12930,,I42.0,DILATED CARDIOMYOPATHY
1870,Disease::DOID:13378,Disease::DOID:13378,Kawasaki disease,Disease,DOID:13378,DOID,,1,2331,Kawasaki disease,ICD-11,E (Exact mapping: the two concepts are equival...,Specific code (The ORPHA code has its own code...,Validated,ICD-11,3,13378,,4A44.5,KAWASAKI DISEASE
1876,Disease::DOID:14268,Disease::DOID:14268,sclerosing cholangitis,Disease,DOID:14268,DOID,,1,447771,Sclerosing cholangitis,UMLS,E (Exact mapping: the two concepts are equival...,,Validated,UMLS,3,14268,,C0008313,SCLEROSING CHOLANGITIS
1877,Disease::DOID:2394,Disease::DOID:2394,ovarian cancer,Disease,DOID:2394,DOID,,1,213500,Ovarian cancer,UMLS,E (Exact mapping: the two concepts are equival...,,Validated,UMLS,3,2394,,C1140680,OVARIAN CANCER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8061,Side Effect::C0221242,Side Effect::C0221242,Fixed drug eruption,Side Effect,C0221242,UMLS,,1,293812,Fixed drug eruption,UMLS,E (Exact mapping: the two concepts are equival...,,Validated,UMLS,3,C0221242,,C0877391,FIXED DRUG ERUPTION
8104,Side Effect::C0334634,Side Effect::C0334634,Mantle cell lymphoma,Side Effect,C0334634,UMLS,,1,52416,Mantle cell lymphoma,GARD,E (Exact mapping: the two concepts are equival...,,Validated,GARD,3,C0334634,,6969,MANTLE CELL LYMPHOMA
8110,Side Effect::C0334660,Side Effect::C0334660,Intravascular large B-cell lymphoma,Side Effect,C0334660,UMLS,,1,98839,Intravascular large B-cell lymphoma,ICD-11,E (Exact mapping: the two concepts are equival...,Specific code (The ORPHA code has its own code...,Validated,ICD-11,3,C0334660,,2A81.1,INTRAVASCULAR LARGE B-CELL LYMPHOMA
8187,Side Effect::C2242577,Side Effect::C2242577,Oromandibular dystonia,Side Effect,C2242577,UMLS,,1,93958,Oromandibular dystonia,UMLS,E (Exact mapping: the two concepts are equival...,,Validated,UMLS,3,C2242577,,C0393607,OROMANDIBULAR DYSTONIA


In [346]:
# DRKG entities with multiple orphanet codes?
ct_orphacode = matched_rarediseases.groupby(by=['drkg_id','name', 'match_type']).agg(
    ct_orphacode = ('Orphacode', 'count')
).reset_index()
ct_orphacode[ct_orphacode['ct_orphacode']>1]

Unnamed: 0,drkg_id,name,match_type,ct_orphacode
1,Disease::DOID:10021,duodenum cancer,2,2
2,Disease::DOID:10153,ileum cancer,2,3
10,Disease::DOID:14268,sclerosing cholangitis,2,4
153,Disease::MESH:C536385,Focal facial dermal dysplasia type I,1,2
184,Disease::MESH:C536611,Brooke-Spiegler syndrome,1,2
301,Disease::MESH:C537771,Anorectal malformation,1,2
375,Disease::MESH:D000168,Acrocephalosyndactylia,1,2
390,Disease::MESH:D000699,"Pain Insensitivity, Congenital",1,2
429,Disease::MESH:D002292,"Carcinoma, Renal Cell",1,2
499,Disease::MESH:D005359,"Fibrous Dysplasia, Polyostotic",1,2


In [344]:
matched_rarediseases[matched_rarediseases['drkg_id']=='Disease::DOID:14268']

Unnamed: 0,drkg_id,drkg_dict_id,name,entity_type,ontology_code,ontology_name,code,matched,Orphacode,Name,code_source,Disordermappingrelation,Disordermappingicdrelation,Disordermappingvalidationstatus,code_source_upper,match_type,code_x,id,code_y,key_0
1654,Disease::DOID:14268,Disease::DOID:14268,sclerosing cholangitis,Disease,DOID:14268,DOID,,1,171,Primary sclerosing cholangitis,ICD-11,E (Exact mapping: the two concepts are equival...,Specific code (The ORPHA code has its own code...,Validated,ICD-11,2,14268,DOID:14268,DB96.2,
1662,Disease::DOID:14268,Disease::DOID:14268,sclerosing cholangitis,Disease,DOID:14268,DOID,,1,447774,Secondary sclerosing cholangitis,ICD-11,NTBT (ORPHA code's Narrower Term maps to a Bro...,Index term (The ORPHA code is listed in the IC...,Validated,ICD-11,2,14268,DOID:14268,DC13,
1664,Disease::DOID:14268,Disease::DOID:14268,sclerosing cholangitis,Disease,DOID:14268,DOID,,1,447764,IgG4-related sclerosing cholangitis,ICD-10,NTBT (ORPHA code's Narrower Term maps to a Bro...,Attributed (The ICD code is attributed by Orph...,Validated,ICD-10,2,14268,DOID:14268,K83.0,
1665,Disease::DOID:14268,Disease::DOID:14268,sclerosing cholangitis,Disease,DOID:14268,DOID,,1,480556,Isolated neonatal sclerosing cholangitis,ICD-10,NTBT (ORPHA code's Narrower Term maps to a Bro...,Attributed (The ICD code is attributed by Orph...,Not yet validated,ICD-10,2,14268,DOID:14268,K83.0,
1876,Disease::DOID:14268,Disease::DOID:14268,sclerosing cholangitis,Disease,DOID:14268,DOID,,1,447771,Sclerosing cholangitis,UMLS,E (Exact mapping: the two concepts are equival...,,Validated,UMLS,3,14268,,C0008313,SCLEROSING CHOLANGITIS


In [347]:
# Orphacodes with multiple DRKG entities?
ct_drkg_id = matched_rarediseases.groupby(by=['Orphacode','name', 'match_type']).agg(
    ct_drkg_id = ('drkg_id', 'count')
).reset_index()
ct_drkg_id[ct_drkg_id['ct_drkg_id']>1]

Unnamed: 0,Orphacode,name,match_type,ct_drkg_id
1,100,Ataxia Telangiectasia,1,2
4,100067,Waterhouse-Friderichsen Syndrome,1,2
36,1041,Hydrops Fetalis,1,2
56,1163,Aspergillosis,1,2
61,1183,Opsoclonus-Myoclonus Syndrome,1,2
...,...,...,...,...
1431,963,Acromegaly,1,2
1437,97275,Encephalitis,1,2
1438,97279,Insulinoma,1,2
1459,98292,Mastocytosis,1,2


In [232]:
matched_rarediseases[matched_rarediseases['Orphacode']=='88642']

Unnamed: 0,drkg_id,drkg_dict_id,name,entity_type,ontology_code,ontology_name,code,matched,Orphacode,Name,code_source,Disordermappingrelation,Disordermappingicdrelation,Disordermappingvalidationstatus,code_source_upper,key_0,code_x,code_y
371,Disease::MESH:D000699,MESH::D000699,"Pain Insensitivity, Congenital",Disease,MESH:D000699,MESH,D000699,1.0,88642,Congenital insensitivity to pain-anosmia-neuro...,MeSH,E (Exact mapping: the two concepts are equival...,,Validated,MESH,,,
636,Disease::MESH:D009477,MESH::D009477,Hereditary Sensory and Autonomic Neuropathies,Disease,MESH:D009477,MESH,D009477,1.0,88642,Congenital insensitivity to pain-anosmia-neuro...,MeSH,E (Exact mapping: the two concepts are equival...,,Validated,MESH,,,


In [233]:
matched_rarediseases[matched_rarediseases['Orphacode']=='1183']

Unnamed: 0,drkg_id,drkg_dict_id,name,entity_type,ontology_code,ontology_name,code,matched,Orphacode,Name,code_source,Disordermappingrelation,Disordermappingicdrelation,Disordermappingvalidationstatus,code_source_upper,key_0,code_x,code_y
988,Disease::MESH:D053578,MESH::D053578,Opsoclonus-Myoclonus Syndrome,Disease,MESH:D053578,MESH,D053578,1.0,1183,Opsoclonus-myoclonus syndrome,MeSH,E (Exact mapping: the two concepts are equival...,,Validated,MESH,,,
989,Symptom::D053578,Symptom::D053578,Opsoclonus-Myoclonus Syndrome,Symptom,D053578,MESH,D053578,1.0,1183,Opsoclonus-myoclonus syndrome,MeSH,E (Exact mapping: the two concepts are equival...,,Validated,MESH,,,


In [244]:
# Non-matched rare diseases? (could still be in DRKG but under different name or code - need to check that embeddings are separate)

unmatched_rarediseases = orphan_codes[~orphan_codes['Orphacode'].isin(matched_rarediseases['Orphacode'])]
unmatched_rarediseases

Unnamed: 0,Orphacode,Name,code_source,code,Disordermappingrelation,Disordermappingicdrelation,Disordermappingvalidationstatus,code_source_upper
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",ICD-10,Q77.3,NTBT (ORPHA code's Narrower Term maps to a Bro...,Attributed (The ICD code is attributed by Orph...,Validated,ICD-10
1,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",OMIM,607131,E (Exact mapping: the two concepts are equival...,,Validated,OMIM
2,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",UMLS,C1846722,E (Exact mapping: the two concepts are equival...,,Validated,UMLS
9,166032,"Multiple epiphyseal dysplasia, with miniepiphyses",ICD-10,Q77.3,NTBT (ORPHA code's Narrower Term maps to a Bro...,Attributed (The ICD code is attributed by Orph...,Validated,ICD-10
10,166032,"Multiple epiphyseal dysplasia, with miniepiphyses",UMLS,C1836307,E (Exact mapping: the two concepts are equival...,,Validated,UMLS
...,...,...,...,...,...,...,...,...
30794,620368,EGF-related primary hypomagnesemia with intell...,UMLS,C5681825,E (Exact mapping: the two concepts are equival...,,Validated,UMLS
30795,617910,Conjunctival malignant melanoma,UMLS,C0346360,E (Exact mapping: the two concepts are equival...,,Validated,UMLS
30796,619948,Early-onset autoimmunity-autoinflammation-immu...,UMLS,C5680416,E (Exact mapping: the two concepts are equival...,,Validated,UMLS
30797,619360,NON RARE IN EUROPE: Isolated hereditary persis...,ICD-10,D56.4,E (Exact mapping: the two concepts are equival...,Specific code (The ORPHA code has its own code...,Validated,ICD-10


# 3) BioLinkBERT embedding

In [7]:
from torch_geometric.data import HeteroData
from SIMP_LLM.llm_encode import EntityEncoder
# from SIMP_LLM.dataloader_mappings import create_mapping, create_edges, embed_entities, embed_edges
from DEBUG_dataloader_mappings import create_mapping, create_edges, embed_entities, embed_edges

### Set variables and load data

In [127]:
## Set variables
device   = "cpu"
Encoder  = EntityEncoder(device = device )
run_full_sample = 0

if run_full_sample:
    # Run full DRKG
    entity_df = drkg_entity_df.copy()
    hrt_data = drkg_translated.copy()
    relation_lookup = relation_df.copy()
else:
    # Create relationship subset for testing
    test_list = ['Compound treats the disease', 'increases expression or production', 'biomarkers diagnostic']

    # Create test sample of DRKG relationships filtering to these relations (for full sample: delete and use drkg_entity_df)
    test_relation_df = relation_df[relation_df['relation_name'].isin(test_list)].copy()
    print_head(test_relation_df)

    # Create test sample of DRKG relationships filtering to these relations (for full sample: delete and use drkg_entity_df)
    test_hrt_df = drkg_translated[drkg_translated[1].isin(test_relation_df['drkg_id'])]
    test_hrt_df = test_hrt_df.groupby(1).head(3).reset_index(drop=True)
    test_unique_entities = get_unique_entities(test_hrt_df, columns=[0,2])
    test_entity_df = drkg_entity_df[drkg_entity_df['name'].isin(test_unique_entities)]
    print_head(test_hrt_df)
    print_head(test_entity_df)

    entity_df = test_entity_df.copy()
    hrt_data = test_hrt_df.copy()
    relation_lookup = relation_df.copy()

+----+------------------------------------+---------------+--------------------------+---------------------------------------------+------------------------------------------------------------------------------------------+---------------------------------+---------------+---------------+------------------------------------+
|    | drkg_id                            | Data-source   | Connected entity-types   | Interaction-type                            | Description                                                                              | Reference for the description   | head_entity   | tail_entity   | relation_name                      |
|----+------------------------------------+---------------+--------------------------+---------------------------------------------+------------------------------------------------------------------------------------------+---------------------------------+---------------+---------------+------------------------------------|
| 18 | DRUGBANK::tr

In [120]:
# Check for special characters in relation name
relation_lookup[relation_lookup['relation_name'].str.contains('/')]
relation_df['relation_name'][relation_df['relation_name'].str.replace(' ', '').str.contains(r"[^a-zA-Z0-9]+", regex=True)].drop_duplicates()

# Replace and check
relation_df_test = relation_df.copy()
relation_df_test['relation_name'] = relation_df_test['relation_name'].str.replace(',|/', ' or ', regex=True)
relation_df_test['relation_name'] = relation_df_test['relation_name'].str.replace('esp.','especially')
relation_df_test['relation_name'] = relation_df_test['relation_name'].str.replace('\(|\)|-|\.', '', regex=True)
relation_df_test['relation_name'][relation_df_test['relation_name'].str.replace(' ', '').str.contains(r"[^a-zA-Z0-9]+", regex=True)].drop_duplicates()

# Check differences
relation_df_test = relation_df.copy()
relation_df_test['new_relation_name'] = relation_df_test['relation_name']
relation_df_test['new_relation_name'] = relation_df_test['new_relation_name'].str.replace(',|/', ' or ', regex=True)
relation_df_test['new_relation_name'] = relation_df_test['new_relation_name'].str.replace('esp.','especially')
relation_df_test['new_relation_name'] = relation_df_test['new_relation_name'].str.replace('\(|\)|-|\.', '', regex=True)
relation_df_test[['relation_name', 'new_relation_name']][relation_df_test['relation_name'] != relation_df_test['new_relation_name']].drop_duplicates() #.loc[17, 'new_relation_name']


Unnamed: 0,relation_name,new_relation_name
14,drug-drug interaction,drugdrug interaction
17,Compound belongs to Anatomical Therapeutic Che...,Compound belongs to Anatomical Therapeutic Che...
22,"binding, ligand (esp. receptors)",binding or ligand especially receptors
23,inhibits cell growth (esp. cancers),inhibits cell growth especially cancers
25,increases expression/production,increases expression or production
27,decreases expression/production,decreases expression or production
28,affects expression/production (neutral),affects expression or production neutral
35,"metabolism, pharmacokinetics",metabolism or pharmacokinetics
37,biomarkers (diagnostic),biomarkers diagnostic
38,biomarkers (of disease progression),biomarkers of disease progression


In [134]:
relation_df.loc[17,'relation_name']

'Compound belongs to Anatomical Therapeutic Chemical ATC code'

### Build HeteroData Object

In [128]:
# Initialize heterograph object
data = HeteroData()

# Embed entities, add to graph, and save embedding mapping dictionary of dictionaries
mapping_dict = embed_entities(entity_df, data, Encoder, device) 

# Embed relationships, add to graph, and save relation embeddings/mapping dictionary
relation_X, relation_mapping = embed_edges(hrt_data, relation_lookup, data, mapping_dict, Encoder, device, data_path=DATA_DIR)

# Print summary
#data = T.ToUndirected()(data)

print(data)
for ent_type in entity_df['entity_type'].unique():
    print(f"Unique {ent_type}s: {len(mapping_dict[ent_type])} \t Matrix shape: {data[ent_type].x.shape }")
    # print(mapping_dict[ent_type]) # Prints whole dictionary so delete/uncomment if using all entities

HeteroData(
  [1mCompound[0m={ x=[12, 768] },
  [1mDisease[0m={ x=[11, 768] },
  [1mGene[0m={ x=[12, 768] },
  [1m(Compound, Compound treats the disease, Disease)[0m={
    edge_index=[2, 9],
    edge_label=[9, 768]
  },
  [1m(Compound, increases expression or production, Gene)[0m={
    edge_index=[2, 6],
    edge_label=[6, 768]
  },
  [1m(Gene, increases expression or production, Gene)[0m={
    edge_index=[2, 3],
    edge_label=[3, 768]
  },
  [1m(Gene, biomarkers diagnostic, Disease)[0m={
    edge_index=[2, 3],
    edge_label=[3, 768]
  }
)
Unique Compounds: 12 	 Matrix shape: torch.Size([12, 768])
Unique Diseases: 11 	 Matrix shape: torch.Size([11, 768])
Unique Genes: 12 	 Matrix shape: torch.Size([12, 768])


In [129]:

data1 = T.ToUndirected()(data)
data1

HeteroData(
  [1mCompound[0m={ x=[12, 768] },
  [1mDisease[0m={ x=[11, 768] },
  [1mGene[0m={ x=[12, 768] },
  [1m(Compound, Compound treats the disease, Disease)[0m={
    edge_index=[2, 9],
    edge_label=[9, 768]
  },
  [1m(Compound, increases expression or production, Gene)[0m={
    edge_index=[2, 6],
    edge_label=[6, 768]
  },
  [1m(Gene, increases expression or production, Gene)[0m={
    edge_index=[2, 6],
    edge_label=[6, 768]
  },
  [1m(Gene, biomarkers diagnostic, Disease)[0m={
    edge_index=[2, 3],
    edge_label=[3, 768]
  },
  [1m(Disease, rev_Compound treats the disease, Compound)[0m={
    edge_index=[2, 9],
    edge_label=[9, 768]
  },
  [1m(Gene, rev_increases expression or production, Compound)[0m={
    edge_index=[2, 6],
    edge_label=[6, 768]
  },
  [1m(Disease, rev_biomarkers diagnostic, Gene)[0m={
    edge_index=[2, 3],
    edge_label=[3, 768]
  }
)

In [73]:
data[('Compound', 'increases expression/production', 'Gene')].edge_index

tensor([[ 7,  6,  3,  8,  1,  4],
        [ 3,  9,  2, 10,  7,  6]])

In [14]:
# Experimental: Ignore ##
data2 = HeteroData()
data2['Compound'].x  = data["Compound"]["x"]
data2['Disease'].x   = data["Disease"]["x"]

ctd                  = data[("Compound", "Compound treats the disease", "Disease")]["edge_index"] #.to_sparse()
data2['Compound', 'treats', 'Disease'].edge_index = ctd
target_label                                     = data2['Compound', 'treats', 'Disease'].edge_index
#data['Compound', 'treats', 'Disease'].edge_label =  torch.ones(target_label.shape[1],)
data2 = T.ToUndirected()(data2)




print(target_label.shape)

print(data2)

torch.Size([2, 3])
HeteroData(
  [1mCompound[0m={ x=[13, 768] },
  [1mDisease[0m={ x=[3, 768] },
  [1m(Compound, treats, Disease)[0m={ edge_index=[2, 3] },
  [1m(Disease, rev_treats, Compound)[0m={ edge_index=[2, 3] }
)


In [31]:
# SP create new test sample
run_test_sample3 = 1
if  run_test_sample3:  
    test_relation_df = relation_df[relation_df['relation_name']=='Compound treats the disease']

    # Create test sample of DRKG relationships filtering to these relations (for full sample: delete and use drkg_entity_df)
    test_hrt_df = drkg_translated[drkg_translated[1].isin(test_relation_df['drkg_id'])]
    test_hrt_df = test_hrt_df.groupby(1).head(3).reset_index(drop=True)
    test_hrt_df[1] = 'Compound treats the disease'
    print_head(test_hrt_df)


+----+--------------+-----------------------------+-------------------------+
|    | 0            | 1                           | 2                       |
|----+--------------+-----------------------------+-------------------------|
|  0 | Dornase alfa | Compound treats the disease | Cystic Fibrosis         |
|  1 | Etanercept   | Compound treats the disease | Spondylitis, Ankylosing |
|  2 | Etanercept   | Compound treats the disease | Graft vs Host Disease   |
|  3 | Tetrandrine  | Compound treats the disease | Dermatitis, Atopic      |
|  4 | Tetrandrine  | Compound treats the disease | Q Fever                 |
+----+--------------+-----------------------------+-------------------------+


In [32]:
#### Graph that works
rx_dx_subset = test_hrt_df[:10].copy()


### DX RX Relationship ###
rx_X,rx_mapping = create_mapping(rx_dx_subset[0].to_list(),encoder= Encoder ,device=device) # Maps drugs to indices
dx_X,dx_mapping = create_mapping(rx_dx_subset[2].to_list(),encoder= Encoder ,device=device) # Maps drugs to indices
## As of now this only encodes 'Compound treats the disease', but  the idea is that this is used to encode every entity 
relationship_X,relationship_mapping = create_mapping(rx_dx_subset[1].to_list(),encoder= Encoder ,device=device)  

print(f"Unique Drugs:   {len(rx_mapping)} Matrix shape: {rx_X.shape}")
print(f"Unique Disases: {len(dx_mapping)} Matrix shape: {dx_X.shape }")
relationship_feature = relationship_X[relationship_mapping['Compound treats the disease'],:].reshape(1,-1)

data3 = HeteroData()
data3['compounds'].x = rx_X
data3['disease'].x   = dx_X

#data['compounds2'].x = rx_X
#data['disease2'].x   = dx_X
#print(data)

Edge_index,edge_attribute = create_edges(df             =  rx_dx_subset,
                                          src_index_col  = 0, 
                                          src_mapping    = rx_mapping , 
                                          dst_index_col  = 2, 
                                          dst_mapping    = dx_mapping ,
                                          edge_attr      = relationship_feature)

data3['compounds', 'treats', 'disease'].edge_index = Edge_index
data3 = T.ToUndirected()(data3)


print(data3)

Unique Drugs:   6 Matrix shape: torch.Size([6, 768])
Unique Disases: 8 Matrix shape: torch.Size([8, 768])
HeteroData(
  [1mcompounds[0m={ x=[6, 768] },
  [1mdisease[0m={ x=[8, 768] },
  [1m(compounds, treats, disease)[0m={ edge_index=[2, 9] },
  [1m(disease, rev_treats, compounds)[0m={ edge_index=[2, 9] }
)


In [59]:

data3['compounds', 'treats', 'disease'].edge_index

tensor([[0, 2, 2, 5, 5, 5, 3, 1, 4],
        [0, 7, 1, 2, 6, 4, 3, 5, 5]])

In [62]:
data['Compound', 'Compound treats the disease', 'Disease'].edge_index
data2['Compound', 'treats', 'Disease'].edge_index

tensor([[ 1, 10, 10],
        [ 0,  1,  2]])

In [60]:
relation_df['relation_name'].drop_duplicates()[70:90]

99     post-translational modification
100                           reaction
Name: relation_name, dtype: object

## GRAPH SAGE

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric as pyg
import torch_geometric
from torch_geometric.nn import SAGEConv, to_hetero
from   torch.utils.data      import Dataset, DataLoader
from   torch_geometric.data  import Data
from   torch_geometric.utils import negative_sampling

from torch_geometric.nn import SAGEConv, to_hetero




class GNNStack(torch.nn.Module):
    def __init__(self, input_dim:int, hidden_dim:int, output_dim:int, layers:int, dropout:float=0.3, return_embedding=False):
        """
            A stack of GraphSAGE Module 
            input_dim        <int>:   Input dimension
            hidden_dim       <int>:   Hidden dimension
            output_dim       <int>:   Output dimension
            layers           <int>:   Number of layers
            dropout          <float>: Dropout rate
            return_embedding <bool>:  Whether to return the return_embeddingedding of the input graph
        """
        
        super(GNNStack, self).__init__()
        graphSage_conv               = pyg.nn.SAGEConv
        self.dropout                 = dropout
        self.layers                  = layers
        self.return_embedding        = return_embedding

        ### Initalize the layers ###
        self.convs                   = nn.ModuleList()                      # ModuleList to hold the layers
        for l in range(self.layers):
            if l == 0:
                ### First layer  maps from input_dim to hidden_dim ###
                self.convs.append(graphSage_conv(input_dim, hidden_dim))
            else:
                ### All other layers map from hidden_dim to hidden_dim ###
                self.convs.append(graphSage_conv(hidden_dim, hidden_dim))

        # post-message-passing processing MLP
        self.post_mp = nn.Sequential(
                                     nn.Linear(hidden_dim, hidden_dim), 
                                     nn.Dropout(self.dropout),
                                     nn.Linear(hidden_dim, output_dim))

    def forward(self, x, edge_index):
        for i in range(self.layers):
            x = self.convs[i](x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.post_mp(x)

        # Return final layer of return_embeddingeddings if specified
        if self.return_embedding:
            return x

        # Else return class probabilities
        return F.log_softmax(x, dim=1)

    def loss(self, pred, label):
        return F.nll_loss(pred, label)
    


class LinkPredictorMLP(nn.Module):
    def __init__(self, in_channels:int, hidden_channels:int, out_channels:int, n_layers:int,dropout_probabilty:float=0.3):
        """
        Args:
            in_channels (int):     Number of input features.
            hidden_channels (int): Number of hidden features.
            out_channels (int):    Number of output features.
            n_layers (int):        Number of MLP layers.
            dropout (float):       Dropout probability.
            """
        super(LinkPredictorMLP, self).__init__()
        self.dropout_probabilty    = dropout_probabilty  # dropout probability
        self.mlp_layers            = nn.ModuleList()     # ModuleList: is a list of modules
        self.non_linearity         = F.relu              # non-linearity
        
        for i in range(n_layers - 1):                                 
            if i == 0:
                self.mlp_layers.append(nn.Linear(in_channels, hidden_channels))          # input layer (in_channels, hidden_channels)
            else:
                self.mlp_layers.append(nn.Linear(hidden_channels, hidden_channels))      # hidden layers (hidden_channels, hidden_channels)

        self.mlp_layers.append(nn.Linear(hidden_channels, out_channels))                 # output layer (hidden_channels, out_channels)


    def reset_parameters(self):
        for mlp_layer in self.mlp_layers:
            mlp_layer.reset_parameters()

    def forward(self, x_i, x_j):
        x = x_i * x_j                                                     # element-wise multiplication
        for mlp_layer in self.mlp_layers[:-1]:                            # iterate over all layers except the last one
            x = mlp_layer(x)                                              # apply linear transformation
            x = self.non_linearity(x)                                     # Apply non linear activation function
            x = F.dropout(x, p=self.dropout_probabilty,training=self.training)      # Apply dropout
        x = self.mlp_layers[-1](x)                                        # apply linear transformation to the last layer
        x = torch.sigmoid(x)                                              # apply sigmoid activation function to get the probability
        return x
    
### We will use This function to save our best model during trainnig ###
def save_torch_model(model,epoch,PATH:str,optimizer):
    print(f"Saving Model in Path {PATH}")
    torch.save({'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer':optimizer,      
                }, PATH)

In [130]:
epochs        = 500
hidden_dim    = 524      # 256 
dropout       = 0.7
num_layers    = 3
learning_rate = 1e-4
node_emb_dim  = 768

test_data = data1

HomoGNN         = GNNStack(node_emb_dim, hidden_dim, hidden_dim, num_layers, dropout, return_embedding=True).to(device) # the graph neural network that takes all the node embeddings as inputs to message pass and agregate
HeteroGNN       = to_hetero(HomoGNN   , test_data.metadata(), aggr='sum')
link_predictor  = LinkPredictorMLP(hidden_dim, hidden_dim, 1, num_layers , dropout).to(device) # the MLP that takes embeddings of a pair of nodes and predicts the existence of an edge between them
#optimizer      = torch.optim.AdamW(list(model.parameters()) + list(link_predictor.parameters() ), lr=learning_rate, weight_decay=1e-4)
optimizer       = torch.optim.Adam(list(HeteroGNN.parameters()) + list(link_predictor.parameters() ), lr=learning_rate)

print(HeteroGNN )
print(link_predictor)
print(f"Models Loaded to {device}")

GraphModule(
  (convs): ModuleList(
    (0): ModuleDict(
      (Compound__Compound_treats_the_disease__Disease): SAGEConv(768, 524, aggr=mean)
      (Compound__increases_expression_or_production__Gene): SAGEConv(768, 524, aggr=mean)
      (Gene__increases_expression_or_production__Gene): SAGEConv(768, 524, aggr=mean)
      (Gene__biomarkers_diagnostic__Disease): SAGEConv(768, 524, aggr=mean)
      (Disease__rev_Compound_treats_the_disease__Compound): SAGEConv(768, 524, aggr=mean)
      (Gene__rev_increases_expression_or_production__Compound): SAGEConv(768, 524, aggr=mean)
      (Disease__rev_biomarkers_diagnostic__Gene): SAGEConv(768, 524, aggr=mean)
    )
    (1-2): 2 x ModuleDict(
      (Compound__Compound_treats_the_disease__Disease): SAGEConv(524, 524, aggr=mean)
      (Compound__increases_expression_or_production__Gene): SAGEConv(524, 524, aggr=mean)
      (Gene__increases_expression_or_production__Gene): SAGEConv(524, 524, aggr=mean)
      (Gene__biomarkers_diagnostic__Disease): 



In [66]:
print(test_data)

HeteroData(
  [1mCompound[0m={ x=[12, 768] },
  [1mDisease[0m={ x=[11, 768] },
  [1mGene[0m={ x=[12, 768] },
  [1m(Compound, Compound treats the disease, Disease)[0m={
    edge_index=[2, 9],
    edge_label=[9, 768]
  },
  [1m(Compound, increases expression/production, Gene)[0m={
    edge_index=[2, 6],
    edge_label=[6, 768]
  },
  [1m(Gene, increases expression/production, Gene)[0m={
    edge_index=[2, 6],
    edge_label=[6, 768]
  },
  [1m(Gene, biomarkers (diagnostic), Disease)[0m={
    edge_index=[2, 3],
    edge_label=[3, 768]
  },
  [1m(Disease, rev_Compound treats the disease, Compound)[0m={
    edge_index=[2, 9],
    edge_label=[9, 768]
  },
  [1m(Gene, rev_increases expression/production, Compound)[0m={
    edge_index=[2, 6],
    edge_label=[6, 768]
  },
  [1m(Disease, rev_biomarkers (diagnostic), Gene)[0m={
    edge_index=[2, 3],
    edge_label=[3, 768]
  }
)


In [28]:
len(test_data.x_dict.values())

3

In [37]:
test_data.edge_index_dict

{('Compound',
  'activation',
  'Gene'): tensor([[11,  0,  4,  6,  7,  5,  2,  2,  9],
         [16, 12, 14,  3, 23,  5,  1, 22,  6]]),
 ('Compound',
  'inhibition',
  'Gene'): tensor([[ 1, 10,  3],
         [ 7, 21, 19]]),
 ('Compound',
  'Compound treats the disease',
  'Disease'): tensor([[12,  8,  8],
         [ 1,  0,  2]]),
 ('Gene',
  'activates, stimulates',
  'Gene'): tensor([[ 2,  4,  8, 11, 15, 17],
         [11, 17, 15,  2,  8,  4]]),
 ('Gene',
  'inhibition',
  'Gene'): tensor([[ 0,  9, 10, 13, 18, 20],
         [20, 18, 13, 10,  9,  0]]),
 ('Gene',
  'rev_activation',
  'Compound'): tensor([[16, 12, 14,  3, 23,  5,  1, 22,  6],
         [11,  0,  4,  6,  7,  5,  2,  2,  9]]),
 ('Gene',
  'rev_inhibition',
  'Compound'): tensor([[ 7, 21, 19],
         [ 1, 10,  3]]),
 ('Disease',
  'rev_Compound treats the disease',
  'Compound'): tensor([[ 1,  0,  2],
         [12,  8,  8]])}

In [35]:
data3.edge_index_dict

{('compounds',
  'treats',
  'disease'): tensor([[5, 2, 2, 3, 3, 3, 0, 4, 1],
         [5, 0, 2, 4, 6, 7, 3, 1, 1]]),
 ('disease',
  'rev_treats',
  'compounds'): tensor([[5, 0, 2, 4, 6, 7, 3, 1, 1],
         [5, 2, 2, 3, 3, 3, 0, 4, 1]])}

In [252]:
node_emb   = HeteroGNN(test_data.x_dict, test_data.edge_index_dict)
# edge_index = test_data['compounds', 'treats', 'disease'].edge_index 
# pos_pred    = link_predictor(node_emb["compounds"][edge_index[0]], node_emb["disease"][edge_index[0]])   # (B, )
edge_index = test_data['Compound', 'Compound treats the disease', 'Disease'].edge_index 
pos_pred    = link_predictor(node_emb["Compound"][edge_index[0]], node_emb["Disease"][edge_index[1]])   # (B, )



In [253]:
pos_pred

tensor([[0.4970],
        [0.4745],
        [0.4823],
        [0.4923],
        [0.4851],
        [0.4945],
        [0.4230],
        [0.4087],
        [0.4884]], grad_fn=<SigmoidBackward0>)

In [254]:
test_data

HeteroData(
  [1mCompound[0m={ x=[12, 768] },
  [1mDisease[0m={ x=[11, 768] },
  [1mGene[0m={ x=[12, 768] },
  [1m(Compound, Compound treats the disease, Disease)[0m={
    edge_index=[2, 9],
    edge_label=[9, 768]
  },
  [1m(Compound, increases expression or production, Gene)[0m={
    edge_index=[2, 6],
    edge_label=[6, 768]
  },
  [1m(Gene, increases expression or production, Gene)[0m={
    edge_index=[2, 6],
    edge_label=[6, 768]
  },
  [1m(Gene, biomarkers diagnostic, Disease)[0m={
    edge_index=[2, 3],
    edge_label=[3, 768]
  },
  [1m(Disease, rev_Compound treats the disease, Compound)[0m={
    edge_index=[2, 9],
    edge_label=[9, 768]
  },
  [1m(Gene, rev_increases expression or production, Compound)[0m={
    edge_index=[2, 6],
    edge_label=[6, 768]
  },
  [1m(Disease, rev_biomarkers diagnostic, Gene)[0m={
    edge_index=[2, 3],
    edge_label=[3, 768]
  }
)