DRKG

Adapted from: https://github.com/gnn4dr/DRKG/blob/master/drkg_with_dgl/loading_drkg_in_dgl.ipynb

In [3]:
import pandas as pd
import numpy as np
import os 

In [4]:
from SIMP_LLM.DRKG_loading   import  get_triplets, read_tsv,filter_drkg,map_drkg_relationships,filter_interaction_subset,print_head
from SIMP_LLM.DRKG_translate import  load_lookups
from SIMP_LLM.DRKG_entity_processing import get_unique_entities, get_entity_lookup, convert_entitynames, flip_headtail

# 1) Load Data

In [5]:
### 1) Read: This section reads DRKG and a glossary (used to map entities from codes to words)
DATA_DIR           = os.path.join("data")
verbose            =  True 
triplets,drkg_df   =  get_triplets(drkg_file = os.path.join(DATA_DIR  ,'drkg.tsv'),             verbose=verbose)  # Read triplets (head,relationship,tail)
relation_glossary  =  read_tsv(relation_file = os.path.join(DATA_DIR  ,'relation_glossary.tsv'),verbose=verbose)  # Read relationship mapping  


### 2) Filter & Map Interactions: This section returns a list of interactions e.g. DRUGBANK::treats::Compound:Disease )
# 2.1: First  we filter the interactions to only Compound-Disease
# 2.2: Then   we map the codes -> text  (this will be use to further filter interactions based on text) e.g.  Hetionet::CpD::Compound:Disease -> palliation
# 2.3: We use natural text to fitler  interactions based on terms such as "treat" (but we return the orignal interaction name )



# modularize this in create_dataframe
drkg_rx_dx_relations        = filter_drkg(data_frame = drkg_df ,  filter_column = 1 ,  filter_term = r'.*?Compound:Disease', verbose = verbose) # 2.1 Filter only Compound-Disease Interactions
drkg_rx_dx_relations_mapped = map_drkg_relationships(drkg_rx_dx_relations,relation_glossary,verbose=verbose)                                    # 2.2 Map codes to text 

### 2.3 Filter Drug interactions Interaction types to only include: treat inhibit or alleviate interactions  ###
drkg_rx_dx_relation_subset =  filter_interaction_subset(df                  = drkg_rx_dx_relations_mapped,
                                                        filter_colunm_name = 'Interaction-type' ,
                                                        regex_string       =  'treat|inhibit|alleviate',
                                                        return_colunm_name =  'Relation-name')

# 3) Use Filter Interactions to get Gilter DRKG 
drkg_df_filtered = drkg_df[drkg_df[1].isin(drkg_rx_dx_relation_subset)] # 3.1 Filter DRKG  to only  Compund-Disease 
print_head(df=drkg_df_filtered)



###

rx_dx_triplets   = drkg_df_filtered.values.tolist()                     # 3.2 Convert filtered DRKG to list


 Triplets:

[['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2157'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5264'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2158'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::3309'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::28912'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::811'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2159'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::821'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5627'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5624']]

 data/drkg.tsv  Dataframe:

+----+------------+--------------------------------+-------------+
|    | 0          | 1                              | 2           |
|----+------------+--------------------------------+-------------|
|  0 | Gene::2157 | bioarx::HumGenHumGen:Gene:Gene | Gene::2157  |
|  1 | Gene::2157 | bioarx::HumGenHumGen:Gene:Gene

In [6]:
# 4) Load Data frames for translation
hetionet_df, gene_df, drugbank_df, omim_df, mesh_dict, chebi_df, chembl_df = load_lookups(data_path=DATA_DIR,verbose=verbose)


 data/hetionet-v1.0-nodes.tsv  Dataframe:

+----+-------------------------+---------------------------+---------+
|    | id                      | name                      | kind    |
|----+-------------------------+---------------------------+---------|
|  0 | Anatomy::UBERON:0000002 | uterine cervix            | Anatomy |
|  1 | Anatomy::UBERON:0000004 | nose                      | Anatomy |
|  2 | Anatomy::UBERON:0000006 | islet of Langerhans       | Anatomy |
|  3 | Anatomy::UBERON:0000007 | pituitary gland           | Anatomy |
|  4 | Anatomy::UBERON:0000010 | peripheral nervous system | Anatomy |
+----+-------------------------+---------------------------+---------+

 Sample of Hetionet Data Types (Before processing):

+-------+----------------------------------+-------------------------------------------+---------------------+
|       | id                               | name                                      | kind                |
|-------+--------------------------------

In [7]:
# Make dictionaries for codes
code_df   = pd.concat([hetionet_df[['name', 'id']], 
                       gene_df.rename(columns = {"description":"name", "GeneID":"id"}),
                       drugbank_df.rename(columns = {"Common name":"name", "DrugBank ID":"id"}),
                       omim_df.rename(columns = {"MIM Number":"id"}),
                       chebi_df.rename(columns = {"NAME":"name", "CHEBI_ACCESSION":"id"}),
                       chembl_df.rename(columns = {"pref_name":"name", "chembl_id":"id"})
                       ], ignore_index=True, axis=0).drop_duplicates() 
code_dict = pd.Series(code_df['name'].values, index=code_df['id']).to_dict() | mesh_dict # Convert node df to dict and merge with MeSH dictionary

# Get unique DRKG entities
drkg_entities = get_unique_entities(drkg_df, [0,2])

# Create and use convert_entitynames function
drkg_entity_df, drkg_unmatched = get_entity_lookup(drkg_entities, code_dict)

# Create final node dictionary
node_dict = pd.Series(drkg_entity_df['name'].values, index=drkg_entity_df['drkg_id']).to_dict() 

# Initialize translated DRKG and manually clean heads/tails for one case where they were flipped
drkg_translated    = drkg_df.copy()
drkg_translated = flip_headtail(drkg_translated, 'Gene:Compound')

# Map DRKG to translated entity names
drkg_translated = convert_entitynames(drkg_translated, 0, node_dict)
drkg_translated = convert_entitynames(drkg_translated, 2, node_dict)
drkg_translated = drkg_translated.dropna()
print_head(drkg_translated) 

# Summarize percentage translated
print("Number of unique DRKG entities: ", len(drkg_entities)) # should be 97238
print("Number of translated entities: ", drkg_entity_df.shape[0])
print("Number of untranslated entities: ", drkg_unmatched.shape[0])
pct_entity_translated = drkg_entity_df.shape[0]/len(drkg_entities)
print('Percentage of entities translated: ', round(pct_entity_translated*100,1), '%')

print('Total DRKG relationships: ', drkg_df.shape[0])
print('Translated DRKG relationships: ', drkg_translated.shape[0])
pct_translated = drkg_translated.shape[0]/drkg_df.shape[0]
print('Percentage of relationships fully translated: ', round(pct_translated*100,1), '%')

+----+------------------------------+--------------------------------+------------------------------------------------------+
|    | 0                            | 1                              | 2                                                    |
|----+------------------------------+--------------------------------+------------------------------------------------------|
|  0 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | coagulation factor VIII (F8)                         |
|  1 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | phytanoyl-CoA 2-hydroxylase (PHYH)                   |
|  2 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | coagulation factor IX (F9)                           |
|  3 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | heat shock protein family A (Hsp70) member 5 (HSPA5) |
|  4 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | immunoglobulin kappa variable 3-20 (IGKV3-20)  

In [27]:
# Update relation glossary 
relation_df = relation_glossary.copy().rename(columns={'Relation-name':'drkg_id'})
relation_df[['head_entity','tail_entity']] = relation_df['drkg_id'].str.split('::', expand=True)[2].str.split(':', expand=True) # Set head and tail nodes

# Manually fix head and tail nodes for DGIDB relations, which reverse compound-gene interactions
relation_df.loc[relation_df['drkg_id'].str.contains('Gene:Compound'),'head_entity'] = 'Compound'
relation_df.loc[relation_df['drkg_id'].str.contains('Gene:Compound'),'tail_entity'] = 'Gene'

# Add mapped relation group labels
relation_groups = [['activation', 'agonism', 'agonism, activation', 'activates, stimulates'],
    ['antagonism', 'blocking', 'antagonism, blocking'],
    ['binding', 'binding, ligand (esp. receptors)'],
    ['blocking', 'channel blocking'],
    ['inhibition', 'inhibits cell growth (esp. cancers)', 'inhibits'],
    ['enzyme', 'enzyme activity'],
    ['upregulation', 'increases expression/production'],
    ['downregulation', 'decreases expression/production'],
    ['Compound treats the disease', 'treatment/therapy (including investigatory)', 'treatment']]

relation_df['relation_name'] = relation_df['Interaction-type']

for grp in relation_groups:
    relation_df_subset = relation_df[relation_df['Interaction-type'].isin(grp)].copy()
    for entities in relation_df_subset['Connected entity-types'].unique():
        subgrp = relation_df_subset[relation_df_subset['Connected entity-types'] == entities]['Interaction-type'].unique()
        relation_df.loc[(relation_df_subset['Connected entity-types'] == entities) & (relation_df['Interaction-type'].isin(subgrp)), 'relation_name'] = subgrp[0]

relation_df


Unnamed: 0,drkg_id,Data-source,Connected entity-types,Interaction-type,Description,Reference for the description,head_entity,tail_entity,relation_name
0,DGIDB::ACTIVATOR::Gene:Compound,DGIDB,Compound:Gene,activation,An activator interaction is when a drug activa...,http://www.dgidb.org/getting_started,Compound,Gene,activation
1,DGIDB::AGONIST::Gene:Compound,DGIDB,Compound:Gene,agonism,An agonist interaction occurs when a drug bind...,http://www.dgidb.org/getting_started,Compound,Gene,activation
2,DGIDB::ALLOSTERIC MODULATOR::Gene:Compound,DGIDB,Compound:Gene,allosteric modulation,An allosteric modulator interaction occurs whe...,http://www.dgidb.org/getting_started,Compound,Gene,allosteric modulation
3,DGIDB::ANTAGONIST::Gene:Compound,DGIDB,Compound:Gene,antagonism,An antagonist interaction occurs when a drug b...,http://www.dgidb.org/getting_started,Compound,Gene,antagonism
4,DGIDB::ANTIBODY::Gene:Compound,DGIDB,Compound:Gene,antibody,An antibody interaction occurs when an antibod...,http://www.dgidb.org/getting_started,Compound,Gene,antibody
...,...,...,...,...,...,...,...,...,...
102,bioarx::Covid2_acc_host_gene::Disease:Gene,BIBLIOGRAPHY,Disease:Gene,interaction,"Interactions between 27 viral proteins, and ...",,Disease,Gene,interaction
103,bioarx::DrugHumGen:Compound:Gene,BIBLIOGRAPHY,Compound:Gene,interaction,,,,,interaction
104,bioarx::DrugVirGen:Compound:Gene,BIBLIOGRAPHY,Compound:Gene,interaction,,,,,interaction
105,bioarx::HumGenHumGen:Gene:Gene,BIBLIOGRAPHY,Gene:Gene,interaction,Protein-protein interaction,,,,interaction


In [31]:
# Test specific cases
test = 'inhibition'
relation_df[relation_df['relation_name']==test]
# relation_df[relation_df['Connected entity-types']=='Compound:Disease']

Unnamed: 0,drkg_id,Data-source,Connected entity-types,Interaction-type,Description,Reference for the description,head_entity,tail_entity,relation_name
8,DGIDB::INHIBITOR::Gene:Compound,DGIDB,Compound:Gene,inhibition,"In inhibitor interactions, the drug binds to a...",http://www.dgidb.org/getting_started,Compound,Gene,inhibition
39,GNBR::N::Compound:Gene,GNBR,Compound:Gene,inhibits,,,Compound,Gene,inhibition
97,STRING::INHIBITION::Gene:Gene,STRING,Gene:Gene,inhibition,,,Gene,Gene,inhibition


In [9]:
# Filter DRKG in natural language to drug-treats-disease relationships
# rx_dx        = df_med[df_med.iloc[:,1] ==   'Compound treats the disease']
rx_dx        =  drkg_translated[drkg_translated[1].isin(drkg_rx_dx_relation_subset)]
rx_dx_subset =  rx_dx[0:10]
rx_dx_subset

Unnamed: 0,0,1,2
1518268,Dornase alfa,DRUGBANK::treats::Compound:Disease,Cystic Fibrosis
1518270,Etanercept,DRUGBANK::treats::Compound:Disease,"Spondylitis, Ankylosing"
1518271,Etanercept,DRUGBANK::treats::Compound:Disease,Graft vs Host Disease
1518272,Etanercept,DRUGBANK::treats::Compound:Disease,Hidradenitis Suppurativa
1518273,Etanercept,DRUGBANK::treats::Compound:Disease,"Arthritis, Juvenile"
1518274,Etanercept,DRUGBANK::treats::Compound:Disease,Psoriasis
1518275,Etanercept,DRUGBANK::treats::Compound:Disease,"Arthritis, Psoriatic"
1518276,Etanercept,DRUGBANK::treats::Compound:Disease,Pyoderma Gangrenosum
1518277,Etanercept,DRUGBANK::treats::Compound:Disease,"Arthritis, Rheumatoid"
1518278,Etanercept,DRUGBANK::treats::Compound:Disease,Stevens-Johnson Syndrome


# 3) BioLinkBERT embedding

In [10]:
from torch_geometric.data import HeteroData
from SIMP_LLM.llm_encode import EntityEncoder
from SIMP_LLM.dataloader_mappings import create_mapping, create_edges, embed_entities, embed_edges

### Set variables and load data

In [37]:
## Example of loading data without anything to encode
device   = "cpu"
Encoder  = EntityEncoder(device = device )


# Create relationship subset for testing
test_relation_df = relation_df[relation_df['Connected entity-types'].isin(['Compound:Gene', 'Disease:Gene', 'Compound:Disease', 'Gene:Gene'])].copy()
test_relation_df['relation_name'] = None

activation_list = ['activation', 'agonism', 'agonism, activation'] 
treat_list = ['Compound treats the disease', 'treats']
gene_drug_list = ['inhibition']

test_relation_df['relation_name'][test_relation_df['Interaction-type'].isin(activation_list)] = 'Compound activates gene'
test_relation_df['relation_name'][test_relation_df['Interaction-type'].isin(treat_list)] = 'Compound treats disease'
test_relation_df['relation_name'][test_relation_df['Interaction-type'].isin(gene_drug_list)] = 'Inhibition'
test_relation_df = test_relation_df[~test_relation_df['relation_name'].isna()]
print_head(test_relation_df)

# Create test sample of DRKG relationships filtering to these relations (for full sample: delete and use drkg_entity_df)
test_hrt_df = drkg_translated[drkg_translated[1].isin(test_relation_df['drkg_id'])]
test_hrt_df = test_hrt_df.groupby(1).head(3).reset_index(drop=True)
test_unique_entities = get_unique_entities(test_hrt_df, columns=[0,2])
test_entity_df = drkg_entity_df[drkg_entity_df['name'].isin(test_unique_entities)]
print_head(test_hrt_df)
print_head(test_entity_df)


entity_df = test_entity_df.copy() # (for full sample: replace test_entity_df with drkg_entity_df)
hrt_data = test_hrt_df.copy()  # (for full sample: replace test_hrt_df with drkg_translated)
relation_lookup = relation_df.copy() # (for full sample: replace test_relation_df with the updated relation_glossary with relation_name)

+----+------------------------------------+---------------+--------------------------+-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+---------------+---------------+-------------------------+
|    | drkg_id                            | Data-source   | Connected entity-types   | Interaction-type            | Description                                                                                                                                                                                            | Reference for the description        | head_entity   | tail_entity   | relation_name           |
|----+------------------------------------+---------------+--------------------------+-----------------------------+------------------------------------------------------

### Build HeteroData Object

In [38]:
# Initialize heterograph object
graph_obj = HeteroData()

# Embed entities, add to graph, and save embedding mapping dictionary of dictionaries
mapping_dict = embed_entities(entity_df, graph_obj, Encoder, device) 

# Embed relationships, add to graph, and save relation embeddings/mapping dictionary
relation_X, relation_mapping = embed_edges(hrt_data, relation_lookup, graph_obj, mapping_dict, Encoder, device)

# Print summary
print(graph_obj)
for ent_type in entity_df['entity_type'].unique():
    print(f"Unique {ent_type}s: {len(mapping_dict[ent_type])} \t Matrix shape: {graph_obj[ent_type].x.shape }")
    # print(mapping_dict[ent_type]) # Prints whole dictionary so delete/uncomment if using all entities

HeteroData(
  [1mCompound[0m={ x=[13, 768] },
  [1mDisease[0m={ x=[3, 768] },
  [1mGene[0m={ x=[24, 768] },
  [1m(Compound, activation, Gene)[0m={
    edge_index=[2, 9],
    edge_label=[9, 768]
  },
  [1m(Compound, inhibition, Gene)[0m={
    edge_index=[2, 3],
    edge_label=[3, 768]
  },
  [1m(Compound, Compound treats the disease, Disease)[0m={
    edge_index=[2, 3],
    edge_label=[3, 768]
  },
  [1m(Gene, activates, stimulates, Gene)[0m={
    edge_index=[2, 3],
    edge_label=[3, 768]
  },
  [1m(Gene, inhibition, Gene)[0m={
    edge_index=[2, 3],
    edge_label=[3, 768]
  }
)
Unique Compounds: 13 	 Matrix shape: torch.Size([13, 768])
Unique Diseases: 3 	 Matrix shape: torch.Size([3, 768])
Unique Genes: 24 	 Matrix shape: torch.Size([24, 768])



### Old example: DX RX Relationship ###

In [None]:

# Create embeddings
rx_X,rx_mapping = create_mapping(rx_dx_subset[0].to_list(),encoder= Encoder ,device=device) # Maps drugs to indices
dx_X,dx_mapping = create_mapping(rx_dx_subset[2].to_list(),encoder= Encoder ,device=device) # Maps drugs to indices
## As of now this only encodes 'Compound treats the disease', but  the idea is that this is used to encode every entity 
relationship_X,relationship_mapping = create_mapping(rx_dx_subset[1].to_list(),encoder= Encoder ,device=device)  

print(f"Unique Drugs:   {len(rx_mapping)} Matrix shape: {rx_X.shape}")
print(f"Unique Disases: {len(dx_mapping)} Matrix shape: {dx_X.shape }")
relationship_feature = relationship_X[relationship_mapping['Compound treats the disease'],:].reshape(1,-1)

# Build heterograph object
### EXAMPLE: Create PyG Hetero Graph:
data = HeteroData()
data['compounds'].x = rx_X
data['disease'].x   = dx_X
print(data)

Edge_index,edge_attribute = create_edges(df             =  rx_dx_subset,
                                          src_index_col  = 0, 
                                          src_mapping    = rx_mapping , 
                                          dst_index_col  = 2, 
                                          dst_mapping    = dx_mapping ,
                                          edge_attr      = relationship_feature)

data['compounds', 'Compound treats the disease', 'disease'].edge_index = Edge_index
data['compounds', 'Compound treats the disease', 'disease'].edge_label = edge_attribute 