DRKG

Adapted from: https://github.com/gnn4dr/DRKG/blob/master/drkg_with_dgl/loading_drkg_in_dgl.ipynb

In [1]:
import pandas as pd
import numpy as np
import os 

In [2]:
from SIMP_LLM.DRKG_loading   import  get_triplets, read_tsv,filter_drkg,map_drkg_relationships,filter_interaction_subset,print_head
from SIMP_LLM.DRKG_translate import  load_lookups

  from .autonotebook import tqdm as notebook_tqdm


# 1) Load Data

In [3]:
### 1) Read: This section reads DRKG and a glossary (used to map entities from codes to words)
DATA_DIR           = os.path.join("data")
verbose            =  True 
triplets,drkg_df   =  get_triplets(drkg_file = os.path.join(DATA_DIR  ,'drkg.tsv'),             verbose=verbose)  # Read triplets (head,relationship,tail)
relation_glossary  =  read_tsv(relation_file = os.path.join(DATA_DIR  ,'relation_glossary.tsv'),verbose=verbose)  # Read relationship mapping  


### 2) Filter & Map Interactions: This section returns a list of interactions e.g. DRUGBANK::treats::Compound:Disease )
# 2.1: First  we filter the interactions to only Compound-Disease
# 2.2: Then   we map the codes -> text  (this will be use to further filter interactions based on text) e.g.  Hetionet::CpD::Compound:Disease -> palliation
# 2.3: We use natural text to fitler  interactions based on terms such as "treat" (but we return the orignal interaction name )



# modularize this in create_dataframe
drkg_rx_dx_relations        = filter_drkg(data_frame = drkg_df ,  filter_column = 1 ,  filter_term = r'.*?Compound:Disease', verbose = verbose) # 2.1 Filter only Compound-Disease Interactions
drkg_rx_dx_relations_mapped = map_drkg_relationships(drkg_rx_dx_relations,relation_glossary,verbose=verbose)                                    # 2.2 Map codes to text 

### 2.3 Filter Drug interactions Interaction types to only include: treat inhibit or alleviate interactions  ###
drkg_rx_dx_relation_subset =  filter_interaction_subset(df                  = drkg_rx_dx_relations_mapped,
                                                        filter_colunm_name = 'Interaction-type' ,
                                                        regex_string       =  'treat|inhibit|alleviate',
                                                        return_colunm_name =  'Relation-name')

# 3) Use Filter Interactions to get Gilter DRKG 
drkg_df_filtered = drkg_df[drkg_df[1].isin(drkg_rx_dx_relation_subset)] # 3.1 Filter DRKG  to only  Compund-Disease 
print_head(df=drkg_df_filtered)



###

rx_dx_triplets   = drkg_df_filtered.values.tolist()                     # 3.2 Convert filtered DRKG to list


 Triplets:

[['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2157'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5264'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2158'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::3309'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::28912'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::811'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2159'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::821'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5627'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5624']]

 data\drkg.tsv  Dataframe:

+----+------------+--------------------------------+-------------+
|    | 0          | 1                              | 2           |
|----+------------+--------------------------------+-------------|
|  0 | Gene::2157 | bioarx::HumGenHumGen:Gene:Gene | Gene::2157  |
|  1 | Gene::2157 | bioarx::HumGenHumGen:Gene:Gene

In [4]:
# 4) Load Data frames for translation
hetionet_df, gene_df, drugbank_df, omim_df, mesh_dict, chebi_df, chembl_df = load_lookups(data_path=DATA_DIR,verbose=verbose)


 data\hetionet-v1.0-nodes.tsv  Dataframe:

+----+-------------------------+---------------------------+---------+
|    | id                      | name                      | kind    |
|----+-------------------------+---------------------------+---------|
|  0 | Anatomy::UBERON:0000002 | uterine cervix            | Anatomy |
|  1 | Anatomy::UBERON:0000004 | nose                      | Anatomy |
|  2 | Anatomy::UBERON:0000006 | islet of Langerhans       | Anatomy |
|  3 | Anatomy::UBERON:0000007 | pituitary gland           | Anatomy |
|  4 | Anatomy::UBERON:0000010 | peripheral nervous system | Anatomy |
+----+-------------------------+---------------------------+---------+

 Sample of Hetionet Data Types (Before processing):

+-------+----------------------------------+-------------------------------------------+---------------------+
|       | id                               | name                                      | kind                |
|-------+--------------------------------

In [5]:
# Make dictionaries
relation_glossary_relation_dict = pd.Series(relation_glossary['Interaction-type'].values, index=relation_glossary['Relation-name']).to_dict()

node_df   = pd.concat([hetionet_df[['name', 'id']], 
                       gene_df.rename(columns = {"symbol":"name", "GeneID":"id"}),
                       drugbank_df.rename(columns = {"Common name":"name", "DrugBank ID":"id"}),
                       omim_df.rename(columns = {"MIM Number":"id"}),
                       chebi_df.rename(columns = {"NAME":"name", "CHEBI_ACCESSION":"id"}),
                       chembl_df.rename(columns = {"pref_name":"name", "chembl_id":"id"})
                       ], ignore_index=True, axis=0).drop_duplicates() 
node_dict = pd.Series(node_df['name'].values, index=node_df['id']).to_dict() | mesh_dict # Convert node df to dict and merge with MeSH dictionary


# Create and use convert_entitynames function
def convert_entitynames(df, col, node_dict):
  df_update = df.copy()
  df_update[col] = df_update[col].str.replace(r'.*?MESH:', "MESH::", regex=True) # Remove MeSH labeling
  df_update[col] = df_update[col].map(node_dict).fillna(df_update[col])    # Translate dictionary
  df_update[col] = df_update[col].str.replace("Gene::", "Gene ID ") # For remaining uncoverted Gene IDs, remove "::"
  df_update[col] = df_update[col].str.replace("Disease::", "") # For remaining diseases (appears to be just SARS-COVID related names), remove label
  return df_update

df_med    = drkg_df.copy()
df_med[1] = df_med[1].map(relation_glossary_relation_dict).fillna(df_med[1])

df_med = convert_entitynames(df_med, 0, node_dict)
df_med = convert_entitynames(df_med, 2, node_dict)

print_head(df_med) 


+----+---------+-------------+---------------+
|    | 0       | 1           | 2             |
|----+---------+-------------+---------------|
|  0 | F8 gene | interaction | F8 gene       |
|  1 | F8 gene | interaction | PHYH gene     |
|  2 | F8 gene | interaction | F9 gene       |
|  3 | F8 gene | interaction | HSPA5 gene    |
|  4 | F8 gene | interaction | IGKV3-20 gene |
+----+---------+-------------+---------------+


In [6]:
# Filter DRKG in natural language to drug-treats-disease relationships
# rx_dx        = df_med[df_med.iloc[:,1] ==   'Compound treats the disease']
rx_dx        =  df_med.loc[drkg_df_filtered.index]
rx_dx_subset =  rx_dx[0:10]
rx_dx_subset

Unnamed: 0,0,1,2
1518268,Dornase alfa,Compound treats the disease,Cystic Fibrosis
1518269,Denileukin diftitox,Compound treats the disease,MESH::C063419
1518270,Etanercept,Compound treats the disease,"Spondylitis, Ankylosing"
1518271,Etanercept,Compound treats the disease,Graft vs Host Disease
1518272,Etanercept,Compound treats the disease,Hidradenitis Suppurativa
1518273,Etanercept,Compound treats the disease,"Arthritis, Juvenile"
1518274,Etanercept,Compound treats the disease,Psoriasis
1518275,Etanercept,Compound treats the disease,"Arthritis, Psoriatic"
1518276,Etanercept,Compound treats the disease,Pyoderma Gangrenosum
1518277,Etanercept,Compound treats the disease,"Arthritis, Rheumatoid"


In [7]:
# Remove codes that are not relevant or have no conversion
drkg_translated = df_med.copy()

# Remove taxonomy
remove_tax = drkg_translated[drkg_translated[2].str.contains('Tax::')]
drkg_translated = drkg_translated.drop(remove_tax.index)

# Check for any untranslated terms
remove_untranslated = drkg_translated[(drkg_translated[0].str.contains('::'))|(drkg_translated[2].str.contains('::'))]
drkg_translated = drkg_translated.drop(remove_untranslated.index)

# Summarize 
print('Total number of pairs ' + str(drkg_df.shape[0]))
print('Dropped taxonomy pairs ' + str(len(remove_tax.index)))
print('Dropped untranslated pairs ' + str(len(remove_untranslated.index)))
drkg_translated

Total number of pairs 5874261
Dropped taxonomy pairs 14663
Dropped untranslated pairs 62779


Unnamed: 0,0,1,2
0,F8 gene,interaction,F8 gene
1,F8 gene,interaction,PHYH gene
2,F8 gene,interaction,F9 gene
3,F8 gene,interaction,HSPA5 gene
4,F8 gene,interaction,IGKV3-20 gene
...,...,...,...
5874256,COMMD9 gene,reaction,DDB2 gene
5874257,PPIL1 gene,reaction,HNRNPC gene
5874258,CBFB gene,catalysis,CDK1 gene
5874259,CES1 gene,binding,UGT2B10 gene


In [8]:
# Check unique values of untranslated
drkg_test1 = np.unique(remove_untranslated[0][remove_untranslated[0].str.contains('::')])
drkg_test2 = np.unique(remove_untranslated[2][remove_untranslated[2].str.contains('::')])

drkg_mesh_list = drkg_test1.tolist() +  drkg_test2.tolist()
drkg_mesh_unique = pd.DataFrame(pd.unique(drkg_mesh_list))
drkg_mesh_unique

Unnamed: 0,0
0,Compound::Bioarxivdrug:0
1,Compound::Bioarxivdrug:1
2,Compound::Bioarxivdrug:10
3,Compound::Bioarxivdrug:11
4,Compound::Bioarxivdrug:2
...,...
14416,MESH::C580539
14417,MESH::C585640
14418,MESH::D000071
14419,MESH::D018290


In [9]:
drkg_untranslated = drkg_mesh_unique.copy()
drkg_untranslated = drkg_untranslated[drkg_untranslated[0].str.startswith('MESH::')]
print(len(drkg_untranslated))
drkg_untranslated

7751


Unnamed: 0,0
1429,MESH::C000020
1430,MESH::C000050
1431,MESH::C000121
1432,MESH::C000154
1433,MESH::C000188
...,...
14416,MESH::C580539
14417,MESH::C585640
14418,MESH::D000071
14419,MESH::D018290


# 3) BioLinkBERT embedding

In [12]:
rx_dx_subset

Unnamed: 0,0,1,2
1518268,Dornase alfa,Compound treats the disease,Cystic Fibrosis
1518269,Denileukin diftitox,Compound treats the disease,MESH::C063419
1518270,Etanercept,Compound treats the disease,"Spondylitis, Ankylosing"
1518271,Etanercept,Compound treats the disease,Graft vs Host Disease
1518272,Etanercept,Compound treats the disease,Hidradenitis Suppurativa
1518273,Etanercept,Compound treats the disease,"Arthritis, Juvenile"
1518274,Etanercept,Compound treats the disease,Psoriasis
1518275,Etanercept,Compound treats the disease,"Arthritis, Psoriatic"
1518276,Etanercept,Compound treats the disease,Pyoderma Gangrenosum
1518277,Etanercept,Compound treats the disease,"Arthritis, Rheumatoid"


In [10]:
from torch_geometric.data import HeteroData
from SIMP_LLM.llm_encode import EntityEncoder
from SIMP_LLM.dataloader_mappings import create_mapping,create_edges


## Example of loading data without anything to encode
device    = "cpu"
Encoder  = EntityEncoder(device = device )


### DX RX Relationship ###
rx_X,rx_mapping = create_mapping(rx_dx_subset[0].to_list(),encoder= Encoder ,device=device) # Maps drugs to indices
dx_X,dx_mapping = create_mapping(rx_dx_subset[2].to_list(),encoder= Encoder ,device=device) # Maps drugs to indices
## As of now this only encodes 'Compound treats the disease', but  the idea is that this is used to encode every entity 
relationship_X,relationship_mapping = create_mapping(rx_dx_subset[1].to_list(),encoder= Encoder ,device=device)  

print(f"Unique Drugs:   {len(rx_mapping)} Matrix shape: {rx_X.shape}")
print(f"Unique Disases: {len(dx_mapping)} Matrix shape: {dx_X.shape }")
relationship_feature = relationship_X[relationship_mapping['Compound treats the disease'],:].reshape(1,-1)


## NEED To add other reationships

Unique Drugs:   3 Matrix shape: torch.Size([3, 768])
Unique Disases: 10 Matrix shape: torch.Size([10, 768])


### Build HeteroData Object

In [11]:
### Create PyG Hetero Graph:
data = HeteroData()
data['compounds'].x = rx_X
data['disease'].x   = dx_X
print(data)

Edge_index,edge_attribute = create_edges(df             =  rx_dx_subset,
                                          src_index_col  = 0, 
                                          src_mapping    = rx_mapping , 
                                          dst_index_col  = 2, 
                                          dst_mapping    = dx_mapping ,
                                          edge_attr      = relationship_feature)

data['compounds', 'Compound treats the disease', 'disease'].edge_index = Edge_index
data['compounds', 'Compound treats the disease', 'disease'].edge_label = edge_attribute 

HeteroData(
  [1mcompounds[0m={ x=[3, 768] },
  [1mdisease[0m={ x=[10, 768] }
)
