DRKG

Adapted from: https://github.com/gnn4dr/DRKG/blob/master/drkg_with_dgl/loading_drkg_in_dgl.ipynb

In [3]:

import pandas as pd
import numpy as np
import os 

In [4]:
!pip install pandas numpy tabulate chembl_downloader pyarrow



In [83]:
from SIMP_LLM.DRKG_loading   import  get_triplets, read_tsv,filter_drkg,map_drkg_relationships,filter_interaction_subset,print_head
from SIMP_LLM.DRKG_translate import  load_lookups
from SIMP_LLM.DRKG_entity_processing import get_unique_entities, get_entity_lookup, convert_entitynames, flip_headtail

# 1) Load Data

In [6]:
### 1) Read: This section reads DRKG and a glossary (used to map entities from codes to words)
DATA_DIR           = os.path.join("data")
verbose            =  True 
triplets,drkg_df   =  get_triplets(drkg_file = os.path.join(DATA_DIR  ,'drkg.tsv'),             verbose=verbose)  # Read triplets (head,relationship,tail)
relation_glossary  =  read_tsv(relation_file = os.path.join(DATA_DIR  ,'relation_glossary.tsv'),verbose=verbose)  # Read relationship mapping  


### 2) Filter & Map Interactions: This section returns a list of interactions e.g. DRUGBANK::treats::Compound:Disease )
# 2.1: First  we filter the interactions to only Compound-Disease
# 2.2: Then   we map the codes -> text  (this will be use to further filter interactions based on text) e.g.  Hetionet::CpD::Compound:Disease -> palliation
# 2.3: We use natural text to fitler  interactions based on terms such as "treat" (but we return the orignal interaction name )



# modularize this in create_dataframe
drkg_rx_dx_relations        = filter_drkg(data_frame = drkg_df ,  filter_column = 1 ,  filter_term = r'.*?Compound:Disease', verbose = verbose) # 2.1 Filter only Compound-Disease Interactions
drkg_rx_dx_relations_mapped = map_drkg_relationships(drkg_rx_dx_relations,relation_glossary,verbose=verbose)                                    # 2.2 Map codes to text 

### 2.3 Filter Drug interactions Interaction types to only include: treat inhibit or alleviate interactions  ###
drkg_rx_dx_relation_subset =  filter_interaction_subset(df                  = drkg_rx_dx_relations_mapped,
                                                        filter_colunm_name = 'Interaction-type' ,
                                                        regex_string       =  'treat|inhibit|alleviate',
                                                        return_colunm_name =  'Relation-name')

# 3) Use Filter Interactions to get Gilter DRKG 
drkg_df_filtered = drkg_df[drkg_df[1].isin(drkg_rx_dx_relation_subset)] # 3.1 Filter DRKG  to only  Compund-Disease 
print_head(df=drkg_df_filtered)



###

rx_dx_triplets   = drkg_df_filtered.values.tolist()                     # 3.2 Convert filtered DRKG to list


 Triplets:

[['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2157'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5264'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2158'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::3309'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::28912'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::811'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2159'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::821'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5627'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5624']]

 data/drkg.tsv  Dataframe:

+----+------------+--------------------------------+-------------+
|    | 0          | 1                              | 2           |
|----+------------+--------------------------------+-------------|
|  0 | Gene::2157 | bioarx::HumGenHumGen:Gene:Gene | Gene::2157  |
|  1 | Gene::2157 | bioarx::HumGenHumGen:Gene:Gene

In [7]:
# 4) Load Data frames for translation
hetionet_df, gene_df, drugbank_df, omim_df, mesh_dict, chebi_df, chembl_df = load_lookups(data_path=DATA_DIR,verbose=verbose)


 data/hetionet-v1.0-nodes.tsv  Dataframe:

+----+-------------------------+---------------------------+---------+
|    | id                      | name                      | kind    |
|----+-------------------------+---------------------------+---------|
|  0 | Anatomy::UBERON:0000002 | uterine cervix            | Anatomy |
|  1 | Anatomy::UBERON:0000004 | nose                      | Anatomy |
|  2 | Anatomy::UBERON:0000006 | islet of Langerhans       | Anatomy |
|  3 | Anatomy::UBERON:0000007 | pituitary gland           | Anatomy |
|  4 | Anatomy::UBERON:0000010 | peripheral nervous system | Anatomy |
+----+-------------------------+---------------------------+---------+

 Sample of Hetionet Data Types (Before processing):

+-------+----------------------------------+-------------------------------------------+---------------------+
|       | id                               | name                                      | kind                |
|-------+--------------------------------

In [38]:
# Make dictionaries for relations and codes
relation_glossary_relation_dict = pd.Series(relation_glossary['Interaction-type'].values, index=relation_glossary['Relation-name']).to_dict()

code_df   = pd.concat([hetionet_df[['name', 'id']], 
                       gene_df.rename(columns = {"description":"name", "GeneID":"id"}),
                       drugbank_df.rename(columns = {"Common name":"name", "DrugBank ID":"id"}),
                       omim_df.rename(columns = {"MIM Number":"id"}),
                       chebi_df.rename(columns = {"NAME":"name", "CHEBI_ACCESSION":"id"}),
                       chembl_df.rename(columns = {"pref_name":"name", "chembl_id":"id"})
                       ], ignore_index=True, axis=0).drop_duplicates() 
code_dict = pd.Series(code_df['name'].values, index=code_df['id']).to_dict() | mesh_dict # Convert node df to dict and merge with MeSH dictionary

# Get unique DRKG entities
drkg_entities = get_unique_entities(drkg_df, [0,2])

# Create and use convert_entitynames function
drkg_entity_df, drkg_unmatched = get_entity_lookup(drkg_entities, code_dict)

# Create final node dictionary
node_dict = pd.Series(drkg_entity_df['name'].values, index=drkg_entity_df['drkg_id']).to_dict() 

# Map DRKG to translated entity names
drkg_translated    = drkg_df.copy()
drkg_translated = convert_entitynames(drkg_translated, 0, node_dict)
drkg_translated = convert_entitynames(drkg_translated, 2, node_dict)
drkg_translated = drkg_translated.dropna()
print_head(drkg_translated) 

# Summarize percentage translated
print("Number of unique DRKG entities: ", len(drkg_entities)) # should be 97238
print("Number of translated entities: ", drkg_entity_df.shape[0])
print("Number of untranslated entities: ", drkg_unmatched.shape[0])
pct_entity_translated = drkg_entity_df.shape[0]/len(drkg_entities)
print('Percentage of entities translated: ', round(pct_entity_translated*100,1), '%')

print('Total DRKG relationships: ', drkg_df.shape[0])
print('Translated DRKG relationships: ', drkg_translated.shape[0])
pct_translated = drkg_translated.shape[0]/drkg_df.shape[0]
print('Percentage of relationships fully translated: ', round(pct_translated*100,1), '%')

+----+------------------------------+--------------------------------+------------------------------------------------------+
|    | 0                            | 1                              | 2                                                    |
|----+------------------------------+--------------------------------+------------------------------------------------------|
|  0 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | coagulation factor VIII (F8)                         |
|  1 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | phytanoyl-CoA 2-hydroxylase (PHYH)                   |
|  2 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | coagulation factor IX (F9)                           |
|  3 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | heat shock protein family A (Hsp70) member 5 (HSPA5) |
|  4 | coagulation factor VIII (F8) | bioarx::HumGenHumGen:Gene:Gene | immunoglobulin kappa variable 3-20 (IGKV3-20)  

In [9]:
# Filter DRKG in natural language to drug-treats-disease relationships
rx_dx        =  drkg_translated[drkg_translated[1].isin(drkg_rx_dx_relation_subset)]
rx_dx_subset =  rx_dx[0:10]
rx_dx_subset

Unnamed: 0,0,1,2
1518268,Dornase alfa,Compound treats the disease,Cystic Fibrosis
1518269,Denileukin diftitox,Compound treats the disease,MESH::C063419
1518270,Etanercept,Compound treats the disease,"Spondylitis, Ankylosing"
1518271,Etanercept,Compound treats the disease,Graft vs Host Disease
1518272,Etanercept,Compound treats the disease,Hidradenitis Suppurativa
1518273,Etanercept,Compound treats the disease,"Arthritis, Juvenile"
1518274,Etanercept,Compound treats the disease,Psoriasis
1518275,Etanercept,Compound treats the disease,"Arthritis, Psoriatic"
1518276,Etanercept,Compound treats the disease,Pyoderma Gangrenosum
1518277,Etanercept,Compound treats the disease,"Arthritis, Rheumatoid"


In [11]:
# Check unique values of untranslated
# drkg_test1 = np.unique(remove_untranslated[0][remove_untranslated[0].str.contains('::')])
# drkg_test2 = np.unique(remove_untranslated[2][remove_untranslated[2].str.contains('::')])

# drkg_mesh_list = drkg_test1.tolist() +  drkg_test2.tolist()
# drkg_mesh_unique = pd.DataFrame(pd.unique(drkg_mesh_list))
# drkg_mesh_unique

Unnamed: 0,0
0,Compound::Bioarxivdrug:0
1,Compound::Bioarxivdrug:1
2,Compound::Bioarxivdrug:10
3,Compound::Bioarxivdrug:11
4,Compound::Bioarxivdrug:2
...,...
14416,MESH::C580539
14417,MESH::C585640
14418,MESH::D000071
14419,MESH::D018290


In [12]:
drkg_untranslated = drkg_unmatched.copy()
drkg_untranslated = drkg_untranslated[drkg_untranslated[0].str.startswith('MESH::')]
print(len(drkg_untranslated))
drkg_untranslated

7751


Unnamed: 0,0
1429,MESH::C000020
1430,MESH::C000050
1431,MESH::C000121
1432,MESH::C000154
1433,MESH::C000188
...,...
14416,MESH::C580539
14417,MESH::C585640
14418,MESH::D000071
14419,MESH::D018290


# 3) BioLinkBERT embedding

In [13]:
rx_dx_subset

Unnamed: 0,0,1,2
1518268,Dornase alfa,Compound treats the disease,Cystic Fibrosis
1518269,Denileukin diftitox,Compound treats the disease,MESH::C063419
1518270,Etanercept,Compound treats the disease,"Spondylitis, Ankylosing"
1518271,Etanercept,Compound treats the disease,Graft vs Host Disease
1518272,Etanercept,Compound treats the disease,Hidradenitis Suppurativa
1518273,Etanercept,Compound treats the disease,"Arthritis, Juvenile"
1518274,Etanercept,Compound treats the disease,Psoriasis
1518275,Etanercept,Compound treats the disease,"Arthritis, Psoriatic"
1518276,Etanercept,Compound treats the disease,Pyoderma Gangrenosum
1518277,Etanercept,Compound treats the disease,"Arthritis, Rheumatoid"


In [14]:
from torch_geometric.data import HeteroData
from SIMP_LLM.llm_encode import EntityEncoder
from SIMP_LLM.dataloader_mappings import create_mapping,create_edges


## Example of loading data without anything to encode
device    = "cpu"
Encoder  = EntityEncoder(device = device )


### DX RX Relationship ###
rx_X,rx_mapping = create_mapping(rx_dx_subset[0].to_list(),encoder= Encoder ,device=device) # Maps drugs to indices
dx_X,dx_mapping = create_mapping(rx_dx_subset[2].to_list(),encoder= Encoder ,device=device) # Maps drugs to indices
## As of now this only encodes 'Compound treats the disease', but  the idea is that this is used to encode every entity 
relationship_X,relationship_mapping = create_mapping(rx_dx_subset[1].to_list(),encoder= Encoder ,device=device)  

print(f"Unique Drugs:   {len(rx_mapping)} Matrix shape: {rx_X.shape}")
print(f"Unique Disases: {len(dx_mapping)} Matrix shape: {dx_X.shape }")
relationship_feature = relationship_X[relationship_mapping['Compound treats the disease'],:].reshape(1,-1)


## NEED To add other reationships

Unique Drugs:   3 Matrix shape: torch.Size([3, 768])
Unique Disases: 10 Matrix shape: torch.Size([10, 768])


### Build HeteroData Object

In [15]:
### Create PyG Hetero Graph:
data = HeteroData()
data['compounds'].x = rx_X
data['disease'].x   = dx_X
print(data)

Edge_index,edge_attribute = create_edges(df             =  rx_dx_subset,
                                          src_index_col  = 0, 
                                          src_mapping    = rx_mapping , 
                                          dst_index_col  = 2, 
                                          dst_mapping    = dx_mapping ,
                                          edge_attr      = relationship_feature)

data['compounds', 'Compound treats the disease', 'disease'].edge_index = Edge_index
data['compounds', 'Compound treats the disease', 'disease'].edge_label = edge_attribute 

HeteroData(
  [1mcompounds[0m={ x=[3, 768] },
  [1mdisease[0m={ x=[10, 768] }
)


## TEST/DEVELOPER - code to create entity lookup table

In [16]:
# Create table of unique DRKG entities
# def get_unique_entities(df:pd.core.frame.DataFrame, columns):
#   '''Append all unique entries in specified list of columns in dataframe and get unique entities
#   '''
#   entity_list = []
#   for col in columns:
#     entity_list = np.append(entity_list, df[col])
#   entity_list = np.unique(entity_list)
#   return entity_list

drkg_entities = get_unique_entities(drkg_df, [0,2])

print(len(drkg_entities)) # should be 97238


97238


In [17]:
# Create lookup table for unique DRKG entities
# def get_entity_lookup(drkg_entities, node_dict):
#     '''Converts list of unique DRKG entities to entity table with the following items, using the lookup table dictionary node_dict:
#         'drkg_id':       original entity code in DRKG
#         'drkg_dict_id':  original entity code, except with entity name in MeSH entity codes removed to match with MeSH lookup format
#         'name':          natural language entity name, translated using node_dict dictionary
#         'entity_type':   type of entity (gene, disease, compound, etc.), taken from drkg_id
#         'ontology_code': combined ontology name and code, taken by removing entity_type from drkg_id
#         'ontology_name': name of ontology from which code was sourced, if available
#         'code':          specific code or ID from the ontology

#     Also does the following cleaning:
#     * Manual processing of entity and ontology names where the name or source was inferred from the code/ID
#     * Remove irrelevant entries (taxonomy and entries with only an entity type but no associated code such as "Gene::")
#     * Remove entities with no name and return them as a separate dataframe
#     '''
#     drkg_entity_df = pd.DataFrame(drkg_entities, columns=['drkg_id'])

#     # Create copy of DRKG ID value that simplifies MeSH codes
#     drkg_entity_df['drkg_dict_id'] = drkg_entity_df['drkg_id'].str.replace(r'.*?MESH:', "MESH::", regex=True)

#     # Map entity natural language name
#     drkg_entity_df['name'] = drkg_entity_df['drkg_dict_id'].map(node_dict)

#     # Get ontology name and code if available
#     drkg_entity_df[['entity_type', 'ontology_code']] = drkg_entity_df['drkg_id'].str.split("::", expand=True)
#     drkg_entity_df['ontology_name'] = drkg_entity_df['ontology_code'].str.split(":", n=2, expand=True)[0]
#     drkg_entity_df['code'] = drkg_entity_df['ontology_code'].str.split(":", n=2, expand=True)[1]

#     ###### Cleaning ######
#     # Move codes without ontology names to correct column
#     drkg_entity_df.loc[drkg_entity_df['ontology_name'] == drkg_entity_df['ontology_code'], 'ontology_name'] = None
#     drkg_entity_df.loc[drkg_entity_df['code'].isna(), 'code'] = drkg_entity_df['ontology_code']

#     # Add name for entries with SARS-CoV code
#     drkg_entity_df.loc[drkg_entity_df['code'].str.startswith('SARS-CoV2'), 'name'] = drkg_entity_df['code']

#     # Manually correct specific ontology names without ':' as ontology-code divider
#     drkg_entity_df.loc[drkg_entity_df['ontology_code'].str.startswith('CHEMBL'), 'ontology_name'] = 'CHEMBL'
#     drkg_entity_df.loc[drkg_entity_df['entity_type'] == 'Atc', 'ontology_name'] = 'Atc'
#     drkg_entity_df.loc[(drkg_entity_df['entity_type'] == 'Compound') & (drkg_entity_df['ontology_code'].str.startswith('DB')), 'ontology_name'] = 'drugbank'
#     drkg_entity_df.loc[(drkg_entity_df['entity_type'] == 'Side Effect') & (drkg_entity_df['ontology_code'].str.len() == 8), 'ontology_name'] = 'UMLS CUI'
#     drkg_entity_df.loc[(drkg_entity_df['entity_type'] == 'Symptom') & (drkg_entity_df['ontology_code'].str.len() == 7), 'ontology_name'] = 'MESH'

#     # Remove entities that are irrelevant or without name (save for downstream analysis)
#     drkg_unmatched = drkg_entity_df[(drkg_entity_df['name'].isna()) | 
#                                     (drkg_entity_df['entity_type'] == 'Tax') |
#                                     (drkg_entity_df['ontology_code'].isna())] # ontology_code filter is redundant to name filter, but keeping in case we need this subset later
#     drkg_entity_df = drkg_entity_df[~drkg_entity_df.index.isin(drkg_unmatched.index)]

#     return drkg_entity_df, drkg_unmatched 

drkg_entity_df, drkg_unmatched = get_entity_lookup(drkg_entities, node_dict)
print(drkg_entity_df.shape[0], drkg_unmatched.shape[0])
# 64033 entities were translated, 33205 untranslated


64033 33205


In [34]:
# Translate entities
# def convert_entitynames(df, col, node_dict):
#   """Convert entity codes to names in specified column based on dictionary"""
#   df_update = df.copy()
#   df_update[col] = df_update[col].map(node_dict)    # Translate dictionary, dont replace NAs
#   return df_update

node_dict2 = pd.Series(drkg_entity_df['name'].values, index=drkg_entity_df['drkg_id']).to_dict() 

# Create and use convert_entitynames function
# def convert_entitynames2(df, col, node_dict):
#   df_update = df.copy()
#   df_update[col] = df_update[col].map(node_dict)    # Translate dictionary, dont replace NAs
#   return df_update


df_med2    = drkg_df.copy()
df_med2 = convert_entitynames(df_med2, 0, node_dict2)
df_med2 = convert_entitynames(df_med2, 2, node_dict2)
df_med_translate = df_med2.dropna()

In [33]:
drkg_entity_df[drkg_entity_df['drkg_id'] == 'Compound::molport:MolPort-046-762-962']


Unnamed: 0,drkg_id,drkg_dict_id,name,entity_type,ontology_code,ontology_name,code


In [35]:
# Check for NAs
df_med2[df_med2[0].isna() |df_med2[2].isna()]
# df_med_translate[df_med_translate[0].isna() |df_med_translate[2].isna()]

# df_med_translate[df_med_translate[0].str.contains("::")]
# node_dict2['Compound::molport:MolPort-046-762-962']



Unnamed: 0,0,1,2
724,calcium/calmodulin dependent serine protein ki...,bioarx::HumGenHumGen:Gene:Gene,
879,heat shock protein family A (Hsp70) member 8 (...,bioarx::HumGenHumGen:Gene:Gene,
1032,glucagon like peptide 1 receptor (GLP1R),bioarx::HumGenHumGen:Gene:Gene,
3183,casein kinase 2 beta (CSNK2B),bioarx::HumGenHumGen:Gene:Gene,
3582,growth factor receptor bound protein 2 (GRB2),bioarx::HumGenHumGen:Gene:Gene,
...,...,...,...
4312604,synaptotagmin 5 (SYT5),INTACT::ASSOCIATION::Gene:Gene,
4321477,ezrin (EZR),INTACT::ASSOCIATION::Gene:Gene,
4357923,IQ motif containing GTPase activating protein ...,INTACT::ASSOCIATION::Gene:Gene,
4365678,PDZ and LIM domain 7 (PDLIM7),INTACT::ASSOCIATION::Gene:Gene,


In [36]:
print(df_med_translate.shape[0])
print(drkg_df.shape[0])

5705613
5874261


In [37]:
# Summarize percentage translated
pct_translated = df_med_translate.shape[0]/drkg_df.shape[0]
print('Percentage of relationships fully translated: ', round(pct_translated*100,1), '%')

pct_translated_notax = df_med_translate.shape[0]/drkg_df[(drkg_df[2].str.startswith('Tax') == False) & (drkg_df[0] != 'Gene::')].shape[0]
print('Percentage of relationships fully translated (excluding taxonomy and missing gene code): ', round(pct_translated_notax*100,1), '%')

Percentage of relationships fully translated:  97.1 %
Percentage of relationships fully translated (excluding taxonomy and missing gene code):  97.4 %


In [52]:
# Look at entity types in dataframe
drkg_entity_df.drop_duplicates(subset=['entity_type', 'ontology_name'])


Unnamed: 0,drkg_id,drkg_dict_id,name,entity_type,ontology_code,ontology_name,code
0,Anatomy::UBERON:0000002,Anatomy::UBERON:0000002,uterine cervix,Anatomy,UBERON:0000002,UBERON,0000002
4448,Biological Process::GO:0000002,Biological Process::GO:0000002,mitochondrial genome maintenance,Biological Process,GO:0000002,GO,0000002
15829,Cellular Component::GO:0000015,Cellular Component::GO:0000015,phosphopyruvate hydratase complex,Cellular Component,GO:0000015,GO,0000015
17232,Compound::CHEBI:10057,Compound::CHEBI:10057,9H-xanthene,Compound,CHEBI:10057,CHEBI,10057
18295,Compound::CHEMBL10,Compound::CHEMBL10,SB-203580,Compound,CHEMBL10,CHEMBL,CHEMBL10
21960,Compound::DB00001,Compound::DB00001,Lepirudin,Compound,DB00001,drugbank,DB00001
38906,Compound::MESH:D000001,MESH::D000001,Calcimycin,Compound,MESH:D000001,MESH,D000001
41533,Disease::DOID:0050156,Disease::DOID:0050156,idiopathic pulmonary fibrosis,Disease,DOID:0050156,DOID,0050156
43010,Disease::MESH:D000007,MESH::D000007,Abdominal Injuries,Disease,MESH:D000007,MESH,D000007
46531,Disease::OMIM:102510,Disease::OMIM:102510,ACROPECTOROVERTEBRAL DYSPLASIA,Disease,OMIM:102510,OMIM,102510


In [72]:
# Counts for matched entity types
drkg_entity_df.groupby(by=['entity_type', 'ontology_name'], dropna=False).agg(
    count = ('code', 'count')#.sort_values(by='count')
)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
entity_type,ontology_name,Unnamed: 2_level_1
Anatomy,UBERON,400
Biological Process,GO,11381
Cellular Component,GO,1391
Compound,CHEBI,1060
Compound,CHEMBL,2463
Compound,MESH,1224
Compound,drugbank,10546
Disease,DOID,127
Disease,MESH,3518
Disease,OMIM,78


In [54]:
# Comparison of new and old gene IDs - old version had slightly more matches
gene_df_raw       =  read_tsv( os.path.join(DATA_DIR,'symbols-human.tsv'),verbose=verbose)     # Read and process Gene IDs
df_updated = gene_df_raw[gene_df_raw['type']=='symbol'].drop_duplicates(subset='GeneID').copy()
df_updated['symbol'] = df_updated['symbol'].astype(str) + ' gene'
df_updated['GeneID'] = "Gene::" + df_updated['GeneID'].astype(str)
df_updated = df_updated.drop(columns=['type'])

gene_df_updated = df_updated.merge(gene_df, how='outer', on='GeneID')
print(len(gene_df_updated[gene_df_updated['symbol'].isna()]))
print(len(gene_df_updated[gene_df_updated['description'].isna()]))


 data/symbols-human.tsv  Dataframe:

+----+----------+---------+----------+
|    |   GeneID | type    | symbol   |
|----+----------+---------+----------|
|  0 |        1 | symbol  | A1BG     |
|  1 |        1 | synonym | A1B      |
|  2 |        1 | synonym | ABG      |
|  3 |        1 | synonym | GAB      |
|  4 |        1 | synonym | HYST2477 |
+----+----------+---------+----------+
110993
7800


In [73]:
# Look for entity types without ontology
drkg_entity_df[drkg_entity_df['ontology_name'].isna()].drop_duplicates(subset=['entity_type'])

# counts for unmatched entity types 
drkg_unmatched.groupby(by=['entity_type', 'ontology_name'], dropna=False).agg(
    count = ('code', 'count')#.sort_values(by='count')
)
# drop_duplicates(subset=['entity_type', 'ontology_name'])

# Entity types where drkg id and drkg_dict_id were different (should only be MESH terms)
# drkg_entity_df[drkg_entity_df['drkg_id'] != drkg_entity_df['drkg_dict_id']] 

Unnamed: 0_level_0,Unnamed: 1_level_0,count
entity_type,ontology_name,Unnamed: 2_level_1
Atc,Atc,4048
Compound,Bioarxivdrug,12
Compound,CHEBI,3
Compound,CHEMBL,1202
Compound,MESH,6398
Compound,bindingdb,144
Compound,brenda,731
Compound,chebi,25
Compound,drugbank,5
Compound,drugcentral,18


In [115]:
# Look for entity types without ontology
drkg_entity_df[drkg_entity_df['ontology_code'].str.contains(":")==False].drop_duplicates(subset='entity_type')

# Look for unmatched entity types
drkg_unmatched.drop_duplicates(subset=['entity_type','ontology_name'])


# 

Unnamed: 0,drkg_id,drkg_dict_id,name,entity_type,ontology_code,ontology_name,code
400,Atc::A,Atc::A,,Atc,A,Atc,A
17220,Compound::Bioarxivdrug:0,Compound::Bioarxivdrug:0,,Compound,Bioarxivdrug:0,Bioarxivdrug,0
17414,Compound::CHEBI:24438,Compound::CHEBI:24438,,Compound,CHEBI:24438,CHEBI,24438
18296,Compound::CHEMBL10009,Compound::CHEMBL10009,,Compound,CHEMBL10009,CHEMBL,CHEMBL10009
24209,Compound::DB02507,Compound::DB02507,,Compound,DB02507,drugbank,DB02507
32511,Compound::MESH:C000020,MESH::C000020,,Compound,MESH:C000020,MESH,C000020
40133,Compound::bindingdb:11428,Compound::bindingdb:11428,,Compound,bindingdb:11428,bindingdb,11428
40277,Compound::brenda:10041,Compound::brenda:10041,,Compound,brenda:10041,brenda,10041
41008,Compound::chebi:131517,Compound::chebi:131517,,Compound,chebi:131517,chebi,131517
41033,Compound::drugcentral:1022,Compound::drugcentral:1022,,Compound,drugcentral:1022,drugcentral,1022


#### check for relationship types where entities flip 

In [53]:
# Create entity type dictionary and map translated words to
entitytype_dict = pd.Series(drkg_entity_df['entity_type'].values, index=drkg_entity_df['drkg_id']).to_dict() 
entityconnection_dict = pd.Series(relation_glossary['Connected entity-types'].values, index=relation_glossary['Relation-name']).to_dict() 

drkg_test    = drkg_df.copy()
drkg_test = convert_entitynames(drkg_test, 0, entitytype_dict)
drkg_test = convert_entitynames(drkg_test, 2, entitytype_dict)
drkg_test = convert_entitynames(drkg_test, 1, entityconnection_dict)
print(len(drkg_test))
print(len(drkg_test.dropna()))

drkg_test = drkg_test.dropna()
drkg_test.drop_duplicates().groupby(1).agg(
    ent_ct = (0, 'count')
)

# Only Compound:Gene and Disease:Gene have >1 pairing

5874261
5705613


Unnamed: 0_level_0,ent_ct
1,Unnamed: 1_level_1
Anatomy:Disease,1
Anatomy:Gene,1
Biological Process:Gene,1
Cellular Component:Gene,1
Compound:Compound,1
Compound:Disease,1
Compound:Gene,2
Compound:Pharmacologic Class,1
Compound:Side Effect,1
Diisease:Disease,1


In [57]:
# count number of head entity types (maybe the more prevalent order should be prioritized)
ent_check_list = ['Compound:Gene','Disease:Gene']
drkg_test[drkg_test[1].isin(ent_check_list)].groupby([1,0]).agg(
    ent_ct = (0, 'count')
)

Unnamed: 0_level_0,Unnamed: 1_level_0,ent_ct
1,0,Unnamed: 2_level_1
Compound:Gene,Compound,126712
Compound:Gene,Gene,23870
Disease:Gene,Disease,27389
Disease:Gene,Gene,61341


In [61]:
drkg_entity_df

Unnamed: 0,drkg_id,drkg_dict_id,name,entity_type,ontology_code,ontology_name,code
0,Anatomy::UBERON:0000002,Anatomy::UBERON:0000002,uterine cervix,Anatomy,UBERON:0000002,UBERON,0000002
1,Anatomy::UBERON:0000004,Anatomy::UBERON:0000004,nose,Anatomy,UBERON:0000004,UBERON,0000004
2,Anatomy::UBERON:0000006,Anatomy::UBERON:0000006,islet of Langerhans,Anatomy,UBERON:0000006,UBERON,0000006
3,Anatomy::UBERON:0000007,Anatomy::UBERON:0000007,pituitary gland,Anatomy,UBERON:0000007,UBERON,0000007
4,Anatomy::UBERON:0000010,Anatomy::UBERON:0000010,peripheral nervous system,Anatomy,UBERON:0000010,UBERON,0000010
...,...,...,...,...,...,...,...
97018,Symptom::D063766,Symptom::D063766,Pediatric Obesity,Symptom,D063766,MESH,D063766
97019,Symptom::D063806,Symptom::D063806,Myalgia,Symptom,D063806,MESH,D063806
97020,Symptom::D064250,Symptom::D064250,Hypertriglyceridemic Waist,Symptom,D064250,MESH,D064250
97021,Symptom::D065634,Symptom::D065634,Cerebrospinal Fluid Leak,Symptom,D065634,MESH,D065634


In [71]:
# Look at these specific entries in DRKG
rel_check = relation_glossary[relation_glossary['Connected entity-types'].isin(ent_check_list)]

hrt_check = drkg_df[drkg_df[1].isin(rel_check['Relation-name'])].copy()
hrt_check = hrt_check.merge(rel_check.drop(columns=['Reference for the description']), how='left', left_on=1, right_on='Relation-name').drop(columns=['Relation-name'])
hrt_check = hrt_check.merge(drkg_entity_df[['drkg_id', 'entity_type']], how='inner', left_on=0, right_on='drkg_id').rename(columns={'drkg_id':'head_drkg_id', 'entity_type':'head_entity_type'})
hrt_check = hrt_check.merge(drkg_entity_df[['drkg_id', 'entity_type']], how='inner', left_on=2, right_on='drkg_id').rename(columns={'drkg_id':'tail_drkg_id', 'entity_type':'tail_entity_type'})
hrt_check = convert_entitynames(hrt_check, 0, node_dict)
hrt_check = convert_entitynames(hrt_check, 2, node_dict)

hrt_check.drop(columns=['head_drkg_id','tail_drkg_id']).drop_duplicates(subset=['Data-source', 1, 'Connected entity-types', 'head_entity_type', 'tail_entity_type']).sort_values(by=['Connected entity-types','Data-source'])
# Conclusion: switch the DGIDB gene:compound head/tails so it becomes compound -> gene; but keep the disease:gene relations the same, since they are original from hetionet (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5640425/)


Unnamed: 0,0,1,2,Data-source,Connected entity-types,Interaction-type,Description,head_entity_type,tail_entity_type
0,"2'-deoxycytidine-2'-deoxyadenosine-3',5'-monop...",bioarx::DrugHumGen:Compound:Gene,"ribonuclease A family member 1, pancreatic (RN...",BIBLIOGRAPHY,Compound:Gene,interaction,,Compound,Gene
141080,cyclin dependent kinase 7 (CDK7),DGIDB::INHIBITOR::Gene:Compound,SNS-032,DGIDB,Compound:Gene,inhibition,"In inhibitor interactions, the drug binds to a...",Gene,Compound
141081,cyclin dependent kinase 7 (CDK7),DGIDB::OTHER::Gene:Compound,SNS-032,DGIDB,Compound:Gene,other,This is a label given by the reporting source ...,Gene,Compound
141159,gamma-aminobutyric acid type A receptor subuni...,DGIDB::POSITIVE ALLOSTERIC MODULATOR::Gene:Com...,Pentobarbital,DGIDB,Compound:Gene,positive allosteric modulation,In a positive allosteric modulator interaction...,Gene,Compound
141165,gamma-aminobutyric acid type A receptor subuni...,DGIDB::ANTAGONIST::Gene:Compound,Pentobarbital,DGIDB,Compound:Gene,antagonism,An antagonist interaction occurs when a drug b...,Gene,Compound
149736,adenosine A2a receptor (ADORA2A),DGIDB::AGONIST::Gene:Compound,2-Naphthalenesulfonic acid,DGIDB,Compound:Gene,agonism,An agonist interaction occurs when a drug bind...,Gene,Compound
149807,ryanodine receptor 3 (RYR3),DGIDB::ACTIVATOR::Gene:Compound,Caffeine,DGIDB,Compound:Gene,activation,An activator interaction is when a drug activa...,Gene,Compound
149880,transient receptor potential cation channel su...,DGIDB::CHANNEL BLOCKER::Gene:Compound,Adenosine,DGIDB,Compound:Gene,channel blocking,,Gene,Compound
149929,tumor necrosis factor (TNF),DGIDB::ANTIBODY::Gene:Compound,Pentoxifylline,DGIDB,Compound:Gene,antibody,An antibody interaction occurs when an antibod...,Gene,Compound
150689,G protein-coupled receptor 68 (GPR68),DGIDB::ALLOSTERIC MODULATOR::Gene:Compound,Lorazepam,DGIDB,Compound:Gene,allosteric modulation,An allosteric modulator interaction occurs whe...,Gene,Compound


In [78]:
# Flip entries

drkg_test   = drkg_df.copy()

def flip_headtail(df, search_string):
    df_update = df.copy()
    heads = df_update[0].copy()
    df_update.loc[df_update[1].str.contains(search_string), 0] = df_update[2]
    df_update.loc[df_update[1].str.contains(search_string), 2] = heads
    return df_update

drkg_test2 = flip_headtail(drkg_test, 'Gene:Compound')
drkg_test2[drkg_test[0] != drkg_test2[0]]

Unnamed: 0,0,1,2
84756,Compound::DB05969,DGIDB::INHIBITOR::Gene:Compound,Gene::1022
84757,Compound::CHEMBL72862,DGIDB::ANTAGONIST::Gene:Compound,Gene::135
84758,Compound::DB00635,DGIDB::OTHER::Gene:Compound,Gene::348
84759,Compound::DB12007,DGIDB::OTHER::Gene:Compound,Gene::1571
84760,Compound::DB12874,DGIDB::INHIBITOR::Gene:Compound,Gene::5159
...,...,...,...
111041,Compound::CHEMBL1770297,DGIDB::ANTAGONIST::Gene:Compound,Gene::2357
111042,Compound::DB00396,DGIDB::OTHER::Gene:Compound,Gene::1645
111043,Compound::DB06649,DGIDB::BLOCKER::Gene:Compound,Gene::11280
111044,Compound::DB11886,DGIDB::INHIBITOR::Gene:Compound,Gene::5289


## Code for graph

In [62]:
# repeat import statements from above
# !pip install torch
# !pip install torch_geometric
# from torch_geometric.data import HeteroData
!pip install transformers
from torch_geometric.data import HeteroData
from SIMP_LLM.llm_encode import EntityEncoder
from SIMP_LLM.dataloader_mappings import create_mapping,create_edges

Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Using cached huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl (188 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2023.5.5-cp311-cp311-macosx_10_9_x86_64.whl (294 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Using cached tokenizers-0.13.3-cp311-cp311-macosx_10_11_universal2.whl (4.0 MB)
Collecting fsspec (from huggingface-hub<1.0,>=0.14.1->transformers)
  Using cached fsspec-2023.5.0-py3-none-any.whl (160 kB)
Installing collected packages: tokenizers, regex, pyyaml, fsspec, huggingface-hub, transformers
Successfully installed fsspec-2023.5.0 huggingface-h

In [67]:
# TEST entity lookup
## Example of loading data without anything to encode
device    = "cpu"
Encoder  = EntityEncoder(device = device )

data_test = HeteroData()

def TEST_embed_entities(entity_df, graph_obj, Encoder, device):
    '''Embeds entities, inputs embeddings into Heterograph object, and returns mapping dictionary by entity type'''
    '''TEST: Only first 3 entities of each type'''
    entity_lookup = entity_df.copy()
    mapping_dict = {}
    for entity in entity_lookup['entity_type'].unique():
        entity_names = entity_lookup.loc[entity_lookup['entity_type'] == entity, 'name'][:3]
        entity_X, entity_mapping = create_mapping(entity_names, encoder=Encoder, device=device) # Maps entities to indices
        graph_obj[entity].x = entity_X
        mapping_dict[entity] = entity_mapping
    return mapping_dict

mapping_dict = TEST_embed_entities(drkg_entity_df, data_test, Encoder, device)
print(data_test)
print(mapping_dict['Gene'])
print(mapping_dict['Disease'])

HeteroData(
  [1mAnatomy[0m={ x=[3, 768] },
  [1mBiological Process[0m={ x=[3, 768] },
  [1mCellular Component[0m={ x=[3, 768] },
  [1mCompound[0m={ x=[3, 768] },
  [1mDisease[0m={ x=[3, 768] },
  [1mGene[0m={ x=[3, 768] },
  [1mMolecular Function[0m={ x=[3, 768] },
  [1mPathway[0m={ x=[3, 768] },
  [1mPharmacologic Class[0m={ x=[3, 768] },
  [1mSide Effect[0m={ x=[3, 768] },
  [1mSymptom[0m={ x=[3, 768] }
)
{'adenosine deaminase (ADA)': 0, 'alpha-1-B glycoprotein (A1BG)': 1, 'N-acetyltransferase 2 (NAT2)': 2}
{'idiopathic pulmonary fibrosis': 0, 'alcohol dependence': 1, 'restless legs syndrome': 2}


In [None]:
# SP 5/15/23 Functions to embed entities and append new edge types to graph

# For DRKG translation: have dictionary 

# Entity lookup table with DRKG code, English language translation, and entity type
# def embed_entities(entity_df, graph_obj, Encoder, device):
#     '''Embeds entities, inputs embeddings directly into Heterograph object, and returns mapping dictionary (which is a dictionary of dictionaries) by entity type'''
#     entity_lookup = entity_df.copy()
#     mapping_dict = {}
#     for entity in entity_lookup['entity_type'].unique():                                        # For each entity type
#         entity_names = entity_lookup.loc[entity_lookup['entity_type'] == entity, 'name']        # Get entity names associated with entity type
#         entity_X, entity_mapping = create_mapping(entity_names, encoder=Encoder, device=device) # Maps entities to indices
#         graph_obj[entity].x = entity_X                                                          # Assign entity type embeddings to graph object
#         mapping_dict[entity] = entity_mapping                                                   # Add entity type mapping to overall mapping dictionary
#     return mapping_dict


# def embed_edges(hrt_data, head_col, relation_col, tail_col, relation_lookup, graph_obj):
#     for relation_name in relation_lookup['relation_name'].unique():                                         # For each relation type
#         # Get relation codes associated with relation type and filter knowledge graph to associated relation codes
#         relation_subset = relation_lookup[relation_lookup['relation_name'] == relation_name]  
#         hrt_subset = hrt_data[hrt_data[relation_col].isin(relation_subset['drkg_id'])]   

#         # Get head and tail entity types from data, check that there is only one type for each
#         head_entity = relation_subset['head_entity']
#         tail_entity = relation_subset['tail_entity']

#         # Create edge attributes for graph
#         Edge_index,edge_attribute = create_edges(df            = hrt_subset,
#                                                 src_index_col  = head_col, 
#                                                 src_mapping    = mapping_dict[head_entity] , 
#                                                 dst_index_col  = tail_col, 
#                                                 dst_mapping    = mapping_dict[tail_entity] ,
#                                                 edge_attr      = relationship_feature)

#         graph_obj[head_entity, relation_name, tail_entity].edge_index = Edge_index
#         graph_obj[head_entity, relation_name, tail_entity].edge_label = edge_attribute 
#     return


hrt_data = df_med_translate.copy()
relation_lookup = relation_glossary.copy()

def embed_edges(hrt_data, relation_lookup, graph_obj, mapping_dict, encoder, device):
    '''
    Given dataframe with columns for head-relationship-tail (h,r,t) in that order, create edges in Heterograph object by relationship type.
    Assumes entity types are already embedded in graph.
    MAY NEED MORE WORK
    '''
    # Create mapping for relations
    relation_name_list = relation_lookup['relation_name'].unique()
    relation_X, relation_mapping = create_mapping(relation_name_list,encoder=encoder,device=device)  

    for relation_name in relation_name_list:
        # Get relation codes associated with relation type and filter knowledge graph to associated relation codes
        relation_subset = relation_lookup[relation_lookup['relation_name'] == relation_name]  
        hrt_subset = hrt_data[hrt_data[1].isin(relation_subset['drkg_id'])]   

        relation_feature = relation_X[relation_mapping[relation_name],:].reshape(1,-1)

        # Get head and tail entity types from data, check that there is only one type for each
        head_entity = relation_subset['head_entity']
        tail_entity = relation_subset['tail_entity']

        # Create edge attributes for graph
        Edge_index,edge_attribute = create_edges(df            = hrt_subset,
                                                src_index_col  = 0, 
                                                src_mapping    = mapping_dict[head_entity] , 
                                                dst_index_col  = 2, 
                                                dst_mapping    = mapping_dict[tail_entity] ,
                                                edge_attr      = relation_feature)

        graph_obj[head_entity, relation_name, tail_entity].edge_index = Edge_index
        graph_obj[head_entity, relation_name, tail_entity].edge_label = edge_attribute 
    return relation_X, relation_mapping