DRKG

Adapted from: https://github.com/gnn4dr/DRKG/blob/master/drkg_with_dgl/loading_drkg_in_dgl.ipynb

In [1]:

import pandas as pd
import numpy as np
import os 

In [2]:
!pip install pandas numpy tabulate chembl_downloader pyarrow



In [3]:
from SIMP_LLM.DRKG_loading   import  get_triplets, read_tsv,filter_drkg,map_drkg_relationships,filter_interaction_subset,print_head
from SIMP_LLM.DRKG_translate import  load_lookups

  from .autonotebook import tqdm as notebook_tqdm


# 1) Load Data

In [4]:
### 1) Read: This section reads DRKG and a glossary (used to map entities from codes to words)
DATA_DIR           = os.path.join("data")
verbose            =  True 
triplets,drkg_df   =  get_triplets(drkg_file = os.path.join(DATA_DIR  ,'drkg.tsv'),             verbose=verbose)  # Read triplets (head,relationship,tail)
relation_glossary  =  read_tsv(relation_file = os.path.join(DATA_DIR  ,'relation_glossary.tsv'),verbose=verbose)  # Read relationship mapping  


### 2) Filter & Map Interactions: This section returns a list of interactions e.g. DRUGBANK::treats::Compound:Disease )
# 2.1: First  we filter the interactions to only Compound-Disease
# 2.2: Then   we map the codes -> text  (this will be use to further filter interactions based on text) e.g.  Hetionet::CpD::Compound:Disease -> palliation
# 2.3: We use natural text to fitler  interactions based on terms such as "treat" (but we return the orignal interaction name )



# modularize this in create_dataframe
drkg_rx_dx_relations        = filter_drkg(data_frame = drkg_df ,  filter_column = 1 ,  filter_term = r'.*?Compound:Disease', verbose = verbose) # 2.1 Filter only Compound-Disease Interactions
drkg_rx_dx_relations_mapped = map_drkg_relationships(drkg_rx_dx_relations,relation_glossary,verbose=verbose)                                    # 2.2 Map codes to text 

### 2.3 Filter Drug interactions Interaction types to only include: treat inhibit or alleviate interactions  ###
drkg_rx_dx_relation_subset =  filter_interaction_subset(df                  = drkg_rx_dx_relations_mapped,
                                                        filter_colunm_name = 'Interaction-type' ,
                                                        regex_string       =  'treat|inhibit|alleviate',
                                                        return_colunm_name =  'Relation-name')

# 3) Use Filter Interactions to get Gilter DRKG 
drkg_df_filtered = drkg_df[drkg_df[1].isin(drkg_rx_dx_relation_subset)] # 3.1 Filter DRKG  to only  Compund-Disease 
print_head(df=drkg_df_filtered)



###

rx_dx_triplets   = drkg_df_filtered.values.tolist()                     # 3.2 Convert filtered DRKG to list


 Triplets:

[['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2157'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5264'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2158'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::3309'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::28912'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::811'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2159'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::821'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5627'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5624']]

 data/drkg.tsv  Dataframe:

+----+------------+--------------------------------+-------------+
|    | 0          | 1                              | 2           |
|----+------------+--------------------------------+-------------|
|  0 | Gene::2157 | bioarx::HumGenHumGen:Gene:Gene | Gene::2157  |
|  1 | Gene::2157 | bioarx::HumGenHumGen:Gene:Gene

In [5]:
# 4) Load Data frames for translation
hetionet_df, gene_df, drugbank_df, omim_df, mesh_dict, chebi_df, chembl_df = load_lookups(data_path=DATA_DIR,verbose=verbose)


 data/hetionet-v1.0-nodes.tsv  Dataframe:

+----+-------------------------+---------------------------+---------+
|    | id                      | name                      | kind    |
|----+-------------------------+---------------------------+---------|
|  0 | Anatomy::UBERON:0000002 | uterine cervix            | Anatomy |
|  1 | Anatomy::UBERON:0000004 | nose                      | Anatomy |
|  2 | Anatomy::UBERON:0000006 | islet of Langerhans       | Anatomy |
|  3 | Anatomy::UBERON:0000007 | pituitary gland           | Anatomy |
|  4 | Anatomy::UBERON:0000010 | peripheral nervous system | Anatomy |
+----+-------------------------+---------------------------+---------+

 Sample of Hetionet Data Types (Before processing):

+-------+----------------------------------+-------------------------------------------+---------------------+
|       | id                               | name                                      | kind                |
|-------+--------------------------------

In [21]:
# Make dictionaries
relation_glossary_relation_dict = pd.Series(relation_glossary['Interaction-type'].values, index=relation_glossary['Relation-name']).to_dict()

node_df   = pd.concat([hetionet_df[['name', 'id']], 
                       gene_df.rename(columns = {"description":"name", "GeneID":"id"}),
                       drugbank_df.rename(columns = {"Common name":"name", "DrugBank ID":"id"}),
                       omim_df.rename(columns = {"MIM Number":"id"}),
                       chebi_df.rename(columns = {"NAME":"name", "CHEBI_ACCESSION":"id"}),
                       chembl_df.rename(columns = {"pref_name":"name", "chembl_id":"id"})
                       ], ignore_index=True, axis=0).drop_duplicates() 
node_dict = pd.Series(node_df['name'].values, index=node_df['id']).to_dict() | mesh_dict # Convert node df to dict and merge with MeSH dictionary


# Create and use convert_entitynames function
def convert_entitynames(df, col, node_dict):
  df_update = df.copy()
  df_update[col] = df_update[col].str.replace(r'.*?MESH:', "MESH::", regex=True) # Remove MeSH labeling
  df_update[col] = df_update[col].map(node_dict).fillna(df_update[col])    # Translate dictionary
  df_update[col] = df_update[col].str.replace("Gene::", "Gene ID ") # For remaining uncoverted Gene IDs, remove "::"
  df_update[col] = df_update[col].str.replace("Disease::", "") # For remaining diseases (appears to be just SARS-COVID related names), remove label
  return df_update

df_med    = drkg_df.copy()
df_med[1] = df_med[1].map(relation_glossary_relation_dict).fillna(df_med[1])

df_med = convert_entitynames(df_med, 0, node_dict)
df_med = convert_entitynames(df_med, 2, node_dict)

print_head(df_med) 


+----+------------------------------+-------------+------------------------------------------------------+
|    | 0                            | 1           | 2                                                    |
|----+------------------------------+-------------+------------------------------------------------------|
|  0 | coagulation factor VIII (F8) | interaction | coagulation factor VIII (F8)                         |
|  1 | coagulation factor VIII (F8) | interaction | phytanoyl-CoA 2-hydroxylase (PHYH)                   |
|  2 | coagulation factor VIII (F8) | interaction | coagulation factor IX (F9)                           |
|  3 | coagulation factor VIII (F8) | interaction | heat shock protein family A (Hsp70) member 5 (HSPA5) |
|  4 | coagulation factor VIII (F8) | interaction | immunoglobulin kappa variable 3-20 (IGKV3-20)        |
+----+------------------------------+-------------+------------------------------------------------------+


In [6]:
# Filter DRKG in natural language to drug-treats-disease relationships
# rx_dx        = df_med[df_med.iloc[:,1] ==   'Compound treats the disease']
rx_dx        =  df_med.loc[drkg_df_filtered.index]
rx_dx_subset =  rx_dx[0:10]
rx_dx_subset

Unnamed: 0,0,1,2
1518268,Dornase alfa,Compound treats the disease,Cystic Fibrosis
1518269,Denileukin diftitox,Compound treats the disease,MESH::C063419
1518270,Etanercept,Compound treats the disease,"Spondylitis, Ankylosing"
1518271,Etanercept,Compound treats the disease,Graft vs Host Disease
1518272,Etanercept,Compound treats the disease,Hidradenitis Suppurativa
1518273,Etanercept,Compound treats the disease,"Arthritis, Juvenile"
1518274,Etanercept,Compound treats the disease,Psoriasis
1518275,Etanercept,Compound treats the disease,"Arthritis, Psoriatic"
1518276,Etanercept,Compound treats the disease,Pyoderma Gangrenosum
1518277,Etanercept,Compound treats the disease,"Arthritis, Rheumatoid"


In [7]:
# Remove codes that are not relevant or have no conversion
drkg_translated = df_med.copy()

# Remove taxonomy
remove_tax = drkg_translated[drkg_translated[2].str.contains('Tax::')]
drkg_translated = drkg_translated.drop(remove_tax.index)

# Check for any untranslated terms
remove_untranslated = drkg_translated[(drkg_translated[0].str.contains('::'))|(drkg_translated[2].str.contains('::'))]
drkg_translated = drkg_translated.drop(remove_untranslated.index)

# Summarize 
print('Total number of pairs ' + str(drkg_df.shape[0]))
print('Dropped taxonomy pairs ' + str(len(remove_tax.index)))
print('Dropped untranslated pairs ' + str(len(remove_untranslated.index)))
drkg_translated

Total number of pairs 5874261
Dropped taxonomy pairs 14663
Dropped untranslated pairs 62779


Unnamed: 0,0,1,2
0,F8 gene,interaction,F8 gene
1,F8 gene,interaction,PHYH gene
2,F8 gene,interaction,F9 gene
3,F8 gene,interaction,HSPA5 gene
4,F8 gene,interaction,IGKV3-20 gene
...,...,...,...
5874256,COMMD9 gene,reaction,DDB2 gene
5874257,PPIL1 gene,reaction,HNRNPC gene
5874258,CBFB gene,catalysis,CDK1 gene
5874259,CES1 gene,binding,UGT2B10 gene


In [8]:
# Check unique values of untranslated
drkg_test1 = np.unique(remove_untranslated[0][remove_untranslated[0].str.contains('::')])
drkg_test2 = np.unique(remove_untranslated[2][remove_untranslated[2].str.contains('::')])

drkg_mesh_list = drkg_test1.tolist() +  drkg_test2.tolist()
drkg_mesh_unique = pd.DataFrame(pd.unique(drkg_mesh_list))
drkg_mesh_unique

Unnamed: 0,0
0,Compound::Bioarxivdrug:0
1,Compound::Bioarxivdrug:1
2,Compound::Bioarxivdrug:10
3,Compound::Bioarxivdrug:11
4,Compound::Bioarxivdrug:2
...,...
14416,MESH::C580539
14417,MESH::C585640
14418,MESH::D000071
14419,MESH::D018290


In [9]:
drkg_untranslated = drkg_mesh_unique.copy()
drkg_untranslated = drkg_untranslated[drkg_untranslated[0].str.startswith('MESH::')]
print(len(drkg_untranslated))
drkg_untranslated

7751


Unnamed: 0,0
1429,MESH::C000020
1430,MESH::C000050
1431,MESH::C000121
1432,MESH::C000154
1433,MESH::C000188
...,...
14416,MESH::C580539
14417,MESH::C585640
14418,MESH::D000071
14419,MESH::D018290


# 3) BioLinkBERT embedding

In [12]:
rx_dx_subset

Unnamed: 0,0,1,2
1518268,Dornase alfa,Compound treats the disease,Cystic Fibrosis
1518269,Denileukin diftitox,Compound treats the disease,MESH::C063419
1518270,Etanercept,Compound treats the disease,"Spondylitis, Ankylosing"
1518271,Etanercept,Compound treats the disease,Graft vs Host Disease
1518272,Etanercept,Compound treats the disease,Hidradenitis Suppurativa
1518273,Etanercept,Compound treats the disease,"Arthritis, Juvenile"
1518274,Etanercept,Compound treats the disease,Psoriasis
1518275,Etanercept,Compound treats the disease,"Arthritis, Psoriatic"
1518276,Etanercept,Compound treats the disease,Pyoderma Gangrenosum
1518277,Etanercept,Compound treats the disease,"Arthritis, Rheumatoid"


In [10]:
from torch_geometric.data import HeteroData
from SIMP_LLM.llm_encode import EntityEncoder
from SIMP_LLM.dataloader_mappings import create_mapping,create_edges


## Example of loading data without anything to encode
device    = "cpu"
Encoder  = EntityEncoder(device = device )


### DX RX Relationship ###
rx_X,rx_mapping = create_mapping(rx_dx_subset[0].to_list(),encoder= Encoder ,device=device) # Maps drugs to indices
dx_X,dx_mapping = create_mapping(rx_dx_subset[2].to_list(),encoder= Encoder ,device=device) # Maps drugs to indices
## As of now this only encodes 'Compound treats the disease', but  the idea is that this is used to encode every entity 
relationship_X,relationship_mapping = create_mapping(rx_dx_subset[1].to_list(),encoder= Encoder ,device=device)  

print(f"Unique Drugs:   {len(rx_mapping)} Matrix shape: {rx_X.shape}")
print(f"Unique Disases: {len(dx_mapping)} Matrix shape: {dx_X.shape }")
relationship_feature = relationship_X[relationship_mapping['Compound treats the disease'],:].reshape(1,-1)


## NEED To add other reationships

Unique Drugs:   3 Matrix shape: torch.Size([3, 768])
Unique Disases: 10 Matrix shape: torch.Size([10, 768])


### Build HeteroData Object

In [11]:
### Create PyG Hetero Graph:
data = HeteroData()
data['compounds'].x = rx_X
data['disease'].x   = dx_X
print(data)

Edge_index,edge_attribute = create_edges(df             =  rx_dx_subset,
                                          src_index_col  = 0, 
                                          src_mapping    = rx_mapping , 
                                          dst_index_col  = 2, 
                                          dst_mapping    = dx_mapping ,
                                          edge_attr      = relationship_feature)

data['compounds', 'Compound treats the disease', 'disease'].edge_index = Edge_index
data['compounds', 'Compound treats the disease', 'disease'].edge_label = edge_attribute 

HeteroData(
  [1mcompounds[0m={ x=[3, 768] },
  [1mdisease[0m={ x=[10, 768] }
)


## Create entity lookup table

In [8]:
# Create table of unique DRKG entities
def get_unique_entities(df, columns):
  entity_list = []
  for col in columns:
    entity_list = np.append(entity_list, df[col])
  entity_list = np.unique(entity_list)
  return entity_list

drkg_entities = get_unique_entities(drkg_df, [0,2])

print(len(drkg_entities)) # should be 97238


97238


In [22]:
# Create lookup table for DRKG entities
drkg_entity_df = pd.DataFrame(drkg_entities, columns=['drkg_id'])
drkg_entity_df['drkg_dict_id'] = drkg_entity_df['drkg_id'].str.replace(r'.*?MESH:', "MESH::", regex=True)
drkg_entity_df['name'] = drkg_entity_df['drkg_dict_id'].map(node_dict)
drkg_entity_df[['entity_type', 'ontology_code']] = drkg_entity_df['drkg_id'].str.split("::", expand=True)


# temp = drkg_entity_df[drkg_entity_df['ontology_code'].str.contains(':')]
drkg_entity_df['ontology_name'] = drkg_entity_df['ontology_code'].str.split(":", n=2, expand=True)[0]
drkg_entity_df['code'] = drkg_entity_df['ontology_code'].str.split(":", n=2, expand=True)[1]

###### Cleaning
# Move codes without ontology names to correct column
drkg_entity_df.loc[drkg_entity_df['ontology_name'] == drkg_entity_df['ontology_code'], 'ontology_name'] = None
drkg_entity_df.loc[drkg_entity_df['code'].isna(), 'code'] = drkg_entity_df['ontology_code']

# Add name for entries with SARS-CoV code
drkg_entity_df.loc[drkg_entity_df['code'].str.startswith('SARS-CoV2'), 'name'] = drkg_entity_df['code']

# Manually correct specific ontology names without ':' as ontology-code divider
drkg_entity_df.loc[drkg_entity_df['ontology_code'].str.startswith('CHEMBL'), 'ontology_name'] = 'CHEMBL'
drkg_entity_df.loc[drkg_entity_df['entity_type'] == 'Atc', 'ontology_name'] = 'Atc'
drkg_entity_df.loc[(drkg_entity_df['entity_type'] == 'Compound') & (drkg_entity_df['ontology_code'].str.startswith('DB')), 'ontology_name'] = 'drugbank'
drkg_entity_df.loc[(drkg_entity_df['entity_type'] == 'Side Effect') & (drkg_entity_df['ontology_code'].str.len() == 8), 'ontology_name'] = 'UMLS CUI'
drkg_entity_df.loc[(drkg_entity_df['entity_type'] == 'Symptom') & (drkg_entity_df['ontology_code'].str.len() == 7), 'ontology_name'] = 'MESH'


# Remove irrelevant entries: taxonomy, missing ontology code (only entity type available)
drkg_entity_df = drkg_entity_df[drkg_entity_df['entity_type'] != 'Tax']
drkg_entity_df = drkg_entity_df[drkg_entity_df['ontology_code'].isna()==False]

# Remove entities without name (save for downstream analysis)
drkg_unmatched = drkg_entity_df[drkg_entity_df['name'].isna()]
drkg_entity_df = drkg_entity_df[drkg_entity_df['name'].isna() == False]

drkg_entity_df # 64033 entities that were translated


Unnamed: 0,drkg_id,drkg_dict_id,name,entity_type,ontology_code,ontology_name,code
0,Anatomy::UBERON:0000002,Anatomy::UBERON:0000002,uterine cervix,Anatomy,UBERON:0000002,UBERON,0000002
1,Anatomy::UBERON:0000004,Anatomy::UBERON:0000004,nose,Anatomy,UBERON:0000004,UBERON,0000004
2,Anatomy::UBERON:0000006,Anatomy::UBERON:0000006,islet of Langerhans,Anatomy,UBERON:0000006,UBERON,0000006
3,Anatomy::UBERON:0000007,Anatomy::UBERON:0000007,pituitary gland,Anatomy,UBERON:0000007,UBERON,0000007
4,Anatomy::UBERON:0000010,Anatomy::UBERON:0000010,peripheral nervous system,Anatomy,UBERON:0000010,UBERON,0000010
...,...,...,...,...,...,...,...
97018,Symptom::D063766,Symptom::D063766,Pediatric Obesity,Symptom,D063766,MESH,D063766
97019,Symptom::D063806,Symptom::D063806,Myalgia,Symptom,D063806,MESH,D063806
97020,Symptom::D064250,Symptom::D064250,Hypertriglyceridemic Waist,Symptom,D064250,MESH,D064250
97021,Symptom::D065634,Symptom::D065634,Cerebrospinal Fluid Leak,Symptom,D065634,MESH,D065634


In [31]:
# Translate entities

node_dict2 = pd.Series(drkg_entity_df['name'].values, index=drkg_entity_df['drkg_id']).to_dict() 

# Create and use convert_entitynames function
def convert_entitynames2(df, col, node_dict):
  df_update = df.copy()
  df_update[col] = df_update[col].str.replace(r'.*?MESH:', "MESH::", regex=True) # Remove MeSH labeling
  df_update[col] = df_update[col].map(node_dict)    # Translate dictionary, dont replace NAs
  df_update[col] = df_update[col].str.replace("Gene::", "Gene ID ") # For remaining uncoverted Gene IDs, remove "::"
  df_update[col] = df_update[col].str.replace("Disease::", "") # For remaining diseases (appears to be just SARS-COVID related names), remove label
  return df_update


df_med2    = drkg_df.copy()
df_med2 = convert_entitynames2(df_med2, 0, node_dict)
df_med2 = convert_entitynames2(df_med2, 2, node_dict)
df_med_translate = df_med2.dropna()

In [40]:
pct_translated = df_med_translate.shape[0]/drkg_df.shape[0]
print(f'Percentage of relationships fully translated: ', round(pct_translated*100,1), '%')

pct_translated_notax = df_med_translate.shape[0]/drkg_df[drkg_df[2].str.startswith('Tax') == False].shape[0]
print(f'Percentage of relationships fully translated (excluding taxonomy): ', round(pct_translated_notax*100,1), '%')

Percentage of relationships fully translated:  97.1 %
Percentage of relationships fully translated (excluding taxonomy):  97.4 %


In [23]:
drkg_entity_df.drop_duplicates(subset=['entity_type', 'ontology_name'])


Unnamed: 0,drkg_id,drkg_dict_id,name,entity_type,ontology_code,ontology_name,code
0,Anatomy::UBERON:0000002,Anatomy::UBERON:0000002,uterine cervix,Anatomy,UBERON:0000002,UBERON,0000002
4448,Biological Process::GO:0000002,Biological Process::GO:0000002,mitochondrial genome maintenance,Biological Process,GO:0000002,GO,0000002
15829,Cellular Component::GO:0000015,Cellular Component::GO:0000015,phosphopyruvate hydratase complex,Cellular Component,GO:0000015,GO,0000015
17232,Compound::CHEBI:10057,Compound::CHEBI:10057,9H-xanthene,Compound,CHEBI:10057,CHEBI,10057
18295,Compound::CHEMBL10,Compound::CHEMBL10,SB-203580,Compound,CHEMBL10,CHEMBL,CHEMBL10
21960,Compound::DB00001,Compound::DB00001,Lepirudin,Compound,DB00001,drugbank,DB00001
38906,Compound::MESH:D000001,MESH::D000001,Calcimycin,Compound,MESH:D000001,MESH,D000001
41533,Disease::DOID:0050156,Disease::DOID:0050156,idiopathic pulmonary fibrosis,Disease,DOID:0050156,DOID,0050156
43010,Disease::MESH:D000007,MESH::D000007,Abdominal Injuries,Disease,MESH:D000007,MESH,D000007
46531,Disease::OMIM:102510,Disease::OMIM:102510,ACROPECTOROVERTEBRAL DYSPLASIA,Disease,OMIM:102510,OMIM,102510


In [24]:
# Counts for matched entity types
drkg_entity_df.groupby(by=['entity_type', 'ontology_name'], dropna=False).agg(
    count = ('code', 'count')#.sort_values(by='count')
)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
entity_type,ontology_name,Unnamed: 2_level_1
Anatomy,UBERON,400
Biological Process,GO,11381
Cellular Component,GO,1391
Compound,CHEBI,1060
Compound,CHEMBL,2463
Compound,MESH,1224
Compound,drugbank,10546
Disease,DOID,127
Disease,MESH,3518
Disease,OMIM,78


In [13]:
# Why are there fewer genes
gene_df

Unnamed: 0,GeneID,description
0,Gene::1,alpha-1-B glycoprotein (A1BG)
1,Gene::2,alpha-2-macroglobulin (A2M)
2,Gene::3,alpha-2-macroglobulin pseudogene 1 (A2MP1)
3,Gene::9,N-acetyltransferase 1 (NAT1)
4,Gene::10,N-acetyltransferase 2 (NAT2)
...,...,...
159540,Gene::8923215,tRNA-Asp (trnD)
159541,Gene::8923216,tRNA-Pro (trnP)
159542,Gene::8923217,tRNA-Ala (trnA)
159543,Gene::8923218,cytochrome c oxidase subunit I (COX1)


In [20]:
# Comparison of new and old gene IDs - old version had slightly more matches
gene_df_raw       =  read_tsv( os.path.join(DATA_DIR,'symbols-human.tsv'),verbose=verbose)     # Read and process Gene IDs
df_updated = gene_df_raw[gene_df_raw['type']=='symbol'].drop_duplicates(subset='GeneID').copy()
df_updated['symbol'] = df_updated['symbol'].astype(str) + ' gene'
df_updated['GeneID'] = "Gene::" + df_updated['GeneID'].astype(str)
df_updated = df_updated.drop(columns=['type'])

gene_df_updated = df_updated.merge(gene_df, how='outer', on='GeneID')
print(len(gene_df_updated[gene_df_updated['symbol'].isna()]))
print(len(gene_df_updated[gene_df_updated['description'].isna()]))


 data/symbols-human.tsv  Dataframe:

+----+----------+---------+----------+
|    |   GeneID | type    | symbol   |
|----+----------+---------+----------|
|  0 |        1 | symbol  | A1BG     |
|  1 |        1 | synonym | A1B      |
|  2 |        1 | synonym | ABG      |
|  3 |        1 | synonym | GAB      |
|  4 |        1 | synonym | HYST2477 |
+----+----------+---------+----------+
110993
7800


In [25]:
# Look for entity types without ontology
drkg_entity_df[drkg_entity_df['ontology_name'].isna()].drop_duplicates(subset=['entity_type'])

# entity types without name
drkg_unmatched.groupby(by=['entity_type', 'ontology_name'], dropna=False).agg(
    count = ('code', 'count')#.sort_values(by='count')
)
# drop_duplicates(subset=['entity_type', 'ontology_name'])

# Entity types where drkg id and drkg_dict_id were different (should only be MESH terms)
# drkg_entity_df[drkg_entity_df['drkg_id'] != drkg_entity_df['drkg_dict_id']] 

Unnamed: 0_level_0,Unnamed: 1_level_0,count
entity_type,ontology_name,Unnamed: 2_level_1
Atc,Atc,4048
Compound,Bioarxivdrug,12
Compound,CHEBI,3
Compound,CHEMBL,1202
Compound,MESH,6398
Compound,bindingdb,144
Compound,brenda,731
Compound,chebi,25
Compound,drugbank,5
Compound,drugcentral,18


In [114]:

drugbank_df_raw = pd.read_csv(os.path.join(DATA_DIR,'drugbank vocabulary.csv'))

# df_updated = df[['DrugBank ID', 'Common name']].copy()
# df_updated['DrugBank ID'] = "Compound::" + df_updated['DrugBank ID'].astype(str)
drugbank_df_raw


Unnamed: 0,DrugBank ID,Accession Numbers,Common name,CAS,UNII,Synonyms,Standard InChI Key
0,DB00001,BIOD00024 | BTD00024,Lepirudin,138068-37-8,Y43GF64R34,"[Leu1, Thr2]-63-desulfohirudin | Desulfatohiru...",
1,DB00002,BIOD00071 | BTD00071,Cetuximab,205923-56-4,PQX0D8J21J,Cetuximab | Cétuximab | Cetuximabum,
2,DB00003,BIOD00001 | BTD00001,Dornase alfa,143831-71-4,953A26OA1Y,Deoxyribonuclease (human clone 18-1 protein mo...,
3,DB00004,BIOD00084 | BTD00084,Denileukin diftitox,173146-27-5,25E79B5CTM,Denileukin | Denileukin diftitox | Interleukin...,
4,DB00005,BIOD00052 | BTD00052,Etanercept,185243-69-0,OP401G7OJC,Etanercept | etanercept-szzs | etanercept-ykro...,
...,...,...,...,...,...,...,...
15230,DB17382,,AUM-601,,,"(R,E)-3-(5-(2-(2,5-difluorophenyl)pyrrolidin-1...",
15231,DB17383,,FN-1501,1429515-59-2,6MC966B505,"4-((7h-pyrrolo (2,3-d)pyrimidin-4-yl)amino)-n-...",VXLAKHWYGRKCGI-UHFFFAOYSA-N
15232,DB17384,,Tinengotinib,2230490-29-4,WZ9TJ0L9Y8,"4-(5-(2-Chlorophenyl)-3-Methyl-2,10-Dihydropyr...",DQFCVOOFMXEPOC-UHFFFAOYSA-N
15233,DB17385,,Lipotecan,1432468-79-5,D47234N30N,"Lipothecan free base | Propanoic acid, 2-(((2,...",JCCCLGDYMMTBPM-HXDHBHDHSA-N


In [115]:
# Look for entity types without ontology
drkg_entity_df[drkg_entity_df['ontology_code'].str.contains(":")==False].drop_duplicates(subset='entity_type')

# Look for unmatched entity types

drkg_unmatched.drop_duplicates(subset=['entity_type','ontology_name'])


# 

Unnamed: 0,drkg_id,drkg_dict_id,name,entity_type,ontology_code,ontology_name,code
400,Atc::A,Atc::A,,Atc,A,Atc,A
17220,Compound::Bioarxivdrug:0,Compound::Bioarxivdrug:0,,Compound,Bioarxivdrug:0,Bioarxivdrug,0
17414,Compound::CHEBI:24438,Compound::CHEBI:24438,,Compound,CHEBI:24438,CHEBI,24438
18296,Compound::CHEMBL10009,Compound::CHEMBL10009,,Compound,CHEMBL10009,CHEMBL,CHEMBL10009
24209,Compound::DB02507,Compound::DB02507,,Compound,DB02507,drugbank,DB02507
32511,Compound::MESH:C000020,MESH::C000020,,Compound,MESH:C000020,MESH,C000020
40133,Compound::bindingdb:11428,Compound::bindingdb:11428,,Compound,bindingdb:11428,bindingdb,11428
40277,Compound::brenda:10041,Compound::brenda:10041,,Compound,brenda:10041,brenda,10041
41008,Compound::chebi:131517,Compound::chebi:131517,,Compound,chebi:131517,chebi,131517
41033,Compound::drugcentral:1022,Compound::drugcentral:1022,,Compound,drugcentral:1022,drugcentral,1022


In [117]:
!wget https://data.bioontology.org/ontologies/MESH/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv

## Code for graph

In [None]:
# SP 5/15/23 pseudo-code to append new edge types to graph

# For DRKG translation: have dictionary 

# Entity lookup table with DRKG code, English language translation, and entity type
entity_lookup = drkg_entity_df.copy()

mapping_dict = {}
for entity in entity_lookup['entity_type'].unique():
    entity_names = entity_lookup.loc[entity_lookup['entity_type'] == entity, 'name']
    entity_X, entity_mapping = create_mapping(entity_names, encoder=Encoder, device=device) # Maps entities to indices
    data[entity].x = entity_X
    mapping_dict[entity] = entity_mapping

for relation_name in distinct_relation_types
    drkg_relation_codes = relation_glossary.loc[relation_glossary['relation_name'] == relation_name, 'DRKG code']
    drkg_relation_subset = df_med[df_med[1] in drkg_relation_codes]
    head_entity = relation_glossary['head'] for relation_name
    tail_entity = relation_glossary['tail'] for relation_name

    Edge_index,edge_attribute = create_edges(df            = drkg_relation_subset,
                                            src_index_col  = 0, 
                                            src_mapping    = mapping_dict[head_entity] , 
                                            dst_index_col  = 2, 
                                            dst_mapping    = mapping_dict[tail_entity] ,
                                            edge_attr      = relationship_feature)

    data[head_entity, relation_name, tail_entity].edge_index = Edge_index
    data[head_entity, relation_name, tail_entity].edge_label = edge_attribute 

In [1]:
# Test
entity_list = ['test1', 'test2', 'test3']
entity_list = list(set(entity_list))  # Convert to set to remove duplicates, then back to list
mapping = {index: i for i, index in enumerate(entity_list)}
print(mapping)

{'test2': 0, 'test1': 1, 'test3': 2}


In [2]:
entity_list.unique()



AttributeError: 'list' object has no attribute 'unique'