## Data Preprocessing

In [1]:
import pandas as pd
df = pd.read_csv('../data/raw/drkg.tsv', delimiter='\t', names=['node_a', 'relation', 'node_b'])
rel_df = pd.read_csv('../data/raw/relation_glossary.tsv', delimiter='\t')

df.head()

Unnamed: 0,node_a,relation,node_b
0,Gene::2157,bioarx::HumGenHumGen:Gene:Gene,Gene::2157
1,Gene::2157,bioarx::HumGenHumGen:Gene:Gene,Gene::5264
2,Gene::2157,bioarx::HumGenHumGen:Gene:Gene,Gene::2158
3,Gene::2157,bioarx::HumGenHumGen:Gene:Gene,Gene::3309
4,Gene::2157,bioarx::HumGenHumGen:Gene:Gene,Gene::28912


In [2]:
from tqdm import tqdm
from collections import Counter

entity_type_a = [rel.split(':')[-2] for rel in df.relation]
entity_type_b = [rel.split(':')[-1] for rel in df.relation]

entity_type_set_a = list(set(entity_type_a))
entity_type_set_b = list(set(entity_type_b))

relation_counts = {}
for en_a in entity_type_set_a:
    for en_b in entity_type_set_b:
        relation_counts[f"{en_a}:{en_b}"] = 0

sources = []
relation_types = []
relation_entities = []
for idx in tqdm(range(len(df.node_a))):
    relations = df.relation[idx].split(':')
    key = f"{relations[-2]}:{relations[-1]}"
    relation_counts[key] += 1
    sources.append(relations[0])
    relation_types.append(relations[2])
    relation_entities.append(key)

df["relation_entities"] = relation_entities
df["relation_type"] = relation_types
df["source"] = sources

df.drop("relation", inplace=True, axis=1)

100%|██████████| 5874261/5874261 [00:30<00:00, 191201.12it/s]


# Dataset Preparation

## Relationship Descriptions

In [3]:
# structured description of relations

with open("../data/structured/structured_relationship_descriptions.txt", "a") as f:
    for idx in range(rel_df.shape[0]):
        for col in list(rel_df.columns):
            if str(rel_df.iloc[idx][col]) != 'nan':
                f.write(f"{col}: {rel_df.iloc[idx][col]}\n")
        f.write("\n")

In [4]:
# unstructured description of relations
with open("../data/unstructured/unstructured_relationship_descriptions.txt", "a") as f:
    for idx in range(rel_df.shape[0]):
        entities = rel_df.iloc[idx]['Connected entity-types'].split(':')
        interaction_type = rel_df.iloc[idx]['Interaction-type']
        description = rel_df.iloc[idx]['Description']
        source = rel_df.iloc[idx]['Data-source']
        f.write(f"Relation \"{rel_df.iloc[idx]['Relation-name']}\" refers to \"{interaction_type}\" type interaction between \"{entities[0]}\" and \"{entities[1]}\". ")
        if str(description) != 'nan':
            f.write(f"This interaction type can be described as \"{description}\". ",)
        f.write(f"Source database of this relation is {source}.\n")

## Entity Relations

In [5]:
df_2 = pd.read_csv('../data/raw/drkg.tsv', delimiter='\t', names=['node_a', 'relation', 'node_b'])

In [6]:
for ent in set(df.relation_entities):
    if rel_df[rel_df['Connected entity-types']==ent].shape[0]:
        print(ent, rel_df[rel_df['Connected entity-types']==ent].shape[0], list(rel_df[rel_df['Connected entity-types']==ent]["Interaction-type"]))
    else:
        print(ent, "NONE")

Compound:Disease 10 ['Compound treats the disease', 'inhibits cell growth (esp. cancers)', 'role in disease pathogenesis', 'biomarkers (of disease progression)', 'alleviates, reduces', 'prevents, suppresses', 'side effect/adverse event', 'treatment/therapy (including investigatory)', 'palliation', 'treatment']
Disease:Gene 15 ['drug targets', 'promotes progression', 'role in pathogenesis', 'improper regulation linked to disease', 'biomarkers (diagnostic)', 'possible therapeutic effect', 'causal mutations', 'mutations affecting disease course', 'overexpression in disease', 'polymorphisms alter risk', 'association', 'downregulation', 'upregulation', 'interaction', 'interaction']
Disease:Disease NONE
Compound:Side Effect 1 ['causes']
Disease:Anatomy NONE
Compound:Gene 34 ['activation', 'agonism', 'allosteric modulation', 'antagonism', 'antibody', 'binding', 'blocking', 'channel blocking', 'inhibition', 'modulation', 'other', 'partial agonism', 'positive allosteric modulation', 'carrier', 

In [7]:
entities_set_new = []
for s in set(df_2.relation):
    entities_set_new.append(s.split('::')[-1])

sorted(list(set(entities_set_new)))

['Anatomy:Gene',
 'Compound:Atc',
 'Compound:Compound',
 'Compound:Disease',
 'Compound:Gene',
 'Compound:Side Effect',
 'Disease:Anatomy',
 'Disease:Disease',
 'Disease:Gene',
 'Disease:Symptom',
 'DrugHumGen:Compound:Gene',
 'DrugVirGen:Compound:Gene',
 'Gene:Biological Process',
 'Gene:Cellular Component',
 'Gene:Compound',
 'Gene:Disease',
 'Gene:Gene',
 'Gene:Molecular Function',
 'Gene:Pathway',
 'Gene:Tax',
 'HumGenHumGen:Gene:Gene',
 'Pharmacologic Class:Compound',
 'VirGenHumGen:Gene:Gene']

In [8]:
for idx in range(rel_df.shape[0]):
    description = rel_df.iloc[idx]['Description']
    if str(description) != 'nan':
        print(f"ID: {idx}, Description: {description}")

ID: 0, Description: An activator interaction is when a drug activates a biological response from a target, although the mechanism by which it does so may not be understood.
ID: 1, Description: An agonist interaction occurs when a drug binds to a target receptor and activates the receptor to produce a biological response.
ID: 2, Description: An allosteric modulator interaction occurs when drugs exert their effects on their protein targets via a different binding site than the natural (orthosteric) ligand site.
ID: 3, Description: An antagonist interaction occurs when a drug blocks or dampens agonist-mediated responses rather than provoking a biological response itself upon binding to a target receptor.
ID: 4, Description: An antibody interaction occurs when an antibody drug specifically binds the target molecule.
ID: 5, Description: A binder interaction has drugs physically binding to their target.
ID: 6, Description: Antagonist interactions are sometimes referred to as blocker interact

In [9]:
sorted(list(set(df_2.relation)))

['DGIDB::ACTIVATOR::Gene:Compound',
 'DGIDB::AGONIST::Gene:Compound',
 'DGIDB::ALLOSTERIC MODULATOR::Gene:Compound',
 'DGIDB::ANTAGONIST::Gene:Compound',
 'DGIDB::ANTIBODY::Gene:Compound',
 'DGIDB::BINDER::Gene:Compound',
 'DGIDB::BLOCKER::Gene:Compound',
 'DGIDB::CHANNEL BLOCKER::Gene:Compound',
 'DGIDB::INHIBITOR::Gene:Compound',
 'DGIDB::MODULATOR::Gene:Compound',
 'DGIDB::OTHER::Gene:Compound',
 'DGIDB::PARTIAL AGONIST::Gene:Compound',
 'DGIDB::POSITIVE ALLOSTERIC MODULATOR::Gene:Compound',
 'DRUGBANK::carrier::Compound:Gene',
 'DRUGBANK::ddi-interactor-in::Compound:Compound',
 'DRUGBANK::enzyme::Compound:Gene',
 'DRUGBANK::target::Compound:Gene',
 'DRUGBANK::treats::Compound:Disease',
 'DRUGBANK::x-atc::Compound:Atc',
 'GNBR::A+::Compound:Gene',
 'GNBR::A-::Compound:Gene',
 'GNBR::B::Compound:Gene',
 'GNBR::B::Gene:Gene',
 'GNBR::C::Compound:Disease',
 'GNBR::D::Gene:Disease',
 'GNBR::E+::Compound:Gene',
 'GNBR::E+::Gene:Gene',
 'GNBR::E-::Compound:Gene',
 'GNBR::E::Compound:Gene'

For the project, we are focusing on **gene to disease** relations only.

In [10]:
relationship_in_focus = 'Gene:Disease'
df[df.relation_entities==relationship_in_focus]

Unnamed: 0,node_a,node_b,relation_entities,relation_type,source
1694421,Gene::1,Disease::MESH:D005909,Gene:Disease,L,GNBR
1694422,Gene::10,Disease::MESH:C562839,Gene:Disease,U,GNBR
1694423,Gene::10,Disease::MESH:D001172,Gene:Disease,Y,GNBR
1694424,Gene::10,Disease::MESH:D001932,Gene:Disease,Y,GNBR
1694425,Gene::10,Disease::MESH:D003110,Gene:Disease,J,GNBR
...,...,...,...,...,...
1789815,Gene::99982,Disease::MESH:D004715,Gene:Disease,L,GNBR
1789816,Gene::99982,Disease::MESH:D006973,Gene:Disease,J,GNBR
1789817,Gene::99982,Disease::MESH:D009362,Gene:Disease,J,GNBR
1789818,Gene::99982,Disease::MESH:D013274,Gene:Disease,L,GNBR


In [11]:
from tqdm import tqdm

# getting relationship description
rel_desc = []
for each in tqdm([f"GNBR::{rel}::{relationship_in_focus}" for rel in list(df[df.relation_entities==relationship_in_focus]['relation_type'])]):
    rel_desc.append(rel_df[rel_df['Relation-name']==each]['Interaction-type'].values[0])

100%|██████████| 95399/95399 [00:14<00:00, 6394.76it/s]


### Structured Dataset Generation

In [12]:
structured_data = df[df.relation_entities==relationship_in_focus]
structured_data["relation_name"] = rel_desc
structured_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  structured_data["relation_name"] = rel_desc


Unnamed: 0,node_a,node_b,relation_entities,relation_type,source,relation_name
1694421,Gene::1,Disease::MESH:D005909,Gene:Disease,L,GNBR,improper regulation linked to disease
1694422,Gene::10,Disease::MESH:C562839,Gene:Disease,U,GNBR,causal mutations
1694423,Gene::10,Disease::MESH:D001172,Gene:Disease,Y,GNBR,polymorphisms alter risk
1694424,Gene::10,Disease::MESH:D001932,Gene:Disease,Y,GNBR,polymorphisms alter risk
1694425,Gene::10,Disease::MESH:D003110,Gene:Disease,J,GNBR,role in pathogenesis
...,...,...,...,...,...,...
1789815,Gene::99982,Disease::MESH:D004715,Gene:Disease,L,GNBR,improper regulation linked to disease
1789816,Gene::99982,Disease::MESH:D006973,Gene:Disease,J,GNBR,role in pathogenesis
1789817,Gene::99982,Disease::MESH:D009362,Gene:Disease,J,GNBR,role in pathogenesis
1789818,Gene::99982,Disease::MESH:D013274,Gene:Disease,L,GNBR,improper regulation linked to disease


In [13]:
structured_data.to_csv(f'../data/structured/structured_gene_disease.csv')

### Semi-structured Dataset Generation

In [14]:
for idx in tqdm(range(10)):
    relationship_description = rel_df[f"{structured_data.iloc[idx]['source']}::{structured_data.iloc[idx]['relation_type']}::{structured_data.iloc[idx]['relation_entities']}"==rel_df['Relation-name']]
    print(f"interaction-entities: {structured_data.iloc[idx].node_a} and {structured_data.iloc[idx].node_b}\ninteraction-type: {relationship_description['Interaction-type'].values[0]}\n")

100%|██████████| 10/10 [00:00<00:00, 2373.15it/s]

interaction-entities: Gene::1 and Disease::MESH:D005909
interaction-type: improper regulation linked to disease

interaction-entities: Gene::10 and Disease::MESH:C562839
interaction-type: causal mutations

interaction-entities: Gene::10 and Disease::MESH:D001172
interaction-type: polymorphisms alter risk

interaction-entities: Gene::10 and Disease::MESH:D001932
interaction-type: polymorphisms alter risk

interaction-entities: Gene::10 and Disease::MESH:D003110
interaction-type: role in pathogenesis

interaction-entities: Gene::10 and Disease::MESH:D004409
interaction-type: polymorphisms alter risk

interaction-entities: Gene::10 and Disease::MESH:D006331
interaction-type: polymorphisms alter risk

interaction-entities: Gene::10 and Disease::MESH:D010190
interaction-type: polymorphisms alter risk

interaction-entities: Gene::10 and Disease::MESH:D015179
interaction-type: role in pathogenesis

interaction-entities: Gene::10 and Disease::MESH:D015212
interaction-type: polymorphisms alter 




In [15]:
with open("../data/semi_structured/semi_structured_gene_disease.txt", "a") as f:
    for idx in tqdm(range(structured_data.shape[0])):
        relationship_description = rel_df[f"{structured_data.iloc[idx]['source']}::{structured_data.iloc[idx]['relation_type']}::{structured_data.iloc[idx]['relation_entities']}"==rel_df['Relation-name']]
        f.write(f"interaction-entities: {structured_data.iloc[idx].node_a} and {structured_data.iloc[idx].node_b}\ninteraction-type: {relationship_description['Interaction-type'].values[0]}\n")
        f.write("\n")

100%|██████████| 95399/95399 [00:29<00:00, 3181.55it/s]


### Unstructured (Language like) Dataset Generation

For each of 107 entity relations type, we have generated a template to convert the structured data to human language like unstructured dataset.

In [16]:
def generate_sentence(entity_a, entity_b, interaction_type):
    """
    Generate a context-rich sentence based on the interaction type, entities, and optionally a description.

    Args:
    - entities (str): The entities involved in the interaction.
    - interaction_type (str): The type of interaction.
    - description (str, optional): A brief description of the interaction. Default is None.

    Returns:
    - str: A context-rich sentence describing the interaction.
    """

    # source of the templates: https://chat.openai.com/share/dd3533f8-06b0-4837-9ec0-b425dbc944b8
    # Mapping of interaction types to sentence templates
    templates = {
        'activation [Compound:Gene]': "{entity_a} interacts with {entity_b} through activation, meaning the drug triggers a biological response from this gene, though the exact mechanism is not fully understood.",
        'activation [Gene:Gene]': "{entity_a} and {entity_b} interact through activation, suggesting one gene activates or enhances the function of the other.",

        'binding [Compound:Gene]': "A binding interaction is observed between {entity_a} and {entity_b}, indicating the compound physically binds to the gene or its product.",
        'binding [Gene:Gene]': "{entity_a} and {entity_b} are involved in a binding interaction, indicating a physical association between them.",

        'inhibition [Compound:Gene]': "{entity_a} and {entity_b} exhibit inhibition, where the drug binds to the gene and reduces its expression or activity, commonly seen in enzyme inhibitors.",
        'inhibition [Gene:Gene]': "{entity_a} and {entity_b} are involved in an inhibition interaction, suggesting one gene inhibits the activity of the other.",

        'binding, ligand (esp. receptors) [Compound:Gene]': "There is a binding to ligand (especially receptors) type interaction between {entity_a} and {entity_b}, highlighting the drug's role in receptor binding.",
        'binding, ligand (esp. receptors) [Gene:Gene]': "{entity_a} and {entity_b} interact through a binding to ligand (especially receptors) mechanism, suggesting a specific receptor-ligand dynamic.",

        'increases expression/production [Compound:Gene]': "{entity_a} and {entity_b} interact, leading to an increase in expression or production of the gene.",
        'increases expression/production [Gene:Gene]': "An interaction between {entity_a} and {entity_b} results in increased expression or production, indicating a regulatory relationship.",

        'affects expression/production (neutral) [Compound:Gene]': "{entity_a} interacts with {entity_b} in a way that affects expression or production neutrally.",
        'affects expression/production (neutral) [Gene:Gene]': "{entity_a} and {entity_b} interact, affecting each other's expression or production in a neutral manner.",

        'expression [Anatomy:Gene]': "{entity_a} and {entity_b} show an expression interaction, suggesting the gene is actively expressed in a specific anatomical region.",
        'expression [Gene:Gene]': "{entity_a} and {entity_b} show an expression interaction, indicating gene expression modulation.",

        'upregulation [Anatomy:Gene]': "{entity_a} and {entity_b} are involved in an upregulation interaction, indicating increased gene expression in a particular anatomical context.",
        'upregulation [Compound:Gene]': "{entity_a} and {entity_b} show an upregulation interaction, where the compound increases the gene's expression or activity.",
        'upregulation [Disease:Gene]': "{entity_a} and {entity_b} are involved in an upregulation interaction, where the disease increases the gene's expression or activity.",

        'downregulatation': "{entity_a} and {entity_b} are involved in a downregulation interaction, indicating a decrease in gene expression in a specific anatomical context.",
        'downregulation [Compound:Gene]': "{entity_a} and {entity_b} are involved in a downregulation interaction, where the compound decreases the gene's expression or activity.",
        'downregulation [Disease:Gene]': "{entity_a} and {entity_b} demonstrate a downregulation interaction, suggesting the disease decreases the gene's expression or activity.",

        'resemblence [Compound:Compound]': "{entity_a} and {entity_b} are linked by a resemblance interaction, suggesting similarity in compound structure or function.",
        'resemblence [Disease:Disease]': "{entity_a} and {entity_b} are related by resemblance, suggesting similarities in disease symptoms or pathology.",

        'association [Disease:Gene]': "{entity_a} and {entity_b} are associated, indicating a connection between the gene and the disease.",
        'association [Compound:Gene]': "{entity_a} and {entity_b} exhibit an association interaction, potentially forming physical complexes.",
        'association [Gene:Gene]': "{entity_a} associates with another gene {entity_b}, possibly participating in the formation of one or more physical complexes.",

        'interaction [Disease:Gene]': "{entity_a} and {entity_b} interact, showing a connection between the disease and the gene.",
        'interaction [Compound:Gene]': "{entity_a} interacts with {entity_b}, suggesting a direct or indirect influence of the compound on the gene.",
        'interaction [Gene:GeneSelf]': "{entity_a} shows a protein-protein interaction with itself, indicating a homodimer or similar self-association.",
        'interaction [Gene:Gene]': "{entity_a} and {entity_b} interact, suggesting a protein-protein interaction between them.",

        'participation': "{entity_a} participates in {entity_b}, suggesting its involvement in a specific biochemical pathway or molecular process.",
        'direct interation': "{entity_a} and {entity_b} show a direct interaction, indicating physical contact between the molecules.",
        'physical association': "{entity_a} and {entity_b} are part of a physical association, indicating their presence in the same physical complex.",

        'agonism': "{entity_a} and {entity_b} exhibit an agonism interaction, where the drug binds to the gene's receptor and activates it to produce a biological response.",
        'allosteric modulation': "An allosteric modulation occurs between {entity_a} and {entity_b}, where the drug affects the gene by binding to a different site than the natural ligand.",
        'antagonism': "{entity_a} and {entity_b} engage in antagonism, where the drug blocks or reduces the response mediated by the gene's agonist without initiating a biological response itself.",
        'antibody': "{entity_a} is targeted by {entity_b} in an antibody interaction, where the antibody drug specifically binds to the gene.",
        'blocking': "{entity_a} and {entity_b} demonstrate a blocking interaction, a type of antagonism such as alpha, beta, or calcium channel blocking.",
        'channel blocking': "{entity_a} and {entity_b} are involved in channel blocking, a specific type of interaction.",
        'modulation': "Modulation is the interaction type between {entity_a} and {entity_b}, where the drug alters the activity of the gene, potentially without direct binding.",
        'other': "{entity_a} interacts with {entity_b}. The exact type of interaction is unknown but it different from the other interactions.",
        'partial agonism': "There is a partial agonism interaction between {entity_a} and {entity_b}, where the drug partially reduces the functional response of the target receptor compared to a full antagonist.",
        'positive allosteric modulation': "{entity_a} and {entity_b} are involved in positive allosteric modulation, where the drug increases the activity of the target enzyme.",
        'carrier': "{entity_b} acts as a carrier for {entity_a}, altering the pharmacokinetics of the drug through binding.",
        'drug-drug interaction': "A drug-drug interaction occurs between {entity_a} and {entity_b}, indicating a change in the drug's effect when taken together.",
        'enzyme': "{entity_a} serves as an enzyme for {entity_b}, facilitating the metabolic transformation of the drug into specific metabolites.",
        'target': "{entity_a} targets {entity_b}, interacting with the gene to alter its normal function, leading to therapeutic or adverse effects.",
        'Compound treats the disease': "{entity_a} treats {entity_b}, illustrating a compound's therapeutic relationship with a specific disease.",
        'Compound belongs to Anatomical Therapeutic Chemical (ATC) code.': "{entity_a} belongs to the Anatomical Therapeutic Chemical (ATC) code {entity_b}, linking the drug to a specific classification in the hierarchical ATC system.",
        'agonism, activation': "{entity_a} exhibits both agonism and activation interactions with {entity_b}, indicating multiple modes of drug action.",
        'antagonism, blocking': "{entity_a} and {entity_b} are involved in an interaction characterized by both antagonism and blocking, indicating a multi-faceted drug action.",
        'inhibits cell growth (esp. cancers)': "{entity_a} and {entity_b} show an interaction where the compound inhibits cell growth, particularly relevant in cancers.",
        'drug targets': "{entity_a} and {entity_b} are linked in a drug target interaction, highlighting the gene's role in the disease's mechanism.",
        'decreases expression/production': "{entity_a} decreases expression or production of {entity_b}.",
        'promotes progression': "{entity_a} promotes the progression of {entity_b}.",
        'same protein or complex': "{entity_a} and {entity_b} demonstrate an interaction where they form the same protein or complex.",
        'signaling pathway': "{entity_a} and {entity_b} are part of a signaling pathway interaction, indicating a complex network of gene interactions.",
        'role in disease pathogenesis': "{entity_a} plays a role in disease pathogenesis of {entity_b}",
        'role in pathogenesis': "{entity_a} plays a role in disease pathogenesis of {entity_b}",
        'regulation': "{entity_a} and {entity_b} are involved in a regulatory interaction, indicating a mutual influence on each other's function.",
        'metabolism, pharmacokinetics': "{entity_a} and {entity_b} interact in the context of metabolism and pharmacokinetics, highlighting the drug's metabolic effects.",
        'improper regulation linked to disease': "{entity_a} and {entity_b} show an improper regulation interaction linked to the disease, suggesting a pathogenic mechanism.",
        'biomarkers (diagnostic)': "{entity_a} and {entity_b} are associated in a biomarker diagnostic interaction, indicating the gene's potential as a disease biomarker.",
        'biomarkers (of disease progression)': "{entity_a} and {entity_b} interact as biomarkers of disease progression, suggesting a role in monitoring disease evolution.",
        'inhibits': "{entity_a} and {entity_b} are involved in an inhibitory interaction, with the compound acting to inhibit the gene's function.",
        'transport, channels': "{entity_a} and {entity_b} interact in a way that involves transport and channel functions, suggesting a role in cellular transport mechanisms.",
        'alleviates, reduces': "{entity_a} and {entity_b} show an interaction where the compound alleviates or reduces the symptoms of the disease.",
        'prevents, suppresses': "{entity_a} and {entity_b} are involved in an interaction that prevents or suppresses the development of the disease.",
        'production by cell population': "{entity_a} and {entity_b} demonstrate a production by cell population interaction, indicating a joint role in cellular processes.",
        'side effect/adverse event': "{entity_a} and {entity_b} interact in a way that causes a side effect or adverse event, highlighting potential negative drug reactions.",
        'treatment/therapy (including investigatory)': "{entity_a} and {entity_b} are linked in a treatment or therapy interaction, suggesting the compound's role in addressing the disease.",
        'possible therapeutic effect': "{entity_a} and {entity_b} demonstrate a possible therapeutic effect interaction, indicating the gene's potential role in disease treatment.",
        'causal mutations': "{entity_a} and {entity_b} are involved in an interaction related to causal mutations, suggesting a genetic link to the disease.",
        'mutations affecting disease course': "{entity_a} and {entity_b} show an interaction where mutations affect the disease course, highlighting genetic influences on disease progression.",
        'activates, stimulates': "{entity_a} and {entity_b} interact through activation or stimulation, indicating a role in initiating or enhancing a biological response.",
        'enhances response': "{entity_a} and {entity_b} demonstrate an interaction that enhances response, suggesting a synergistic effect on biological processes.",
        'overexpression in disease': "{entity_a} and {entity_b} are involved in an overexpression interaction in the disease, indicating a gene's heightened activity in disease states.",
        'polymorphisms alter risk': "{entity_a} and {entity_b} are linked through an interaction where polymorphisms alter risk, suggesting genetic variations influence disease susceptibility.",
        'enzyme activity': "{entity_a} and {entity_b} interact in a way that involves enzyme activity, highlighting the compound's influence on metabolic processes.",
        'gene belongs to taxonomy': "{entity_a} and {entity_b} show an interaction where the gene belongs to a specific taxonomy, indicating its classification in the biological hierarchy.",
        'causes': "{entity_a} causes a specific side effect, as indicated by its interaction with {entity_b}.",
        'palliation': "{entity_a} and {entity_b} interact in a palliative manner, indicating the compound's role in alleviating the symptoms of the disease.",
        'treatment': "{entity_a} is used in the treatment of {entity_b}, indicating its therapeutic application.",
        'localization': "{entity_a} and {entity_b} are linked by a localization interaction, indicating the disease's manifestation in a specific anatomical region.",
        'presents': "{entity_a} presents {entity_b}, indicating the symptom as a characteristic of the disease.",
        'covariation': "{entity_a} and {entity_b} show a covariation interaction, indicating a correlation in their expression or function.",
        'inclusion': "{entity_a} includes {entity_b}, indicating the compound's classification in a specific pharmacological category.",
        'ADP ribosylation reaction': "{entity_a} and {entity_b} are involved in an ADP ribosylation reaction, where one or more ADP-ribose moieties are added to proteins.",
        'cleavage reaction': "{entity_a} and {entity_b} engage in a cleavage reaction, leading to the formation of smaller molecules through covalent bond breakage.",
        'colocalization': "{entity_a} and {entity_b} are involved in a colocalization interaction, indicating their coincident occurrence in a specific subcellular location.",
        'dephosphorylation reaction': "{entity_a} and {entity_b} participate in a dephosphorylation reaction, where phosphoresidues are cleaved from proteins.",
        'phosphorylation reaction': "{entity_a} and {entity_b} engage in a phosphorylation reaction, a reversible process affecting protein residues.",
        'protein cleavage': "{entity_a} and {entity_b} are involved in protein cleavage, a modification occurring during protein maturation or degradation.",
        'ubiquitination reaction': "{entity_a} and {entity_b} participate in a ubiquitination reaction, involving the addition of ubiquitin to target proteins.",
        'catalysis': "{entity_a} and {entity_b} engage in catalysis, suggesting they facilitate a chemical reaction.",
        'post-translational modification': "{entity_a} and {entity_b} are part of a post-translational modification interaction, involving enzymatic changes to proteins after biosynthesis.",
        'reaction': "{entity_a} and {entity_b} are involved in a reaction, suggesting a chemical process between them.",
        # Add other templates for different interaction types here
    }

    # Format the entities for readability
    # formatted_entities = ' and '.join([f"{entity.split('::')[0]} {entity.split('::')[1]}" for entity in entities.split(' and ')])

    # Format the description
    # formatted_description = f", where {description.lower()}" if description else ""
    description = ""
    formatted_description = ""

    # Select the appropriate template
    template = templates.get(interaction_type, "{entity_a} and {entity_b} have an interaction of type '{interaction_type}'{description}.")

    # Format the sentence
    sentence = template.format(entity_a=entity_a, entity_b=entity_b, interaction_type=interaction_type, description=formatted_description)

    return sentence

In [17]:
with open("../data/unstructured/unstructured_gene_disease.txt", "a") as f:
    for idx in tqdm(range(structured_data.shape[0])):
        row = structured_data.iloc[idx]
        entity_a = row.node_a
        entity_b = row.node_b
        rel_row = rel_df[rel_df['Relation-name']==f"{structured_data.iloc[idx]['source']}::{structured_data.iloc[idx]['relation_type']}::{structured_data.iloc[idx]['relation_entities']}"]
        interaction_type = rel_row['Interaction-type'].values[0]
        interaction_description = rel_row['Description'].values[0]

        if interaction_type == 'interaction':
            entity_type = rel_row['Connected entity-types'].values[0]
            if entity_type == "Gene:Gene" and row.node_a==row.node_b:
                interaction_type = "interaction [Gene:GeneSelf]"
            else:
                interaction_type = f"{interaction_type} [{entity_type}]"
        elif interaction_type in ["activation", "binding", "inhibition", "binding, ligand (esp. receptors)",
                                "increases expression/production", "affects expression/production (neutral)",
                                "expression", "upregulation", "downregulation", "resemblence", "association",
                                "interaction"]:
            interaction_type = f"{interaction_type} [{entity_type}]"

        f.write(f"{generate_sentence(entity_a, entity_b, interaction_type)}\n")

100%|██████████| 95399/95399 [00:30<00:00, 3140.17it/s]


## Metadata

As the genes, compound and diseases are describled in IDs from different well-known biological datasets, it would be easier for the LLMs. However, we will explore more on these metadata on our future study i.e., we did not used any metadata in our current study.

In [None]:
def get_entities(key):
    node_a = set([node for node in df.node_a])
    entities_a = [node for node in node_a if node.split('::')[0] == key]

    node_b = set([node for node in df.node_b])
    entities_b = [node for node in node_b if node.split('::')[0] == key]

    return list(set(entities_a + entities_b))

diseases = get_entities("Disease")
genes = get_entities("Gene")
compounds = get_entities("Compound")

len(diseases), len(genes), len(compounds)

(5103, 39220, 24313)

In [None]:
len(drugbank_id), len(other_id)

(2097, 306)

In [None]:
entrez_id = []
drugbank_id = []
other_id = []

for g in genes:
    try:
        x = int(g.split(":")[2])
        entrez_id.append(x)
    except:
        if g.split(":")[2] == "drugbank":
            drugbank_id.append(g.split(":")[3])
        elif ";" in g.split(":")[2]:
            entrez_id.extend([int(i) for i in g.split(":")[2].split(';')])
        elif len(g.split(":")[2]) != 0 and g.split(":")[2][0] == "N":
            other_id.append(g.split(":")[2])

In [None]:
from Bio import Entrez
Entrez.email = 'A.N.Other@example.com'

def get_gene_summary(gene_id):
    """Gets the summary for a given gene ID from NCBI's Entrez database.

    Args:
        gene_id: The ID of the gene to get the summary for.

    Returns:
        A string containing the summary for the gene.
    """
    handle = Entrez.esummary(db="gene", id=gene_id)
    result = Entrez.read(handle, validate=False)
    return result

chunk_rate = 9950
with open("../data/semi_structured/entrez_gene_description.txt", "a") as f:
    for i in tqdm(range(0, len(entrez_id), chunk_rate)):
        gene_ids = entrez_id[i:i+chunk_rate]
        data = get_gene_summary(gene_ids)
        summary = data['DocumentSummarySet']['DocumentSummary']
        for idx in range(len(summary)):
            f.write(f"Entrez ID: {gene_ids[idx]}\n")
            if summary[idx]['Summary']:
                f.write(f"Summary: {summary[idx]['Summary']}\n")
            if summary[idx]['Name']:
                f.write(f"Symbol: {summary[idx]['Name']}\n")
            if summary[idx]['NomenclatureName']:
                f.write(f"Name: {summary[idx]['NomenclatureName']}\n")
            if summary[idx]['OtherAliases']:
                f.write(f"Also known as: {summary[idx]['OtherAliases']}\n")
            f.write("\n")

100%|██████████| 4/4 [00:51<00:00, 12.78s/it]


### Disease

In [None]:
import requests
import xml.etree.ElementTree as ET

# MeSH term unique identifier
mesh_id = 'D014138'  # Example ID

# E-Utilities URL for fetching MeSH term data
url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=mesh&id={mesh_id}&retmode=xml'

# Making the request
response = requests.get(url)

# Check if the request was successful
print(response.content)
if response.status_code == 200:
    # Parsing the XML response
    root = ET.fromstring(response.content)

    # Process the XML to extract necessary information
    # This part depends on the structure of the MeSH XML data
    # Example: print the whole XML response
    print(ET.tostring(root, encoding='utf-8').decode())
else:
    print("Failed to retrieve data:", response.status_code)


b'1: cannot get document summary\n'


ParseError: ignored

In [None]:
# Making the request
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parsing the XML response
    root = ET.fromstring(response.content)

    # Process the XML to extract necessary information
    # This part depends on the structure of the MeSH XML data
    # Example: print the whole XML response
    print(ET.tostring(root, encoding='utf-8').decode())
else:
    print("Failed to retrieve data:", response.status_code)


In [None]:
response.content

b'1: cannot get document summary\n'

In [None]:

# Check if the request was successful
if response.status_code == 200:
    # Parsing the XML response
    root = ET.fromstring(response.content)
    print(root)

    # Process the XML to extract necessary information
    # This part depends on the structure of the MeSH XML data
    # Example: print the whole XML response
#     print(ET.tostring(root, encoding='utf-8').decode())
# else:
#     print("Failed to retrieve data:", response.status_code)


ParseError: ignored

In [None]:
import requests

# Disease Ontology ID
api_key = 'c9ac5621-48d4-4fbc-a4e1-0e143f272ee2'
doid = 'DOID:13223'

# BioPortal API endpoint for fetching disease information
# Note: You may need an API key for extensive use
url = f'http://data.bioontology.org/ontologies/DOID/classes/http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2F{doid}?apikey={api_key}'

# Making the request
response = requests.get(url, headers={'Accept': 'application/json'})

# Check if the request was successful
if response.status_code == 200:
    # Parsing the JSON response
    disease_data = response.json()
    # Process and print the data
    print(disease_data)
else:
    print("Failed to retrieve data:", response.status_code)


Failed to retrieve data: 404


In [None]:
set([d.split(":")[2] for d in diseases])

for d in diseases:
    if "MESH" not in d and "SARS" not in d:
        print(d)

Disease::DOID:1595
Disease::DOID:11476
Disease::DOID:8398
Disease::DOID:10534
Disease::DOID:11949
Disease::OMIM:155310
Disease::DOID:2377
Disease::DOID:8778
Disease::OMIM:181800
Disease::DOID:594
Disease::OMIM:143470
Disease::OMIM:188890
Disease::OMIM:141500
Disease::OMIM:132100
Disease::DOID:10976
Disease::OMIM:217000
Disease::OMIM:193235
Disease::OMIM:301050
Disease::DOID:3312
Disease::OMIM:142700
Disease::DOID:0060119
Disease::DOID:5559
Disease::DOID:8577
Disease::OMIM:167959
Disease::DOID:7147
Disease::DOID:263
Disease::DOID:13499
Disease::OMIM:612348
Disease::DOID:1312
Disease::OMIM:233300
Disease::OMIM:120970
Disease::DOID:0050742
Disease::DOID:4989
Disease::DOID:1781
Disease::OMIM:150800
Disease::DOID:14268
Disease::DOID:0060073
Disease::OMIM:191900
Disease::OMIM:258660
Disease::DOID:1826
Disease::DOID:11119
Disease::DOID:11612
Disease::DOID:0050156
Disease::DOID:1725
Disease::OMIM:168600
Disease::DOID:10608
Disease::DOID:5099
Disease::OMIM:609165
Disease::OMIM:252500
Disease::O