# In This Notebook We Create The Graph Database From The Data We Gathered

#### Entities and Feature Tables (change /Path/To with absolute path of the data)

In [None]:
#download the requirements file first, if some error prevails, maybe something has been missed from it, please pip install it too

import os
import sys
import requests
import ast
import json
import hashlib
from datetime import datetime
from glob import glob
from io import StringIO
import boto3

import pandas as pd
import numpy as np

from arango import ArangoClient
from biomart import BiomartServer

from transformers import AutoTokenizer, AutoModel
import torch


from DeepPurpose import utils
from DeepPurpose import DTI as models

from rdkit import Chem, DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import Draw, AllChem

from Bio.PDB import MMCIFParser



In [None]:
# Disease DOID Synopses
print(os.getcwd())
Doid = pd.read_csv(
    "Path/To/data/D-DoMiner_miner-diseaseDOID.tsv.gz",
    compression="gzip",
    sep="\t",
    header=0,
    names=["doid", "name", "definition", "synonym"],
)

Doid

In [None]:
# Disease MESH Miner

Mesh = pd.read_csv(
    "/Path/To/data/data/D-MeshMiner_miner-disease.tsv.gz",
    compression="gzip",
    sep="\t",
    header=0,
    names=["mesh", "name", "definition", "synonym"],
)

Mesh

In [None]:
# OMIM Genetic Disorders

Omim = pd.read_csv(
    "Path/To/data/D-OmimMiner_miner-diseaseOMIM.tsv.gz",
    compression="gzip",
    sep="\t",
    header=0,
    names=["omim", "phenotypes", "gene_name", "gene", "location", "_"],
)

Omim = Omim.iloc[:, :-1]

Omim

In [None]:
# Gene

Gene = pd.read_csv(
    "/home/emon/NeuThera-Drug-Discovery-Toolkit/data/G-SynMiner_miner-geneHUGO.tsv.gz",
    compression="gzip",
    sep="\t",
)

Gene = Gene.rename(columns={'# ensembl_gene_id': 'ensg', 'symbol': 'gene', 'name': 'gene_name'})

Gene

### Adding Embeddings

In [None]:
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

def get_chemberta_embedding(smiles):
    """Generate a ChemBERTa embedding for a molecule, ensuring input is a string."""
    if not isinstance(smiles, str) or not smiles.strip():
        return None 

    inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).tolist()[0]

In [None]:
# DrugBank
from tqdm.notebook import tqdm

DrugBank = pd.read_csv(
    "/home/emon/NeuThera-Drug-Discovery-Toolkit/data/drugbank_all_drugbank_vocabulary.csv.zip",
    compression="zip",
    sep=",",
    header=0,
    names=["drug", "accession", "drug_name", "cas", "unii", "synonym", "key"],
    dtype={"smiles": str}
)

ChemRepresentation = pd.read_csv(
    "/home/emon/NeuThera-Drug-Discovery-Toolkit/data/chembl_35_chemreps.txt.gz",
    compression="gzip",
    sep="\t",
    header=0,
    names=["chembl", "smiles", "inchi", "key"],
)

Drug = DrugBank.merge(ChemRepresentation, on="key", how="inner")
Drug = Drug.dropna(subset=["smiles"])
Drug["smiles"] = Drug["smiles"].astype(str).str.strip()
Drug["generated"] = False

embeddings = []
for index, row in tqdm(Drug.iterrows(), total=len(Drug), desc="Generating embeddings"):
    smiles = row["smiles"]
    embedding = get_chemberta_embedding(smiles)
    embeddings.append(embedding)

Drug["embedding"] = embeddings

Drug


In [None]:
PDB = pd.read_csv(
    "/home/emon/NeuThera-Drug-Discovery-Toolkit/data/pdb_chain_ensembl.csv.gz",
    compression="gzip",
    sep=",",
    header=1,
)

PDB.head(5)

### Now Adding Interactions

In [None]:
# Drug-Drug interaction

ChCh = pd.read_csv(
    "/home/emon/NeuThera-Drug-Discovery-Toolkit/data/ChCh-Miner_durgbank-chem-chem.tsv.gz",
    compression="gzip",
    sep="\t",
    names=["drug", "drug_target"],
)

ChCh.head(5)

In [None]:
# Drug-Gene interaction

ChG = pd.read_csv(
    "/home/emon/NeuThera-Drug-Discovery-Toolkit/data/ChG-Miner_miner-chem-gene.tsv.gz",
    compression="gzip",
    sep="\t",
    header=0,
    names=["drug", "uniprot_ids"],
)

ChG = ChG.merge(Gene[['uniprot_ids', 'gene']], on='uniprot_ids', how='left')
ChG.drop(columns=['uniprot_ids'], inplace=True)
ChG.dropna(inplace=True)

ChG.head(5)



In [None]:
# Disease-Drug interaction
ChG = pd.read_csv(
    "/home/emon/NeuThera-Drug-Discovery-Toolkit/data/ChG-Miner_miner-chem-gene.tsv.gz",
    compression="gzip",
    sep="\t",
    header=0,
    names=["drug", "uniprot_ids"],
)

ChG = ChG.merge(Gene[['uniprot_ids', 'gene']], on='uniprot_ids', how='left')
ChG.drop(columns=['uniprot_ids'], inplace=True)
ChG.dropna(inplace=True)

ChG.head(5)

In [None]:
# Disease-Drug interaction

DCh = pd.read_csv(
    "/home/emon/NeuThera-Drug-Discovery-Toolkit/data/DCh-Miner_miner-disease-chemical.tsv.gz",
    compression="gzip",
    sep="\t",
    header=0,
    names=["mesh", "drug"],
)

DCh.head(5)

In [None]:
# Disease-Disease interaction

DD = pd.read_csv(
    "/home/emon/NeuThera-Drug-Discovery-Toolkit/data/DD-Miner_miner-disease-disease.tsv.gz",
    compression="gzip",
    sep="\t",
    header=0,
    names=["doid", "doid_target"],
)

DD.head(5)

In [None]:
# Disease-Function interaction

DF = pd.read_csv(
    "/home/emon/NeuThera-Drug-Discovery-Toolkit/data/DF-Miner_miner-disease-function.tsv.gz",
    compression="gzip",
    sep="\t",
    header=0,
    names=["mesh", "go"],
)

DF.head(5)

In [None]:
This code combines disease-gene interaction with gene information  

In [None]:
# Disease-Gene interaction

DG = pd.read_csv(
    "/home/emon/NeuThera-Drug-Discovery-Toolkit/data/DG-Miner_miner-disease-gene.tsv.gz",
    compression="gzip",
    sep="\t",
    header=0,
    names=["mesh", "uniprot_ids"],
)

DG = DG.merge(Gene[['uniprot_ids', 'gene']], on='uniprot_ids', how='left')
DG.drop(columns=['uniprot_ids'], inplace=True)
DG.dropna(inplace=True)

DG.head(5)

In [None]:
FF = pd.read_csv(
    "/home/emon/NeuThera-Drug-Discovery-Toolkit/data/FF-Miner_miner-func-func.tsv.gz",
    compression="gzip",
    sep="\t",
    header=0,
    names=["go", "go_target"],
)

FF.head(5)

In [None]:
#Human gene-function interaction

GF = pd.read_csv(
    "./data/GF-Miner_miner-gene-function.tsv.gz",
    compression="gzip",
    sep="\t",
    skipinitialspace=True
)

GF = GF.rename(
    columns={
        "# GO_ID": "go",
        "Gene": "gene",
        "C8": "go_category",
        "C10": "protein",
        "C12": "organism",
        "C13": "date"
    }
)

GF = GF[GF["organism"] == "taxon:9606"]
GF = GF[["go", "gene"]]

GF.head(5)


This cell processes gene-protein relationships by merging gene symbols and PDB structural data

In [None]:
# Gene-Protien interaction

GP = pd.read_csv(
    "./data/GP-Miner_miner-gene-protein.tsv.gz",
    compression="gzip",
    sep="\t",
    header=0,
    names=["ensg", "ensp"],
)

ENSP = GP[['ensp']].copy()

GP = GP.merge(Gene[['ensg', 'gene']], on='ensg', how='left')
GP = GP.merge(PDB, left_on="ensp", right_on="TRANSLATION_ID", how="left")
GP = GP[["gene", "PDB"]].rename(columns={"PDB": "pdb"})

GP.dropna(inplace=True)
GP.drop_duplicates(inplace=True)

GP.head(5)

This cell merges the ENSP IDs with PDB structural data, keeping only matching entries. It extracts and renames the PDB column to pdb, removes duplicate pdb values, and retains only the pdb column in the final DataFrame.

In [None]:
Protein = ENSP.merge(PDB, left_on="ensp", right_on="TRANSLATION_ID", how="inner") 

Protein = Protein[["ensp", "PDB"]].rename(columns={"PDB": "pdb"})
Protein.drop_duplicates(subset=["pdb"], inplace=True)

Protein = Protein[["pdb"]]

Protein.head(5)

# Adding To Database

In [None]:
sys_db = ArangoClient(hosts="http://localhost:8529").db('_system', username='root', password='perplexity')

# Check if the database exists
if not sys_db.has_database('Remedi'):
    sys_db.create_database('Remedi')
    print("'Remedi' db created successfully!")
else:
    print("Database already there.")

In [None]:
db = ArangoClient(hosts="http://localhost:8529").db('Remedi', username='root', password='perplexity')

This code ensures that the necessary ArangoDB collections exist and then inserts nodes from multiple DataFrames (Drug, Gene, Doid, Mesh, Omim, Protein). It processes data in batches, converting rows into node documents with a unique _key, and handles errors during insertion. Note if anywhere it gives error unrecognised such as Gene Unrecognised or anything else unrecognised, re run the cell that loads it into dataframe, most probably it is not in memory anymore

In [None]:
# Adding Nodes
from tqdm.notebook import tqdm

collections = ["drug", "gene", "disease", "protein"]

for col in collections:
    if not db.has_collection(col):
        db.create_collection(col)

def add_nodes(df, label, key_column, batch_size=100000):
    batch = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Inserting {label} nodes"):
        node_key = str(row[key_column]) if pd.notna(row[key_column]) else "NotAvailable"
        node_id = f"{label}/{node_key}"
        attributes = row.fillna("NaN").drop(key_column).to_dict()
        
        node = {"_key": node_key, **attributes}
        batch.append(node)
        
        if len(batch) >= batch_size:
            try:
                db[label].insert_many(batch, overwrite=True)
                print(f"Inserted {len(batch)} nodes into {label}")
                batch.clear()
            except Exception as e:
                print(f"Error inserting batch: {e}")
    
    if batch:
        try:
            db[label].insert_many(batch, overwrite=True)
            print(f"Inserted {len(batch)} nodes into {label}")
        except Exception as e:
            print(f"Error inserting final batch: {e}")
    

add_nodes(Drug, "drug", "drug")
add_nodes(Gene, "gene", "gene")
add_nodes(Doid, "disease", "doid")
add_nodes(Mesh, "disease", "mesh")
add_nodes(Omim, "disease", "omim")
add_nodes(Protein, "protein", "pdb")

This code creates edge collections in ArangoDB ,if they don’t exist and inserts edges from various DataFrames.

In [None]:
edge_collections = ["drug-drug", "drug-gene", "drug-protein", "disease-drug", "disease-disease", "disease-function", "disease-gene", "function-function", "gene-function", "gene-protein"]

for edge_col in edge_collections:
    if not db.has_collection(edge_col):
        db.create_collection(edge_col, edge=True)

def add_edges(df, src_label, dst_label, src_col, dst_col, edge_collection, batch_size=100000):
    batch = []
    
    for _, row in df.iterrows():
        src_node = f"{src_label}/{str(row[src_col])}"
        dst_node = f"{dst_label}/{str(row[dst_col])}"

        attributes = row.fillna("NaN").drop([src_col, dst_col]).to_dict()
        
        edge = {
            '_from': src_node,
            '_to': dst_node,
            **attributes
        }
        
        batch.append(edge)
        
        if len(batch) >= batch_size:
            try:
                db[edge_collection].insert_many(batch, overwrite=True)
                print(f"Inserted {len(batch)} edges into {edge_collection}")
                batch.clear()
            except Exception as e:
                print(f"Error inserting batch: {e}")
    
    if batch:
        try:
            db[edge_collection].insert_many(batch, overwrite=True)
            print(f"Inserted {len(batch)} edges into {edge_collection}")
        except Exception as e:
            print(f"Error inserting final batch: {e}")

add_edges(ChCh, "drug", "drug", "drug", "drug_target", "drug-drug")
add_edges(ChG, "drug", "gene", "drug", "gene", "drug-gene")
add_edges(DCh, "disease", "drug", "mesh", "drug", "disease-drug")
add_edges(DD, "disease", "disease", "doid", "doid_target", "disease-disease")
add_edges(DF, "disease", "go", "mesh", "go", "disease-function")
add_edges(DG, "disease", "gene", "mesh", "gene", "disease-gene")
add_edges(FF, "go", "go", "go", "go_target", "function-function")
add_edges(GF, "go", "gene", "go", "gene", "gene-function")
add_edges(GP, "gene", "protein", "gene", "pdb", "gene-protein")

Inserted 48514 edges into drug-drug
Inserted 15180 edges into drug-gene
Inserted 100000 edges into disease-drug
Inserted 100000 edges into disease-drug
Inserted 100000 edges into disease-drug
Inserted 100000 edges into disease-drug
Inserted 66657 edges into disease-drug
Inserted 6877 edges into disease-disease
Inserted 100000 edges into disease-function
Inserted 100000 edges into disease-function
Inserted 100000 edges into disease-function
Inserted 100000 edges into disease-function
Inserted 100000 edges into disease-function
Inserted 100000 edges into disease-function
Inserted 100000 edges into disease-function
Inserted 100000 edges into disease-function
Inserted 2760 edges into disease-function
Inserted 100000 edges into disease-gene
Inserted 100000 edges into disease-gene
Inserted 100000 edges into disease-gene
Inserted 100000 edges into disease-gene
Inserted 100000 edges into disease-gene
Inserted 100000 edges into disease-gene
Inserted 100000 edges into disease-gene
Inserted 10000

In [None]:
from tqdm.notebook import tqdm
import time

if not db.has_graph("Remedi"):
    db.create_graph("Remedi")

neuthera_graph = db.graph("Remedi")

edge_definitions = [
    ("drug-drug", "drug", "drug"),
    ("drug-gene", "drug", "gene"),
    ("disease-drug", "disease", "drug"),
    ("disease-disease", "disease", "disease"),
    ("disease-function", "disease", "go"),
    ("disease-gene", "disease", "gene"),
    ("function-function", "go", "go"),
    ("gene-function", "gene", "go"),
    ("gene-protein", "gene", "protein"),
    ("drug-protein", "drug", "protein")
]

for edge_col, from_col, to_col in tqdm(edge_definitions, desc="Creating edge definitions"):
    if edge_col not in neuthera_graph.edge_definitions():
        neuthera_graph.create_edge_definition(
            edge_collection=edge_col,
            from_vertex_collections=[from_col],
            to_vertex_collections=[to_col]
        )

end_time = time.time()

print("Remedi db created and linked with node and edges")

Creating edge definitions:   0%|          | 0/10 [00:00<?, ?it/s]

Remedi db created and linked with node and edges


NameError: name 'start_time' is not defined

# From here the database is made, you can access the database on localhost:8529 and username is root password is perplexity, next step involves running the flask file, main.py