# Search drugs for neglected diseases in ChEMBL

### Import modules

In [1]:
import pandas as pd
import sqlite3

pd.options.mode.chained_assignment = None

### Load ChEMBL Database (version 33)
The database can be downloded from the ChEMBL FTP servers [here](https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/).

In [2]:
conn = sqlite3.connect('../data/ChEMBL/v33/chembl_33.db')

In [3]:
cursor = conn.execute("SELECT name from sqlite_master WHERE type='table';")

In [4]:
assert len(cursor.fetchall()) > 1

In [5]:
sql = """
SELECT 
    MOLECULE_DICTIONARY.CHEMBL_ID,
    MOLECULE_DICTIONARY.PREF_NAME,
    COMPOUND_STRUCTURES.CANONICAL_SMILES,
    COMPOUND_STRUCTURES.STANDARD_INCHI,
    COMPOUND_STRUCTURES.STANDARD_INCHI_KEY,
    MOLECULE_DICTIONARY.MAX_PHASE,
    MOLECULE_DICTIONARY.INDICATION_CLASS,
    MOLECULE_DICTIONARY.WITHDRAWN_FLAG,
    MOLECULE_DICTIONARY.BLACK_BOX_WARNING,
    DRUG_INDICATION.DRUGIND_ID,
    DRUG_INDICATION.MAX_PHASE_FOR_IND,
    DRUG_INDICATION.MESH_ID,
    DRUG_INDICATION.MESH_HEADING,
    INDICATION_REFS.REF_TYPE,
    INDICATION_REFS.REF_ID,
    INDICATION_REFS.REF_URL
FROM MOLECULE_DICTIONARY
JOIN DRUG_INDICATION ON MOLECULE_DICTIONARY.molregno == DRUG_INDICATION.molregno
JOIN INDICATION_REFS ON DRUG_INDICATION.DRUGIND_ID == INDICATION_REFS.DRUGIND_ID
JOIN COMPOUND_STRUCTURES ON MOLECULE_DICTIONARY.molregno == COMPOUND_STRUCTURES.molregno
"""

In [6]:
chembl_data = pd.read_sql(sql=sql, con=conn)

### MESH IDs for diseases

In [7]:
who_neglected_disease = [
    "D054312",  # Buruli Ulcer
    "D014355", # Chagas Disease
    "D065632", # Chikungunya
    "D003715", # Dengue
    "D002862", # Chromoblastomycosis
    "D004320", # Dracunculiasis
    "D004443", # Echinococcosis
    "D014353", # Human African trypanosomiasis
    "D016833", # Trypanosoma brucei rhodesiense
    "D014347", # Trypanosoma brucei gambiense
    "D016773", # Leishmaniasis (cutaneous)
    "D007898", # Leishmaniasis (visceral)
    "D007918", # Leprosy
    "D004605", # Lymphatic filariasis (LF) / Elephantiasis, Filarial
    "D008271", # Mycetoma
    "D009855", # Onchocerciasis
    "D011818", # Rabies
    "D012532", # Scabies
    "D012552", # Schistosomiasis
    "D012909", # Snakebite envenoming
    "D013622", # Taeniasis
    "D003551", # Cysticercosis
    "D014141", # Trachoma
    "D015001", # Yaws
    "D013322", # Strongyloidiasis 
]

### Subset the database to diseases of interest with clinical drugs

In [8]:
m = chembl_data['mesh_id'].isin(who_neglected_disease)
n = chembl_data['max_phase_for_ind'] > 1  # Phase 3 or 4
o = chembl_data['max_phase'] >= 3.0 # Approved only
subset = chembl_data[m & n & o]
subset.head(2)

Unnamed: 0,chembl_id,pref_name,canonical_smiles,standard_inchi,standard_inchi_key,max_phase,indication_class,withdrawn_flag,black_box_warning,drugind_id,max_phase_for_ind,mesh_id,mesh_heading,ref_type,ref_id,ref_url
4372,CHEMBL1200689,NITRIC OXIDE,[N]=O,InChI=1S/NO/c1-2,MWUXSHHQAYIFBG-UHFFFAOYSA-N,4.0,,0,0,23580,3.0,D016773,"Leishmaniasis, Cutaneous",ClinicalTrials,NCT00317629,https://clinicaltrials.gov/ct2/results?id=%22N...
7584,CHEMBL131,PREDNISOLONE,C[C@]12C=CC(=O)C=C1CC[C@@H]1[C@@H]2[C@@H](O)C[...,InChI=1S/C21H28O5/c1-19-7-5-13(23)9-12(19)3-4-...,OIGNJSKKLXVSLS-VWUMJDOOSA-N,4.0,Glucocorticoid,0,0,24801,2.0,D007918,Leprosy,ClinicalTrials,"NCT00919542,NCT00919776,NCT00919815",https://clinicaltrials.gov/ct2/results?id=%22N...


In [9]:
# Drop duplicates
subset.drop_duplicates(subset=['chembl_id', "mesh_id"], keep='first', inplace=True)

In [10]:
subset["mesh_heading"].value_counts()

mesh_heading
Leprosy                     8
Leishmaniasis, Cutaneous    7
Chagas Disease              6
Leishmaniasis, Visceral     5
Onchocerciasis              5
Elephantiasis, Filarial     5
Cysticercosis               5
Schistosomiasis             5
Buruli Ulcer                5
Strongyloidiasis            3
Trypanosomiasis, African    3
Dengue                      2
Yaws                        2
Mycetoma                    1
Scabies                     1
Name: count, dtype: int64

In [11]:
subset["chembl_id"].nunique()

46

In [12]:
subset.to_csv("../data/ntd_drugs.tsv", sep='\t', index=False)