Step 1: Import Python Libraries into the Jupyter Notebook.

Note: Please download the relevant Python packages to execute this notebook.

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import html2text
import urllib.request
from user_agent import generate_user_agent
from sklearn.preprocessing import StandardScaler

Step 2: Define a function get_ADMET_Properties to web-scrap the relevant c-ADMET data for a given drug entry (represented by DB_ID) in the DrugBank database. If the c-ADMET properties for a given drug (represented by the DB_ID) are not be published in the DrugBank database, then the data for that entry are zero padded.

In [4]:
def get_ADMET_Properties(content, verbose):
    ADMET_Properties=["Human Intestinal Absorption", "Blood Brain Barrier","Caco-2 permeable", "P-glycoprotein substrate", "P-glycoprotein inhibitor I", 
                      "P-glycoprotein inhibitor II", "Renal organic cation transporter", "CYP450 2C9 substrate", "CYP450 2D6 substrate", "CYP450 3A4 substrate",
                      "CYP450 1A2 substrate", "CYP450 2C9 inhibitor", "CYP450 2D6 inhibitor", "CYP450 2C19 inhibitor", "CYP450 3A4 inhibitor", "CYP450 inhibitory promiscuity",
                    "hERG inhibition (predictor I)", "hERG inhibition (predictor II)"]

    #0) Human Intestinal Absorption
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[0])[1].split("</td><td>")
    posnegHIA=test[1]
    HIA_Prob=test[2].split("</")[0]
    if posnegHIA=="+":
        HIA=float(HIA_Prob)
    elif posnegHIA=="-":
        HIA=-1*float(HIA_Prob)

    #1) Blood Brain Barrier
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[1])[1].split("</td><td>")
    posnegBBB=test[1]
    BBB_Prob=test[2].split("</")[0]
    if posnegBBB=="+":
        BBB=float(BBB_Prob)
    elif posnegBBB=="-":
        BBB=-1*float(BBB_Prob)

    #2) Caco-2 permeable
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[2])[1].split("</td><td>")
    posnegCaco2=test[1]
    Caco2_Prob=test[2].split("</")[0]
    if posnegCaco2=="+":
        Caco2=float(Caco2_Prob)
    elif posnegCaco2=="-":
        Caco2=-1*float(Caco2_Prob)

    #3) P-glycoprotein substrate
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[3])[1].split("</td><td>")
    posnegPGSubs=test[1]
    PGSubs_Prob=test[2].split("</")[0]
    if posnegPGSubs=="Substrate":
        PGSubs=float(PGSubs_Prob)
    elif posnegPGSubs=="Non-substrate":
        PGSubs=-1*float(PGSubs_Prob)

    #4) P-glycoprotein inhibitor I
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[4])[1].split("</td><td>")
    posnegPGInh1=test[1]
    PGInh1_Prob=test[2].split("</")[0]
    if posnegPGInh1=="Inhibitor":
        PGInh1=float(PGInh1_Prob)
    elif posnegPGInh1=="Non-inhibitor":
        PGInh1=-1*float(PGInh1_Prob)

    #5) P-glycoprotein inhibitor II
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[5])[1].split("</td><td>")
    posnegPGInh2=test[1]
    PGInh2_Prob=test[2].split("</")[0]
    if posnegPGInh2=="Inhibitor":
        PGInh2=float(PGInh2_Prob)
    elif posnegPGInh2=="Non-inhibitor":
        PGInh2=-1*float(PGInh2_Prob)

    #6) Renal organic cation transporter
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[6])[1].split("</td><td>")
    posnegRenalOrgCatTransport=test[1]
    RenalOrgCatTransport_Prob=test[2].split("</")[0]
    if posnegRenalOrgCatTransport=="Inhibitor":
        RenalOrgCatTransport=float(RenalOrgCatTransport_Prob)
    elif posnegRenalOrgCatTransport=="Non-inhibitor":
        RenalOrgCatTransport=-1*float(RenalOrgCatTransport_Prob)

    #7) CYP450 2C9 substrate
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[7])[1].split("</td><td>")
    posnegCYP2C9=test[1]
    CYP2C9_Prob=test[2].split("</")[0]
    if posnegCYP2C9=="Substrate":
        CYP2C9=float(CYP2C9_Prob)
    elif posnegCYP2C9=="Non-substrate":
        CYP2C9=-1*float(CYP2C9_Prob)

    #8) CYP450 2D6 substrate
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[8])[1].split("</td><td>")
    posnegCYP2D6=test[1]
    CYP2D6_Prob=test[2].split("</")[0]
    if posnegCYP2D6=="Substrate":
        CYP2D6=float(CYP2D6_Prob)
    elif posnegCYP2D6=="Non-substrate":
        CYP2D6=-1*float(CYP2D6_Prob)

    #9) CYP450 3A4 substrate
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[9])[1].split("</td><td>")
    posnegCYP3A4=test[1]
    CYP3A4_Prob=test[2].split("</")[0]
    if posnegCYP3A4=="Substrate":
        CYP3A4=float(CYP3A4_Prob)
    elif posnegCYP3A4=="Non-substrate":
        CYP3A4=-1*float(CYP3A4_Prob)

    #10) CYP450 1A2 substrate
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[10])[1].split("</td><td>")
    posnegCYP1A2=test[1]
    CYP1A2_Prob=test[2].split("</")[0]
    if posnegCYP1A2=="Inhibitor":
        CYP1A2=float(CYP1A2_Prob)
    elif posnegCYP1A2=="Non-inhibitor":
        CYP1A2=-1*float(CYP1A2_Prob)

    #11) CYP450 2C9 inhibitor
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[11])[1].split("</td><td>")
    posnegCYP2C9Inh=test[1]
    CYP2C9Inh_Prob=test[2].split("</")[0]
    if posnegCYP2C9Inh=="Inhibitor":
        CYP2C9Inh=float(CYP2C9Inh_Prob)
    elif posnegCYP2C9Inh=="Non-inhibitor":
        CYP2C9Inh=-1*float(CYP2C9Inh_Prob)

    #12) CYP450 2D6 inhibitor
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[12])[1].split("</td><td>")
    posnegCYP2D6Inh=test[1]
    CYP2D6Inh_Prob=test[2].split("</")[0]
    if posnegCYP2D6Inh=="Inhibitor":
        CYP2D6Inh=float(CYP2D6Inh_Prob)
    elif posnegCYP2D6Inh=="Non-inhibitor":
        CYP2D6Inh=-1*float(CYP2D6Inh_Prob)

    #13) CYP450 2C19 inhibitor
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[13])[1].split("</td><td>")
    posnegCYP2C19Inh=test[1]
    CYP2C19Inh_Prob=test[2].split("</")[0]
    if posnegCYP2C19Inh=="Inhibitor":
        CYP2C19Inh=float(CYP2C19Inh_Prob)
    elif posnegCYP2C19Inh=="Non-inhibitor":
        CYP2C19Inh=-1*float(CYP2C19Inh_Prob)

    #14) CYP450 3A4 inhibitor
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[14])[1].split("</td><td>")
    posnegCYP3A4Inh=test[1]
    CYP3A4Inh_Prob=test[2].split("</")[0]
    if posnegCYP3A4Inh=="Inhibitor":
        CYP3A4Inh=float(CYP3A4Inh_Prob)
    elif posnegCYP3A4Inh=="Non-inhibitor":
        CYP3A4Inh=-1*float(CYP3A4Inh_Prob)

    #15) CYP450 inhibitory promiscuity
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[15])[1].split("</td><td>")
    posnegCYPInhPromisc=test[1]
    CYPInhPromisc_Prob=test[2].split("</")[0]
    if posnegCYPInhPromisc=="Low CYP Inhibitory Promiscuity":
        CYPInhPromisc=round(0.1*float(CYPInhPromisc_Prob),5)
    elif posnegCYPInhPromisc=="High CYP Inhibitory Promiscuity":
        CYPInhPromisc=round(1*float(CYPInhPromisc_Prob),4)

    #16) hERG inhibition (predictor I)
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[16])[1].split("</td><td>")
    posneghERGInh1=test[1]
    hERGInh1_Prob=test[2].split("</")[0]
    if posneghERGInh1=="Weak inhibitor":
        hERGInh1=round(0.1*float(hERGInh1_Prob),5)
    elif posneghERGInh1=="Strong inhibitor":
        hERGInh1=round(1*float(hERGInh1_Prob),4)

    #17) hERG inhibition (predictor II)
    test=content.split(">Predicted ADMET features<")[1].split(ADMET_Properties[17])[1].split("</td><td>")
    posneghERGInh2=test[1]
    hERGInh2_Prob=test[2].split("</")[0]
    if posneghERGInh2=="Inhibitor":
        hERGInh2=1*float(hERGInh2_Prob)
    elif posneghERGInh2=="Non-inhibitor":
        hERGInh2=-1*float(hERGInh2_Prob)

    if verbose==0:
        print(Drug, "\n")
        print('{0:50}'.format('0) Human Intestinal Absorption Probability'), "=", HIA)
        print('{0:50}'.format('1) Blood Brain Barrier Probability'), "=", BBB)
        print('{0:50}'.format('2) Caco-2 Permeability Probability'), "=", Caco2)
        print('{0:50}'.format('3) P-glycoprotein Substrate Probability'), "=", PGSubs)
        print('{0:50}'.format('4) P-glycoprotein Inhibitor I Probability'), "=", PGInh1)
        print('{0:50}'.format('5) P-glycoprotein Inhibitor II Probability'), "=", PGInh2)
        print('{0:50}'.format('6) Renal organic cation transporter Probability'), "=", RenalOrgCatTransport)
        print('{0:50}'.format('7) CYP450 2C9 Substrate Probability'), "=", CYP2C9)
        print('{0:50}'.format('8) CYP450 2D6 Substrate Probability'), "=", CYP2D6)
        print('{0:50}'.format('9) CYP450 3A4 Substrate Probability'), "=", CYP3A4)
        print('{0:50}'.format('10) CYP450 1A2 Substrate Probability'), "=", CYP1A2)
        print('{0:50}'.format('11) CYP450 2C9 Inhibitor Probability'), "=", CYP2C9Inh)
        print('{0:50}'.format('12) CYP450 2D6 Inhibitor Probability'), "=", CYP2D6Inh)
        print('{0:50}'.format('13) CYP450 2C19 Inhibitor Probability'), "=", CYP2C19Inh)
        print('{0:50}'.format('14) CYP450 3A4 Inhibitor Probability'), "=", CYP3A4Inh)
        print('{0:50}'.format('15) CYP450 Inhibitory Promiscuity Probability'), "=", CYPInhPromisc)
        print('{0:50}'.format('16) hERG Inhibition (Predictor I) Probability'), "=", hERGInh1)
        print('{0:50}'.format('17) hERG Inhibition (Predictor II) Probability'), "=", hERGInh2)
        
    ADMET_Results=[HIA, BBB, Caco2, PGSubs, PGInh1, PGInh2, RenalOrgCatTransport, CYP2C9, CYP2D6, CYP3A4, CYP1A2, CYP2C9Inh, CYP2D6Inh, CYP2C19Inh, CYP3A4Inh, CYPInhPromisc, hERGInh1, hERGInh2]
    
    return ADMET_Results

Step 3: Doing web-scrapping on DrugBank database on all 1,710 drugs used to develop the DDI MLP (>192,000 DDIs).

In [5]:
prefix_target_url="https://www.drugbank.ca/drugs/"

headers={}
headers["User-Agent"]=generate_user_agent()

DB_ID_SMILES_df=pd.read_csv("1_1_1_Input\DB_ID_SMILES.csv")
x=DB_ID_SMILES_df.loc[:, "DB_ID"].values

Dict_DBID_ADMET_Properties={}

for i in range(len(x)):
    if i%500==0:
        print("Done ", i)
        
    Drug=x[i]
    url=prefix_target_url+Drug
    resource = urllib.request.urlopen(url)
    content = resource.read()
    charset = resource.headers.get_content_charset()
    content = content.decode(charset)
    
    try:
        results=get_ADMET_Properties(content, 1)
        Dict_DBID_ADMET_Properties[Drug]=results
    except:
        Dict_DBID_ADMET_Properties[Drug]=[]
        print("Check Exceptions for Drug ", Drug)
        

Done  0


Step 4: Zero padding the missing entries.

In [None]:
count=0
for key, val in Dict_DBID_ADMET_Properties.items():
    if val==[]:
        ADMET_Results=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        Dict_DBID_ADMET_Properties[key]=ADMET_Results
        count+=1
print(count)

Step 5: Export the raw c-ADMET Properties as "Raw_DDI_MLP_c-ADMET.csv"

In [None]:
f = open('Raw_DDI_MLP_c-ADMET.csv', 'w+')
count=0
for key, val in Dict_DBID_ADMET_Properties.items():
    if count==0:
        ADMET_Properties=["DB_ID","Human Intestinal Absorption", "Blood Brain Barrier","Caco-2 permeable", "P-glycoprotein substrate", "P-glycoprotein inhibitor I", 
                      "P-glycoprotein inhibitor II", "Renal organic cation transporter", "CYP450 2C9 substrate", "CYP450 2D6 substrate", "CYP450 3A4 substrate",
                      "CYP450 1A2 substrate", "CYP450 2C9 inhibitor", "CYP450 2D6 inhibitor", "CYP450 2C19 inhibitor", "CYP450 3A4 inhibitor", "CYP450 inhibitory promiscuity",
                    "hERG inhibition (predictor I)", "hERG inhibition (predictor II)"]
        Line=",".join(ADMET_Properties)+"\n"
        f.write(Line)  
        
    Line=str(key)+","+str(val[0])+","+str(val[1])+","+str(val[2])+","+str(val[3])+","+str(val[4])+","+str(val[5])+","+str(val[6])+","+str(val[7])+","+str(val[8])+","+str(val[9])+","+str(val[10])+","+str(val[11])+","+str(val[12])+","+str(val[13])+","+str(val[14])+","+str(val[15])+","+str(val[16])+","+str(val[17])+"\n"
    f.write(Line)  
    count=1

f.close()

Step 6: Standardize the raw c-ADMET data to accelerate the machine learning process later on.

In [2]:
ADMET_df=pd.read_csv("1_1_2_Output\Raw_DDI_MLP_c-ADMET.csv")
ADMET_Prop_Columns=ADMET_df.columns[1:]

ADMET_Prop_Values=ADMET_df.loc[:, ADMET_Prop_Columns].values
Standardized_ADMET_Prop_Values=StandardScaler().fit_transform(ADMET_Prop_Values)

new_ADMET_df = ADMET_df.filter([ADMET_df.columns[0]], axis=1)
ADMET_Prop_df=pd.DataFrame(Standardized_ADMET_Prop_Values, columns=ADMET_Prop_Columns)
Standardized_ADMET_df=pd.concat([new_ADMET_df, ADMET_Prop_df], axis=1)
Standardized_ADMET_df.to_csv(r'1_1_2_Output\DDI_MLP_Standardized_c-ADMET.csv', index=False)