In [1]:
import os
import json
import requests
import pandas as pd
from sqlalchemy import create_engine, text
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen
import joblib

## PLAS 5K

In [6]:
plas5k = pd.read_csv('./bfe_data/plas_5k.csv')

plas5k.head()

Unnamed: 0,pdbid,binding_affinity (kcal/mol),binding_affinity_sd (kcal/mol),electrostatic (kcal/mol),electrostatic_sd (kcal/mol),polar_solvation (kcal/mol),polar_solvation_sd (kcal/mol),non_polar_solvation (kcal/mol),non_polar_solvation_sd (kcal/mol),vdW (kcal/mol)
0,6g3f,-0.2557,1.19548,-0.98408,1.12502,-0.34568,0.243,1.26248,1.1208,-0.18842
1,5fpd,-2.61456,2.6715,-1.01432,1.22772,-0.56942,0.40202,1.62432,1.48674,-2.6551
2,6hxe,-0.98558,2.57244,-8.41628,3.52346,-0.81082,0.32616,7.9022,3.00116,0.3393
3,5nqb,-1.0592,1.90178,-36.85828,13.2379,-0.46456,0.24828,34.0379,11.65862,2.22572
4,4q3f,-1.52816,1.87006,-5.36862,3.20528,-0.74246,0.32006,5.44818,2.73016,-0.8653


In [7]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd

def fetch_standardized_smiles(pdbid):
    try:
        url = f"https://files.rcsb.org/download/{pdbid}.pdb"
        r = requests.get(url, timeout=5)
        mol = Chem.MolFromPDBBlock(r.text, sanitize=False)

        if mol:
            return Chem.MolToSmiles(mol, canonical=False)
    except:
        return None


def parallel_fetch(pdb_ids, max_workers=16):
    smiles_map = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(fetch_standardized_smiles, pdbid): pdbid for pdbid in pdb_ids}
        for f in as_completed(futures):
            pdbid = futures[f]
            smiles_map[pdbid] = f.result()
    return smiles_map


from rdkit.Chem import MolToInchiKey, SmilesParserParams, MolFromSmiles
def inchikey(smi):
    try:
        
        params = SmilesParserParams()
        params.removeHs = True
        mol = MolFromSmiles(smi, params)
        return MolToInchiKey(mol, options="-FixedH") 
        I
    except Exception:
        return None

# Usage
pdb_ids = plas5k["pdbid"].unique()
smiles_map = parallel_fetch(pdb_ids)
plas5k["smiles"] = plas5k["pdbid"].map(smiles_map)
plas5k.head()

[10:36:59] 

****
Post-condition Violation
Element 'X' not found
Violation occurred on line 93 in file /Users/runner/work/rdkit-pypi/rdkit-pypi/build/temp.macosx-10.9-x86_64-cpython-311/rdkit/Code/GraphMol/PeriodicTable.h
Failed Expression: anum > -1
****

[10:38:23] 

****
Post-condition Violation
Element 'X' not found
Violation occurred on line 93 in file /Users/runner/work/rdkit-pypi/rdkit-pypi/build/temp.macosx-10.9-x86_64-cpython-311/rdkit/Code/GraphMol/PeriodicTable.h
Failed Expression: anum > -1
****



Unnamed: 0,pdbid,binding_affinity (kcal/mol),binding_affinity_sd (kcal/mol),electrostatic (kcal/mol),electrostatic_sd (kcal/mol),polar_solvation (kcal/mol),polar_solvation_sd (kcal/mol),non_polar_solvation (kcal/mol),non_polar_solvation_sd (kcal/mol),vdW (kcal/mol),smiles
0,6g3f,-0.2557,1.19548,-0.98408,1.12502,-0.34568,0.243,1.26248,1.1208,-0.18842,N[C@H](C(=O)N1[C@H](C(=O)N[C@H](C(=O)N[C@H](C(...
1,5fpd,-2.61456,2.6715,-1.01432,1.22772,-0.56942,0.40202,1.62432,1.48674,-2.6551,NCC(=O)N1[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)NC...
2,6hxe,-0.98558,2.57244,-8.41628,3.52346,-0.81082,0.32616,7.9022,3.00116,0.3393,N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=...
3,5nqb,-1.0592,1.90178,-36.85828,13.2379,-0.46456,0.24828,34.0379,11.65862,2.22572,N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=...
4,4q3f,-1.52816,1.87006,-5.36862,3.20528,-0.74246,0.32006,5.44818,2.73016,-0.8653,N1[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(...


## Binding DB

In [8]:
from tdc.multi_pred import DTI
import numpy as np

# Then, access the specific BindingDB dataset by name
data = DTI(name='BindingDB_Kd')  # For datasets with Kd units
binding_db = data.get_data()
print(binding_db.head())


R = 1.987e-3  # kcal/mol·K
T = 298
binding_db['binding_db_bfe'] = R * T * np.log(binding_db['Y'] * 1e-9)  # Kd (nM → M)
binding_db.rename(columns={"Drug": "smiles"}, inplace=True)

binding_db.head()

Found local copy...
Loading...
Done!


    Drug_ID                                            Drug Target_ID  \
0  444607.0       Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1    P00918   
1    4316.0      COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1    P00918   
2    4293.0           NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1    P00918   
3    1611.0    NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O    P00918   
4    1612.0  COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1    P00918   

                                              Target     Y  
0  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.46  
1  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.49  
2  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.83  
3  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.20  
4  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.16  


Unnamed: 0,Drug_ID,smiles,Target_ID,Target,Y,binding_db_bfe
0,444607.0,Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.46,-12.730587
1,4316.0,COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.49,-12.693178
2,4293.0,NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.83,-12.381115
3,1611.0,NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.2,-13.223775
4,1612.0,COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.16,-13.355904


In [18]:
len(binding_db)

52274

## Merge Binding BD and PLAS 5k

In [10]:
common_smiles = set(binding_db["smiles"]) & set(plas5k["smiles"])
common_smiles

set()

## Merge Permeability Data

In [11]:
caco2 = pd.read_csv('./perm_data/Caco-2.csv')
mdck = pd.read_csv('./perm_data/MDCK.csv')
others = pd.read_csv('./perm_data/Others.csv')
pampa = pd.read_csv('./perm_data/PAMPA.csv')
rrck = pd.read_csv('./perm_data/RRCK.csv')

In [12]:
# 1️⃣ Combine them vertically
combined_perm_df = pd.concat([caco2, mdck, others, pampa, rrck], axis=0, ignore_index=True)

# 2️⃣ Check potential duplicates
# For example, by Standardise_SMILES (or another unique ID column)
combined_perm_df = combined_perm_df.drop_duplicates(subset=["Standardise_SMILES"], keep="first")

combined_perm_df.head()

Unnamed: 0,ID,Original_ID,Common_name,SMILES,Standardise_SMILES,Value,Unit,Endpoint,Standardized_Value,Standardized_Endpoint,...,Num_H_Donors,Num_Heavy_Atoms,Num_Carbon_Atoms,Fraction_SP3_Carbons,TPSA,Num_Rotatable_Bonds,Num_Charged_Atoms,Net_Charge,Kier_index,InchiKey
0,MC-0003,502,,O=C1[C@@H](C(C)C)NC([C@@H](CC2=CC(=CC=C2)C(F)(...,CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(C(F)(F)F)c2)NC...,5.0,10^-6 cm/s,Papp AB,-5.301,Log Papp AB,...,4,43,31,0.516129,111.8,5,0,0,12.377831,XLGYUJSXIXSLHY-CMTIAEDTSA-N
3,MC-0004,508,,O=C1[C@@H](C(C)C)NC([C@@H](CC2C=CC=C(C=2)Cl)NC...,CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(Cl)c2)NCCOc2cc...,1.0,10^-6 cm/s,Papp AB,-6.0,Log Papp AB,...,4,41,31,0.516129,111.8,6,0,0,12.902564,KDLJRSICUQNEQC-ZRRKCSAHSA-N
6,MC-0005,511,,Clc1cccc(C[C@H]2NCCOc3ccccc3CCCNC(=O)[C@H](Cn3...,CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(Cl)c2)NCCOc2cc...,1.0,,ER,1.0,ER,...,4,42,32,0.40625,113.49,5,0,0,11.426567,NPKDYPCFVUJCKG-IUAQSZDVSA-N
9,MC-0006,512,,O=C1[C@@H](C(C)C)NC([C@@H](CC2C=CC=C(Cl)C=2)NC...,CNC[C@@H]1NC(=O)[C@@H](C(C)C)NC(=O)[C@@H](Cc2c...,0.3,10^-6 cm/s,Papp AB,-6.523,Log Papp AB,...,5,39,29,0.482759,120.59,5,0,0,12.044473,OAKBMRJASZUEMA-RMTZWNOUSA-N
12,MC-0007,515,,O=C1[C@@H](C(C)C)NC([C@@H](CC2=CC(=CC=C2)F)NCC...,CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(F)c2)NCCOc2ccc...,3.0,10^-6 cm/s,Papp AB,-5.523,Log Papp AB,...,4,41,31,0.516129,111.8,5,0,0,12.094712,XWBGQVPRXHOMIT-UAXWJAQVSA-N


In [17]:
common_bdb_perm_smiles = set(binding_db["smiles"]) & set(combined_perm_df["SMILES"])
len(common_bdb_perm_smiles)

3

## Upload to AWS RDS