In [2]:
import os
import json
import requests
import pandas as pd
from sqlalchemy import create_engine, text
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen
import joblib


In [61]:
## Get Cacao-trained permeability model
permeability_model = joblib.load("../models/permeability_rf.joblib")

In [62]:
## Get Cacao permeability data
from tdc.single_pred import ADME
cacao_data = ADME(name='Caco2_Wang')

Found local copy...
Loading...
Done!


In [63]:
# --- CONFIG ---
import os

os.environ["DB_USER"] = "postgres"
os.environ["DB_PASS"] = "AzuleneLabs_2026"
os.environ["DB_HOST"] = "azulene-1.cizeysmsgxmm.us-east-1.rds.amazonaws.com"
os.environ["DB_NAME"] = "postgres"
os.environ["DB_PORT"] = "5432"

print(os.getenv("DB_USER"))  # confirm

DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_HOST = os.getenv("DB_HOST")
DB_NAME = os.getenv("DB_NAME")
DB_PORT = os.getenv("DB_PORT", 5432)

BATCH_SIZE = 1000

def get_cacao_permeability(df, cacao):
    """Fetch permeability from CACAO database (placeholder function)."""
    # ## add all permeability data to df 
    ## 'Drug' column in cacao == 'smiles' column in df, smiles might overlap, or might nowt be present at all

    cacao = cacao.rename(columns={"Drug": "smiles", "Y": "cacao_permeability"})

    # Merge on 'smiles' — keep all molecules from df
    merged = df.merge(cacao[["smiles", "cacao_permeability"]], on="smiles", how="left")


    return merged



# --- Connect to PostgreSQL ---
conn_str = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(conn_str)


# --- RDKit-based calculations ---
def compute_rdkit_features(smiles):
    """Compute molecular weight, logP, etc. using RDKit."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if not mol:
            return None
        return {
            "mol_weight": Descriptors.MolWt(mol),
            "logp_rdkit": Crippen.MolLogP(mol)
        }
    except Exception:
        return None


# --- PubChem fallback API ---
def fetch_pubchem_logp(smiles):
    """Fetch experimental logP from PubChem."""
    try:
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/property/LogP,IsomericSMILES/JSON"
        resp = requests.get(url, timeout=5)
        if resp.status_code == 200:
            props = resp.json().get("PropertyTable", {}).get("Properties", [{}])[0]
            return props.get("LogP")
    except Exception:
        return None


# --- Load data ---
def fetch_data(limit=None):
    """Fetch data from PostgreSQL table."""
    query = "SELECT * FROM drug_properties"
    if limit:
        query += f" LIMIT {limit}"
    return pd.read_sql(query, engine)


def fetch_data_enriched(limit=None):
    """Fetch data from PostgreSQL table."""
    query = "SELECT * FROM drug_properties_enriched"
    if limit:
        query += f" LIMIT {limit}"
    return pd.read_sql(query, engine)


# --- Enrichment Pipeline ---
def enrich_dataframe(df):

    df = df.dropna(subset=["smiles"]).copy()
    df = df[df["smiles"].apply(lambda s: isinstance(s, str) and len(s.strip()) > 0)]


    ## Get the caco permeability data from cacao and merge into df
    cacao = cacao_data.get_data()
    print(cacao.head())
    print(len(df))
    df = get_cacao_permeability(df, cacao)
    print(len(df))
    
    enriched = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Enriching molecules"):
        smiles = row.get("smiles")

        mol = Chem.MolFromSmiles(smiles)
        logp = Descriptors.MolLogP(mol)
        psa = Descriptors.TPSA(mol)
        mw = Descriptors.MolWt(mol)
        hbd = Descriptors.NumHDonors(mol)
        hba = Descriptors.NumHAcceptors(mol)

        desc = [
            mw,
            logp,
            psa,
            hbd,
            hba,
            Descriptors.NumRotatableBonds(mol)
        ]
        
        ## Include permeability model prediction AND direct predictions from CACAO when available (NA if othterwise)
        #permeability = get_cacao_permeability(df, cacao)
        permeability_predictions = float(permeability_model.predict([desc])[0])
        row["predicted_permeability"] = permeability_predictions

        row["hba"]  = hba
        row["hbd"]  = hbd
        row["psa"]  = psa
        row["molecular_weight"] = mw



        data_origin = {}

        if not smiles:
            enriched.append(row)
            continue

        # --- Compute with RDKit ---
        rdkit_features = compute_rdkit_features(smiles)
        if rdkit_features:
            if pd.isna(row.get("logp")) and rdkit_features["logp_rdkit"] is not None:
                row["logp"] = rdkit_features["logp_rdkit"]
                data_origin["logp"] = "rdkit"
            if pd.isna(row.get("binding_free_energy")):
                # (placeholder example)
                row["binding_free_energy"] = -0.1 * rdkit_features["logp_rdkit"]
                data_origin["binding_free_energy"] = "estimated_rdkit"

        # --- PubChem fallback ---
        if pd.isna(row.get("logp")):
            logp_pubchem = fetch_pubchem_logp(smiles)
            if logp_pubchem is not None:
                row["logp"] = logp_pubchem
                data_origin["logp"] = "pubchem"

        # --- Simple pKa estimation (toy model) ---
        if pd.isna(row.get("pka")) and rdkit_features:
            row["pka"] = 7.0 - 0.2 * rdkit_features["logp_rdkit"]
            data_origin["pka"] = "estimated_rdkit"

        # --- Solubility fallback (basic logS estimation) ---
        if pd.isna(row.get("solubility")) and rdkit_features:
            logp = rdkit_features["logp_rdkit"]
            molwt = rdkit_features["mol_weight"]
            row["solubility"] = -0.01 * molwt - 0.5 * logp
            data_origin["solubility"] = "estimated_rdkit"

        # Track origin of each field
        row["metadata"] = json.dumps({"data_origin": data_origin})
        enriched.append(row)

    return pd.DataFrame(enriched)


postgres


In [64]:
original_df = fetch_data()
print(f"Fetched {len(original_df)} records.")
print(original_df.head())

Fetched 29982 records.
      chembl_id                                            smiles  \
0    CHEMBL6329      Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl   
1    CHEMBL6328   Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1   
2  CHEMBL265667  Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1   
3    CHEMBL6362      Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1   
4  CHEMBL267864    Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1   

  binding_free_energy  solubility  logp permeability   pka  molecular_weight  \
0                None         NaN   NaN         None  None               NaN   
1                None         NaN   NaN         None  None               NaN   
2                None         NaN   NaN         None  None               NaN   
3                None         NaN   NaN         None  None               NaN   
4                None         NaN   NaN         None  None               NaN   

   hba  hbd  psa  rtb qed_weighted  source  \
0  NaN  NaN  NaN  NaN         None 

In [65]:
enriched_df = enrich_dataframe(original_df) ## This does NOT include binding_free_energy data
print(enriched_df.head())

                                             Drug_ID  \
0                                    (-)-epicatechin   
1  (2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y...   
2                                            codeine   
3                                         creatinine   
4                                            danazol   

                                                Drug         Y  
0            Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2 -6.220000  
1                   C/C=C\C#CCC/C=C\C=C\C(=O)NCC(C)C -3.860000  
2  COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2... -4.090000  
3                                     CN1CC(=O)NC1=N -5.935409  
4  C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(... -4.840000  
29860
29861


Enriching molecules: 100%|██████████| 29861/29861 [09:15<00:00, 53.74it/s]


      chembl_id                                            smiles  \
0    CHEMBL6329      Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl   
1    CHEMBL6328   Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1   
2  CHEMBL265667  Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1   
3    CHEMBL6362      Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1   
4  CHEMBL267864    Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1   

   binding_free_energy  solubility     logp permeability       pka  \
0            -0.211362    -4.47435  2.11362         None  6.577276   
1            -0.133190    -3.98914  1.33190         None  6.733620   
2            -0.227274    -4.71434  2.27274         None  6.545452   
3            -0.146022    -3.80320  1.46022         None  6.707956   
4            -0.211362    -4.47435  2.11362         None  6.577276   

   molecular_weight  hba  hbd     psa  rtb qed_weighted  source  \
0           341.754    5    1   84.82  NaN         None  ChEMBL   
1           332.319    6    1 

In [67]:
import numpy as np
## Compare cacao perm with predicted perm
just_perm = enriched_df[['cacao_permeability', 'predicted_permeability']]
just_perm_no_nan = just_perm.dropna()
just_perm_no_nan["Error"] = np.abs(just_perm_no_nan["cacao_permeability"] - just_perm_no_nan["predicted_permeability"])

just_perm_no_nan.describe() ## Difference between cacao and predicted permeability is small

Unnamed: 0,cacao_permeability,predicted_permeability,Error
count,99.0,99.0,99.0
mean,-4.814975,-4.846693,0.152299
std,0.632247,0.54072,0.145915
min,-7.38,-7.052423,0.005595
25%,-4.9892,-5.013632,0.05409
50%,-4.7,-4.7416,0.111777
75%,-4.44,-4.502791,0.210597
max,-3.51,-3.966881,0.91042


## Adding Binding Free Energies (derive from BindingDB AND predict using SMILES)

In [68]:
## Try TDC

# Correct import for multi-instance prediction:
from tdc.multi_pred import DTI

# Then, access the specific BindingDB dataset by name
data = DTI(name='BindingDB_Kd')  # For datasets with Kd units
binding_db = data.get_data()
print(binding_db.head())


R = 1.987e-3  # kcal/mol·K
T = 298
binding_db['binding_db_bfe'] = R * T * np.log(binding_db['Y'] * 1e-9)  # Kd (nM → M)
binding_db.rename(columns={"Drug": "smiles"}, inplace=True)

binding_db.head()

Found local copy...
Loading...
Done!


    Drug_ID                                            Drug Target_ID  \
0  444607.0       Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1    P00918   
1    4316.0      COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1    P00918   
2    4293.0           NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1    P00918   
3    1611.0    NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O    P00918   
4    1612.0  COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1    P00918   

                                              Target     Y  
0  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.46  
1  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.49  
2  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.83  
3  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.20  
4  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.16  


Unnamed: 0,Drug_ID,smiles,Target_ID,Target,Y,binding_db_bfe
0,444607.0,Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.46,-12.730587
1,4316.0,COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.49,-12.693178
2,4293.0,NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.83,-12.381115
3,1611.0,NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.2,-13.223775
4,1612.0,COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.16,-13.355904


In [9]:
len(binding_db)

52274

In [69]:
## Merge binding_db to enriched_df on smiles / Drug columns
enriched_data = enriched_df.merge(binding_db[['smiles', 'binding_db_bfe']], left_on='smiles', right_on='smiles', how='left', suffixes=('', '_tdc'))
enriched_data.head()

Unnamed: 0,chembl_id,smiles,binding_free_energy,solubility,logp,permeability,pka,molecular_weight,hba,hbd,psa,rtb,qed_weighted,source,metadata,cacao_permeability,predicted_permeability,binding_db_bfe
0,CHEMBL6329,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl,-0.211362,-4.47435,2.11362,,6.577276,341.754,5,1,84.82,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,-4.397749,
1,CHEMBL6328,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1,-0.13319,-3.98914,1.3319,,6.73362,332.319,6,1,108.61,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,-4.659045,
2,CHEMBL265667,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1,-0.227274,-4.71434,2.27274,,6.545452,357.797,5,2,87.98,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,-4.839266,
3,CHEMBL6362,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1,-0.146022,-3.8032,1.46022,,6.707956,307.309,5,1,84.82,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,-4.536761,
4,CHEMBL267864,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1,-0.211362,-4.47435,2.11362,,6.577276,341.754,5,1,84.82,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,-4.397749,


In [11]:
len(enriched_data)

32833

In [12]:
bfes = enriched_data[['binding_free_energy', 'binding_db_bfe']]
bfes_no_nan = bfes.dropna()
bfes_no_nan["Error"] = np.abs(bfes_no_nan["binding_free_energy"] - bfes_no_nan["binding_db_bfe"])
bfes_no_nan.head()

Unnamed: 0,binding_free_energy,binding_db_bfe,Error
75,-0.17846,-13.320006,13.141546
76,-0.17846,-13.374703,13.196243
77,-0.17846,-12.148206,11.969746
78,-0.17846,-11.725861,11.547401
79,-0.17846,-12.259059,12.080599


In [13]:
bfes_no_nan.describe()

Unnamed: 0,binding_free_energy,binding_db_bfe,Error
count,3241.0,3241.0,3241.0
mean,-0.352681,-8.005819,7.653138
std,0.139869,1.944118,1.968566
min,-0.68032,-20.451308,2.519171
25%,-0.36333,-8.934279,6.476593
50%,-0.333494,-6.817103,6.496893
75%,-0.32021,-6.817103,8.622319
max,0.27778,-2.726841,20.330218


In [70]:
## Inspect enriched_data for predicted binding free energy values

bfe_enriched_data = fetch_data_enriched()
bfes = bfe_enriched_data[['binding_free_energy', 'binding_db_bfe']]
bfes = bfes.dropna()
bfes.head()

Unnamed: 0,binding_free_energy,binding_db_bfe
75,12.284173,-13.320006
76,12.284173,-13.374703
77,12.284173,-12.148206
78,12.284173,-11.725861
79,12.284173,-12.259059


In [71]:
## Ignore signs for now: find error between binding_free_energy and binding_db_bfe
bfes['Error'] = np.abs(bfes['binding_free_energy'] - np.abs(bfes['binding_db_bfe']))
bfes.head()

Unnamed: 0,binding_free_energy,binding_db_bfe,Error
75,12.284173,-13.320006,1.035833
76,12.284173,-13.374703,1.090529
77,12.284173,-12.148206,0.135967
78,12.284173,-11.725861,0.558312
79,12.284173,-12.259059,0.025115


In [72]:
bfes.describe() ## shows that predicted binding free energy values are reasonable, only sign-inversed

Unnamed: 0,binding_free_energy,binding_db_bfe,Error
count,3241.0,3241.0,3241.0
mean,7.96715,-8.005819,1.063772
std,1.158335,1.944118,1.103749
min,3.03943,-20.451308,0.000389
25%,7.324859,-8.934279,0.48342
50%,7.540588,-6.817103,0.723486
75%,8.435151,-6.817103,1.249155
max,12.889833,-2.726841,8.809638


In [73]:
bfes.iloc[0]

binding_free_energy    12.284173
binding_db_bfe        -13.320006
Error                   1.035833
Name: 75, dtype: float64

In [74]:
bfe_enriched_data.head()

Unnamed: 0,chembl_id,smiles,binding_free_energy,solubility,logp,permeability,pka,molecular_weight,hba,hbd,psa,rtb,qed_weighted,source,metadata,cacao_permeability,binding_db_bfe,predicted_permeability
0,CHEMBL6329,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl,8.543225,-4.47435,2.11362,,6.577276,341.754,5,1,84.82,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,,-4.397749
1,CHEMBL6328,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1,9.577582,-3.98914,1.3319,,6.73362,332.319,6,1,108.61,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,,-4.659045
2,CHEMBL265667,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1,8.073905,-4.71434,2.27274,,6.545452,357.797,5,2,87.98,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,,-4.839266
3,CHEMBL6362,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1,7.141158,-3.8032,1.46022,,6.707956,307.309,5,1,84.82,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,,-4.536761
4,CHEMBL267864,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1,8.543225,-4.47435,2.11362,,6.577276,341.754,5,1,84.82,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,,-4.397749


In [75]:
bfe_enriched_data = bfe_enriched_data.drop(columns=['permeability'])
bfe_enriched_data = bfe_enriched_data.rename(columns={"binding_free_energy": "bindingdb_bfe_predictions"})
bfe_enriched_data.head()

Unnamed: 0,chembl_id,smiles,bindingdb_bfe_predictions,solubility,logp,pka,molecular_weight,hba,hbd,psa,rtb,qed_weighted,source,metadata,cacao_permeability,binding_db_bfe,predicted_permeability
0,CHEMBL6329,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl,8.543225,-4.47435,2.11362,6.577276,341.754,5,1,84.82,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,,-4.397749
1,CHEMBL6328,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1,9.577582,-3.98914,1.3319,6.73362,332.319,6,1,108.61,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,,-4.659045
2,CHEMBL265667,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1,8.073905,-4.71434,2.27274,6.545452,357.797,5,2,87.98,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,,-4.839266
3,CHEMBL6362,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1,7.141158,-3.8032,1.46022,6.707956,307.309,5,1,84.82,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,,-4.536761
4,CHEMBL267864,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1,8.543225,-4.47435,2.11362,6.577276,341.754,5,1,84.82,,,ChEMBL,"{""data_origin"": {""logp"": ""rdkit"", ""binding_fre...",,,-4.397749


## PAMPA

In [76]:
import pandas as pd

## load the dataset_pampa_ncats and dataset_pampa_ncats_properties
dataset_pampa_ncats = pd.read_csv('./dataverse_files/dataset_pampa_ncats.csv')
drugs_pampa_ncats = pd.read_csv('./dataverse_files/drugs_pampa_ncats.csv')

dataset_pampa_ncats.head()

Unnamed: 0,ID,SMILES,Y
0,2466,CCCCOC1=CC=C(C=C1)CC(=O)NO,0
1,1259573,COC1=C(C=C(C=C1)CCN2C(=CC(=O)NC2=S)N)OC,1
2,1275864,COC1=C(C=C(C=C1)Cl)C(=O)NC2=CC=C(C=C2)NC(=O)C3...,0
3,4878,CC(C)(C)N1C2=NC=NC(=C2C(=N1)C3=CC=C(C=C3)Cl)N,0
4,2030130,CN1C2=CC=CC=C2C(=O)C3=C1N=C(N(C3=O)C4=CC=CC=C4...,0


In [77]:
len(dataset_pampa_ncats)

2035

In [78]:
drugs_pampa_ncats.head()

Unnamed: 0,ID,SMILES,Y
0,444,CC(C(=O)C1=CC(=CC=C1)Cl)NC(C)(C)C,0
1,1051,CC1=NC=C(C(=C1O)C=O)COP(=O)(O)O,1
2,1130,CC1=C(SC=[N+]1CC2=CN=C(N=C2N)C)CCO,1
3,1923,C1=CC2=C(C(=C1)O)N=CC=C2,0
4,2239,CC(C)(C)NCC(CSC1=NC(=CS1)C2=CC=C(S2)C(=O)N)O,1


In [79]:
len(drugs_pampa_ncats)

142

## All High-Quality Perm Data

In [3]:
## 
caco2 = pd.read_csv('./perm_data/Caco-2.csv')
mdck = pd.read_csv('./perm_data/MDCK.csv')
others = pd.read_csv('./perm_data/Others.csv')
pampa = pd.read_csv('./perm_data/PAMPA.csv')
rrck = pd.read_csv('./perm_data/RRCK.csv')

In [4]:
caco2.head()

Unnamed: 0,ID,Original_ID,Common_name,SMILES,Standardise_SMILES,Value,Unit,Endpoint,Standardized_Value,Standardized_Endpoint,...,Num_H_Donors,Num_Heavy_Atoms,Num_Carbon_Atoms,Fraction_SP3_Carbons,TPSA,Num_Rotatable_Bonds,Num_Charged_Atoms,Net_Charge,Kier_index,InchiKey
0,MC-0003,502,,O=C1[C@@H](C(C)C)NC([C@@H](CC2=CC(=CC=C2)C(F)(...,CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(C(F)(F)F)c2)NC...,5,10^-6 cm/s,Papp AB,-5.301,Log Papp AB,...,4,43,31,0.516129,111.8,5,0,0,12.377831,XLGYUJSXIXSLHY-CMTIAEDTSA-N
1,MC-0003,502,,O=C1[C@@H](C(C)C)NC([C@@H](CC2=CC(=CC=C2)C(F)(...,CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(C(F)(F)F)c2)NC...,11,10^-6 cm/s,Papp BA,-4.959,Log Papp BA,...,4,43,31,0.516129,111.8,5,0,0,12.377831,XLGYUJSXIXSLHY-CMTIAEDTSA-N
2,MC-0003,502,,O=C1[C@@H](C(C)C)NC([C@@H](CC2=CC(=CC=C2)C(F)(...,CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(C(F)(F)F)c2)NC...,2,,ER,2.0,ER,...,4,43,31,0.516129,111.8,5,0,0,12.377831,XLGYUJSXIXSLHY-CMTIAEDTSA-N
3,MC-0004,508,,O=C1[C@@H](C(C)C)NC([C@@H](CC2C=CC=C(C=2)Cl)NC...,CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(Cl)c2)NCCOc2cc...,1,10^-6 cm/s,Papp AB,-6.0,Log Papp AB,...,4,41,31,0.516129,111.8,6,0,0,12.902564,KDLJRSICUQNEQC-ZRRKCSAHSA-N
4,MC-0004,508,,O=C1[C@@H](C(C)C)NC([C@@H](CC2C=CC=C(C=2)Cl)NC...,CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(Cl)c2)NCCOc2cc...,5,10^-6 cm/s,Papp BA,-5.301,Log Papp BA,...,4,41,31,0.516129,111.8,6,0,0,12.902564,KDLJRSICUQNEQC-ZRRKCSAHSA-N


In [5]:
mdck.head()

Unnamed: 0,ID,Original_ID,Common_name,SMILES,Standardise_SMILES,Value,Unit,Endpoint,Standardized_Value,Standardized_Endpoint,...,Num_H_Donors,Num_Heavy_Atoms,Num_Carbon_Atoms,Fraction_SP3_Carbons,TPSA,Num_Rotatable_Bonds,Num_Charged_Atoms,Net_Charge,Kier_index,InchiKey
0,MC-0001,Ib,Trk-IN-8,N1=CC2C(N[C@H]3C[C@H](C3)OC3C(=CC(=CN=3)F)[C@@...,C[C@H]1Nc2nc3c(cnn3cc2F)C(=O)N[C@H]2C[C@H](C2)...,11.13,10^-6 cm/s,Papp AB,-4.954,Log Papp AB,...,2,28,18,0.333333,93.44,0,0,0,3.676688,JXLZJPYLDDSEQF-IEBDPFPHSA-N
1,MC-0001,Ib,Trk-IN-8,N1=CC2C(N[C@H]3C[C@H](C3)OC3C(=CC(=CN=3)F)[C@@...,C[C@H]1Nc2nc3c(cnn3cc2F)C(=O)N[C@H]2C[C@H](C2)...,51.91,10^-6 cm/s,Papp BA,-4.285,Log Papp BA,...,2,28,18,0.333333,93.44,0,0,0,3.676688,JXLZJPYLDDSEQF-IEBDPFPHSA-N
2,MC-0001,Ib,Trk-IN-8,N1=CC2C(N[C@H]3C[C@H](C3)OC3C(=CC(=CN=3)F)[C@@...,C[C@H]1Nc2nc3c(cnn3cc2F)C(=O)N[C@H]2C[C@H](C2)...,4.67,,ER,4.67,ER,...,2,28,18,0.333333,93.44,0,0,0,3.676688,JXLZJPYLDDSEQF-IEBDPFPHSA-N
3,MC-0002,Positive control 2,Trk-IN-7,N1=CC2C(N[C@H]3C[C@H](C3)OC3C(=CC(=CN=3)F)[C@@...,C[C@H]1Nc2ccn3ncc(c3n2)C(=O)N[C@H]2C[C@H](C2)O...,3.31,10^-6 cm/s,Papp AB,-5.48,Log Papp AB,...,2,27,18,0.333333,93.44,0,0,0,3.491734,MPTHHGRJSLVPSX-JLLWLGSASA-N
4,MC-0002,Positive control 2,Trk-IN-7,N1=CC2C(N[C@H]3C[C@H](C3)OC3C(=CC(=CN=3)F)[C@@...,C[C@H]1Nc2ccn3ncc(c3n2)C(=O)N[C@H]2C[C@H](C2)O...,64.41,10^-6 cm/s,Papp BA,-4.191,Log Papp BA,...,2,27,18,0.333333,93.44,0,0,0,3.491734,MPTHHGRJSLVPSX-JLLWLGSASA-N


In [6]:
others.head()

Unnamed: 0,ID,Original_ID,Common_name,SMILES,Standardise_SMILES,Value,Unit,Endpoint,Standardized_Value,Standardized_Endpoint,...,Num_H_Donors,Num_Heavy_Atoms,Num_Carbon_Atoms,Fraction_SP3_Carbons,TPSA,Num_Rotatable_Bonds,Num_Charged_Atoms,Net_Charge,Kier_index,InchiKey
0,MC-3389,19,,COC1=C2C=C(C=N1)N1CCOC3=C1C=C(CN(C)CCCNS2(=O)=...,COc1ncc2cc1S(=O)(=O)NCCCN(C)Cc1ccc3c(c1)N2CCO3,530,nm/s,Papp,-4.276,Log Papp,...,1,28,19,0.421053,92.38,1,0,0,5.1333,HFMJOZBBMINJJK-UHFFFAOYSA-N
1,MC-4163,CHEMBL3917415,,N#CC1(NC(=O)[C@@H]2Cc3ccc(c(Cl)c3)OCCCCOc3cc(C...,N#CC1(NC(=O)[C@@H]2Cc3ccc(c(Cl)c3)OCCCCOc3cc(C...,26,,ER,26.0,ER,...,2,36,25,0.4,100.45,2,0,0,7.312584,XAPRSGPLBWSEHV-IBGZPJMESA-N
2,MC-4163,CHEMBL3917415,,N#CC1(NC(=O)[C@@H]2Cc3ccc(c(Cl)c3)OCCCCOc3cc(C...,N#CC1(NC(=O)[C@@H]2Cc3ccc(c(Cl)c3)OCCCCOc3cc(C...,53,,ER,53.0,ER,...,2,36,25,0.4,100.45,2,0,0,7.312584,XAPRSGPLBWSEHV-IBGZPJMESA-N
3,MC-4163,CHEMBL3917415,,N#CC1(NC(=O)[C@@H]2Cc3ccc(c(Cl)c3)OCCCCOc3cc(C...,N#CC1(NC(=O)[C@@H]2Cc3ccc(c(Cl)c3)OCCCCOc3cc(C...,53,,ER,53.0,ER,...,2,36,25,0.4,100.45,2,0,0,7.312584,XAPRSGPLBWSEHV-IBGZPJMESA-N
4,MC-4163,CHEMBL3917415,,N#CC1(NC(=O)[C@@H]2Cc3ccc(c(Cl)c3)OCCCCOc3cc(C...,N#CC1(NC(=O)[C@@H]2Cc3ccc(c(Cl)c3)OCCCCOc3cc(C...,26,,ER,26.0,ER,...,2,36,25,0.4,100.45,2,0,0,7.312584,XAPRSGPLBWSEHV-IBGZPJMESA-N


In [7]:
pampa.head()

Unnamed: 0,ID,Original_ID,Common_name,SMILES,Standardise_SMILES,Value,Unit,Endpoint,Standardized_Value,Standardized_Endpoint,...,Num_H_Donors,Num_Heavy_Atoms,Num_Carbon_Atoms,Fraction_SP3_Carbons,TPSA,Num_Rotatable_Bonds,Num_Charged_Atoms,Net_Charge,Kier_index,InchiKey
0,MC-0085,erythromycin,Erythromycin,CC[C@@H]1[C@@]([C@@H]([C@H](C(=O)[C@@H](C[C@@]...,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@@H]2C[C@@](C)...,1.3±0.1,nm/s,Papp,-6.886,Log Papp,...,5,51,37,0.945946,193.91,7,0,0,14.712338,ULGZDMOVFRHVEP-RWJQBGPGSA-N
1,MC-0085,T-079,Erythromycin,O([C@@H]1[C@@H](C)[C@H](O[C@@H]2O[C@@H](C)[C@H...,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@@H]2C[C@@](C)...,0.22,10^-6 cm/s,Papp,-6.658,Log Papp,...,5,51,37,0.945946,193.91,7,0,0,14.712338,ULGZDMOVFRHVEP-RWJQBGPGSA-N
2,MC-0086,clarithromycin,Clarithromycin,CC[C@@H]1[C@@]([C@@H]([C@H](C(=O)[C@@H](C[C@@]...,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,37.8±1.7,nm/s,Papp,-5.423,Log Papp,...,4,52,38,0.947368,182.91,8,0,0,15.313214,AGOYDEPGAOXOCK-KCBOHYOISA-N
3,MC-0087,roxithromycin,Roxithromycin,CC[C@@H]1[C@@]([C@@H]([C@H](/C(=N/OCOCCOC)/[C@...,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,35.6±1.5,nm/s,Papp,-5.449,Log Papp,...,5,58,41,0.95122,216.89,13,0,0,18.80083,RXZBMPWDPOLZGW-XMRMVWPWSA-N
4,MC-0087,T-180,Roxithromycin,O(/N=C\1/[C@H](C)[C@@H](O)[C@@](O)(C)[C@@H](CC...,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,0.76,10^-6 cm/s,Papp,-6.119,Log Papp,...,5,58,41,0.95122,216.89,13,0,0,18.80083,RXZBMPWDPOLZGW-XMRMVWPWSA-N


In [8]:
rrck.head()

Unnamed: 0,ID,Original_ID,Common_name,SMILES,Standardise_SMILES,Value,Unit,Endpoint,Standardized_Value,Standardized_Endpoint,...,Num_H_Donors,Num_Heavy_Atoms,Num_Carbon_Atoms,Fraction_SP3_Carbons,TPSA,Num_Rotatable_Bonds,Num_Charged_Atoms,Net_Charge,Kier_index,InchiKey
0,MC-4033,CHEMBL3286830,Lorlatinib,C[C@H]1Oc2cc(cnc2N)-c2c(nn(C)c2C#N)CN(C)C(=O)c...,C[C@H]1Oc2cc(cnc2N)-c2c(nn(C)c2C#N)CN(C)C(=O)c...,1.3,,ER,1.3,ER,...,1,30,21,0.238095,110.06,0,0,0,4.62136,IIXWYSCJSQVBQM-LLVKDONJSA-N
1,MC-4033,CHEMBL3286830,Lorlatinib,C[C@H]1Oc2cc(cnc2N)-c2c(nn(C)c2C#N)CN(C)C(=O)c...,C[C@H]1Oc2cc(cnc2N)-c2c(nn(C)c2C#N)CN(C)C(=O)c...,1.5,,ER,1.5,ER,...,1,30,21,0.238095,110.06,0,0,0,4.62136,IIXWYSCJSQVBQM-LLVKDONJSA-N
2,MC-4033,CHEMBL3286830,Lorlatinib,C[C@H]1Oc2cc(cnc2N)-c2c(nn(C)c2C#N)CN(C)C(=O)c...,C[C@H]1Oc2cc(cnc2N)-c2c(nn(C)c2C#N)CN(C)C(=O)c...,19.3,10^-6 cm/s,Papp AB,-4.714,Log Papp AB,...,1,30,21,0.238095,110.06,0,0,0,4.62136,IIXWYSCJSQVBQM-LLVKDONJSA-N
3,MC-4033,CHEMBL3286830,Lorlatinib,C[C@H]1Oc2cc(cnc2N)-c2c(nn(C)c2C#N)CN(C)C(=O)c...,C[C@H]1Oc2cc(cnc2N)-c2c(nn(C)c2C#N)CN(C)C(=O)c...,28.0,10^-6 cm/s,Papp BA,-4.553,Log Papp BA,...,1,30,21,0.238095,110.06,0,0,0,4.62136,IIXWYSCJSQVBQM-LLVKDONJSA-N
4,MC-4240,CHEMBL4286522,,Cc1[nH][n+](C)c2c1-c1cnc(N)c(c1)O[C@H](C)c1cc(...,Cc1[nH][n+](C)c2c1-c1cnc(N)c(c1)O[C@H](C)c1cc(...,4.2,,ER,4.2,ER,...,2,29,21,0.285714,88.12,0,1,1,4.486743,ASQDMAQHZGURLM-GFCCVEGCSA-O


In [9]:
rrck.columns

Index(['ID', 'Original_ID', 'Common_name', 'SMILES', 'Standardise_SMILES',
       'Value', 'Unit', 'Endpoint', 'Standardized_Value',
       'Standardized_Endpoint', 'No_symbol_Value', 'Assay', 'Description',
       'Link', 'Source Type', 'Source', 'Citation', 'Publish_Year',
       'Macrocycle_Ring_Size', 'Macrocycle_Free_Amide_Count',
       'Macrocycle_Substituted_Amide_Count', 'Macrocycle_Overall_Amide_Count',
       'Macrocycle_Ring_smiles', 'Macrocycle_Peripheral_smiles',
       'Free_Amide_Ratio', 'Amide_Ratio', 'Num_Rings', 'Num_Aromatic_Rings',
       'cLogP', 'Molecular_Weight', 'Num_H_Acceptors', 'Num_H_Donors',
       'Num_Heavy_Atoms', 'Num_Carbon_Atoms', 'Fraction_SP3_Carbons', 'TPSA',
       'Num_Rotatable_Bonds', 'Num_Charged_Atoms', 'Net_Charge', 'Kier_index',
       'InchiKey'],
      dtype='object')

### merging perm datasets together

In [86]:
print(len(caco2), len(mdck))

1737 642


In [88]:
# 1️⃣ Combine them vertically
combined_df = pd.concat([caco2, mdck, others, pampa, rrck], axis=0, ignore_index=True)

# 2️⃣ Check potential duplicates
# For example, by Standardise_SMILES (or another unique ID column)
combined_perm_df = combined_df.drop_duplicates(subset=["Standardise_SMILES"], keep="first")

combined_perm_df.head()

Unnamed: 0,ID,Original_ID,Common_name,SMILES,Standardise_SMILES,Value,Unit,Endpoint,Standardized_Value,Standardized_Endpoint,...,Num_H_Donors,Num_Heavy_Atoms,Num_Carbon_Atoms,Fraction_SP3_Carbons,TPSA,Num_Rotatable_Bonds,Num_Charged_Atoms,Net_Charge,Kier_index,InchiKey
0,MC-0003,502,,O=C1[C@@H](C(C)C)NC([C@@H](CC2=CC(=CC=C2)C(F)(...,CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(C(F)(F)F)c2)NC...,5.0,10^-6 cm/s,Papp AB,-5.301,Log Papp AB,...,4,43,31,0.516129,111.8,5,0,0,12.377831,XLGYUJSXIXSLHY-CMTIAEDTSA-N
3,MC-0004,508,,O=C1[C@@H](C(C)C)NC([C@@H](CC2C=CC=C(C=2)Cl)NC...,CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(Cl)c2)NCCOc2cc...,1.0,10^-6 cm/s,Papp AB,-6.0,Log Papp AB,...,4,41,31,0.516129,111.8,6,0,0,12.902564,KDLJRSICUQNEQC-ZRRKCSAHSA-N
6,MC-0005,511,,Clc1cccc(C[C@H]2NCCOc3ccccc3CCCNC(=O)[C@H](Cn3...,CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(Cl)c2)NCCOc2cc...,1.0,,ER,1.0,ER,...,4,42,32,0.40625,113.49,5,0,0,11.426567,NPKDYPCFVUJCKG-IUAQSZDVSA-N
9,MC-0006,512,,O=C1[C@@H](C(C)C)NC([C@@H](CC2C=CC=C(Cl)C=2)NC...,CNC[C@@H]1NC(=O)[C@@H](C(C)C)NC(=O)[C@@H](Cc2c...,0.3,10^-6 cm/s,Papp AB,-6.523,Log Papp AB,...,5,39,29,0.482759,120.59,5,0,0,12.044473,OAKBMRJASZUEMA-RMTZWNOUSA-N
12,MC-0007,515,,O=C1[C@@H](C(C)C)NC([C@@H](CC2=CC(=CC=C2)F)NCC...,CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(F)c2)NCCOc2ccc...,3.0,10^-6 cm/s,Papp AB,-5.523,Log Papp AB,...,4,41,31,0.516129,111.8,5,0,0,12.094712,XWBGQVPRXHOMIT-UAXWJAQVSA-N


In [89]:
len(combined_perm_df)

4553

In [91]:
combined_perm_df.keys()

Index(['ID', 'Original_ID', 'Common_name', 'SMILES', 'Standardise_SMILES',
       'Value', 'Unit', 'Endpoint', 'Standardized_Value',
       'Standardized_Endpoint', 'No_symbol_Value', 'Assay', 'Description',
       'Link', 'Source Type', 'Source', 'Citation', 'Publish_Year',
       'Macrocycle_Ring_Size', 'Macrocycle_Free_Amide_Count',
       'Macrocycle_Substituted_Amide_Count', 'Macrocycle_Overall_Amide_Count',
       'Macrocycle_Ring_smiles', 'Macrocycle_Peripheral_smiles',
       'Free_Amide_Ratio', 'Amide_Ratio', 'Num_Rings', 'Num_Aromatic_Rings',
       'cLogP', 'Molecular_Weight', 'Num_H_Acceptors', 'Num_H_Donors',
       'Num_Heavy_Atoms', 'Num_Carbon_Atoms', 'Fraction_SP3_Carbons', 'TPSA',
       'Num_Rotatable_Bonds', 'Num_Charged_Atoms', 'Net_Charge', 'Kier_index',
       'InchiKey'],
      dtype='object')

## PLAS 5K

In [54]:
plas5k = pd.read_csv('./bfe_data/plas_5k.csv')

plas5k.head()

Unnamed: 0,pdbid,binding_affinity (kcal/mol),binding_affinity_sd (kcal/mol),electrostatic (kcal/mol),electrostatic_sd (kcal/mol),polar_solvation (kcal/mol),polar_solvation_sd (kcal/mol),non_polar_solvation (kcal/mol),non_polar_solvation_sd (kcal/mol),vdW (kcal/mol)
0,6g3f,-0.2557,1.19548,-0.98408,1.12502,-0.34568,0.243,1.26248,1.1208,-0.18842
1,5fpd,-2.61456,2.6715,-1.01432,1.22772,-0.56942,0.40202,1.62432,1.48674,-2.6551
2,6hxe,-0.98558,2.57244,-8.41628,3.52346,-0.81082,0.32616,7.9022,3.00116,0.3393
3,5nqb,-1.0592,1.90178,-36.85828,13.2379,-0.46456,0.24828,34.0379,11.65862,2.22572
4,4q3f,-1.52816,1.87006,-5.36862,3.20528,-0.74246,0.32006,5.44818,2.73016,-0.8653


In [94]:
plas5k.keys()

Index(['pdbid', 'binding_affinity (kcal/mol)',
       'binding_affinity_sd (kcal/mol)', 'electrostatic (kcal/mol)',
       'electrostatic_sd (kcal/mol)', 'polar_solvation (kcal/mol)',
       'polar_solvation_sd (kcal/mol)', 'non_polar_solvation (kcal/mol)',
       'non_polar_solvation_sd (kcal/mol)', 'vdW (kcal/mol)',
       'Standardise_SMILES', 'InchiKey'],
      dtype='object')

In [93]:
len(plas5k)

5000

In [92]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd

def fetch_standardized_smiles(pdbid):
    try:
        url = f"https://files.rcsb.org/download/{pdbid}.pdb"
        r = requests.get(url, timeout=5)
        mol = Chem.MolFromPDBBlock(r.text, sanitize=False)

        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
    except:
        return None


def parallel_fetch(pdb_ids, max_workers=16):
    smiles_map = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(fetch_standardized_smiles, pdbid): pdbid for pdbid in pdb_ids}
        for f in as_completed(futures):
            pdbid = futures[f]
            smiles_map[pdbid] = f.result()
    return smiles_map


from rdkit.Chem import MolToInchiKey, SmilesParserParams, MolFromSmiles
def inchikey(smi):
    try:
        
        params = SmilesParserParams()
        params.removeHs = True
        mol = MolFromSmiles(smi, params)
        return MolToInchiKey(mol, options="-FixedH") 
        I
    except Exception:
        return None

# Usage
pdb_ids = plas5k["pdbid"].unique()
smiles_map = parallel_fetch(pdb_ids)
plas5k["Standardise_SMILES"] = plas5k["pdbid"].map(smiles_map)
plas5k["InchiKey"] = plas5k["Standardise_SMILES"].apply(inchikey)

[09:59:58] 

****
Post-condition Violation
Element 'X' not found
Violation occurred on line 93 in file /Users/runner/work/rdkit-pypi/rdkit-pypi/build/temp.macosx-10.9-x86_64-cpython-311/rdkit/Code/GraphMol/PeriodicTable.h
Failed Expression: anum > -1
****

[10:01:49] 

****
Post-condition Violation
Element 'X' not found
Violation occurred on line 93 in file /Users/runner/work/rdkit-pypi/rdkit-pypi/build/temp.macosx-10.9-x86_64-cpython-311/rdkit/Code/GraphMol/PeriodicTable.h
Failed Expression: anum > -1
****

[10:03:24] Invalid InChI prefix in generating InChI Key
[10:03:25] Invalid InChI prefix in generating InChI Key
[10:03:25] Invalid InChI prefix in generating InChI Key
[10:03:34] Invalid InChI prefix in generating InChI Key
[10:03:34] Invalid InChI prefix in generating InChI Key
[10:03:34] Invalid InChI prefix in generating InChI Key
[10:03:34] Invalid InChI prefix in generating InChI Key
[10:03:34] Invalid InChI prefix in generating InChI Key
[10:03:35] Invalid InChI prefix in gen

In [59]:
plas5k.head()

Unnamed: 0,pdbid,binding_affinity (kcal/mol),binding_affinity_sd (kcal/mol),electrostatic (kcal/mol),electrostatic_sd (kcal/mol),polar_solvation (kcal/mol),polar_solvation_sd (kcal/mol),non_polar_solvation (kcal/mol),non_polar_solvation_sd (kcal/mol),vdW (kcal/mol),Standardise_SMILES,InchiKey
0,6g3f,-0.2557,1.19548,-0.98408,1.12502,-0.34568,0.243,1.26248,1.1208,-0.18842,CC[C@H](C)[C@H](NC(=O)CNC(=O)[C@H](CCCCN)NC(=O...,
1,5fpd,-2.61456,2.6715,-1.01432,1.22772,-0.56942,0.40202,1.62432,1.48674,-2.6551,CCCC[C@H](NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CC(C...,
2,6hxe,-0.98558,2.57244,-8.41628,3.52346,-0.81082,0.32616,7.9022,3.00116,0.3393,C=CC1=C(C)NC=C1C[C@H](NC(=O)[C@H](CC(C)C)NC(=O...,
3,5nqb,-1.0592,1.90178,-36.85828,13.2379,-0.46456,0.24828,34.0379,11.65862,2.22572,CC[C@H](C)[C@H](NC(=O)CNC(=O)[C@H](CC1=CNC2=C1...,
4,4q3f,-1.52816,1.87006,-5.36862,3.20528,-0.74246,0.32006,5.44818,2.73016,-0.8653,CC[C@H](C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)CNC(=O...,


In [None]:
print("combined_perm_df SMILES examples:")
print(combined_perm_df["Standardise_SMILES"].head(5).tolist())

print("\nplas5k SMILES examples:")
print(plas5k["Standardise_SMILES"].head(5).tolist())

combined_df SMILES examples:
['CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(C(F)(F)F)c2)NCCOc2ccccc2CCCNC(=O)[C@H](CN(C)C)NC1=O', 'CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(Cl)c2)NCCOc2ccccc2CCCNC(=O)[C@H](CCN(C)C)NC1=O', 'CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(Cl)c2)NCCOc2ccccc2CCCNC(=O)[C@H](Cn2cccc2)NC1=O', 'CNC[C@@H]1NC(=O)[C@@H](C(C)C)NC(=O)[C@@H](Cc2cccc(Cl)c2)NCCOc2ccccc2CCCNC1=O', 'CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(F)c2)NCCOc2ccccc2C[C@H](C)CNC(=O)[C@H](CN(C)C)NC1=O']

plas5k SMILES examples:
['CC[C@H](C)[C@H](NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H](CC1=CNC=N1)NC(=O)[C@H](C)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CC1=CNC=N1)NC(=O)[C@H](C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CC1=CC=C(O)C=C1)NC(=O)[C@H](CC(=O)O)NC(=O)[C@@H](NC(=O)CNC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCSC)NC(=O)[C@H](CC(=O)O)NC(=O)[C@@H](NC(=O)[C@

In [None]:
# join by SMILES
merged_enriched_combined = pd.merge(
    enriched_df,
    combined_perm_df,
    left = 'smiles',
    right = 'SMILES'
 
)

merged_enriched_combined.head()

In [47]:
len(combined_df), len(plas5k)

(4553, 5000)

In [50]:
# join by SMILES
merged_all_perm_plas = pd.merge(
    combined_df,
    plas5k,
    on = 'Standardise_SMILES',
 
)

merged_all_perm_plas.head()

Unnamed: 0,ID,Original_ID,Common_name,SMILES,Standardise_SMILES,Value,Unit,Endpoint,Standardized_Value,Standardized_Endpoint,...,pdbid,binding_affinity (kcal/mol),binding_affinity_sd (kcal/mol),electrostatic (kcal/mol),electrostatic_sd (kcal/mol),polar_solvation (kcal/mol),polar_solvation_sd (kcal/mol),non_polar_solvation (kcal/mol),non_polar_solvation_sd (kcal/mol),vdW (kcal/mol)


In [44]:
len(merged_all_perm_plas)

merged_all_perm_plas['binding_affinity (kcal/mol)'].value_counts(dropna=False)

binding_affinity (kcal/mol)
NaN    4553
Name: count, dtype: int64

In [26]:
common_smiles = set(combined_df["SMILES"]) & set(plas5k["SMILES"])
common_smiles

set()

In [None]:
## TRY STANDARDISING SMILES

def fetch_standardized_smiles(pdbid):
    try:
        url = f"https://files.rcsb.org/download/{pdbid}.pdb"
        r = requests.get(url, timeout=5)
        mol = Chem.MolFromPDBBlock(r.text, sanitize=False)

        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
    except:
        return None

plas5k_standard_10 = plas5k["pdbid"][:10].apply(fetch_standardized_smiles)


common_standard_smiles = set(combined_df["SMILES"]) & set(plas5k_standard_10)
common_standard_smiles

set()

In [None]:
print('combined_df')
combined_df["Standardised_SMILES_CHEM"] = combined_df["SMILES"].apply(standardize_smiles)
print('plas5k')
plas5k["Standardised_SMILES"] = plas5k["SMILES"].apply(standardize_smiles)

In [None]:
common_standard_smiles = set(combined_df["Standardised_SMILES_CHEM"]) & set(plas5k['Standardised_SMILES'])
common_standard_smiles