In [2]:
import os
print('hi')
import requests
import pandas as pd

API_URL = "https://www.ebi.ac.uk/chembl/api/data/molecule.json"
BATCH_SIZE = 1000
TOTAL_RECORDS = 2000000        # total molecules in ChEMBL (approx)
SAMPLE_SIZE = int(TOTAL_RECORDS * 0.10)  # 10% sample (~200k)
LIMIT = BATCH_SIZE
OFFSET = 0

params = {"limit": LIMIT, "offset": OFFSET}
resp = requests.get(API_URL, params=params)
resp.raise_for_status()
data = resp.json()

if "molecules" not in data:
        print("Unexpected response")

molecules = data["molecules"]
print(molecules[0]["molecule_chembl_id"])


hi
CHEMBL6329


In [3]:
molecules[0].keys()



In [4]:
m0 = molecules[0]

for key in m0.keys():
    print(type(m0[key]), key)

<class 'list'> atc_classifications
<class 'int'> availability_type
<class 'NoneType'> biotherapeutic
<class 'int'> chemical_probe
<class 'int'> chirality
<class 'list'> cross_references
<class 'bool'> dosed_ingredient
<class 'NoneType'> first_approval
<class 'int'> first_in_class
<class 'NoneType'> helm_notation
<class 'int'> inorganic_flag
<class 'NoneType'> max_phase
<class 'str'> molecule_chembl_id
<class 'dict'> molecule_hierarchy
<class 'dict'> molecule_properties
<class 'dict'> molecule_structures
<class 'list'> molecule_synonyms
<class 'str'> molecule_type
<class 'int'> natural_product
<class 'bool'> oral
<class 'int'> orphan
<class 'bool'> parenteral
<class 'int'> polymer_flag
<class 'NoneType'> pref_name
<class 'int'> prodrug
<class 'str'> structure_type
<class 'bool'> therapeutic_flag
<class 'bool'> topical
<class 'NoneType'> usan_stem
<class 'NoneType'> usan_stem_definition
<class 'NoneType'> usan_substem
<class 'NoneType'> usan_year
<class 'int'> veterinary
<class 'bool'> w

In [5]:
## Look into each key in m0:
m0_key = "molecule_properties"
print(m0_key)
for key in m0[m0_key].keys():
    print(type(m0[m0_key][key]), key)

molecule_properties
<class 'str'> alogp
<class 'int'> aromatic_rings
<class 'str'> full_molformula
<class 'str'> full_mwt
<class 'int'> hba
<class 'int'> hbd
<class 'int'> heavy_atoms
<class 'str'> mw_freebase
<class 'str'> np_likeness_score
<class 'int'> num_ro5_violations
<class 'str'> psa
<class 'str'> qed_weighted
<class 'str'> ro3_pass
<class 'int'> rtb


In [6]:
from jsonschema import validate
import json
df = pd.DataFrame([
    {"molecule_chembl_id": m["molecule_chembl_id"], "molecule_data": json.dumps(m)}
    for m in molecules
])

df.head()

Unnamed: 0,molecule_chembl_id,molecule_data
0,CHEMBL6329,"{""atc_classifications"": [], ""availability_type..."
1,CHEMBL6328,"{""atc_classifications"": [], ""availability_type..."
2,CHEMBL265667,"{""atc_classifications"": [], ""availability_type..."
3,CHEMBL6362,"{""atc_classifications"": [], ""availability_type..."
4,CHEMBL267864,"{""atc_classifications"": [], ""availability_type..."


In [7]:
df["molecule_data"] = df["molecule_data"].apply(json.loads)
type((df['molecule_data'][0]))

dict

In [10]:
import nest_asyncio
import asyncio

nest_asyncio.apply()

In [11]:
import asyncio
import aiohttp
import pandas as pd
from tqdm.asyncio import tqdm_asyncio
import numpy as np

CHEMBL_API_BASE = "https://www.ebi.ac.uk/chembl/api/data"

async def fetch_pkd(session, chembl_id):
    """Fetch pKd value for one compound asynchronously."""
    url = f"{CHEMBL_API_BASE}/activity.json?molecule_chembl_id={chembl_id}&standard_type=Kd"
    try:
        async with session.get(url, timeout=15) as resp:
            if resp.status != 200:
                return chembl_id, None
            data = await resp.json()
            activities = data.get("activities", [])
            if not activities:
                return chembl_id, None
            
            pkd_vals = [
                float(a["pchembl_value"])
                for a in activities
                if a.get("pchembl_value")
            ]
            return chembl_id, np.mean(pkd_vals) if pkd_vals else None
    except Exception:
        return chembl_id, None


async def fetch_all_pkd(chembl_ids, max_concurrent=20):
    """Fetch pKd values for all compounds concurrently."""
    connector = aiohttp.TCPConnector(limit_per_host=max_concurrent)
    timeout = aiohttp.ClientTimeout(total=600)
    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
        tasks = [fetch_pkd(session, cid) for cid in chembl_ids]
        results = []
        for result in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="⚡ Fetching pKd"):
            results.append(await result)
        return dict(results)


def fetch_pkd_for_dataframe(df, id_column="chembl_id"):
    """High-speed pKd fetching for a DataFrame using asyncio."""
    chembl_ids = df[id_column].dropna().unique().tolist()
    pkd_dict = asyncio.run(fetch_all_pkd(chembl_ids))

    df["pKd"] = df[id_column].map(pkd_dict)
    df["binding_free_energy"] = df["pKd"].apply(lambda x: 1.364 * x if pd.notnull(x) else None)
    df["binding_free_energy_source"] = df["pKd"].apply(
        lambda x: "chembl_pKd" if pd.notnull(x) else "missing"
    )
    return df


pkd_df = fetch_pkd_for_dataframe(df.rename(columns={"molecule_chembl_id": "chembl_id"}))
pkd_df.head()

⚡ Fetching pKd: 100%|██████████| 1000/1000 [00:11<00:00, 86.37it/s]


Unnamed: 0,chembl_id,molecule_data,pKd,binding_free_energy,binding_free_energy_source
0,CHEMBL6329,"{'atc_classifications': [], 'availability_type...",,,missing
1,CHEMBL6328,"{'atc_classifications': [], 'availability_type...",,,missing
2,CHEMBL265667,"{'atc_classifications': [], 'availability_type...",,,missing
3,CHEMBL6362,"{'atc_classifications': [], 'availability_type...",,,missing
4,CHEMBL267864,"{'atc_classifications': [], 'availability_type...",,,missing


In [13]:
pkd_df['pKd'].value_counts(dropna=False)

pKd
NaN         949
9.260000      1
8.050000      1
4.750000      1
6.335000      1
8.450000      1
7.420000      1
7.109231      1
8.266000      1
6.220000      1
5.975000      1
4.795714      1
9.500000      1
8.533333      1
6.050000      1
5.380000      1
6.890000      1
5.540000      1
4.760000      1
7.350000      1
5.060000      1
8.204545      1
7.620000      1
5.110000      1
7.260000      1
9.349000      1
9.116667      1
7.163500      1
4.040000      1
8.080556      1
7.055882      1
8.740000      1
6.530000      1
4.920000      1
6.430000      1
7.300000      1
6.600000      1
8.100000      1
7.920000      1
4.010000      1
8.900000      1
6.402857      1
7.375000      1
7.200000      1
6.395000      1
5.730000      1
7.090000      1
7.250000      1
5.270000      1
6.817273      1
6.450000      1
7.093333      1
Name: count, dtype: int64

In [14]:
pkd_df['binding_free_energy'].value_counts(dropna=False)

binding_free_energy
NaN          949
12.630640      1
10.980200      1
6.479000       1
8.640940       1
11.525800      1
10.120880      1
9.696991       1
11.274824      1
8.484080       1
8.149900       1
6.541354       1
12.958000      1
11.639467      1
8.252200       1
7.338320       1
9.397960       1
7.556560       1
6.492640       1
10.025400      1
6.901840       1
11.191000      1
10.393680      1
6.970040       1
9.902640       1
12.752036      1
12.435133      1
9.771014       1
5.510560       1
11.021878      1
9.624224       1
11.921360      1
8.906920       1
6.710880       1
8.770520       1
9.957200       1
9.002400       1
11.048400      1
10.802880      1
5.469640       1
12.139600      1
8.733497       1
10.059500      1
9.820800       1
8.722780       1
7.815720       1
9.670760       1
9.889000       1
7.188280       1
9.298760       1
8.797800       1
9.675307       1
Name: count, dtype: int64

In [None]:
## Try TDC

# Correct import for multi-instance prediction:
from tdc.multi_pred import DTI

# Then, access the specific BindingDB dataset by name
data = DTI(name='BindingDB_Kd')  # For datasets with Kd units
tdc_df = data.get_data()
print(tdc_df.head())

Downloading...
100%|██████████| 54.4M/54.4M [00:08<00:00, 6.20MiB/s]
Loading...
Done!


    Drug_ID                                            Drug Target_ID  \
0  444607.0       Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1    P00918   
1    4316.0      COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1    P00918   
2    4293.0           NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1    P00918   
3    1611.0    NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O    P00918   
4    1612.0  COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1    P00918   

                                              Target     Y  
0  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.46  
1  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.49  
2  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.83  
3  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.20  
4  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...  0.16  


In [23]:

R = 1.987e-3  # kcal/mol·K
T = 298
tdc_df['binding_free_energy'] = R * T * np.log(tdc_df['Y'] * 1e-9)  # Kd (nM → M)

tdc_df.head()

Unnamed: 0,Drug_ID,Drug,Target_ID,Target,Y,binding_free_energy
0,444607.0,Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.46,-12.730587
1,4316.0,COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.49,-12.693178
2,4293.0,NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.83,-12.381115
3,1611.0,NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.2,-13.223775
4,1612.0,COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0.16,-13.355904
