In [13]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
# In notebooks __file__ is not defined — use the notebook cwd or a relative path instead
file_path = Path.cwd().parent / "data" / "bindingdb_data" / "BindingDB_All.tsv"
# fallback if the file is directly under data/
if not file_path.exists():
    file_path = Path.cwd().parent / "data" / "BindingDB_All.tsv"

assert file_path.exists(), f"{file_path} not found"


In [6]:
chunksize = 100000  # adjust based on memory
filtered_chunks = []

for i, chunk in enumerate(pd.read_csv(file_path, sep="\t", chunksize=chunksize, low_memory=False)):
    print(f"Processing chunk {i} with {len(chunk)} rows...")
    
    # Select only relevant columns (skip if missing)
    cols_to_keep = ['Target Name', 'Ligand SMILES', 'Ki (nM)', 'Kd (nM)', 'IC50 (nM)', 'Reference PMID', 'UniProt ID']
    available_cols = [c for c in cols_to_keep if c in chunk.columns]
    chunk = chunk[available_cols]
    
    filtered_chunks.append(chunk)

# Concatenate all filtered chunks
df = pd.concat(filtered_chunks, ignore_index=True)

print(f"\n✅ Done! Final dataset: {len(df)} rows, {len(df.columns)} columns")

Processing chunk 0 with 100000 rows...
Processing chunk 1 with 100000 rows...
Processing chunk 2 with 100000 rows...
Processing chunk 3 with 100000 rows...
Processing chunk 4 with 100000 rows...
Processing chunk 5 with 100000 rows...
Processing chunk 6 with 100000 rows...
Processing chunk 7 with 100000 rows...
Processing chunk 8 with 100000 rows...
Processing chunk 9 with 100000 rows...
Processing chunk 10 with 100000 rows...
Processing chunk 11 with 100000 rows...
Processing chunk 12 with 100000 rows...
Processing chunk 13 with 100000 rows...
Processing chunk 14 with 100000 rows...
Processing chunk 15 with 100000 rows...
Processing chunk 16 with 100000 rows...
Processing chunk 17 with 100000 rows...
Processing chunk 18 with 100000 rows...
Processing chunk 19 with 100000 rows...
Processing chunk 20 with 100000 rows...
Processing chunk 21 with 100000 rows...
Processing chunk 22 with 100000 rows...
Processing chunk 23 with 100000 rows...
Processing chunk 24 with 100000 rows...
Processing

In [12]:
# Rows where Kd or IC50 are NOT NaN
df_valid = df[df['Kd (nM)'].notna()]

df_valid.head()

Unnamed: 0,Target Name,Ligand SMILES,Ki (nM),Kd (nM),IC50 (nM)
18378,Endochitinase B1,CC(=O)CCCCN1C(=O)c2c(ncn2C)N(C1=O)C,,43000,
18379,Endochitinase B1 [A217G],CC(=O)CCCCN1C(=O)c2c(ncn2C)N(C1=O)C,,77000,
18380,Endochitinase B1 [D175A],CC(=O)CCCCN1C(=O)c2c(ncn2C)N(C1=O)C,,100000,
18381,Endochitinase B1 [D246A],CC(=O)CCCCN1C(=O)c2c(ncn2C)N(C1=O)C,,62000,
18382,Endochitinase B1 [E322A],CC(=O)CCCCN1C(=O)c2c(ncn2C)N(C1=O)C,,45000,


In [21]:
# === STEP 1: Ensure numeric ===
df['Kd (nM)'] = pd.to_numeric(df['Kd (nM)'], errors='coerce')
df['IC50 (nM)'] = pd.to_numeric(df['IC50 (nM)'], errors='coerce')

# === STEP 2: Keep only rows where at least one is present ===
df_filtered = df[df['Kd (nM)'].notna() | df['IC50 (nM)'].notna()].copy()

# === STEP 3: Min-max scaling individually ===
# Kd
kd_min = df_filtered['Kd (nM)'].min(skipna=True)
kd_max = df_filtered['Kd (nM)'].max(skipna=True)
df_filtered.loc[df_filtered['Kd (nM)'].notna(), 'Kd_scaled'] = (
    (df_filtered.loc[df_filtered['Kd (nM)'].notna(), 'Kd (nM)'] - kd_min) / (kd_max - kd_min)
)

# IC50
ic50_min = df_filtered['IC50 (nM)'].min(skipna=True)
ic50_max = df_filtered['IC50 (nM)'].max(skipna=True)
df_filtered.loc[df_filtered['IC50 (nM)'].notna(), 'IC50_scaled'] = (
    (df_filtered.loc[df_filtered['IC50 (nM)'].notna(), 'IC50 (nM)'] - ic50_min) / (ic50_max - ic50_min)
)

df_filtered['Kd_scaled_log'] = np.log10(df_filtered['Kd_scaled'] + 1e-9)
df_filtered['IC50_scaled_log'] = np.log10(df_filtered['IC50_scaled'] + 1e-9)


df_filtered.head()

Unnamed: 0,Target Name,Ligand SMILES,Ki (nM),Kd (nM),IC50 (nM),Kd_log,IC50_log,Kd_scaled,IC50_scaled,Kd_scaled_log,IC50_scaled_log
144,Galactokinase,O=C1CCCC2=C1C1(CCS(=O)(=O)C1)N=C(Nc1nc3ccccc3o...,,,6676.9,,3.824575,,6.6769e-11,,-8.97193
180,Dimer of Gag-Pol polyprotein [489-587],CC(C)[C@H](NC(C)=O)C(=O)N[C@@H](Cc1ccccc1)[C@@...,,,8.5,,0.929419,,8.5e-14,,-8.999963
181,Dimer of Gag-Pol polyprotein [489-587],CCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1)[...,,,177.0,,2.247973,,1.77e-12,,-8.999232
183,Dimer of Gag-Pol polyprotein [489-587],COCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1...,,,164.0,,2.214844,,1.64e-12,,-8.999288
184,Dimer of Gag-Pol polyprotein [489-587],COCCOCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccc...,,,67.0,,1.826075,,6.7e-13,,-8.999709
