In [1]:
import os
import glob
import pandas as pd
import tqdm 

def load_all_chembl_activities(base_dir="/mnt/katritch_lab2/aoxu/VLS_benchmark/chembl_affinity/"):
    
    paths = glob.glob(f"{base_dir}/uniprot_*/*_chembl_activities_filtered.parquet")

    dfs = []
    #print(len(paths))
    for path in tqdm.tqdm(paths[:]):
        uniprot_dir = os.path.basename(os.path.dirname(path))   # e.g. "uniprot_P60842"
        uniprot_id = uniprot_dir.split("_", 1)[1]               # "P60842"

        df = pd.read_parquet(path)
        df["uniprot_id"] = uniprot_id
        dfs.append(df)

    if not dfs:
        return pd.DataFrame()

    return pd.concat(dfs, ignore_index=True)

df_all = load_all_chembl_activities()


100%|██████████| 837/837 [00:05<00:00, 141.25it/s]


In [2]:
contained_uniprotid = (df_all['uniprot_id'].unique().tolist())

# Pdbids

In [3]:
uniprot_pdbfiles_df = pd.read_csv(
    "pdb_chain_uniprot.tsv",
    sep="\t",
    comment="#",
    na_values=["None"],
    compression="infer",
    dtype={
        "PDB": "string",
        "CHAIN": "string",
        "SP_PRIMARY": "string",
    },
)
uniprot_pdbfiles_df = uniprot_pdbfiles_df.loc[uniprot_pdbfiles_df['SP_PRIMARY'].isin(contained_uniprotid)]

  uniprot_pdbfiles_df = pd.read_csv(


In [4]:
uniprot_pdbfiles_df.columns

Index(['PDB', 'CHAIN', 'SP_PRIMARY', 'RES_BEG', 'RES_END', 'PDB_BEG',
       'PDB_END', 'SP_BEG', 'SP_END'],
      dtype='object')

# Mapping the uniprotid to pdbid

In [5]:
# assuming df has columns: 'PDB', 'CHAIN', 'SP_PRIMARY'

# optional: normalize IDs
uniprot_pdbfiles_df["PDB"] = uniprot_pdbfiles_df["PDB"].str.upper()
uniprot_pdbfiles_df["SP_PRIMARY"] = uniprot_pdbfiles_df["SP_PRIMARY"].str.upper()

uniprot_to_pdb = (
    uniprot_pdbfiles_df.dropna(subset=["SP_PRIMARY", "PDB"])
      .groupby("SP_PRIMARY")["PDB"]
      .apply(lambda s: set(s))
      .to_dict()
)


# CCD and smi

In [6]:
ccd_smi_df = pd.read_csv(
    "Components-smiles-stereo-oe.smi",
    sep="\t",
    names = ['Smiles', "CID", "name"]
)

In [7]:
dict_ccd_smi = dict(zip(ccd_smi_df["CID"], ccd_smi_df["Smiles"]))

# ccd to list of pdbid

In [8]:
ccd_pdbid_df =  pd.read_csv(
    "interacting_chains_with_ligand_functions.tsv",
    sep="\t",

)

In [9]:

dict_pdbid_ccd = (
    ccd_pdbid_df.dropna(subset=["PDBID", "LigandID"])
      .groupby("PDBID")["LigandID"]
      .apply(lambda s: set(s))
      .to_dict()
)
dict_pdbid_ccd = {k.upper():v for k,v in dict_pdbid_ccd.items()}

# Clustering the active ligands by similarity to pdb ligand
- [ ] within each uniprotid get all the actives and all CCD smiles in corresponding pdbid
- [ ] if similarity < 0.1, 0.3, 0.5,0.7,0.9 we reject the ligand from consideration.

In [12]:
import itertools
import tqdm
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import AllChem, rdFingerprintGenerator
from rdkit import DataStructs

morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

threshold_distance = 0.3  # you choose

uniprot_min_dist_dict = {}        # {uniprot_id: {smi_active: min_distance}}


for i_uniprotid, i_list_pdbid in tqdm.tqdm(uniprot_to_pdb.items()):


    i_ccd_among_all_contained_pdbid = []
    for i_pdbid in i_list_pdbid:
        try:
            i_ccd_among_all_contained_pdbid.append(dict_pdbid_ccd[i_pdbid])
        except KeyError:
            pass
            #print(f"{i_pdbid} not exist dict_pdbid_ccd")
    i_ccd_among_all_contained_pdbid = list(itertools.chain.from_iterable(i_ccd_among_all_contained_pdbid))



    i_smi_among_all_contained_pdbid = []
    for i_ccd in i_ccd_among_all_contained_pdbid:
        try:
            i_smi_among_all_contained_pdbid.append(dict_ccd_smi[i_ccd])
        except KeyError:
            pass
            #print(f"{i_ccd} not exist in dict_ccd_smi")



    i_actives_in_uniprotid_df = df_all.loc[df_all['uniprot_id'] == i_uniprotid]
    i_smi_actives = i_actives_in_uniprotid_df['canonical_smiles'].unique().tolist()


    # RDKit mols
    i_rdmol_from_pdb = [Chem.MolFromSmiles(s) for s in i_smi_among_all_contained_pdbid]
    i_rdmol_from_actives = [Chem.MolFromSmiles(s) for s in i_smi_actives]

    # filter out failed parses, keep aligned lists
    pdb_smi_valid = []
    pdb_mols_valid = []
    for s, m in zip(i_smi_among_all_contained_pdbid, i_rdmol_from_pdb):
        if m is not None:
            pdb_smi_valid.append(s)
            pdb_mols_valid.append(m)

    act_smi_valid = []
    act_mols_valid = []
    for s, m in zip(i_smi_actives, i_rdmol_from_actives):
        if m is not None:
            act_smi_valid.append(s)
            act_mols_valid.append(m)

    if not pdb_mols_valid or not act_mols_valid:
        uniprot_min_dist_dict[i_uniprotid] = {}
        continue

    # ECFP fingerprints
    pdb_fps = [morgan_gen.GetFingerprint(m) for m in pdb_mols_valid]
    act_fps = [morgan_gen.GetFingerprint(m) for m in act_mols_valid]

    # : min Tanimoto distance per active
    smi_to_min_dist = {}
    for smi_a, fp_a in zip(act_smi_valid, act_fps):
        sims = DataStructs.BulkTanimotoSimilarity(fp_a, pdb_fps)
        if sims:
            max_sim = max(sims)
            min_dist = 1.0 - max_sim
        else:
            min_dist = 1.0  # no reference, treat as far
        smi_to_min_dist[smi_a] = min_dist

    uniprot_min_dist_dict[i_uniprotid] = smi_to_min_dist




  0%|          | 0/837 [00:00<?, ?it/s]

  1%|          | 10/837 [00:00<00:50, 16.45it/s][20:10:32] Explicit valence for atom # 0 Be, 4, is greater than permitted
  3%|▎         | 23/837 [00:01<00:59, 13.71it/s][20:10:34] Explicit valence for atom # 24 N, 4, is greater than permitted
[20:10:34] Explicit valence for atom # 25 N, 4, is greater than permitted
  6%|▋         | 53/837 [00:03<01:04, 12.19it/s][20:10:36] Explicit valence for atom # 54 N, 5, is greater than permitted
[20:10:36] Explicit valence for atom # 54 N, 5, is greater than permitted
  8%|▊         | 66/837 [00:04<00:48, 15.91it/s][20:10:37] Explicit valence for atom # 0 B, 4, is greater than permitted
 11%|█         | 90/837 [00:06<00:39, 18.69it/s][20:10:38] Explicit valence for atom # 0 B, 6, is greater than permitted
[20:10:38] Explicit valence for atom # 0 B, 4, is greater than permitted
 11%|█         | 92/837 [00:06<00:44, 16.88it/s][20:10:38] Explicit valence for atom # 0 O, 2, is greater than permitted
[20:10:38] Explicit valence for atom # 0 O, 2, is 

In [28]:
import pandas as pd

rows = [
    (uniprot_id, smi_active, min_dist)
    for uniprot_id, smi_dict in uniprot_min_dist_dict.items()
    for smi_active, min_dist in smi_dict.items()
]

df_uniprotid_smiactives_mindist = pd.DataFrame(rows, columns=["uniprot_id", "smi_active", "min_distance_from_pdb_ligand"])


In [31]:
df_uniprotid_smiactives_mindist.to_csv("/mnt/katritch_lab2/aoxu/VLS_benchmark/df_uniprotid_smiactives_mindist.csv")

# smi actives that is not supported by pdb structure will have large distance

In [29]:
df_uniprotid_smiactives_mindist.loc[df_uniprotid_smiactives_mindist["min_distance_from_pdb_ligand"] > 0.9]

Unnamed: 0,uniprot_id,smi_active,min_distance_from_pdb_ligand
645,O00206,Cc1nn(Cc2ccccc2Cl)c(C)c1CN(C)CC(O)CCOc1cccc2[n...,0.901235
649,O00206,COc1ccccc1OCCNCC(O)COc1cccc2[nH]c3ccccc3c12,0.907692
651,O00206,OC(CNc1ccc(-c2ccccc2)cc1)COc1cccc2[nH]c3ccccc3c12,0.903226
652,O00206,OC(CNc1ccc2c(c1)-c1ccccc1C2)COc1cccc2[nH]c3ccc...,0.929577
654,O00206,OC(CNc1ccc2ccccc2c1)COc1cccc2[nH]c3ccccc3c12,0.923077
...,...,...,...
279966,Q9Y6K1,O=C1[C@@H]2CC=CC[C@@H]2C(c2cccc(OCCCCCOc3ccc(-...,0.916667
279967,Q9Y6K1,O=C1[C@@H]2CC=CC[C@@H]2C(c2cccc(OC/C=C\COc3ccc...,0.917431
279968,Q9Y6K1,O=C1[C@@H]2CC=CC[C@@H]2C(c2cccc(OC/C=C/COc3ccc...,0.917431
279969,Q9Y6K1,O=C(COc1cccc(C2=NN(C3CCCCCC3)C(=O)[C@@H]3CC=CC...,0.900901
