In [75]:
import rdkit
import rdkit.Chem as Chem
import efgs
from joblib import Parallel, delayed
import pandas as pd
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem.Scaffolds import MurckoScaffold
import pickle
import os
import math
from rdkit import DataStructs
import numpy as np

In [47]:
# Load the dataset
data_path = 'chembl_35_fg_scaf.csv'
if os.path.exists(data_path):
    chembl = pd.read_csv(data_path)
else:
    raise FileNotFoundError(f"Dataset not found at {data_path}")

In [48]:
chembl.shape

(2310847, 4)

In [49]:
# Convert each fgs entry to a list
chembl['fgs'] = chembl['fgs'].apply(lambda x: eval(x) if isinstance(x, str) else x)

In [50]:
chembl.shape

(2310847, 4)

In [51]:
# drop rows with empty fgs
chembl = chembl[chembl['fgs'].notna() & (chembl['fgs'].str.len() > 0)]
chembl.shape

(2310581, 4)

In [52]:
# Currate dataset using 4 functional groups
fgs_list = []
for fgs in chembl['fgs']:
    if isinstance(fgs, list):
        fgs_list.extend(fgs)
sorted_fgs = pd.Series(fgs_list).value_counts().head(20).index.tolist()

print(sorted_fgs)


['[Nar]', '[R][O][R]', 'O=[C]([R])[N]([R])[R]', '[F][R]', '[R][N]([R])[R]', '[Cl][R]', '[OH][Cal]', '[R][NH][R]', '[O]=[Car]', '[OH][Car]', '[Sar]', '[Oar]', 'O=[C]([R])[O][R]', 'O=[C](O)[R]', 'O=[S](=O)([R])[N]([R])[R]', 'C=C', 'O=[C]([R])[R]', '[R][S][R]', '[NH2][Car]', '[R][Br]']


In [53]:
print(pd.Series(fgs_list).value_counts().head(50))

[Nar]                               3188978
[R][O][R]                           1322479
O=[C]([R])[N]([R])[R]               1249000
[F][R]                               962042
[R][N]([R])[R]                       699809
[Cl][R]                              502836
[OH][Cal]                            485908
[R][NH][R]                           397184
[O]=[Car]                            305243
[OH][Car]                            282228
[Sar]                                281385
[Oar]                                275334
O=[C]([R])[O][R]                     259821
O=[C](O)[R]                          203151
O=[S](=O)([R])[N]([R])[R]            195132
C=C                                  172048
O=[C]([R])[R]                        148208
[R][S][R]                            132147
[NH2][Car]                           130185
[R][Br]                              104909
C#N                                   99300
O=C([N]([R])[R])[N]([R])[R]           93996
[NH2][Cal]                      

In [54]:
curation_fgs = ['[R][NH][R]', 'O=[C](O)[R]', 'C=C', '[NH2][Car]']
chembl_curated = chembl[chembl['fgs'].apply(lambda x: any(fg in x for fg in curation_fgs))]
chembl_curated.shape

(717003, 4)

In [55]:
# Convert SMILES to RDKit Mol objects
def smiles_to_mol(smiles):
    try:
        return Chem.MolFromSmiles(smiles)
    except Exception as e:
        print(f"Error converting SMILES {smiles}: {e}")
        return None
    

In [None]:
# Convert SMILES to RDKit Mol objects
chembl_curated['mol'] = chembl_curated['smiles'].apply(smiles_to_mol)
chembl_curated.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chembl_curated['mol'] = chembl_curated['smiles'].apply(smiles_to_mol)


In [58]:
# Get the molecular weights of the molecules
chembl_curated['mol_weight'] = chembl_curated['mol'].apply(lambda x: Descriptors.MolWt(x) if x else None)
chembl_curated.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chembl_curated['mol_weight'] = chembl_curated['mol'].apply(lambda x: Descriptors.MolWt(x) if x else None)


Unnamed: 0,smiles,inchikey,fgs,scaffolds,mol,mol_weight
4,Cc1cc(CC#N)cc(C)c1Nc1ccnc(Nc2ccc(C#N)cc2)n1,ZKXJVUBVZRGELZ-UHFFFAOYSA-N,"[C#N, [R][NH][R], [R][NH][R], C#N, [Nar], [Nar]]",c1ccc(Nc2ccnc(Nc3ccccc3)n2)cc1,<rdkit.Chem.rdchem.Mol object at 0x32234b140>,354.417
5,CC[C@H](C)[C@H](NS(=O)(=O)c1ccc(C)cc1)C(=O)N1C...,GDPHYVWXATZNEZ-YYWHXJBOSA-N,"[O=[S](=O)([R])[N]([R])[R], O=[C]([R])[N]([R])...",*=C(CNS(=*)(=*)c1ccccc1)N1CCCCC1,<rdkit.Chem.rdchem.Mol object at 0x32234ad50>,495.642
9,C[C@]12CC[C@H](O)C[C@]1(C=O)CC=C1CCC12,OXLGUMPVPZLFKY-FUEJHIMDSA-N,"[[OH][Cal], O=[CH][R], C=C]",C1=C2CCC2C2CCCCC2C1,<rdkit.Chem.rdchem.Mol object at 0x327e39f50>,220.312
13,CC(C)(C)SC[C@H](N)C(=O)O,VADVRIAPCDFQJU-YFKPBYRVSA-N,"[[R][S][R], [NH2][Cal], O=[C](O)[R]]",,<rdkit.Chem.rdchem.Mol object at 0x372a43060>,177.269
14,C=CCc1cc(/C=C2\CN(C)C/C(=C\c3ccc(O)c(CC=C)c3)C...,IWFTZFJVKREGEG-HOFJZWJUSA-N,"[C=C, C=CC(=O)C=C, [R][N]([R])[R], [OH][Car], ...",*=C1C(=Cc2ccccc2)CNCC1=Cc1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x372a43bc0>,401.506


In [61]:
print(chembl_curated['mol_weight'].describe().astype(str))

count              717003.0
mean     425.38353725629304
std       138.7347772154064
min      28.053999999999995
25%       331.4850000000001
50%      407.49300000000034
75%      492.33100000000013
max      1004.3879999999997
Name: mol_weight, dtype: object


In [70]:
filtered = chembl_curated[
    (chembl_curated['mol_weight'] >= 330) & 
    (chembl_curated['mol_weight'] <= 500)
]
sampled = filtered.sample(n=100000, random_state=42)
sampled[['smiles', 'inchikey', 'fgs', 'scaffolds', 'mol_weight']].to_csv('chembl_35_fg_scaf_curated.csv', index=False)

In [71]:
from rdkit.Chem import rdFingerprintGenerator

mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
def mol_to_fingerprint(mol):
    if mol is None:
        return None
    return mfpgen.GetFingerprint(mol)

In [72]:
sampled['fingerprint'] = sampled['mol'].apply(mol_to_fingerprint)

In [76]:
def fp_to_array(fp):
    arr = np.zeros((1,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

sampled['fingerprint_array'] = sampled['fingerprint'].apply(
    lambda x: fp_to_array(x) if x is not None else None
)

In [77]:
sampled.head()

Unnamed: 0,smiles,inchikey,fgs,scaffolds,mol,mol_weight,fingerprint,fingerprint_array
1422591,O=C(O)c1cc(NC(=O)c2ccccc2F)cc(NC(=O)c2ccccc2F)c1,ZASQESYJZCAESE-UHFFFAOYSA-N,"[O=[C](O)[R], O=[C]([R])[N]([R])[R], [F][R], O...",*=C(Nc1cccc(NC(=*)c2ccccc2)c1)c1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x349d55620>,396.349,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
860799,COc1cc(/C=C/c2nnc(C(C)c3ccc(F)cc3)[nH]2)ccc1-n...,CRJMPGQRVVAATK-VZUCSPMQSA-N,"[[R][O][R], C=C, [F][R], [Nar], [Nar], [Nar], ...",C(=Cc1nnc(Cc2ccccc2)[nH]1)c1ccc(-n2ccnc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x15bab4970>,403.461,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1314447,COc1cccc(Cc2c(-c3ccccc3)sc(N)c2C(=O)c2ccc(Cl)c...,UPRXVPZBXSLBPJ-UHFFFAOYSA-N,"[[R][O][R], [NH2][Car], O=[C]([R])[R], [Cl][R]...",*=C(c1ccccc1)c1csc(-c2ccccc2)c1Cc1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x35d6798c0>,433.96,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
2221047,COc1c(-c2ccc(F)cc2C)cc(S(=O)(=O)Nc2ccc(F)c(C(=...,YXQVXPBOVNCAAO-UHFFFAOYSA-N,"[[R][O][R], [F][R], O=[S](=O)([R])[N]([R])[R],...",*=S(=*)(Nc1ccccc1)c1cc(-c2ccccc2)cc2ccccc12,<rdkit.Chem.rdchem.Mol object at 0x36c813b50>,483.492,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2230184,CNC(=O)c1cc(Oc2ccc3[nH]c(Nc4ccc(C(F)(F)F)cc4)n...,LZVVSWVIESQYLD-UHFFFAOYSA-N,"[O=[C]([R])[N]([R])[R], [R][O][R], [R][NH][R],...",c1ccc(Nc2nc3cc(Oc4ccncc4)ccc3[nH]2)cc1,<rdkit.Chem.rdchem.Mol object at 0x36c83f610>,427.386,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [80]:
# Give each molecule a 4 bit array indicating the presence of each functional group
def fg_to_array(fgs, fg_list):
    fg_array = np.zeros(len(fg_list), dtype=int)
    if isinstance(fgs, list):
        for fg in fgs:
            if fg in fg_list:
                fg_array[fg_list.index(fg)] = 1
    return fg_array

sampled['fg_array'] = sampled['fgs'].apply(lambda x: fg_to_array(x, curation_fgs))

In [81]:
sampled.head()

Unnamed: 0,smiles,inchikey,fgs,scaffolds,mol,mol_weight,fingerprint,fingerprint_array,fg_array
1422591,O=C(O)c1cc(NC(=O)c2ccccc2F)cc(NC(=O)c2ccccc2F)c1,ZASQESYJZCAESE-UHFFFAOYSA-N,"[O=[C](O)[R], O=[C]([R])[N]([R])[R], [F][R], O...",*=C(Nc1cccc(NC(=*)c2ccccc2)c1)c1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x349d55620>,396.349,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0]"
860799,COc1cc(/C=C/c2nnc(C(C)c3ccc(F)cc3)[nH]2)ccc1-n...,CRJMPGQRVVAATK-VZUCSPMQSA-N,"[[R][O][R], C=C, [F][R], [Nar], [Nar], [Nar], ...",C(=Cc1nnc(Cc2ccccc2)[nH]1)c1ccc(-n2ccnc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x15bab4970>,403.461,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0]"
1314447,COc1cccc(Cc2c(-c3ccccc3)sc(N)c2C(=O)c2ccc(Cl)c...,UPRXVPZBXSLBPJ-UHFFFAOYSA-N,"[[R][O][R], [NH2][Car], O=[C]([R])[R], [Cl][R]...",*=C(c1ccccc1)c1csc(-c2ccccc2)c1Cc1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x35d6798c0>,433.96,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1]"
2221047,COc1c(-c2ccc(F)cc2C)cc(S(=O)(=O)Nc2ccc(F)c(C(=...,YXQVXPBOVNCAAO-UHFFFAOYSA-N,"[[R][O][R], [F][R], O=[S](=O)([R])[N]([R])[R],...",*=S(=*)(Nc1ccccc1)c1cc(-c2ccccc2)cc2ccccc12,<rdkit.Chem.rdchem.Mol object at 0x36c813b50>,483.492,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0]"
2230184,CNC(=O)c1cc(Oc2ccc3[nH]c(Nc4ccc(C(F)(F)F)cc4)n...,LZVVSWVIESQYLD-UHFFFAOYSA-N,"[O=[C]([R])[N]([R])[R], [R][O][R], [R][NH][R],...",c1ccc(Nc2nc3cc(Oc4ccncc4)ccc3[nH]2)cc1,<rdkit.Chem.rdchem.Mol object at 0x36c83f610>,427.386,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0]"
