In [None]:
import pandas as pd
from rdkit.Chem import Descriptors
import pickle
import os
import numpy as np

from fg_funcs import mol_to_fingerprint, safe_mol_from_smiles, fg_to_array, fp_to_array

In [3]:
# Load the dataset
data_path = 'chembl_35_fg_scaf.csv'
if os.path.exists(data_path):
    chembl = pd.read_csv(data_path)
else:
    raise FileNotFoundError(f"Dataset not found at {data_path}")

In [None]:
# Convert each fgs entry to a list
chembl['fgs'] = chembl['fgs'].apply(lambda x: eval(x) if isinstance(x, str) else x)

# drop rows with empty fgs
chembl = chembl[chembl['fgs'].notna() & (chembl['fgs'].str.len() > 0)]

In [8]:
# Currate dataset using 4 functional groups
fgs_list = []
for fgs in chembl['fgs']:
    if isinstance(fgs, list):
        fgs_list.extend(fgs)
sorted_fgs = pd.Series(fgs_list).value_counts().head(20).index.tolist()

print(sorted_fgs)


['[Nar]', '[R][O][R]', 'O=[C]([R])[N]([R])[R]', '[F][R]', '[R][N]([R])[R]', '[Cl][R]', '[OH][Cal]', '[R][NH][R]', '[O]=[Car]', '[OH][Car]', '[Sar]', '[Oar]', 'O=[C]([R])[O][R]', 'O=[C](O)[R]', 'O=[S](=O)([R])[N]([R])[R]', 'C=C', 'O=[C]([R])[R]', '[R][S][R]', '[NH2][Car]', '[R][Br]']


In [9]:
print(pd.Series(fgs_list).value_counts().head(50))

[Nar]                               3188978
[R][O][R]                           1322479
O=[C]([R])[N]([R])[R]               1249000
[F][R]                               962042
[R][N]([R])[R]                       699809
[Cl][R]                              502836
[OH][Cal]                            485908
[R][NH][R]                           397184
[O]=[Car]                            305243
[OH][Car]                            282228
[Sar]                                281385
[Oar]                                275334
O=[C]([R])[O][R]                     259821
O=[C](O)[R]                          203151
O=[S](=O)([R])[N]([R])[R]            195132
C=C                                  172048
O=[C]([R])[R]                        148208
[R][S][R]                            132147
[NH2][Car]                           130185
[R][Br]                              104909
C#N                                   99300
O=C([N]([R])[R])[N]([R])[R]           93996
[NH2][Cal]                      

In [10]:
curation_fgs = ['[R][NH][R]', 'O=[C](O)[R]', 'C=C', '[NH2][Car]']
chembl_curated = chembl[chembl['fgs'].apply(lambda x: any(fg in x for fg in curation_fgs))]
chembl_curated.shape

(717003, 4)

In [11]:
# Convert SMILES to RDKit Mol objects
chembl_curated['mol'] = chembl_curated['smiles'].apply(safe_mol_from_smiles)
chembl_curated.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chembl_curated['mol'] = chembl_curated['smiles'].apply(safe_mol_from_smiles)


Unnamed: 0,smiles,inchikey,fgs,scaffolds,mol
4,Cc1cc(CC#N)cc(C)c1Nc1ccnc(Nc2ccc(C#N)cc2)n1,ZKXJVUBVZRGELZ-UHFFFAOYSA-N,"[C#N, [R][NH][R], [R][NH][R], C#N, [Nar], [Nar]]",c1ccc(Nc2ccnc(Nc3ccccc3)n2)cc1,<rdkit.Chem.rdchem.Mol object at 0x31ed79690>
5,CC[C@H](C)[C@H](NS(=O)(=O)c1ccc(C)cc1)C(=O)N1C...,GDPHYVWXATZNEZ-YYWHXJBOSA-N,"[O=[S](=O)([R])[N]([R])[R], O=[C]([R])[N]([R])...",*=C(CNS(=*)(=*)c1ccccc1)N1CCCCC1,<rdkit.Chem.rdchem.Mol object at 0x31ed78ac0>
9,C[C@]12CC[C@H](O)C[C@]1(C=O)CC=C1CCC12,OXLGUMPVPZLFKY-FUEJHIMDSA-N,"[[OH][Cal], O=[CH][R], C=C]",C1=C2CCC2C2CCCCC2C1,<rdkit.Chem.rdchem.Mol object at 0x31ed79460>
13,CC(C)(C)SC[C@H](N)C(=O)O,VADVRIAPCDFQJU-YFKPBYRVSA-N,"[[R][S][R], [NH2][Cal], O=[C](O)[R]]",,<rdkit.Chem.rdchem.Mol object at 0x31ed793f0>
14,C=CCc1cc(/C=C2\CN(C)C/C(=C\c3ccc(O)c(CC=C)c3)C...,IWFTZFJVKREGEG-HOFJZWJUSA-N,"[C=C, C=CC(=O)C=C, [R][N]([R])[R], [OH][Car], ...",*=C1C(=Cc2ccccc2)CNCC1=Cc1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x31ed79380>


In [12]:
# Get the molecular weights of the molecules
chembl_curated['mol_weight'] = chembl_curated['mol'].apply(lambda x: Descriptors.MolWt(x) if x else None)
chembl_curated.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chembl_curated['mol_weight'] = chembl_curated['mol'].apply(lambda x: Descriptors.MolWt(x) if x else None)


Unnamed: 0,smiles,inchikey,fgs,scaffolds,mol,mol_weight
4,Cc1cc(CC#N)cc(C)c1Nc1ccnc(Nc2ccc(C#N)cc2)n1,ZKXJVUBVZRGELZ-UHFFFAOYSA-N,"[C#N, [R][NH][R], [R][NH][R], C#N, [Nar], [Nar]]",c1ccc(Nc2ccnc(Nc3ccccc3)n2)cc1,<rdkit.Chem.rdchem.Mol object at 0x31ed79690>,354.417
5,CC[C@H](C)[C@H](NS(=O)(=O)c1ccc(C)cc1)C(=O)N1C...,GDPHYVWXATZNEZ-YYWHXJBOSA-N,"[O=[S](=O)([R])[N]([R])[R], O=[C]([R])[N]([R])...",*=C(CNS(=*)(=*)c1ccccc1)N1CCCCC1,<rdkit.Chem.rdchem.Mol object at 0x31ed78ac0>,495.642
9,C[C@]12CC[C@H](O)C[C@]1(C=O)CC=C1CCC12,OXLGUMPVPZLFKY-FUEJHIMDSA-N,"[[OH][Cal], O=[CH][R], C=C]",C1=C2CCC2C2CCCCC2C1,<rdkit.Chem.rdchem.Mol object at 0x31ed79460>,220.312
13,CC(C)(C)SC[C@H](N)C(=O)O,VADVRIAPCDFQJU-YFKPBYRVSA-N,"[[R][S][R], [NH2][Cal], O=[C](O)[R]]",,<rdkit.Chem.rdchem.Mol object at 0x31ed793f0>,177.269
14,C=CCc1cc(/C=C2\CN(C)C/C(=C\c3ccc(O)c(CC=C)c3)C...,IWFTZFJVKREGEG-HOFJZWJUSA-N,"[C=C, C=CC(=O)C=C, [R][N]([R])[R], [OH][Car], ...",*=C1C(=Cc2ccccc2)CNCC1=Cc1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x31ed79380>,401.506


In [None]:
filtered = chembl_curated[
    (chembl_curated['mol_weight'] <= 500)
]

filtered[['smiles', 'inchikey', 'fgs', 'scaffolds', 'mol_weight']].to_csv('chembl_35_fg_scaf_curated.csv', index=False)

In [None]:
# Create fingerprints and functional group arrays
filtered['fingerprint'] = filtered['mol'].apply(mol_to_fingerprint)

filtered['fingerprint_array'] = filtered['fingerprint'].apply(
    lambda x: fp_to_array(x) if x is not None else None
)

filtered['fg_array'] = filtered['fgs'].apply(lambda x: fg_to_array(x, curation_fgs))

# Convert the fingerprint to numpy arrays
filtered['fingerprint_array'] = filtered['fingerprint_array'].apply(lambda x: x if isinstance(x, np.ndarray) else np.zeros((2048,), dtype=int))

# Convert the fg_array to numpy arrays
filtered['fg_array'] = filtered['fg_array'].apply(lambda x: x if isinstance(x, np.ndarray) else np.zeros((len(curation_fgs),), dtype=int))

# Remove molecules with more than 1 of the functional groups
filtered['fg_array'] = filtered['fg_array'].apply(lambda x: x if np.sum(x) <= 1 else np.zeros((len(curation_fgs),), dtype=int))

# Remove any rows where fg_array is all zeros
filtered = filtered[filtered['fg_array'].apply(lambda x: np.any(x))]

# Sample 100,000 molecules
sampled = filtered.sample(n=100000, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['fingerprint'] = filtered['mol'].apply(mol_to_fingerprint)


In [21]:
# Save the processed data to a pickle file
output_file = 'chembl_35_fg_scaf_curated.pkl'
with open(output_file, 'wb') as f:
    pickle.dump(sampled, f)
print(f"Processed data saved to {output_file}")

Processed data saved to chembl_35_fg_scaf_curated.pkl
