In [116]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
from tqdm import tqdm

# load your dataset
df = pd.read_csv('BindingDB_excluding_bad_conformers.csv')

# extract unique SMILES strings
smiles = df['SMILES']
unique_smiles = smiles[138122:]

In [44]:
unique_smiles

138122     CC(C)(C)NC(=O)C1CCC2C3CCC4NC(=O)C=CC4(C)C3CCC12C
138123    CC(=O)N[C@@H]1[C@H](OCCCCCCN)O[C@@H](CO)[C@@H]...
138124    CC(=O)N[C@@H]1[C@H](OCCCCCCNC(=O)CCCC[C@@H]2SC...
138125    CC(=O)N[C@@H]1[C@H](OCCCCCCN)O[C@@H](CO)[C@@H]...
138126    CC(=O)N[C@@H]1[C@H](OCCCCCCNC(=O)CCCC[C@@H]2SC...
                                ...                        
458849                    CCOc1ccc(CCc2cc(OC)cc(OC)c2)cc1OC
458850                     COc1cc(O)cc(CCc2ccc3c(c2)OCO3)c1
458851                        COc1ccc(CCc2cc(O)cc(OC)c2)cc1
458852                     COc1cc(O)cc(CCc2ccc(O)c(OC)c2)c1
458853                     CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21
Name: SMILES, Length: 320732, dtype: object

In [11]:
def check_valid_conformers(smiles_list, seed=0):
    valid_smiles = []
    invalid_smiles = []
    for smiles in tqdm(smiles_list, desc='Checking conformers'):
        try:
            mol = Chem.MolFromSmiles(smiles)
            if mol is None:
                invalid_smiles.append(smiles)
                continue
            mol = AllChem.AddHs(mol)
            mol_conf = Chem.AllChem.EmbedMolecule(mol, randomSeed=seed)
            if mol_conf == -1:  # check if EmbedMolecule() failed
                invalid_smiles.append(smiles)
                continue
            conformer = mol.GetConformer()
            if conformer is None:  # check if EmbedMolecule() returned a valid conformer
                invalid_smiles.append(smiles)
                continue
            coordinates = conformer.GetPositions()
            valid_smiles.append(smiles)
        except:
            invalid_smiles.append(smiles)
    return valid_smiles, invalid_smiles



In [45]:
def check_valid_conformers(smiles_list, seed=0):
    valid_smiles_file = open('valid_smile.txt', 'w')
    invalid_smiles_file = open('invalid_smile.txt', 'w')
    valid_smiles = []
    invalid_smiles = []
    for smiles in tqdm(smiles_list, desc='Checking conformers'):
        try:
            mol = Chem.MolFromSmiles(smiles)
            if mol is None:
                invalid_smiles.append(smiles)
                invalid_smiles_file.write(str(smiles), + '\n')
                continue
            mol = AllChem.AddHs(mol)
            mol_conf = Chem.AllChem.EmbedMolecule(mol, randomSeed=seed)
            if mol_conf == -1:  # check if EmbedMolecule() failed
                invalid_smiles.append(smiles)
                invalid_smiles_file.write(str(smiles) + '\n')
                continue
            conformer = mol.GetConformer()
            if conformer is None:  # check if EmbedMolecule() returned a valid conformer
                invalid_smiles.append(smiles)
                invalid_smiles_file.write(str(smiles) + '\n')
                continue
            coordinates = conformer.GetPositions()
            valid_smiles.append(smiles)
            valid_smiles_file.write(str(smiles) + '\n')
        except:
            invalid_smiles.append(smiles)
            invalid_smiles_file.write(str(smiles) + '\n')

    valid_smiles_file.close()
    invalid_smiles_file.close()
    
    return valid_smiles, invalid_smiles

In [None]:
# check for valid conformers for each unique SMILES string
valid_smiles, invalid_smiles = check_valid_conformers(unique_smiles)


Checking conformers:   0%|        | 1026/320732 [02:15<12:33:54,  7.07it/s]

In [21]:
print(len(invalid_smiles))
print(len(valid_smiles))

0
458857


In [22]:
#print("Valid SMILES strings:")
#for smile in valid_smiles:
#    print(smile)

print("Invalid SMILES strings:")
for smile in invalid_smiles:
    print(smile)

Invalid SMILES strings:


In [111]:
smi_list = [
'COC(=O)N[C@@H]1C(O)C(OC2C(O)[C@H](O[C@@H]3C(CO)O[C@@H](OCCCO[C@H]4CCC[C@@H](OCCCO[C@@H]5OC(CO)[C@@H](O[C@@H]6OC(CO)[C@H](O)C(OC7C(O)[C@@H](NC(=O)OC)C([C@H](O)[C@H](O)CO)O[C@H]7C(=O)O)C6O)C(O[C@@H]6OC(C)[C@H](O)C(O)C6O)C5Nc5ccc6ccccc6c5)C4)C(Nc4ccc5ccccc5c4)C3O[C@@H]3OC(C)[C@H](O)C(O)C3O)OC(CO)[C@@H]2O)[C@H](C(=O)[O-])OC1[C@H](O)[C@H](O)CO',
'COC(=O)N[C@@H]1C(O)C(OC2C(O)[C@H](O[C@@H]3C(CO)O[C@@H](OCCCO[C@H]4CCC[C@H](OCCCO[C@@H]5OC(CO)[C@@H](O[C@@H]6OC(CO)[C@H](O)C(OC7C(O)[C@@H](NC(=O)OC)C([C@H](O)[C@H](O)CO)O[C@H]7C(=O)O)C6O)C(O[C@@H]6OC(C)[C@H](O)C(O)C6O)C5Nc5ccc6ccccc6c5)C4)C(Nc4ccc5ccccc5c4)C3O[C@@H]3OC(C)[C@H](O)C(O)C3O)OC(CO)[C@@H]2O)[C@H](C(=O)[O-])OC1[C@H](O)[C@H](O)CO',
'COC(=O)CCCCCCCCO[C@@H]1OC(CO)[C@@H](O[C@@H]2O[C@@H](CO)[C@H](O)[C@H](O[C@]3(C(=O)O)C[C@@H](O)[C@@H](NC(C)=O)[C@@H](C(O)C(O)CO)O3)[C@@H]2O)[C@@H](O[C@@H]2O[C@H](C)[C@@H](O)[C@H](O)C2O)[C@@H]1NC(C)=O'
]

In [117]:
list11 = df['SMILES'].tolist()
list1 = list11[393090:393350]
len(list1)

260

In [118]:
list22 = df['SMILES'].tolist()
list2 = list22[408400:409050]
len(list2)

650

In [119]:
list33 = df['SMILES'].tolist()
list3 = list33[425345:425850]
len(list3)

505

In [120]:
smi_list = list1 + list2 + list3
len(smi_list)

1415

In [121]:
# get the LigandIDs of invalid SMILES strings
#invalid_ids = df.loc[df['SMILES'].isin(invalid_smiles), 'LigandID']

# remove the invalid conformers from the dataset
df1 = df.loc[~df['SMILES'].isin(smi_list)]



# print the IDs of invalid conformers
#print("IDs of invalid conformers:")
#print(invalid_ids.tolist())

In [122]:
df1.shape

(457321, 5)

In [None]:
# save the updated dataset
df1.to_csv('BindingDB_excluding_bad_conformers.csv', index=False)

In [53]:
# To load the list from the .txt file:
with open('valid_smiles.txt', 'r') as file:
    valid = file.read().splitlines()

In [23]:
# open the file and write the list to it
f = open('valid_smiles_1000.txt', 'w')
f.write(str(valid_smiles))
f.close()