In [1]:
from rdkit.Chem.rdMolDescriptors import CalcAUTOCORR3D, CalcRDF

from rdkit import Chem
from rdkit.Chem import AllChem

from rdkit.Chem import MolToSmiles,MolFromSmiles

from rdkit.Chem import rdMolDescriptors, MACCSkeys, rdmolops

from padelpy.functions import from_smiles

mol = MolFromSmiles("CC(=O)N[C@@H](Cc1ccccc1)C(=O)O")
mol1 = MolFromSmiles("CC(=O)NC(Cc1ccccc1)C(=O)O")

def generate_conformers(mol, n_conf,maxIters):
    i=0
    nConfGen=n_conf      #número de conformeros máximo gerados para cada molécula
    pruneRmsThreshVal=1.0

    m2=Chem.AddHs(mol)
     # run ETKDG 
    cids=AllChem.EmbedMultipleConfs(m2, 
                                    numConfs=nConfGen, 
                                    pruneRmsThresh=pruneRmsThreshVal,
                                    ignoreSmoothingFailures=True,
                                    numThreads=4,
                                   useRandomCoords=True)
    res =AllChem.MMFFOptimizeMoleculeConfs(m2, 
                                            maxIters=maxIters, 
                                            numThreads=4) 
    #print('not_converged  /  energy')
    #print(res)
    rmslist = []
    AllChem.AlignMolConformers(m2, RMSlist=rmslist)
    
    return m2

def storeInQueue(f):
  def wrapper(*args):
    my_queue.put(f(*args))
  return wrapper

@storeInQueue
def featurizeAUTOCORR3D(smile):
    
    mol = MolFromSmiles(smile)
    m2 = generate_conformers(mol,10,2000)
    des1 = CalcAUTOCORR3D(m2)
    
    return des1

@storeInQueue
def featurize(smile):
    
    mol = MolFromSmiles(smile)
    start = time.time()
    m2 = generate_conformers(mol,3,100)
    end = time.time()
    #print("generating conformer: %f"%(end-start))
    start = time.time()
    des1 = CalcRDF(m2)
    end = time.time()
    #print("generating desc: %f"%(end-start))
    
    return des1

@storeInQueue
def featurize_padel(smile):
    
    fingerprints = from_smiles(smile,d_2d=True, d_3d=True)
    
    return fingerprints
    

from threading import Thread, Event
import queue


my_queue = queue.Queue()

In [39]:
import rdkit
rdkit.__version__

'2020.09.1'

In [2]:
from deepbioDBpy.API import DeepBioAPI

from deepbioDBpy.entities import DeepBioCompound, DeepBioDataset

dataset = DeepBioDataset(["sweet"],"smiles")
dataset.download_dataset_for_file("sweetness_dataset.csv")

27949


In [7]:
import pandas as pd

dset = pd.read_csv("sweetness_dataset.csv")

In [8]:
dset = dset.drop(dset[dset["sweet"] < 1].index)

In [5]:
not_sweet = dset[dset["sweet"] == 2]
not_sweet_sample = not_sweet.sample(n=1800, random_state=1)

In [6]:
sweet = dset[dset["sweet"] == 1]

In [7]:
final_set = sweet.append(not_sweet_sample)
final_set

Unnamed: 0,id,smiles,sweet
3,36,[O-][N+](=O)c1ccccc1,1.0
4,93,CNc1ccccc1C(=O)OC,1.0
6,26,OC(=O)[C@@H]1CCCN1,1.0
8,31,OS(=O)(=O)NC1CCCCC1,1.0
9,69,OCC(O)C(O)C(O)C(=O)CO,1.0
...,...,...,...
4676,4769,C=C(CCc1ccccc1)c1ccccc1,2.0
7789,7965,CCC=CCC=CCC=CCC=CCC=CCCCCCC(=O)O[C@@H](CO)COC(...,2.0
22431,22408,CCCCCCCCC=CCC=CCC=CCCCC(=O)OCC(CO)OC(=O)CCC=CC...,2.0
22544,22521,CCCCCC=CCC=CCC=CCC=CCCCCCC(=O)OC(CO)COC(=O)CCC...,2.0


In [26]:
from deepbioDBpy.API import DeepBioAPI
import time
from queue import Empty
import numpy as np

smiles = dset["smiles"]



descriptors = []
print(len(smiles))
start = time.time()
y=[]
empties = 0
i=0
ids = []
for l,row in dset.iterrows():
    i+=1
    
    smile = row["smiles"]
    j = row["id"]
    sweet = row["sweet"]
        
    if i%100 == 0:
        print("checkpoint: %d"%i)
    
    
    try :
        action_thread = Thread(target=featurize,args = (smile,))
        action_thread.start()
        
        fp = my_queue.get(True,timeout=60)
        
    
    except Empty:
        print('error in smile: ' + str(smile))
        fp = np.empty(80, dtype=float)
        fp[:] = np.NaN
        fp = fp.tolist()
        
    X = np.asarray( fp, dtype=np.float)
    
    descriptors.append( X)
    
    y.append(sweet)
    ids.append(j)
    
dataset = np.asarray(descriptors)

    
end = time.time()

print(end-start)

28398


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


checkpoint: 100
checkpoint: 200


KeyboardInterrupt: 

In [13]:
len(dataset)

3687

In [12]:
np.savetxt("test_numpy.csv", dataset, delimiter=",")

Exception in thread Thread-3615:
Traceback (most recent call last):
  File "/home/joao/anaconda3/envs/DeepMol/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/joao/anaconda3/envs/DeepMol/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-1-efa7eb2dd971>", line 38, in wrapper
    my_queue.put(f(*args))
  File "<ipython-input-1-efa7eb2dd971>", line 45, in featurize
    m2 = generate_conformers(mol,10,2000)
  File "<ipython-input-1-efa7eb2dd971>", line 28, in generate_conformers
    numThreads=4)
ValueError: Bad Conformer Id

Exception in thread Thread-4029:
Traceback (most recent call last):
  File "/home/joao/anaconda3/envs/DeepMol/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/joao/anaconda3/envs/DeepMol/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-1-efa7eb2dd971>", line 38, 

In [None]:
import time
smiles = []
mols = []
sid=[]
sweets=[]
i=0
start = time.time()
for l,row in dset.iloc[:1000].iterrows():
    i+=1
    
    smile = row["smiles"]
    j = row["id"]
    sweet = row["sweet"]

    mol = MolFromSmiles(smile)
    
    try:
        m2 = generate_conformers(mol,10,2000)
    
    except:
        m2 = mol
        
    smiles.append(smile)
    sid.append(j)
    mols.append(m2)
    sweets.append(sweet)
    if i%100 == 0:
        print("checkpoint: %d"%i)
    
    
writer = Chem.SDWriter('test.sdf')

for n in range(len(mols)):
    mols[n].SetProp("_Name","%s"%sid[n])
    mols[n].SetProp("_SourceID","%s"%sid[n])
    mols[n].SetProp("_SMILES","%s"%smiles[n])
    mols[n].SetProp("_SWEET","%s"%sweets[n])
    writer.write(mols[n])
writer.close()

end = time.time()

print(end-start)

checkpoint: 100
checkpoint: 200
checkpoint: 300
checkpoint: 400
checkpoint: 500
checkpoint: 600
checkpoint: 700


In [10]:
from rdkit.Chem import PandasTools

frame = PandasTools.LoadSDF('test.sdf',smilesName='SMILES',molColName='Molecule',includeFingerprints=True)

frame.info

<bound method DataFrame.info of       ID                                             SMILES  \
0     18           CC(C)C[C@H](N)C(=O)N[C@@H](CC(C)C)C(=O)O   
1    110                                 COc1ccc(C(C)=O)cc1   
2     36                               O=[N+]([O-])c1ccccc1   
3     93                                  CNc1ccccc1C(=O)OC   
4     16                        CC(C)C[C@H](NC(=O)CN)C(=O)O   
..   ...                                                ...   
495  407                            CC=C(C)C(=O)OCCc1ccccc1   
496  539          COC(=O)C(NC(=O)C(N)CC(=O)O)C(=O)OC1CCCCC1   
497  434                             COc1cc(C=CC(C)=O)ccc1O   
498  430                            CC(C)C(=O)OCC=Cc1ccccc1   
499  607  CC1(C(=O)O)CCC2(C)CCC3(C)C(=CCC4C5(C=O)CCC(OC6...   

                                              Molecule  
0    <img data-content="rdkit/molecule" src="data:i...  
1    <img data-content="rdkit/molecule" src="data:i...  
2    <img data-content="rdkit/molecule" 

In [11]:
from rdkit.Chem import SDMolSupplier

supplier = SDMolSupplier("test.sdf")
mols, attempts = [], 0
while not mols and attempts < 10:
    mols = list(supplier)
    attempts += 1
print(f"Loaded {len(mols)} molecules after {attempts} attempts.")

for mol in mols:
    des1 = CalcRDF(mol)
    print(des1)
    

Loaded 500 molecules after 2 attempts.
[0.012, 10.836, 0.01, 13.418, 3.068, 3.128, 4.19, 6.659, 1.801, 4.646, 3.916, 2.136, 1.68, 0.995, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.017, 11.205, 0.017, 14.741, 3.896, 4.017, 4.927, 8.249, 1.961, 5.467, 4.097, 2.428, 2.238, 1.326, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.009, 10.322, 0.005, 11.898, 2.352, 2.28, 3.444, 5.412, 1.676, 3.75, 3.76, 1.886, 1.201, 0.711, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.017, 11.192, 0.017, 14.705, 3.878, 3.999, 4.912, 8.217, 1.958, 5.447, 4.094, 2.423, 2.23, 1.321, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006, 10.028, 0.003, 10.852, 1.83, 1.61, 2.863, 4.582, 1.6, 3.103, 3.618, 1.658, 0.764, 0.452, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.015, 11.44, 0.015, 14.975, 3.779, 3.889, 4.866, 7.911, 1.929, 5.523, 

In [1]:
from rdkit.Chem import SDMolSupplier

def load_sdf_file(file):

    supplier = SDMolSupplier(file)
    mols, attempts = [], 0

    while not mols and attempts < 10:
        mols = list(supplier)
        attempts += 1
    print(f"Loaded {len(mols)} molecules after {attempts} attempts.")

    return mols

In [4]:
from rdkit import Chem

from random import sample

mols = load_sdf_file("./sweet_all.sdf")


writer = Chem.SDWriter("./data/sweet_all.sdf")

ids_sweet = []
ids_not_sweet = []
y = []

for mol in mols:
    if float(mol.GetProp("_SWEET")) == 2:
        
        ids_not_sweet.append(int(mol.GetProp("_SourceID")))
        mol.SetProp("_SWEET",str(0))
    else:
        ids_sweet.append(int(mol.GetProp("_SourceID")))
    
    #writer.write(mol)

#writer.close()

#idx = sample(ids_not_sweet,1800)
#ids_to_mantain = idx + ids_sweet

Loaded 25795 molecules after 2 attempts.


In [5]:
sweet = 0
not_sweet = 0

for mol in mols:
    if float(mol.GetProp("_SWEET")) == 0:
        not_sweet+=1
    else:
        sweet+=1
        ids_sweet.append(int(mol.GetProp("_SourceID")))
    writer.write(mol)
        
writer.close()

In [7]:
not_sweet

23918

In [None]:
boo = np.where(descript==0).all(axis=1)
indexes_to_remove = [index[0] for index in np.argwhere(boo==True)]

In [3]:
mols = load_sdf_file("./sweet_all.sdf")

import pandas as pd

sweet = []
ids = []
smiles_lst = []

for mol in mols:
    if float(mol.GetProp("_SWEET")) == 2:
        smiles = mol.GetProp("_SMILES")
        smiles_lst.append(smiles)
        sweet.append(0)
        ids.append(int(mol.GetProp("_SourceID")))
    else:
        smiles = mol.GetProp("_SMILES")
        smiles_lst.append(smiles)
        sweet.append(1)
        ids.append(int(mol.GetProp("_SourceID")))
        
dset = pd.DataFrame({"DeepBio_id": ids, "smiles": smiles_lst, "sweet": sweet})

dset.to_csv("deepbio_dataset_3d_convertable.csv",index=False)

Loaded 25795 molecules after 2 attempts.


Unnamed: 0,DeepBio_id,smiles,sweet
0,18,CC(C)C[C@H](N)C(=O)N[C@@H](CC(C)C)C(O)=O,0
1,110,COc1ccc(cc1)C(C)=O,0
2,36,[O-][N+](=O)c1ccccc1,1
3,93,CNc1ccccc1C(=O)OC,1
4,16,CC(C)C[C@H](NC(=O)CN)C(O)=O,0
...,...,...,...
25790,28394,[2H]C([2H])([2H])OC(=O)C(Cc1ccccc1)NC(=O)C(N)C...,1
25791,28395,COC(=O)C(Cc1ccccc1)NC(=O)C(N)C(C)C(=O)O,1
25792,28396,COC(=O)C(NC(=O)C(N)CC(=O)O)C(C)c1ccccc1,1
25793,28397,OC[C@H]1O[C@](O)(CO)[C@@H](O)[C@@H]1O,1
