In [25]:
%load_ext autoreload
%autoreload 2

import numpy as np
import rdkit
from rdkit import Chem

import h5py, ast, pickle

# If run on CUDA node, you must select a GPU, otherwise comment the 2 following lines out!
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1 # GPU ID

from ddc_pub import ddc_v3 as ddc

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1 # GPU ID


In [3]:
# Import existing (trained) model
# Ignore UserWarning(s) about non-seriazable keyword arguments
model_name = "models/heteroencoder/9784435"
model = ddc.DDC(model_name=model_name)

Initializing model in test mode.
Loading model.


  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '


Loading finished in 67 seconds.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder_Inputs (InputLayer)     (None, 138, 35)      0                                            
__________________________________________________________________________________________________
mol_to_latent_model (Model)     (None, 512)          3238400     Encoder_Inputs[0][0]             
__________________________________________________________________________________________________
Decoder_Inputs (InputLayer)     (None, 137, 35)      0                                            
__________________________________________________________________________________________________
latent_to_states_model (Model)  [(None, 512), (None, 2117632     mol_to_latent_model[1][0]        
_____________________________________________________________________________

In [4]:
# Input SMILES to auto-encode
smiles_in = ['Cc1cccn2c(CN(C)C3CCCc4ccccc43)c(C(=O)N3CCOCC3)nc12',
             'COC(=O)NN=C(c1ccc(O)cc1)C1C(=O)N(C)C(=O)N(C)C1=O',
             'CCc1cc(CC)nc(OCCCn2c3c(c4cc(-c5nc(C)no5)ccc42)CC(F)(F)CC3)n1',
             'Cc1ccc2c(C(=O)Nc3ccccc3)c(SSc3c(C(=O)Nc4ccccc4)c4ccc(C)cc4n3C)n(C)c2c1',
             'Cc1cccc(-c2ccccc2)c1Oc1nc(O)nc(NCc2ccc3occc3c2)n1',
             'Cn1nnnc1SCC(=O)NN=Cc1ccc(Cl)cc1',
             'COc1cccc(NS(=O)(=O)c2ccc(OC)c(OC)c2)c1',
             'COc1ccc(OC)c(S(=O)(=O)n2nc(C)cc2C)c1',
             'NCCCn1cc(C2=C(c3ccncc3)C(=O)NC2=O)c2ccccc21',
             'CN(C)C(=O)N1CCN(C(c2ccc(Cl)cc2)c2cccnc2)CC1']

# MUST convert SMILES to binary mols for the model to accept them (it re-converts them to SMILES internally)
mols_in = [Chem.rdchem.Mol.ToBinary(Chem.MolFromSmiles(smiles)) for smiles in smiles_in]

In [5]:
# Encode the binary mols into their latent representations
latent = model.transform(model.vectorize(mols_in))

In [26]:
# Convert back to SMILES
smiles_out = []
for lat in latent:   
    smiles, _ = model.predict(lat)
    smiles_out.append(smiles)

In [27]:
# To compare the results, convert smiles_out to CANONICAL
for idx, smiles in enumerate(smiles_out):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        smiles_out[idx] = Chem.MolToSmiles(mol, canonical=True)
    else:
        smiles_out[idx] = "INVALID"

In [28]:
smiles_in

['Cc1cccn2c(CN(C)C3CCCc4ccccc43)c(C(=O)N3CCOCC3)nc12',
 'COC(=O)NN=C(c1ccc(O)cc1)C1C(=O)N(C)C(=O)N(C)C1=O',
 'CCc1cc(CC)nc(OCCCn2c3c(c4cc(-c5nc(C)no5)ccc42)CC(F)(F)CC3)n1',
 'Cc1ccc2c(C(=O)Nc3ccccc3)c(SSc3c(C(=O)Nc4ccccc4)c4ccc(C)cc4n3C)n(C)c2c1',
 'Cc1cccc(-c2ccccc2)c1Oc1nc(O)nc(NCc2ccc3occc3c2)n1',
 'Cn1nnnc1SCC(=O)NN=Cc1ccc(Cl)cc1',
 'COc1cccc(NS(=O)(=O)c2ccc(OC)c(OC)c2)c1',
 'COc1ccc(OC)c(S(=O)(=O)n2nc(C)cc2C)c1',
 'NCCCn1cc(C2=C(c3ccncc3)C(=O)NC2=O)c2ccccc21',
 'CN(C)C(=O)N1CCN(C(c2ccc(Cl)cc2)c2cccnc2)CC1']

In [29]:
smiles_out

['INVALID',
 'COC(=O)NN=C(c1ccc(O)cc1)C1C(=O)N(C)C(=O)N(C)C1=O',
 'CCc1cc(C)nc(-c2ccc3c4c(n(CCCOc5nc(CC)no5)c3c2)CC(F)(F)CC4)n1',
 'Cc1ccc2c(C(=O)Nc3ccccc3)c(SSc3c(C(=O)Nc4ccccc4)n(C)c4cc(C)ccc34)n(C)c2c1',
 'Cc1ccc(-c2ccccc2)c(Oc2c(CNc3cc(N=O)nc(O)n3)ccc3occc23)n1',
 'Cn1nnnc1SCC(=O)NN=Cc1ccc(Cl)cc1',
 'COc1cccc(NS(=O)(=O)c2ccc(OC)c(OC)c2)c1',
 'COc1ccc(OC)c(S(=O)(=O)n2nc(C)cc2C)c1',
 'NCCCn1cc(C2=C(c3ccncc3)C(=O)NC2=O)c2ccccc21',
 'CN(C)C(=O)N1CCN(C(c2ccc(Cl)cc2)c2cccnc2)CC1']

In [32]:
# Validity
out = np.asarray(smiles_out)
valids = len(out[out[:]!="INVALID"])
print("Validity: %.2f" % ( valids / len(out)))

# Reconstructability
print("Reconstructability: %.2f" % ( len(set(smiles_in) & set(smiles_out)) / valids ) )


Validity: 0.90
Reconstructability: 0.67
