In [1]:
'''%load_ext autoreload
%autoreload 2
# Occupy a GPU for the model to be loaded 
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
# GPU ID, if occupied change to an available GPU ID listed under !nvidia-smi
%env CUDA_VISIBLE_DEVICES=0'''

import numpy as np
import rdkit
from rdkit import Chem
import h5py
import ast
import pickle

from ddc_pub import ddc_v3 as ddc


In [2]:
def get_descriptors(binmols_list, qsar_model=None):
    """Calculate molecular descriptors of SMILES in a list.
    The descriptors are logp, tpsa, mw, qed, hba, hbd and probability of being active towards DRD2.
    
    Returns:
        A np.ndarray of descriptors.
    """
    from tqdm import tqdm_notebook as tqdm
    import rdkit
    from rdkit import Chem, DataStructs
    from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem, QED
    
    descriptors = []
    active_mols = []
    
    for idx, binmol in enumerate(binmols_list):
        mol = Chem.Mol(binmol)
        if mol:
            try:
                logp  = Descriptors.MolLogP(mol)
                tpsa  = Descriptors.TPSA(mol)
                molwt = Descriptors.ExactMolWt(mol)
                hba   = rdMolDescriptors.CalcNumHBA(mol)
                hbd   = rdMolDescriptors.CalcNumHBD(mol)
                qed   = QED.qed(mol)
                
                fp = AllChem.GetMorganFingerprintAsBitVect(mol,2, nBits=2048)
                ecfp4 = np.zeros((2048,))
                DataStructs.ConvertToNumpyArray(fp, ecfp4) 
                active = qsar_model.predict_proba([ecfp4])[0][1]
                descriptors.append([logp, tpsa, molwt, qed, hba, hbd, active]) 
                
            except Exception as e:
                print(e)
        else:
            print("Invalid generation.")
            
    return np.asarray(descriptors)

In [3]:
# Load QSAR model
qsar_model_name = "models/qsar_model.pickle"
with open(qsar_model_name, "rb") as file:
    qsar_model = pickle.load(file)["classifier_sv"]



In [4]:
# Load dataset
dataset_filename = r"C:\Users\Leave\OneDrive - hust.edu.cn\大创\pcko1-Deep-Drug-Coder-d6e7ef3\datasets\CHEMBL25_TRAIN_MOLS.h5"
with h5py.File(dataset_filename, "r") as f:
    binmols = f["mols"][0:256]

In [5]:
# Calculate the descriptors for the molecules in the dataset
# This process takes a lot of time and it's good if the descriptors are
# pre-calculated and stored in a file to load every time
descr = get_descriptors(binmols, qsar_model=qsar_model)

In [6]:
# All apriori known characters of the SMILES in the dataset
charset = "Brc1(-23[nH])45C=NOso#FlS67+89%0"
# Apriori known max length of the SMILES in the dataset
maxlen = 128
# Name of the dataset
name = "ChEMBL25_TRAIN"

dataset_info = {"charset": charset, "maxlen": maxlen, "name": name}

In [7]:
# Initialize a model
model = ddc.DDC(x              = descr,        # input
                y              = binmols,      # output
                dataset_info   = dataset_info, # dataset information
                scaling        = True,         # scale the descriptors
                noise_std      = 0.1,          # std of the noise layer
                lstm_dim       = 512,          # breadth of LSTM layers
                dec_layers     = 3,            # number of decoding layers
                batch_size     = 26)          # batch size for training



Initializing model in train mode.
Input type is 'molecular descriptors'.
Applying scaling on input.
Model received 230 train samples and 26 validation samples.
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Latent_Input (InputLayer)       [(None, 7)]          0                                            
__________________________________________________________________________________________________
Decoder_Inputs (InputLayer)     [(None, 142, 35)]    0                                            
__________________________________________________________________________________________________
latent_to_states_model (Model)  [(None, 512), (None, 36864       Latent_Input[0][0]               
__________________________________________________________________________________________________
batch_model (Model)            

In [8]:
model.fit(epochs              = 30,         # number of epochs
          lr                  = 1e-3,        # initial learning rate for Adam, recommended
          model_name          = "new_model", # base name to append the checkpoints with
          checkpoint_dir      = "",          # save checkpoints in the notebook's directory
          mini_epochs         = 10,          # number of sub-epochs within an epoch to trigger lr decay
          save_period         = 50,          # checkpoint frequency (in mini_epochs)
          lr_decay            = True,        # whether to use exponential lr decay or not
          sch_epoch_to_start  = 500,         # mini-epoch to start lr decay (bypassed if lr_decay=False)
          sch_lr_init         = 1e-3,        # initial lr, should be equal to lr (bypassed if lr_decay=False)
          sch_lr_final        = 1e-6,        # final lr before finishing training (bypassed if lr_decay=False)
          patience            = 25)          # patience for Keras' ReduceLROnPlateau (bypassed if lr_decay=True)


Model trained with dataset ChEMBL25_TRAIN that has maxlen=138 and charset=Brc1(-23[nH])45C=NOso#FlS67+89%0 for 30 epochs.
noise_std: 0.100000, lstm_dim: 512, dec_layers: 3, td_dense_dim: 0, batch_size: 26, codelayer_dim: 7, lr: 0.001000.

Epoch 00001: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 1/300
1/0 - 11s - loss: 4.1368 - val_loss: 31.7256

Epoch 00002: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 2/300
1/0 - 0s - loss: 2.5051 - val_loss: 32.7391

Epoch 00003: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 3/300
1/0 - 0s - loss: 1.9811 - val_loss: 34.7723

Epoch 00004: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 4/300
1/0 - 0s - loss: 2.1096 - val_loss: 31.7657

Epoch 00005: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 5/300
1/0 - 0s - loss: 1.6876 - val_loss: 29.8915

Epoch 00006: LearningRateScheduler reducing learning r

1/0 - 1s - loss: 0.8029 - val_loss: 6.9959

Epoch 00057: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 57/300
1/0 - 0s - loss: 0.8077 - val_loss: 7.0209

Epoch 00058: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 58/300
1/0 - 0s - loss: 0.7073 - val_loss: 7.1531

Epoch 00059: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 59/300
1/0 - 0s - loss: 0.7677 - val_loss: 6.8854

Epoch 00060: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 60/300
1/0 - 1s - loss: 0.7129 - val_loss: 6.6967

Epoch 00061: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 61/300
1/0 - 0s - loss: 0.7792 - val_loss: 6.9234

Epoch 00062: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 62/300
1/0 - 0s - loss: 0.7250 - val_loss: 6.7923

Epoch 00063: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 63/300
1/0 - 0s - lo

1/0 - 0s - loss: 0.6181 - val_loss: 5.3960

Epoch 00115: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 115/300
1/0 - 0s - loss: 0.5802 - val_loss: 5.5236

Epoch 00116: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 116/300
1/0 - 0s - loss: 0.6404 - val_loss: 5.5349

Epoch 00117: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 117/300
1/0 - 0s - loss: 0.5765 - val_loss: 5.4436

Epoch 00118: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 118/300
1/0 - 0s - loss: 0.6015 - val_loss: 5.6332

Epoch 00119: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 119/300
1/0 - 0s - loss: 0.6114 - val_loss: 5.6094

Epoch 00120: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 120/300
1/0 - 0s - loss: 0.5762 - val_loss: 5.4248

Epoch 00121: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 121/300
1/0 - 

1/0 - 0s - loss: 0.5254 - val_loss: 5.1843

Epoch 00173: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 173/300
1/0 - 0s - loss: 0.5310 - val_loss: 5.4209

Epoch 00174: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 174/300
1/0 - 0s - loss: 0.4836 - val_loss: 5.4513

Epoch 00175: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 175/300
1/0 - 0s - loss: 0.6152 - val_loss: 5.2216

Epoch 00176: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 176/300
1/0 - 0s - loss: 0.5885 - val_loss: 5.0621

Epoch 00177: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 177/300
1/0 - 0s - loss: 0.5314 - val_loss: 4.9513

Epoch 00178: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 178/300
1/0 - 0s - loss: 0.5840 - val_loss: 5.0041

Epoch 00179: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 179/300
1/0 - 

1/0 - 0s - loss: 0.4981 - val_loss: 5.0373

Epoch 00231: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 231/300
1/0 - 0s - loss: 0.4803 - val_loss: 4.8361

Epoch 00232: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 232/300
1/0 - 0s - loss: 0.5262 - val_loss: 4.8297

Epoch 00233: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 233/300
1/0 - 0s - loss: 0.4861 - val_loss: 4.7638

Epoch 00234: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 234/300
1/0 - 0s - loss: 0.4581 - val_loss: 4.9047

Epoch 00235: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 235/300
1/0 - 0s - loss: 0.5180 - val_loss: 4.9415

Epoch 00236: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 236/300
1/0 - 0s - loss: 0.5472 - val_loss: 4.7851

Epoch 00237: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 237/300
1/0 - 

1/0 - 0s - loss: 0.4626 - val_loss: 4.6397

Epoch 00289: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 289/300
1/0 - 0s - loss: 0.4634 - val_loss: 4.6556

Epoch 00290: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 290/300
1/0 - 0s - loss: 0.4451 - val_loss: 4.7905

Epoch 00291: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 291/300
1/0 - 0s - loss: 0.4530 - val_loss: 4.5426

Epoch 00292: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 292/300
1/0 - 0s - loss: 0.4961 - val_loss: 4.4927

Epoch 00293: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 293/300
1/0 - 0s - loss: 0.5290 - val_loss: 4.6992

Epoch 00294: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 294/300
1/0 - 0s - loss: 0.5060 - val_loss: 4.7625

Epoch 00295: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 295/300
1/0 - 

In [9]:
# Save the final model
model.save("models/model_0605_2")

Model saved.
Elapsed time: 0.214 seconds.
