### CNN Model with SCFP for the target SR-p35 (based on paper): Training

In [3]:
import time, gc, os
import datetime
import numpy as np
import cupy as cp
from rdkit import Chem
from SCFP_functions.feature import *
import SCFP_functions.SCFPfunctions as Mf
import SCFP_functions.SCFPmodel as Mm
from SCFP_functions.constants import lensize
from SCFP_functions.constants import scfp_model_args as args
import chainer
from chainer import training
from chainer import datasets
from chainer.training import extensions

In [4]:
#-------------------------------
# GPU check
xp = np
if args["gpu"] >= 0:
    print('GPU mode')
    xp = cp

GPU mode


### Model: Training

In [None]:
print('Making Training  Dataset...')
file="data/tox21_SR-p53/tox21_data_train.smiles"
print('Loading smiles: ', file)
smi = Chem.SmilesMolSupplier(file,delimiter=' ',titleLine=False)
mols = [mol for mol in smi if mol is not None]

F_list, T_list = [],[]
for mol in mols:
    if len(Chem.MolToSmiles(mol, kekuleSmiles=True, isomericSmiles=True)) > args["atomsize"]: print("SMILES is too long. This mol will be ignored.")
    else:
        F_list.append(mol_to_feature(mol,-1,args["atomsize"]))
        T_list.append(mol.GetProp('_Name'))
Mf.random_list(F_list)
Mf.random_list(T_list)
data_t = xp.asarray(T_list, dtype=cp.int32).reshape(-1,args["n_out"])
data_f = xp.asarray(F_list, dtype=cp.float32).reshape(-1,args["n_out"],args["atomsize"],lensize)
print(data_t.shape, data_f.shape)
train_dataset = datasets.TupleDataset(data_f, data_t)

In [None]:
print('Making Test Dataset...')
file="data/tox21_SR-p53/tox21_data_test.smiles"
print('Loading smiles: ', file)
smi = Chem.SmilesMolSupplier(file,delimiter='\t',titleLine=False)
mols = [mol for mol in smi if mol is not None]
F_list, T_list = [],[]
for mol in mols:
    if len(Chem.MolToSmiles(mol, kekuleSmiles=True, isomericSmiles=True)) > args["atomsize"]: print("SMILES is too long. This mol will be ignored.")
    else:
        F_list.append(mol_to_feature(mol,-1,args["atomsize"]))
        T_list.append(mol.GetProp('_Name'))            
Mf.random_list(F_list)
Mf.random_list(T_list)
data_t = xp.asarray(T_list, dtype=cp.int32).reshape(-1,1)
data_f = xp.asarray(F_list, dtype=cp.float32).reshape(-1,1,args["atomsize"],lensize)
print(data_t.shape, data_f.shape)
test_dataset = datasets.TupleDataset(data_f, data_t)

In [8]:
# reset memory
del mol, mols, data_f, data_t, F_list, T_list
gc.collect()

0

In [9]:
# Set up a neural network to train
model = Mm.CNN(args["atomsize"], lensize, args["k1"], args["s1"], args["f1"], args["k2"], args["s2"], args["k3"], args["s3"], args["f3"],args["k4"], args["s4"],args["n_hid"],args["n_out"])

In [10]:
if args["gpu"] >= 0:
    chainer.cuda.get_device_from_id(args["gpu"]).use()
    model.to_gpu()
    print("Copied the model to the GPU!")

Copied the model to the GPU!


In [11]:
optimizer = chainer.optimizers.Adam()
optimizer.setup(model)

<chainer.optimizers.adam.Adam at 0x1e81d6387f0>

In [12]:
output_dir = "model_output"
os.makedirs(output_dir)

train_iter = chainer.iterators.SerialIterator(train_dataset, batch_size= args["batchsize"], shuffle=True)
test_iter = chainer.iterators.SerialIterator(test_dataset, batch_size= args["batchsize"], repeat=False, shuffle=True)
updater = training.StandardUpdater(train_iter, optimizer, device=args["gpu"])
trainer = training.Trainer(updater, (args["epoch"], 'epoch'), out=output_dir)

# Evaluate the model with the test dataset for each epoch
trainer.extend(extensions.Evaluator(test_iter, model, device=args["gpu"]))
# Take a snapshot for each specified epoch
trainer.extend(extensions.snapshot_object(model, 'model_snapshot_{.updater.epoch}'), trigger=(args["frequency"],'epoch'))
# Write a log of evaluation statistics for each epoch    
trainer.extend(extensions.LogReport(trigger=(1, 'epoch'), log_name='log_epoch'))
trainer.extend(extensions.LogReport(trigger=(10, 'iteration'), log_name='log_iteration'))
# Print selected entries of the log to stdout
trainer.extend(extensions.PrintReport( ['epoch', 'elapsed_time','main/loss', 'validation/main/loss','main/accuracy','validation/main/accuracy']))
# Print a progress bar to stdout
trainer.extend(extensions.ProgressBar())

In [13]:
# Run the training
START = time.time()

trainer.run()

END = time.time()
print('Training done. Total time was {}'.format(str(datetime.timedelta(seconds=END-START))))

  cuda.cudnn.convolution_forward(


epoch       elapsed_time  main/loss   validation/main/loss  main/accuracy  validation/main/accuracy
     total [..................................................]  0.09%
this epoch [######################............................] 44.61%
       100 iter, 0 epoch / 500 epochs
       inf iters/sec. Estimated time to finish: 0:00:00.
     total [..................................................]  0.18%
this epoch [############################################......] 89.21%
       200 iter, 0 epoch / 500 epochs
    125.31 iters/sec. Estimated time to finish: 0:14:52.914635.
1           4.54267       0.262389    0.199369              0.928611       0.9375                    
     total [..................................................]  0.27%
this epoch [################..................................] 33.82%
       300 iter, 1 epoch / 500 epochs
    117.44 iters/sec. Estimated time to finish: 0:15:51.924353.
     total [..................................................]  0.36%
th