In [1]:
import sys
sys.path.append('..')

In [2]:
from logics_pack import global_settings, chemistry, drugex, predictor, reward_functions
from logics_pack import analysis, smiles_vocab, smiles_lstm
import pandas as pd
import numpy as np
import json
import torch

project_paths = global_settings.build_project_paths(project_dir='../')
expset_obj = global_settings.ExperimentSettings(project_paths['EXPERIMENT_SETTINGS_JSON'])

Perform DrugEx fine-tuning to build agent generator

In [3]:
config = global_settings.Object()
config.tokens_path = project_paths['SMILES_TOKENS_PATH']
config.pretrain_setting_path = project_paths['PRETRAIN_SETTING_JSON']
config.pretrained_model_path = project_paths['PROJECT_DIR'] + 'model-prior/prior_e10.ckpt'
config.featurizer = predictor.featurizer
config.predictor_path = project_paths['PROJECT_DIR'] + \
                            "model-pik3ca/predictor/pik3ca_rfr_cv%s.pkl"%expset_obj.get_setting("pik3ca-pred-best-cv")

config.max_epoch = 4500
config.save_period = 90
config.save_size = 20000
config.save_ckpt_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/drugex/pik3ca_drugex_e%d.ckpt'
config.sample_fmt = project_paths['PROJECT_DIR'] +'model-pik3ca/drugex/pik3ca_drugex_e%d.txt'
config.train_batch_size = 128
config.scaler = 10
config.rewarding = reward_functions.pAff_to_reward_t2
config.beta = 0.1
config.epsilon = 0.1
config.finetune_lr = 0.0002
config.sampling_bs = 256

config.device_name = 'cpu'

In [None]:
# perform fine-tuning
drugex.DrugEx_training(config)

Load DrugEx agent generator and sample some examples

In [4]:
vocab_obj = smiles_vocab.Vocabulary(init_from_file=config.tokens_path)
smtk = smiles_vocab.SmilesTokenizer(vocab_obj)

with open(config.pretrain_setting_path, 'r') as f:
    model_setting = json.load(f)
    
# load agent model (epoch=4230)
agent_ckpt = torch.load(config.save_ckpt_fmt%4230, map_location='cpu')
lstm_agent = smiles_lstm.SmilesLSTMGenerator(vocab_obj, model_setting['emb_size'], model_setting['hidden_units'], device_name='cpu')
lstm_agent.lstm.load_state_dict(agent_ckpt['model_state_dict'])

<All keys matched successfully>

In [5]:
# sampling
ssplr = analysis.SafeSampler(lstm_agent, batch_size=16)
generated_smiles = ssplr.sample_clean(50, maxlen=150)
display(generated_smiles)

['CN1C(=O)N(CC(N)=O)CC1(C)c1ccc(OCc2ccc3ccc([N+](=O)[O-])cc3n2)cc1',
 'C=CC(=O)NC1=CC(=O)CN1c1nc(-c2ccc(OCc3ccc(C#N)cc3)cc2)c2cc(OC)c(OC)cc2n1',
 'CSc1ncc(C(=NO)c2ccc(COc3ccc(C(C)C)cc3)cc2)c2c1CCC2',
 'C=C1C(OC(=O)c2ccc(OCc3ccccc3)cc2)CCC2(C)C1CCC1(C)C2CC(OC(=O)CC2CC2)C(C(=O)OCC)C1C(C)=O',
 'COc1ccc(OCc2cc(C(=O)N3CCS(=O)(=O)CC3)no2)cc1',
 'COc1cc(NCC(C)NCCc2ccc(OCc3ccccc3)cc2)cc(OC)c1OC',
 'CNc1nc(C)c(-c2cccc(OC)c2OC)c(C(=O)O)n1',
 'COc1ccc(CCN(CC(=O)NO)C(=O)c2snc(C(=O)N3CCOCC3)c2-c2cccs2)cc1-n1cc(C(F)(F)F)cc1C(C)(C)C',
 'COc1ccc(C(=O)NCC(=O)NCCc2ccccn2)cc1Cl',
 'COC(=O)N1CCN(c2cccc(OCc3csc(-c4ccccc4)n3)c2)CC1',
 'Oc1nc2cc(Cl)cc(Cl)c2c(-c2ccc(OCc3cn(-c4ccccc4)nn3)cc2)c1C=Nc1ncc(Cc2cc(F)cc(F)c2)s1',
 'COc1cccc(N)c1NS(=O)(=O)C=Cc1ccc(OCc2ccc3ccccc3n2)cc1',
 'O=C1CC(Nc2cccc(O)c2)C(c2ccc(OCc3c(F)cccc3Cl)cc2)N1c1cccc(CO)c1',
 'CCOC(=O)C1CCN(S(=O)(=O)c2ccc(OC)cc2[N+](=O)[O-])CC1c1ccccc1NS(=O)(=O)c1ccc(OCc2ccccc2)cc1',
 'COc1cc2c(Oc3ccc(NC(=O)c4c(Cl)cncc4Cl)cc3F)ccnc2cc1OCCCN1CCC(F)(F)CC1',
 

Subsidiary files building for evaluation phase

In [6]:
config.vc_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/drugex/pik3ca_drugex_vc_e%d.smi'  # save valid & canonical smiles
config.npfps_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/drugex/pik3ca_drugex_npfps_e%d.npy'  # save fingerprint in npy
config.fcvec_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/drugex/pik3ca_drugex_fcvec_e%d.npy'  # save Frechet ChemNet vectors

# epochs = list(range(0, config.max_epoch+1, config.save_period))
epochs = [4230]

In [7]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # use tensorflow cpu

import fcd
from logics_pack import frechet_chemnet
fc_ref_model = fcd.load_ref_model()

Using TensorFlow backend.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


2023-03-23 14:23:07.430230: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2023-03-23 14:23:07.443789: E tensorflow/stream_executor/cuda/cuda_driver.cc:318] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-03-23 14:23:07.443839: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: shepherd5
2023-03-23 14:23:07.443847: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: shepherd5
2023-03-23 14:23:07.443986: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 465.19.1
2023-03-23 14:23:07.444022: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 465.19.1
2023-03-23 14:23:07.444029: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 465.19.1
2023-03-23 14:23:07.445375: I tensorflow/core/platform/cpu_featu




In [9]:
for epo in epochs:
    print(epo)
    with open(config.sample_fmt%epo, 'r') as f:
        gens = [line.strip() for line in f.readlines()]
    vcs, invids = chemistry.get_valid_canons(gens)
    print("- count invalids: ", len(invids))
    with open(config.vc_fmt%epo, 'w') as f:
        f.writelines([line+'\n' for line in vcs])
    fps = chemistry.get_fps_from_smilist(vcs)
    np.save(config.npfps_fmt%epo, chemistry.rdk2npfps(fps))
    fcvecs = fcd.get_predictions(fc_ref_model, vcs)  # ChemNet vectors
    np.save(config.fcvec_fmt%epo, fcvecs)

4230
- count invalids:  336


Evaluate FCD and OTD on validation set, and pick the best epoch

In [10]:
# loading validation dataset
with open(project_paths['PIK3CA_FOLD_JSON'], 'r') as f:
    pik3_folds = json.load(f)
data_npfps = np.load(project_paths['PIK3CA_DATA_FP'])
data_fcvecs = np.load(project_paths['PIK3CA_DATA_FCVEC'])

val_fold_id = expset_obj.get_setting('pik3ca-pred-best-cv')
val_npfps = data_npfps[pik3_folds[val_fold_id]]
val_rdkfps = chemistry.np2rdkfps(val_npfps)
val_fcvecs = data_fcvecs[pik3_folds[val_fold_id]]

dsize = len(val_rdkfps)  # demand size for OT
ssize = dsize*global_settings.OT_CALC_REPEATS  # supply size for repeated OT   

In [11]:
val_fcd_list = []
val_otd_list = []
for epo in epochs:
    print(epo)
    # load fc vectors of generation
    gen_fcvecs = np.load(config.fcvec_fmt%epo)
    fcdval = frechet_chemnet.fcd_calculation(val_fcvecs, gen_fcvecs)
    val_fcd_list.append(fcdval)
    
    gen_npfps = np.load(config.npfps_fmt%epo)[:ssize]  # only need this amount
    gen_rdkfps = chemistry.np2rdkfps(gen_npfps)
    simmat = analysis.calculate_simmat(gen_rdkfps, val_rdkfps)  # row:gen, col:data
    distmat = analysis.transport_distmat(analysis.tansim_to_dist, simmat, global_settings.OT_CALC_REPEATS)
    _, _, motds = analysis.repeated_optimal_transport(distmat, repeat=global_settings.OT_CALC_REPEATS)
    val_otd_list.append(np.mean(motds))

4230


In [12]:
# validation FCDxOTD
val_FCDxOTD = np.array(val_fcd_list)*np.array(val_otd_list)
# find the best epoch
best_epoch = epochs[np.argmin(val_FCDxOTD)]
# register the best epoch
expset_obj.update_setting('pik3ca-drugex-best-epoch', best_epoch)

In [13]:
print(expset_obj.get_setting('pik3ca-drugex-best-epoch'))

4230
