In [1]:
import sys
sys.path.append('..')

In [2]:
from logics_pack import global_settings, chemistry, logics, predictor, analysis, smiles_vocab, smiles_lstm
import pandas as pd
import numpy as np
import json
import torch

project_paths = global_settings.build_project_paths(project_dir='../')
expset_obj = global_settings.ExperimentSettings(project_paths['EXPERIMENT_SETTINGS_JSON'])

Perform LOGICS fine-tuning to build agent generator

In [3]:
# LOGICS fine-tuning config
config = global_settings.Object()

config.ablation = None  # we will use full LOGICS model

config.tokens_path = project_paths['SMILES_TOKENS_PATH']
config.pretrain_setting_path = project_paths['PRETRAIN_SETTING_JSON']
config.pretrained_model_path = project_paths['PROJECT_DIR'] + 'model-prior/prior_e10.ckpt'
config.featurizer = predictor.featurizer
config.predictor_path = project_paths['PROJECT_DIR'] + \
                            "model-pik3ca/predictor/pik3ca_rfr_cv%s.pkl"%expset_obj.get_setting("pik3ca-pred-best-cv")

config.max_epoch = 200
config.save_period = 4
config.save_ckpt_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/logics/pik3ca_logics_e%d.ckpt'
config.sample_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/logics/pik3ca_logics_e%d.txt'
config.memory_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/logics/pik3ca_logics_mem_e%d.csv'
config.memory_size = 100000
config.save_size = 20000
config.gen_size = config.save_size
config.exp_size = config.save_size
config.finetune_lr = 0.0001
config.finetune_bs = 32
config.sampling_bs = 256

config.device_name = 'cpu'

In [None]:
# perform fine-tuning
logics.LOGICS_training(config)

Load LOGICS agent generator and sample some examples

In [4]:
vocab_obj = smiles_vocab.Vocabulary(init_from_file=config.tokens_path)
smtk = smiles_vocab.SmilesTokenizer(vocab_obj)

with open(config.pretrain_setting_path, 'r') as f:
    model_setting = json.load(f)
    
# load agent model (epoch=200)
agent_ckpt = torch.load(config.save_ckpt_fmt%200, map_location='cpu')
lstm_agent = smiles_lstm.SmilesLSTMGenerator(vocab_obj, model_setting['emb_size'], model_setting['hidden_units'], device_name='cpu')
lstm_agent.lstm.load_state_dict(agent_ckpt['model_state_dict'])

<All keys matched successfully>

In [5]:
# sampling
ssplr = analysis.SafeSampler(lstm_agent, batch_size=16)
generated_smiles = ssplr.sample_clean(50, maxlen=150)
display(generated_smiles)

['CC1(N2CCc3c(-c4cnc(=N)[nH]c4)nc(N4CCOCC4)nc32)CCN(C(=O)c2cccc(C(=O)N3CCCC3)c2)C1',
 'CC1(N2CCc3c(-c4cnc(=N)[nH]c4)nc(N4CCOCC4)nc32)CCN(Cc2ncccc2F)C1',
 'CC1COCCN1c1nc(-c2cnc(=N)[nH]c2)c2c(n1)N(C1(C)CCN(S(=O)(=O)c3cn(C)cn3)CC1)CC2',
 'CC1(N2CCc3c(-c4cnc(=N)[nH]c4)nc(N4CCOC(CCF)C4)nc32)CCN(C(=O)c2cccc(C(F)(F)F)c2Cl)C1',
 'CC(SNC(=O)c1ccc(S(=O)(=O)N2CCOCC2)cc1)C(=O)Nc1ccc(S(=O)(=O)N2CCOCC2)cc1',
 'CC1(N2CCc3c(-c4cnc(=N)[nH]c4)nc(NCC4CC4)nc32)CCN(C(=O)c2ccc(NS(=O)(=O)C(F)(F)F)cc2F)C1',
 'CC1(N2CCc3c(-c4cnc(=N)[nH]c4)nc(N4CCOCC4)nc32)CCN(C(=O)c2cc(O)ccc2O)C1',
 'COC(=O)c1ccc(COc2ccc(CCNC(=O)N3CCN(C(=O)OC(C)(C)C)CC3)cc2)cc1',
 'CN(C)C(=O)c1ccc(S(=O)(=O)NCCc2ccc(OCc3ccc(Cl)c(Cl)c3)cc2)cc1',
 'CC1(N2CCc3c(-c4cnc(=N)[nH]c4)nc(N4CCOCC4)nc32)CCN(Cc2ccc(F)cn2)C1',
 'CC(C)(C)OC(=O)N1CCC(C(=O)NCCc2ccc(OCc3cccc(Cl)c3)cc2)CC1',
 'CC1(N2CCc3c(-c4cnc(=N)[nH]c4)nc(N4CCOCC4)nc32)CCN(C(=O)C2CCN(S(=O)(=O)Cc3ccsc3)C2)C1',
 'CC(C)OC(=O)N1CCC(Oc2ccc(OCc3ccc4c(c3)OCO4)cn2)C1',
 'COC(=O)c1cccc(COc2cccc(CCNS(=O

Subsidiary files building for evaluation phase

In [6]:
config.vc_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/logics/pik3ca_logics_vc_e%d.smi'  # save valid & canonical smiles
config.npfps_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/logics/pik3ca_logics_npfps_e%d.npy'  # save fingerprint in npy
config.fcvec_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/logics/pik3ca_logics_fcvec_e%d.npy'  # save Frechet ChemNet vectors

# epochs = list(range(0, config.max_epoch+1, config.save_period))
epochs = [200]

In [7]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # use tensorflow cpu

import fcd
from logics_pack import frechet_chemnet
fc_ref_model = fcd.load_ref_model()

Using TensorFlow backend.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


2023-03-23 13:34:22.617318: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2023-03-23 13:34:22.649769: E tensorflow/stream_executor/cuda/cuda_driver.cc:318] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-03-23 13:34:22.649803: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: shepherd5
2023-03-23 13:34:22.649811: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: shepherd5
2023-03-23 13:34:22.649908: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 465.19.1
2023-03-23 13:34:22.649940: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 465.19.1
2023-03-23 13:34:22.649948: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 465.19.1
2023-03-23 13:34:22.650826: I tensorflow/core/platform/cpu_featu




In [8]:
for epo in epochs:
    print(epo)
    with open(config.sample_fmt%epo, 'r') as f:
        gens = [line.strip() for line in f.readlines()]
    vcs, invids = chemistry.get_valid_canons(gens)
    print("- count invalids: ", len(invids))
    with open(config.vc_fmt%epo, 'w') as f:
        f.writelines([line+'\n' for line in vcs])
    fps = chemistry.get_fps_from_smilist(vcs)
    np.save(config.npfps_fmt%epo, chemistry.rdk2npfps(fps))
    fcvecs = fcd.get_predictions(fc_ref_model, vcs)  # ChemNet vectors
    np.save(config.fcvec_fmt%epo, fcvecs)

200
- count invalids:  90


Evaluate FCD and OTD on validation set, and pick the best epoch

In [12]:
# loading validation dataset
with open(project_paths['PIK3CA_FOLD_JSON'], 'r') as f:
    pik3_folds = json.load(f)
data_npfps = np.load(project_paths['PIK3CA_DATA_FP'])
data_fcvecs = np.load(project_paths['PIK3CA_DATA_FCVEC'])

val_fold_id = expset_obj.get_setting('pik3ca-pred-best-cv')
val_npfps = data_npfps[pik3_folds[val_fold_id]]
val_rdkfps = chemistry.np2rdkfps(val_npfps)
val_fcvecs = data_fcvecs[pik3_folds[val_fold_id]]

dsize = len(val_rdkfps)  # demand size for OT
ssize = dsize*global_settings.OT_CALC_REPEATS  # supply size for repeated OT   

In [13]:
val_fcd_list = []
val_otd_list = []
for epo in epochs:
    print(epo)
    # load fc vectors of generation
    gen_fcvecs = np.load(config.fcvec_fmt%epo)
    fcdval = frechet_chemnet.fcd_calculation(val_fcvecs, gen_fcvecs)
    val_fcd_list.append(fcdval)
    
    gen_npfps = np.load(config.npfps_fmt%epo)[:ssize]  # only need this amount
    gen_rdkfps = chemistry.np2rdkfps(gen_npfps)
    simmat = analysis.calculate_simmat(gen_rdkfps, val_rdkfps)  # row:gen, col:data
    distmat = analysis.transport_distmat(analysis.tansim_to_dist, simmat, global_settings.OT_CALC_REPEATS)
    _, _, motds = analysis.repeated_optimal_transport(distmat, repeat=global_settings.OT_CALC_REPEATS)
    val_otd_list.append(np.mean(motds))

200


In [14]:
# validation FCDxOTD
val_FCDxOTD = np.array(val_fcd_list)*np.array(val_otd_list)
# find the best epoch
best_epoch = epochs[np.argmin(val_FCDxOTD)]
# register the best epoch
expset_obj.update_setting('pik3ca-logics-best-epoch', best_epoch)

In [16]:
print(expset_obj.get_setting('pik3ca-logics-best-epoch'))

200
