In [1]:
import sys
sys.path.append('..')

In [2]:
from logics_pack import global_settings, chemistry, generator, predictor, analysis, smiles_vocab, smiles_lstm
import pandas as pd
import numpy as np
import json
import torch

project_paths = global_settings.build_project_paths(project_dir='../')
expset_obj = global_settings.ExperimentSettings(project_paths['EXPERIMENT_SETTINGS_JSON'])

Perform VGPC fine-tuning to build agent generator

In [3]:
config = global_settings.Object()
config.tokens_path = project_paths['SMILES_TOKENS_PATH']
config.pretrain_setting_path = project_paths['PRETRAIN_SETTING_JSON']
config.pretrained_model_path = project_paths['PROJECT_DIR'] + 'model-prior/prior_e10.ckpt'
config.featurizer = predictor.featurizer
config.predictor_path = project_paths['PROJECT_DIR'] + "model-kor/predictor/kor_rfr_cv%s.pkl"%expset_obj.get_setting("kor-pred-best-cv")

config.max_epoch = 50
config.save_period = 1
config.save_ckpt_fmt = project_paths['PROJECT_DIR'] + 'model-kor/vgpc/kor_vgpc_e%d.ckpt'
config.sample_fmt = project_paths['PROJECT_DIR'] + 'model-kor/vgpc/kor_vgpc_e%d.txt'
config.save_size = 20000
config.gen_size = config.save_size
config.high_score_size = int(config.gen_size/5)
config.finetune_lr = 0.00005
config.finetune_bs = 128
config.sampling_bs = 256

config.device_name = 'cpu'

In [None]:
# perform fine-tuning
generator.VanillaGPC_training(config)

Load VGPC agent generator and sample some examples

In [4]:
vocab_obj = smiles_vocab.Vocabulary(init_from_file=config.tokens_path)
smtk = smiles_vocab.SmilesTokenizer(vocab_obj)

with open(config.pretrain_setting_path, 'r') as f:
    model_setting = json.load(f)
    
# load agent model (epoch=20)
agent_ckpt = torch.load(config.save_ckpt_fmt%20, map_location='cpu')
lstm_agent = smiles_lstm.SmilesLSTMGenerator(vocab_obj, model_setting['emb_size'], model_setting['hidden_units'], device_name='cpu')
lstm_agent.lstm.load_state_dict(agent_ckpt['model_state_dict'])

<All keys matched successfully>

In [5]:
# sampling
ssplr = analysis.SafeSampler(lstm_agent, batch_size=16)
generated_smiles = ssplr.sample_clean(50, maxlen=150)
display(generated_smiles)

['CC(C)CC(NC(=O)C(CC(C)C)NC(=O)C(CCCc1ccccc1NC(=O)C(N)CCCNC(=N)N)NC(=O)c1ccnc(-c2ccc(-c3ccc(C(=N)NC)cc3)cc2)c1)C(N)=O',
 'CCCNC(=O)CCC(CC(CCC(N)C(=O)O)C(=O)NC(CCCCN)C(=O)NC(=O)C(CC(C)C)NC(=O)C(N)CCCNC(N)=O)C(=O)NC(C)C(N)=O',
 'CC(C)CC(NC(=O)C(Cc1ccccc1)OC(=O)N(CC(C)(C)C)C(C)(Cc1ccccc1)C(N)=O)C(CCCCN)NC(=O)C(C)(C)C(N)CC(=O)NC(C(=O)O)C(C)C',
 'CS(=O)(=O)NC1CCCC(NCC(=O)NC(CCCNC(=N)N)C(N)=O)C(C(N)=O)C(CC(C)(C)C)C1',
 'CC(CN1CC2CN(C(=O)Cc3cnccs3)CCC2C1)NC(N)=O',
 'CC(C)CC(NC(=O)C(=O)NCc1ccccc1)C(=O)NC(CCCC(N)CS)C(=O)N1CCCCC1',
 'CC(C)CC(NC(=O)C(CCCCN)NC(=O)C(N)CC(=O)NCCCCCN)C(=O)NC(CC(N)=O)C(N)=O',
 'CC(C)CC(NC(=O)C(CC(C)C)NC(=O)C(CCCN=C(N)N)NC(=O)C(N)CCCCN)C(=O)NC(CC(C)C)C(=O)NC(Cc1ccccc1)C(N)=O',
 'CC(C)CC(NC(=O)C(CCCNC(=N)N)NC(=O)C(N)Cc1ccc(O)cc1CC1C(=O)NC(=O)C(C)NC(=O)C(O)C(C)N1C(=O)OCc2ccccc2)C1NC(C)(C)CC(C)C',
 'CC(C)CC(NC(=O)C(CCCNC(=N)N)NC(=O)OC(C)(C)C)C(=O)NC(C(=O)NC(CCCC(N)C(=O)O)C(=O)NC(CC(C)C)C(=O)NN)C(=O)NC(Cc1ccc(O)cc1)C(N)=O',
 'CC(C)C(C(N)=O)N(Cc1ccccc1F)S(=O)(=O)c1ccccc1-c1

Subsidiary files building for evaluation phase

In [8]:
config.vc_fmt = project_paths['PROJECT_DIR'] + 'model-kor/vgpc/kor_vgpc_vc_e%d.ckpt'  # save valid & canonical smiles
config.npfps_fmt = project_paths['PROJECT_DIR'] + 'model-kor/vgpc/kor_vgpc_npfps_e%d.npy'  # save fingerprint in npy
config.fcvec_fmt = project_paths['PROJECT_DIR'] + 'model-kor/vgpc/kor_vgpc_fcvec_e%d.npy'  # save Frechet ChemNet vectors

# epochs = list(range(0, config.max_epoch+1, config.save_period))
epochs = [20]

In [9]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # use tensorflow cpu

import fcd
from logics_pack import frechet_chemnet
fc_ref_model = fcd.load_ref_model()

Using TensorFlow backend.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


2023-03-22 10:37:32.333419: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2023-03-22 10:37:32.368792: E tensorflow/stream_executor/cuda/cuda_driver.cc:318] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-03-22 10:37:32.368827: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: shepherd5
2023-03-22 10:37:32.368835: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: shepherd5
2023-03-22 10:37:32.368923: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 465.19.1
2023-03-22 10:37:32.368959: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 465.19.1
2023-03-22 10:37:32.368966: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 465.19.1
2023-03-22 10:37:32.370248: I tensorflow/core/platform/cpu_featu




In [12]:
for epo in epochs:
    print(epo)
    with open(config.sample_fmt%epo, 'r') as f:
        gens = [line.strip() for line in f.readlines()]
    vcs, invids = chemistry.get_valid_canons(gens)
    print("- count invalids: ", len(invids))
    with open(config.vc_fmt%epo, 'w') as f:
        f.writelines([line+'\n' for line in vcs])
    fps = chemistry.get_fps_from_smilist(vcs)
    np.save(config.npfps_fmt%epo, chemistry.rdk2npfps(fps))
    fcvecs = fcd.get_predictions(fc_ref_model, vcs)  # ChemNet vectors
    np.save(config.fcvec_fmt%epo, fcvecs)

20
- count invalids:  2432


Evaluate FCD and OTD on validation set, and pick the best epoch

In [13]:
# loading validation dataset
with open(project_paths['KOR_FOLD_JSON'], 'r') as f:
    kor_folds = json.load(f)
data_npfps = np.load(project_paths['KOR_DATA_FP'])
data_fcvecs = np.load(project_paths['KOR_DATA_FCVEC'])

val_fold_id = expset_obj.get_setting('kor-pred-best-cv')
val_npfps = data_npfps[kor_folds[val_fold_id]]
val_rdkfps = chemistry.np2rdkfps(val_npfps)
val_fcvecs = data_fcvecs[kor_folds[val_fold_id]]

dsize = len(val_rdkfps)  # demand size for OT
ssize = dsize*global_settings.OT_CALC_REPEATS  # supply size for repeated OT   

In [14]:
val_fcd_list = []
val_otd_list = []
for epo in epochs:
    print(epo)
    # load fc vectors of generation
    gen_fcvecs = np.load(config.fcvec_fmt%epo)
    fcdval = frechet_chemnet.fcd_calculation(val_fcvecs, gen_fcvecs)
    val_fcd_list.append(fcdval)
    
    gen_npfps = np.load(config.npfps_fmt%epo)[:ssize]  # only need this amount
    gen_rdkfps = chemistry.np2rdkfps(gen_npfps)
    simmat = analysis.calculate_simmat(gen_rdkfps, val_rdkfps)  # row:gen, col:data
    distmat = analysis.transport_distmat(analysis.tansim_to_dist, simmat, global_settings.OT_CALC_REPEATS)
    _, _, motds = analysis.repeated_optimal_transport(distmat, repeat=global_settings.OT_CALC_REPEATS)
    val_otd_list.append(np.mean(motds))

20


In [15]:
# validation FCDxOTD
val_FCDxOTD = np.array(val_fcd_list)*np.array(val_otd_list)
# find the best epoch
best_epoch = epochs[np.argmin(val_FCDxOTD)]
# register the best epoch
expset_obj.update_setting('kor-vgpc-best-epoch', best_epoch)

In [16]:
print(expset_obj.get_setting('kor-vgpc-best-epoch'))

20
