In [1]:
import sys
sys.path.append('..')

In [2]:
from logics_pack import global_settings, chemistry, predictor, reward_functions, ahc
from logics_pack import analysis, smiles_vocab, smiles_lstm
import pandas as pd
import numpy as np
import json
import torch

project_paths = global_settings.build_project_paths(project_dir='../')
expset_obj = global_settings.ExperimentSettings(project_paths['EXPERIMENT_SETTINGS_JSON'])

Perform AHC fine-tuning to build agent generator

In [3]:
# AHC fine-tuning config
config = global_settings.Object()
config.tokens_path = project_paths['SMILES_TOKENS_PATH']
config.pretrain_setting_path = project_paths['PRETRAIN_SETTING_JSON']
config.pretrained_model_path = project_paths['PROJECT_DIR'] + 'model-prior/prior_e10.ckpt'
config.featurizer = predictor.featurizer
config.predictor_path = project_paths['PROJECT_DIR'] + \
                "model-pik3ca/predictor/pik3ca_rfr_cv%s.pkl"%expset_obj.get_setting("pik3ca-pred-best-cv")
config.max_epoch = 2000  # "epoch" is actually the training batches for reinforcement learning models
config.save_period = 20
config.save_size = 20000
config.save_ckpt_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/ahc/pik3ca_ahc_e%d.ckpt'
config.sample_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/ahc/pik3ca_ahc_e%d.txt'
config.sigma = 15
config.topk = 0.5  ## AHC
config.nbmax = 100  ## DF2
config.minscore = 0.5  ## DF2
config.dfmode = "linear"  ## DF2
config.rewarding = reward_functions.pAff_to_reward_t2
config.train_batch_size = 128
config.finetune_lr = 0.0003
config.sampling_bs = 256

config.device_name = 'cuda:5'  ####

In [5]:
# perform fine-tuning
ahc.AHC_training(config)

NVIDIA GeForce RTX 3090 with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the NVIDIA GeForce RTX 3090 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



--- 0 ---
---
uniq valid count:  118
avg pkx:  6.855987551099112
avg filtered scores  0.015686489011443632
size _scaffolds  1
top-k reward mean:  0.10765486610391704
---
uniq valid count:  121
avg pkx:  6.867995350395105
avg filtered scores  0.020046727496503294
size _scaffolds  3
top-k reward mean:  0.09804605950046331
---
uniq valid count:  122
avg pkx:  6.836356482833382
avg filtered scores  0.010330820826429169
size _scaffolds  4
top-k reward mean:  0.09781661121383697
---
uniq valid count:  119
avg pkx:  6.954357406708952
avg filtered scores  0.044714232041536066
size _scaffolds  7
top-k reward mean:  0.14437540334134796
--- 20 ---
---
uniq valid count:  120
avg pkx:  6.935081905932523
avg filtered scores  0.03932174487035397
size _scaffolds  11
top-k reward mean:  0.12940818965420442
---
uniq valid count:  124
avg pkx:  6.980177473060254
avg filtered scores  0.05172374232276404
size _scaffolds  14
top-k reward mean:  0.15150972304877525
---
uniq valid count:  126
avg pkx:  6.9961

---
uniq valid count:  125
avg pkx:  7.520235189315609
avg filtered scores  0.2010808037955658
size _scaffolds  860
top-k reward mean:  0.39133637251558256
---
uniq valid count:  121
avg pkx:  7.5387270505103485
avg filtered scores  0.20699886968201647
size _scaffolds  877
top-k reward mean:  0.3783748397920328
--- 320 ---
---
uniq valid count:  125
avg pkx:  7.404230001671778
avg filtered scores  0.1662713773744824
size _scaffolds  905
top-k reward mean:  0.3440935503524881
---
uniq valid count:  128
avg pkx:  7.599075630087027
avg filtered scores  0.2165311370757356
size _scaffolds  931
top-k reward mean:  0.39931084365855685
---
uniq valid count:  125
avg pkx:  7.461086320000328
avg filtered scores  0.18377495567521293
size _scaffolds  955
top-k reward mean:  0.38231620347159806
--- 340 ---
---
uniq valid count:  126
avg pkx:  7.490738887145167
avg filtered scores  0.19410099616885945
size _scaffolds  982
top-k reward mean:  0.3783565364698244
---
uniq valid count:  125
avg pkx:  7.

--- 620 ---
---
uniq valid count:  122
avg pkx:  7.707835089944392
avg filtered scores  0.2481947016158467
size _scaffolds  2281
top-k reward mean:  0.4470323260518414
---
uniq valid count:  122
avg pkx:  7.775987063401642
avg filtered scores  0.27139174767599755
size _scaffolds  2307
top-k reward mean:  0.45858640105837206
---
uniq valid count:  122
avg pkx:  7.649006033185646
avg filtered scores  0.23526040219716635
size _scaffolds  2328
top-k reward mean:  0.4355031906342657
--- 640 ---
---
uniq valid count:  122
avg pkx:  7.769946403668254
avg filtered scores  0.2676614780099762
size _scaffolds  2359
top-k reward mean:  0.4555066490227994
---
uniq valid count:  125
avg pkx:  7.673794707366155
avg filtered scores  0.23554342357046695
size _scaffolds  2388
top-k reward mean:  0.439934367968435
---
uniq valid count:  121
avg pkx:  7.733272542242122
avg filtered scores  0.25167830485677056
size _scaffolds  2416
top-k reward mean:  0.43669819519572983
--- 660 ---
---
uniq valid count:  

---
uniq valid count:  127
avg pkx:  7.87376019814195
avg filtered scores  0.29265424545879043
size _scaffolds  3683
top-k reward mean:  0.4688478881459617
--- 940 ---
---
uniq valid count:  127
avg pkx:  7.868115457666291
avg filtered scores  0.28240921276612385
size _scaffolds  3706
top-k reward mean:  0.46565220473383745
---
uniq valid count:  124
avg pkx:  7.804610655508649
avg filtered scores  0.27891395413239306
size _scaffolds  3742
top-k reward mean:  0.4648183333330236
---
uniq valid count:  125
avg pkx:  7.886116817123704
avg filtered scores  0.29201172630057753
size _scaffolds  3766
top-k reward mean:  0.4705338805191646
--- 960 ---
---
uniq valid count:  124
avg pkx:  7.954084246517507
avg filtered scores  0.3085601914902076
size _scaffolds  3792
top-k reward mean:  0.47068896488080936
---
uniq valid count:  126
avg pkx:  7.859416456880588
avg filtered scores  0.2835962686053291
size _scaffolds  3809
top-k reward mean:  0.4616810257918733
---
uniq valid count:  122
avg pkx:

--- 1240 ---
---
uniq valid count:  127
avg pkx:  8.083204555255131
avg filtered scores  0.3512470954288843
size _scaffolds  5070
top-k reward mean:  0.47478747733641324
---
uniq valid count:  126
avg pkx:  7.893590369415044
avg filtered scores  0.29015549096044085
size _scaffolds  5103
top-k reward mean:  0.4721484240265748
---
uniq valid count:  125
avg pkx:  7.982001625608754
avg filtered scores  0.32224978042305047
size _scaffolds  5136
top-k reward mean:  0.4694856663086504
--- 1260 ---
---
uniq valid count:  127
avg pkx:  7.997623088397712
avg filtered scores  0.3236311807383455
size _scaffolds  5163
top-k reward mean:  0.4719200375018034
---
uniq valid count:  126
avg pkx:  7.9124083864198544
avg filtered scores  0.301914094595957
size _scaffolds  5195
top-k reward mean:  0.469652091373457
---
uniq valid count:  126
avg pkx:  8.062890257885538
avg filtered scores  0.343679080904568
size _scaffolds  5235
top-k reward mean:  0.4711674818524249
---
uniq valid count:  123
avg pkx:  

---
uniq valid count:  126
avg pkx:  8.057403702489825
avg filtered scores  0.3376438493656346
size _scaffolds  6505
top-k reward mean:  0.46919517531916155
---
uniq valid count:  126
avg pkx:  7.906507958280849
avg filtered scores  0.3016134372349057
size _scaffolds  6535
top-k reward mean:  0.4646925682021604
--- 1560 ---
---
uniq valid count:  126
avg pkx:  8.134213119684228
avg filtered scores  0.3572007495878772
size _scaffolds  6562
top-k reward mean:  0.48066984255345513
---
uniq valid count:  125
avg pkx:  8.021220846774696
avg filtered scores  0.32874674358446876
size _scaffolds  6581
top-k reward mean:  0.4774644504349004
---
uniq valid count:  127
avg pkx:  7.9264531800254066
avg filtered scores  0.30719374170763947
size _scaffolds  6611
top-k reward mean:  0.4757815011931168
---
uniq valid count:  123
avg pkx:  8.055288523251459
avg filtered scores  0.3329081787115059
size _scaffolds  6642
top-k reward mean:  0.4731646044558854
--- 1580 ---
---
uniq valid count:  124
avg pk

--- 1860 ---
---
uniq valid count:  125
avg pkx:  8.070247071775771
avg filtered scores  0.3420380578369315
size _scaffolds  8049
top-k reward mean:  0.4747439534088246
---
uniq valid count:  126
avg pkx:  7.842172200842816
avg filtered scores  0.2839233982649458
size _scaffolds  8077
top-k reward mean:  0.4657187019725077
---
uniq valid count:  128
avg pkx:  8.063567358409136
avg filtered scores  0.3393913087188447
size _scaffolds  8110
top-k reward mean:  0.4816159687147431
---
uniq valid count:  123
avg pkx:  8.094588052613979
avg filtered scores  0.34687145639423134
size _scaffolds  8138
top-k reward mean:  0.4751732145735129
--- 1880 ---
---
uniq valid count:  127
avg pkx:  8.074994820711025
avg filtered scores  0.34164395147128324
size _scaffolds  8173
top-k reward mean:  0.4780517916530675
---
uniq valid count:  123
avg pkx:  8.03324533257771
avg filtered scores  0.33085953842900645
size _scaffolds  8208
top-k reward mean:  0.4759126295868209
---
uniq valid count:  126
avg pkx: 

Subsidiary files building for evaluation phase

In [4]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # use tensorflow cpu

import fcd
import pickle
from logics_pack import frechet_chemnet
fc_ref_model = fcd.load_ref_model()

config.vc_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/ahc/pik3ca_ahc_vc_e%d.smi'  # save valid & canonical smiles
config.npfps_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/ahc/pik3ca_ahc_npfps_e%d.npy'  # save fingerprint in npy
config.fcvec_fmt = project_paths['PROJECT_DIR'] + 'model-pik3ca/ahc/pik3ca_ahc_fcvec_e%d.npy'  # save Frechet ChemNet vectors

epochs = list(range(0, config.max_epoch+1, config.save_period))
np.array(epochs)

Using TensorFlow backend.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


2023-07-07 11:40:33.083825: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2023-07-07 11:40:33.122431: E tensorflow/stream_executor/cuda/cuda_driver.cc:318] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-07-07 11:40:33.122498: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: shepherd6
2023-07-07 11:40:33.122517: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: shepherd6
2023-07-07 11:40:33.122700: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 465.19.1
2023-07-07 11:40:33.122770: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 465.19.1
2023-07-07 11:40:33.122789: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 465.19.1
2023-07-07 11:40:33.124395: I tensorflow/core/platform/cpu_featu




array([   0,   20,   40,   60,   80,  100,  120,  140,  160,  180,  200,
        220,  240,  260,  280,  300,  320,  340,  360,  380,  400,  420,
        440,  460,  480,  500,  520,  540,  560,  580,  600,  620,  640,
        660,  680,  700,  720,  740,  760,  780,  800,  820,  840,  860,
        880,  900,  920,  940,  960,  980, 1000, 1020, 1040, 1060, 1080,
       1100, 1120, 1140, 1160, 1180, 1200, 1220, 1240, 1260, 1280, 1300,
       1320, 1340, 1360, 1380, 1400, 1420, 1440, 1460, 1480, 1500, 1520,
       1540, 1560, 1580, 1600, 1620, 1640, 1660, 1680, 1700, 1720, 1740,
       1760, 1780, 1800, 1820, 1840, 1860, 1880, 1900, 1920, 1940, 1960,
       1980, 2000])

In [5]:
for epo in epochs:
    print(epo)
    with open(config.sample_fmt%epo, 'r') as f:
        gens = [line.strip() for line in f.readlines()]
    vcs, invids = chemistry.get_valid_canons(gens)
    print("- count invalids: ", len(invids))
    with open(config.vc_fmt%epo, 'w') as f:
        f.writelines([line+'\n' for line in vcs])
    fps = chemistry.get_fps_from_smilist(vcs)
    np.save(config.npfps_fmt%epo, chemistry.rdk2npfps(fps))
    fcvecs = fcd.get_predictions(fc_ref_model, vcs)  # ChemNet vectors
    np.save(config.fcvec_fmt%epo, fcvecs)

0
- count invalids:  985
20
- count invalids:  1022
40
- count invalids:  978
60
- count invalids:  970
80
- count invalids:  1038
100
- count invalids:  1035
120
- count invalids:  846
140
- count invalids:  822
160
- count invalids:  806
180
- count invalids:  787
200
- count invalids:  827
220
- count invalids:  819
240
- count invalids:  812
260
- count invalids:  706
280
- count invalids:  721
300
- count invalids:  664
320
- count invalids:  696
340
- count invalids:  728
360
- count invalids:  626
380
- count invalids:  626
400
- count invalids:  631
420
- count invalids:  596
440
- count invalids:  605
460
- count invalids:  731
480
- count invalids:  598
500
- count invalids:  649
520
- count invalids:  556
540
- count invalids:  562
560
- count invalids:  547
580
- count invalids:  540
600
- count invalids:  581
620
- count invalids:  613
640
- count invalids:  560
660
- count invalids:  531
680
- count invalids:  510
700
- count invalids:  500
720
- count invalids:  494
740


Evaluate FCD and OTD on validation set, and pick the best epoch

In [5]:
# which validation fold recorded
vfold = expset_obj.get_setting("pik3ca-pred-best-cv")
vfold

'2'

In [6]:
affinity_data = pd.read_csv(project_paths['PIK3CA_DATA_PATH'])

# data split info
with open(project_paths['PIK3CA_FOLD_JSON'], 'r') as f:
    folds = json.load(f)

# retrieve validation set
val_ids = folds[vfold]
val_data = affinity_data.iloc[val_ids]

# get validation set activate (vsa)
vsa_data = val_data[val_data['affinity']>global_settings.PIK3CA_ACT_THRS]  # active among validation set
len(vsa_data)

vsa_smis = vsa_data['smiles'].tolist()
vsa_rdkfps = chemistry.get_fps_from_smilist(vsa_smis)
vsa_fc_vecs = fcd.get_predictions(fc_ref_model, vsa_smis)

dsize = len(vsa_rdkfps)  # demand size for OT
ssize = dsize*global_settings.OT_CALC_REPEATS  # supply size for repeated OT

# load predictor for PredAct (avg. predicted activity) calculation
with open(config.predictor_path, 'rb') as f:
    predictor = pickle.load(f)

In [7]:
val_fcd_list = []
val_otd_list = []
predact_list = []

for epo in epochs:
    print(epo)
    # load fc vectors of generation
    gen_fcvecs = np.load(config.fcvec_fmt%epo)
    fcdval = frechet_chemnet.fcd_calculation(gen_fcvecs, vsa_fc_vecs)
    val_fcd_list.append(fcdval)
    
    gen_npfps = np.load(config.npfps_fmt%epo)[:ssize]  # only need this amount
    gen_rdkfps = chemistry.np2rdkfps(gen_npfps)
    simmat = analysis.calculate_simmat(gen_rdkfps, vsa_rdkfps)  # row:gen, col:data
    distmat = analysis.transport_distmat(analysis.tansim_to_dist, simmat, global_settings.OT_CALC_REPEATS)
    _, _, motds = analysis.repeated_optimal_transport(distmat, repeat=global_settings.OT_CALC_REPEATS)
    val_otd_list.append(np.mean(motds))

    # record PredAct
    predact_list.append(np.mean(predictor.predict(gen_npfps)))

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640
1660
1680
1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000


In [8]:
# validation FCDxOTD
val_FCDxOTD = np.array(val_fcd_list)*np.array(val_otd_list)
# dataframe for validation performance
v_perf = pd.DataFrame(epochs, columns=['epoch'])
v_perf['v-OTDxFCD'] = val_FCDxOTD
v_perf['v-OTD'] = val_otd_list
v_perf['v-FCD'] = val_fcd_list
v_perf['PredAct'] = predact_list
v_perf

Unnamed: 0,epoch,v-OTDxFCD,v-OTD,v-FCD,PredAct
0,0,238.407451,6.002111,39.720600,6.841045
1,20,232.473061,5.965062,38.972450,6.898469
2,40,235.180997,5.953102,39.505623,6.958623
3,60,227.234618,5.884525,38.615627,7.005883
4,80,226.084201,5.892840,38.365917,7.044240
...,...,...,...,...,...
96,1920,306.170447,5.970956,51.276621,8.068756
97,1940,306.752297,5.984850,51.254805,8.108896
98,1960,300.842114,5.971910,50.376199,8.093584
99,1980,298.363747,5.956041,50.094304,8.042051


In [9]:
# we are only interested in epochs that achieved PredAct > (activity threshold)
subv = v_perf[v_perf['PredAct']>global_settings.PIK3CA_ACT_THRS].copy()

# find the best epoch
vbest = subv.loc[subv['v-OTDxFCD'].idxmin()]
print(vbest)

# register the best epoch
expset_obj.update_setting('pik3ca-ahc-best-epoch', int(vbest['epoch']))

epoch        1660.000000
v-OTDxFCD     295.779663
v-OTD           5.934146
v-FCD          49.843674
PredAct         8.043140
Name: 83, dtype: float64


In [19]:
v_perf.to_csv(project_paths['PROJECT_DIR']+'model-pik3ca/pik3ca-ahc-vperf.csv', index=False)