In [1]:
import sys
sys.path.append('..')

In [2]:
import pandas as pd
import numpy as np
import pickle
import json
from logics_pack import global_settings, analysis, chemistry, evaluation, frechet_chemnet
import fcd

project_paths = global_settings.build_project_paths(project_dir='../')
expset_obj = global_settings.ExperimentSettings(project_paths['EXPERIMENT_SETTINGS_JSON'])
expset_obj.get_keys()

Using TensorFlow backend.


dict_keys(['kor-pred-best-cv', 'pik3ca-pred-best-cv', 'kor-logics-best-epoch', 'kor-vgpc-best-epoch', 'kor-segler-best-epoch', 'kor-reinv-best-epoch', 'kor-drugex-best-epoch', 'pik3ca-logics-best-epoch', 'pik3ca-segler-best-epoch', 'pik3ca-vgpc-best-epoch', 'pik3ca-reinv-best-epoch', 'pik3ca-drugex-best-epoch'])

In [3]:
## run this cell in case CuDNN library error occurs 
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [4]:
fc_ref_model = fcd.load_ref_model()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


2023-04-12 16:21:11.707773: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2023-04-12 16:21:11.749494: E tensorflow/stream_executor/cuda/cuda_driver.cc:318] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-04-12 16:21:11.749526: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: shepherd7
2023-04-12 16:21:11.749532: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: shepherd7
2023-04-12 16:21:11.749637: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 465.19.1
2023-04-12 16:21:11.749665: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 465.19.1
2023-04-12 16:21:11.749670: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 465.19.1
2023-04-12 16:21:11.750618: I tensorflow/core/platform/cpu_featu




In [5]:
SAMPLE_SIZE = 20000  # we sampled and saved 20k for each model
INTDIV_SIZE = 1000  # sample size to be used for intdiv calculation

# we need to fillout the following table
perf_table = pd.DataFrame(index=['validity','uniqueness','novelty','diversity','PredAct','PwSim','FCD','OTD'], 
                        columns=['prior','vgpc','segler','reinv','drugex','logics'])
model_names = perf_table.columns.tolist()
metrics = perf_table.index.tolist()

In [6]:
# recorded epochs for each model
model_e = {
    'prior': 10, 
    'vgpc':int(expset_obj.get_setting('pik3ca-vgpc-best-epoch')),
    'segler':int(expset_obj.get_setting('pik3ca-segler-best-epoch')), 
    'reinv':int(expset_obj.get_setting('pik3ca-reinv-best-epoch')), 
    'drugex':int(expset_obj.get_setting('pik3ca-drugex-best-epoch')),
    'logics':int(expset_obj.get_setting('pik3ca-logics-best-epoch')),
}
model_e

{'prior': 10,
 'vgpc': 16,
 'segler': 150,
 'reinv': 21000,
 'drugex': 4230,
 'logics': 200}

In [7]:
# generation file paths
paths_vc = {
    'prior': project_paths['PROJECT_DIR']+'model-prior/prior_vc_e{}.smi'.format(model_e['prior']),
    'vgpc': project_paths['PROJECT_DIR']+'model-pik3ca/vgpc/pik3ca_vgpc_vc_e{}.smi'.format(model_e['vgpc']),
    'segler': project_paths['PROJECT_DIR']+'model-pik3ca/segler/pik3ca_segler_vc_e{}.smi'.format(model_e['segler']),
    'reinv': project_paths['PROJECT_DIR']+'model-pik3ca/reinvent/pik3ca_reinv_vc_e{}.smi'.format(model_e['reinv']),
    'drugex': project_paths['PROJECT_DIR']+'model-pik3ca/drugex/pik3ca_drugex_vc_e{}.smi'.format(model_e['drugex']),
    'logics': project_paths['PROJECT_DIR']+'model-pik3ca/logics/pik3ca_logics_vc_e{}.smi'.format(model_e['logics'])
}
paths_npfps = {
    'prior': project_paths['PROJECT_DIR']+'model-prior/prior_npfps_e{}.npy'.format(model_e['prior']),
    'vgpc': project_paths['PROJECT_DIR']+'model-pik3ca/vgpc/pik3ca_vgpc_npfps_e{}.npy'.format(model_e['vgpc']),
    'segler': project_paths['PROJECT_DIR']+'model-pik3ca/segler/pik3ca_segler_npfps_e{}.npy'.format(model_e['segler']),
    'reinv': project_paths['PROJECT_DIR']+'model-pik3ca/reinvent/pik3ca_reinv_npfps_e{}.npy'.format(model_e['reinv']),
    'drugex': project_paths['PROJECT_DIR']+'model-pik3ca/drugex/pik3ca_drugex_npfps_e{}.npy'.format(model_e['drugex']),
    'logics': project_paths['PROJECT_DIR']+'model-pik3ca/logics/pik3ca_logics_npfps_e{}.npy'.format(model_e['logics'])
}
paths_fc_vecs = {
    'prior': project_paths['PROJECT_DIR']+'model-prior/prior_fcvec_e{}.npy'.format(model_e['prior']),
    'vgpc': project_paths['PROJECT_DIR']+'model-pik3ca/vgpc/pik3ca_vgpc_fcvec_e{}.npy'.format(model_e['vgpc']),
    'segler': project_paths['PROJECT_DIR']+'model-pik3ca/segler/pik3ca_segler_fcvec_e{}.npy'.format(model_e['segler']),
    'reinv': project_paths['PROJECT_DIR']+'model-pik3ca/reinvent/pik3ca_reinv_fcvec_e{}.npy'.format(model_e['reinv']),
    'drugex': project_paths['PROJECT_DIR']+'model-pik3ca/drugex/pik3ca_drugex_fcvec_e{}.npy'.format(model_e['drugex']),
    'logics': project_paths['PROJECT_DIR']+'model-pik3ca/logics/pik3ca_logics_fcvec_e{}.npy'.format(model_e['logics'])
}

In [8]:
# pre-training dataset loading
with open(project_paths['PRETRAINING_DATA_PATH'], 'r') as f:
    pret_smis = [line.strip() for line in f.readlines()]
len(pret_smis)

1583442

In [9]:
# loading predictor 
pred_path = project_paths['PROJECT_DIR'] + "model-pik3ca/predictor/pik3ca_rfr_cv{}.pkl".format(
                                                    expset_obj.get_setting("pik3ca-pred-best-cv"))
with open(pred_path, 'rb') as f:
    predictor = pickle.load(f)
predictor

RandomForestRegressor()

In [10]:
# loading test set actives (tsa)
affinity_data = pd.read_csv(project_paths['PIK3CA_DATA_PATH'])
with open(project_paths['PIK3CA_FOLD_JSON'], 'r') as f:
    folds = json.load(f)
test_ids = folds[str(global_settings.TEST_FOLD_IDX)]
test_data = affinity_data.iloc[test_ids]

tsa_data = test_data[test_data['affinity']>global_settings.PIK3CA_ACT_THRS]  # active among test set
tsa_smis = tsa_data['smiles'].tolist()
tsa_rdkfps = chemistry.get_fps_from_smilist(tsa_smis)
tsa_fc_vecs = fcd.get_predictions(fc_ref_model, tsa_smis)

In [11]:
# evaluation config objects
evcons = {}
for mn in model_names:
    with open(paths_vc[mn], 'r') as f:
        vc_smis = [line.strip() for line in f.readlines()]
    npfps = np.load(paths_npfps[mn])
    fc_vecs = np.load(paths_fc_vecs[mn])
    evc = evaluation.EvalConfig(
            ssize=SAMPLE_SIZE, vc_smis=vc_smis, npfps=npfps, simmat_size=INTDIV_SIZE, fc_vecs=fc_vecs,
            data_smis=tsa_data, data_rdkfps=tsa_rdkfps, data_fc_vecs=tsa_fc_vecs, ot_repeats=global_settings.OT_CALC_REPEATS
    )
    evcons[mn] = evc

In [12]:
# evaluate and fillout the performance table
for mn in model_names:
    print(mn)
    va, uni, nov, div = evaluation.eval_standard(evcons[mn], pret_smis)
    predact, pwsim, fcdval, otdval = evaluation.eval_optimization(evcons[mn], predictor)
    perf_table[mn]['validity'] = va
    perf_table[mn]['uniqueness'] = uni
    perf_table[mn]['novelty'] = nov
    perf_table[mn]['diversity'] = div
    perf_table[mn]['PredAct'] = predact
    perf_table[mn]['PwSim'] = pwsim
    perf_table[mn]['FCD'] = fcdval
    perf_table[mn]['OTD'] = otdval

prior
vgpc
segler
reinv
drugex
logics


In [13]:
perf_table

Unnamed: 0,prior,vgpc,segler,reinv,drugex,logics
validity,0.95365,0.8338,0.97105,0.99815,0.9832,0.9955
uniqueness,0.999266,0.997841,0.94645,0.655312,0.996135,0.715671
novelty,0.948318,0.994892,0.99913,0.933726,0.997498,0.994877
diversity,0.88942,0.824073,0.782522,0.789715,0.802702,0.737681
PredAct,6.846708,8.049363,8.753931,8.83186,8.390269,9.539011
PwSim,0.100009,0.111354,0.120689,0.174694,0.11307,0.187271
FCD,40.912529,43.751357,45.795805,32.597269,44.000946,29.669013
OTD,6.011624,5.934928,5.754742,4.456088,5.881252,4.273728
