In [None]:
import sys
sys.path.append('..')

In [None]:
from logics_pack import global_settings, chemistry, generator, analysis, smiles_vocab, smiles_lstm
import pandas as pd
import numpy as np
import json
import torch

project_paths = global_settings.build_project_paths(project_dir='../')
expset_obj = global_settings.ExperimentSettings(project_paths['EXPERIMENT_SETTINGS_JSON'])

We first build the pre-training dataset. 

From the ChEMBL dataset, we exclude the molecules from validation and test set of the predictor bioassay datasets.

In [None]:
kor_aff = pd.read_csv(project_paths['KOR_DATA_PATH'])
pik3ca_aff = pd.read_csv(project_paths['PIK3CA_DATA_PATH'])

# fold split dictionaries
with open(project_paths['KOR_FOLD_JSON'], 'r') as f:
    kor_fs = json.load(f)
with open(project_paths['PIK3CA_FOLD_JSON'],'r') as f:
    pik3ca_fs = json.load(f)

# validation and test set fold idx
kvf, pvf = str(expset_obj.get_setting('kor-pred-best-cv')), str(expset_obj.get_setting('pik3ca-pred-best-cv'))
ktf, ptf = str(global_settings.TEST_FOLD_IDX), str(global_settings.TEST_FOLD_IDX)

In [None]:
# building pre-training dataset
kor_vinds, kor_tinds = np.array(kor_fs[kvf]), np.array(kor_fs[ktf])
kor_excl = np.append(kor_vinds, kor_tinds)  # validation and test of KOR
kor_excl_smis = kor_aff['smiles'].iloc[kor_excl].tolist()  # exclusion from KOR

pik3ca_vinds, pik3ca_tinds = np.array(pik3ca_fs[pvf]), np.array(pik3ca_fs[ptf])
pik3ca_excl = np.append(pik3ca_vinds, pik3ca_tinds)   # validation and test of PIK3CA
pik3ca_excl_smis = pik3ca_aff['smiles'].iloc[pik3ca_excl].tolist()  # exclusion from PIK3CA

excl_smis = kor_excl_smis.copy()
excl_smis.extend(pik3ca_excl_smis.copy())
set_excl_smis = set(excl_smis)

with open(project_paths['CHEMBL_DATA_PATH'], 'r') as f:
    new_chembl = [line.strip() for line in f.readlines()]
# exclude the molecules from bioassay validation and test
prior_smis = list(set(new_chembl).difference(set_excl_smis))
with open(project_paths['PRETRAINING_DATA_PATH'], 'w') as f:
    f.writelines([line+'\n' for line in prior_smis])

Perform pre-training to build prior generator

In [None]:
# prior generator training config
config = global_settings.Object()
config.tokens_path = project_paths['SMILES_TOKENS_PATH']
config.pretrain_setting_path = project_paths['PRETRAIN_SETTING_JSON']
config.dataset_path = project_paths['PRETRAINING_DATA_PATH']
config.max_epoch = 20

config.save_ckpt_fmt = project_paths['PROJECT_DIR'] + 'model-prior/prior_e%d.ckpt'
config.sample_fmt = project_paths['PROJECT_DIR'] + 'model-prior/prior_e%d.txt'
config.sample_size = 20000

config.device_name = 'cpu'

In [None]:
# perform pre-training
generator.pretrain(config)

Load the prior generator and sample some examples

In [None]:
vocab_obj = smiles_vocab.Vocabulary(init_from_file=config.tokens_path)
smtk = smiles_vocab.SmilesTokenizer(vocab_obj)

with open(config.pretrain_setting_path, 'r') as f:
    model_setting = json.load(f)
    
# load prior model (epoch=10)
pret_ckpt = torch.load(config.save_ckpt_fmt%10, map_location='cpu')
lstm_prior = smiles_lstm.SmilesLSTMGenerator(vocab_obj, model_setting['emb_size'], model_setting['hidden_units'], device_name='cpu')
lstm_prior.lstm.load_state_dict(pret_ckpt['model_state_dict'])

In [None]:
# sampling
ssplr = analysis.SafeSampler(lstm_prior, batch_size=16)
generated_smiles = ssplr.sample_clean(50, maxlen=150)
display(generated_smiles)