In [1]:
import pandas as pd

from SMILESX import main, inference, utils
%load_ext autoreload
%aimport SMILESX
%autoreload 1

%matplotlib inline



#### **Read data file**

In [2]:
validation_data_dir = "./validation_data/"

In [3]:
extension = '.csv'

In [4]:
data_name = 'FreeSolv' # FreeSolv, ESOL, Lipophilicity
prop_tag = ''

In [5]:
if data_name == 'FreeSolv':
    data_filename = 'FreeSolv_SAMPL'
    prop_tag = 'expt'
elif data_name == 'ESOL':
    data_filename = 'ESOL_delaney-processed'
    prop_tag = 'measured log solubility in mols per litre'
elif data_name == 'Lipophilicity':
    data_filename = 'Lipophilicity'
    prop_tag = 'exp'
else:
    data_filename = data_name
    prop_tag = prop_tag

In [6]:
sol_data = pd.read_csv(validation_data_dir+data_filename+extension)

In [7]:
sol_data.head(3)

Unnamed: 0.1,Unnamed: 0,iupac,smiles,expt,calc
0,0,"4-methoxy-N,N-dimethyl-benzamide",COc1ccc(C(=O)N(C)C)cc1,-11.01,-9.625
1,1,methanesulfonyl chloride,CS(=O)(=O)Cl,-4.87,-6.219
2,2,3-methylbut-1-ene,C=CC(C)C,1.83,2.452


#### **Observation**
* The column containing the SMILES must be named 'smiles' 

#### **Extract relevant data**

In [8]:
sol_data = sol_data[['smiles',prop_tag]]

In [9]:
sol_data.head()

Unnamed: 0,smiles,expt
0,COc1ccc(C(=O)N(C)C)cc1,-11.01
1,CS(=O)(=O)Cl,-4.87
2,C=CC(C)C,1.83
3,CCc1cnccn1,-5.45
4,CCCCCCCO,-4.21


In [10]:
sol_data.shape

(642, 2)

#### **SMILES check from RDKit**

In [11]:
sol_data, bad_smiles_list = utils.check_smiles(sol_data)

In [12]:
sol_data.shape

(642, 2)

In [13]:
sol_data.iloc[:,1].values.shape

(642,)

#### Hyperparameters optimization with GPyOpt (Bayesian optimization)

In [14]:
dhyp_range = [int(2**itn) for itn in range(3,11)] # 
#dhyp_range = [itn for itn in range(1,1024)] 

bounds = [
    {'name': 'lstmunits', 'type': 'discrete', 'domain': dhyp_range}, 
    {'name': 'denseunits', 'type': 'discrete', 'domain': dhyp_range}, 
    {'name': 'embedding', 'type': 'discrete', 'domain': dhyp_range}
]

In [15]:
main.Main(data=sol_data, 
          data_name=data_name, 
          data_units='', 
          bayopt_bounds=bounds, 
          k_fold_number = 3, 
          augmentation = False, 
          outdir = "./data/", 
          n_seeds = 1, 
          bayopt_n_rounds = 1, 
          bayopt_on = True, 
          lstmunits_ref = 16, 
          denseunits_ref = 1024, 
          embedding_ref = 256, 
          seed_ref = None, 
          n_gpus = 4,
          gpus_list = None, 
          gpus_debug = False,
          patience = 50, 
          n_epochs = 400, 
          batchsize_pergpu = None,  
          lr_schedule = None,
          lr_min = 1e-5, 
          lr_max = 1e-2, 
          verbose = 0)

4 Physical GPUs, 4 Logical GPUs detected and configured.
4 GPU device(s) will be used.

***SMILES_X starts...***

The SMILES_X process can be followed in the ./data/Main/FreeSolv/Can/2020-03-26_10:17:16_Main.log file.

Remaining time: <0.02 h. Processing the last fold of data...

***SMILES_X has terminated successfully.***



In [16]:
Inference_class = inference.Inference(data_name=data_name, 
                                      data_units = '',
                                      augmentation = False, 
                                      indir = "./data/", 
                                      outdir = "./data/")

4 Physical GPUs, 4 Logical GPUs detected and configured.
4 GPU device(s) will be used.

No data augmentation is required.
Full vocabulary: ['pad', 'unk', 'P', '/', 'Br', 's', 'F', 'O', '4', 'n', '1', '2', ' ', '(', '[C@@H]', '5', '=', '[C@H]', '[C@@]', '[nH]', 'I', '-', 'Cl', '3', '[C@]', '[O-]', 'N', 'C', 'S', '#', ')', '[S+2]', 'c', '[N+]'], of size: 34.

Maximum length of tokenized SMILES: 47 tokens.
***************************************
***SMILES_X for inference initiated.***
***************************************



In [17]:
%%time
pred_from_ens = Inference_class.infer(smiles_list = sol_data.smiles.values.tolist(), check_smiles = True)

**************************************
***SMILES_X for inference starts...***
**************************************

Checking the SMILES list for inference.
Number of enumerated SMILES: 642.
Tokenization of SMILES.

****************************************
***Inference of SMILES property done.***
****************************************

CPU times: user 2.4 s, sys: 131 ms, total: 2.53 s
Wall time: 2.31 s


In [18]:
pred_from_ens

Unnamed: 0,SMILES,ens_pred_mean,ens_pred_sd
0,COc1ccc(C(=O)N(C)C)cc1,-10.2116,0.387253
1,CS(=O)(=O)Cl,-5.75587,0.833627
2,C=CC(C)C,1.30396,0.166197
3,CCc1cnccn1,-5.83139,0.327533
4,CCCCCCCO,-4.38829,0.0934589
...,...,...,...
637,CCCCCCCC(=O)OC,-2.37421,0.545755
638,C1CCNC1,-5.01799,0.350627
639,O=Cc1ccc(O)cc1,-9.10979,0.316098
640,CCCCCCCCl,0.0445524,0.0957219
