In [1]:
import pandas as pd

from SMILESX import main, inference, utils
%load_ext autoreload
%aimport SMILESX
%autoreload 1

%matplotlib inline



#### **Read data file**

In [2]:
validation_data_dir = "./validation_data/"

In [3]:
extension = '.csv'

In [4]:
data_name = 'Lipophilicity' # FreeSolv, ESOL, Lipophilicity
prop_tag = ''

In [5]:
if data_name == 'FreeSolv':
    data_filename = 'FreeSolv_SAMPL'
    prop_tag = 'expt'
elif data_name == 'ESOL':
    data_filename = 'ESOL_delaney-processed'
    prop_tag = 'measured log solubility in mols per litre'
elif data_name == 'Lipophilicity':
    data_filename = 'Lipophilicity'
    prop_tag = 'exp'
else:
    data_filename = data_name
    prop_tag = prop_tag

In [6]:
sol_data = pd.read_csv(validation_data_dir+data_filename+extension)

In [7]:
sol_data.head(3)

Unnamed: 0.1,Unnamed: 0,CMPD_CHEMBLID,exp,smiles
0,0,CHEMBL596271,3.54,Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21
1,1,CHEMBL1951080,-1.18,COc1cc(OC)c(S(=O)(=O)N2c3ccccc3CCC2C)cc1NC(=O)...
2,2,CHEMBL1771,3.69,COC(=O)[C@H](c1ccccc1Cl)N1CCc2sccc2C1


#### **Observation**
* The column containing the SMILES must be named 'smiles' 

#### **Extract relevant data**

In [8]:
sol_data = sol_data[['smiles',prop_tag]]

In [9]:
sol_data.head()

Unnamed: 0,smiles,exp
0,Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21,3.54
1,COc1cc(OC)c(S(=O)(=O)N2c3ccccc3CCC2C)cc1NC(=O)...,-1.18
2,COC(=O)[C@H](c1ccccc1Cl)N1CCc2sccc2C1,3.69
3,O=C(NC1Cc2ccccc2N(C[C@@H](O)CO)C1=O)c1cc2cc(Cl...,3.37
4,Cc1cccc(C[C@H](NC(=O)c2cc(C(C)(C)C)nn2C)C(=O)N...,3.1


In [10]:
sol_data.shape

(4200, 2)

#### **SMILES check from RDKit**

In [11]:
sol_data, bad_smiles_list = utils.check_smiles(sol_data)

In [12]:
sol_data.shape

(4200, 2)

In [13]:
sol_data.iloc[:,1].values.shape

(4200,)

#### Hyperparameters optimization with GPyOpt (Bayesian optimization)

In [14]:
dhyp_range = [int(2**itn) for itn in range(3,11)] # 
#dhyp_range = [itn for itn in range(1,1024)] 

bounds = [
    {'name': 'lstmunits', 'type': 'discrete', 'domain': dhyp_range}, 
    {'name': 'denseunits', 'type': 'discrete', 'domain': dhyp_range}, 
    {'name': 'embedding', 'type': 'discrete', 'domain': dhyp_range}, 
    {'name': 'batchsize', 'type': 'discrete', 'domain': dhyp_range}
]

In [None]:
main.Main(data=sol_data, 
          data_name=data_name, 
          data_units='', 
          bayopt_bounds=bounds, 
          k_fold_number = 10, 
          augmentation = True, 
          outdir = "./data/", 
          bayopt_n_rounds = 25, 
          bayopt_on = False, 
          lstmunits_ref = 16, 
          denseunits_ref = 1024, 
          embedding_ref = 256, 
          seed_ref = None, 
          n_gpus = 4,
          gpus_list = None, 
          gpus_debug = False,
          batchsize_pergpu = 128,  
          patience = 50, 
          n_epochs = 400, 
          lr_schedule = None, 
          verbose = 0)

4 Physical GPUs, 4 Logical GPUs detected and configured.
4 GPU device(s) will be used.

***SMILES_X starts...***

The SMILES_X process can be followed in the ./data/Main/Lipophilicity/Augm/2020-03-18_14:57:23_Main.log file.

Remaining time: 1.46 h. Processing fold #4 of data...

In [13]:
pred_from_ens = inference.Inference(data_name=data_name, 
                                    smiles_list = ['CC','CCC','C=O','ABC','DEF'], 
                                    data_units = '',
                                    k_fold_number = 3,
                                    augmentation = True, 
                                    outdir = "./data/")

***SMILES_X for inference starts...***


***Checking the SMILES list for inference***

***Data augmentation.***

Enumerated SMILES: 5

***Tokenization of SMILES.***

Full vocabulary: ['pad', 'unk', 'Cl', '[N+]', '\\', ')', '2', '-', 'c', '1', 'P', 'n', '=', ' ', '[C@@]', 'S', '3', 'Br', '#', '/', '[O-]', '[C@@H]', '[C@H]', '[nH]', '5', '4', 'C', 's', '(', 'N', 'F', '[S+2]', 'I', '[C@]', 'O']
Of size: 35

Maximum length of tokenized SMILES: 51 tokens

***Inference of SMILES property done.***


In [14]:
pred_from_ens

Unnamed: 0,SMILES,ens_pred_mean,ens_pred_sd
0,CC,0.406713,0.0213377
1,CCC,0.43053,0.020964
2,C=O,0.00962523,0.0223681
