# Compute molecular embeddings from SMILES
This notebook exemplifies how to compute molecular embeddings from SMILES with the molecular autoencoder using 10 example SMILES provided in `example_smiles.csv`

In [1]:
import os
import numpy as np
import pandas as pd

import tensorflow as tf
tf.compat.v1.logging.set_verbosity (tf.compat.v1.logging.ERROR)

import cpmolgan.utils
import cpmolgan.inference as infr
import pkg_resources
WEIGHTS_PATH = pkg_resources.resource_filename('cpmolgan','model_weights')


## Input SMILES

In [2]:
cpds = pd.read_csv('example_smiles.csv')
cpds.head()

Unnamed: 0,SMILES
0,C[C@@H](CO)N1C[C@@H](C)[C@@H](CN(C)C(=O)Nc2ccc...
1,CCOC(=O)C(CC)Sc1nc2c(c(=O)[nH]c(=O)n2C)n1C/C=C...
2,C=CCN1C(=O)NC(=O)/C(=C\Nc2ccc(OC)cc2C)C1=O
3,O=C([C@H]1[C@@H]2N[C@@H](Cn3c2ccc(-c2ccccc2)c3...
4,COc1ccc2nc(O)c(CN(CCc3ccccc3)C(C)=O)cc2c1


## 1. Clean SMILES 
Standardize SMILES and filter those exceeding the maximum smiles length

In [3]:
cpds["SMILES_standard"]= cpmolgan.utils.clean_smiles_parallel( cpds.SMILES )
keep_idx = cpds.SMILES_standard.apply(lambda x: len(x) < infr.max_smiles_length )
cpds = cpds[keep_idx].reset_index(drop=True)

## 2. Set compute environment

In [4]:
use_gpu = True
gpu_device = '15'

if use_gpu:
    os.environ['CUDA_VISIBLE_DEVICES'] = gpu_device
    gpu_options = tf.GPUOptions(visible_device_list='0')
    tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    tf.config.set_soft_device_placement(True)
    tf.debugging.set_log_device_placement(True)
else:
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
    


## 3. Load inference model

In [5]:
model_weigth_paths = {
    'autoencoder': os.path.join(WEIGHTS_PATH,'autoencoder.h5'),
    'wgan':{
            'C': os.path.join(WEIGHTS_PATH,'gan_C.h5'),
            'D': os.path.join(WEIGHTS_PATH,'gan_D.h5'),
            'G': os.path.join(WEIGHTS_PATH,'gan_G.h5'),
            'condition_encoder':os.path.join(WEIGHTS_PATH,'gan_condition_encoder.h5'),
            'classifier':os.path.join(WEIGHTS_PATH,'gan_classifier.h5')
            }
}

model = infr.InferenceModel( model_weigth_paths ) 

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla V100-SXM3-32GB, pci bus id: 0000:e7:00.0, compute capability: 7.0
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device



## 4. Compute and remove invalid SELFIES 

In [6]:
smiles = cpds.SMILES_standard.values.astype(str)
cpds['selfies'], valid_idx = model.encode_smiles_to_selfies(smiles)
print("Removing %i compounds with invalid selfies"%(valid_idx==False).sum())
cpds = cpds.loc[valid_idx].reset_index(drop=True)


Removing 1 compounds with invalid selfies


## 5. Compute molecular embeddings (latents)

In [8]:
latents = model.encode_selfies_to_latent(cpds.selfies)
embd_cols = ['MolEmb_'+str(i) for i in range(latents.shape[1])]
latents = pd.concat( [cpds, pd.DataFrame( columns=embd_cols, data=latents)], axis=1)
print('Latents shape',latents.shape)
latents.head()

Latents shape (9, 259)


Unnamed: 0,SMILES,SMILES_standard,selfies,MolEmb_0,MolEmb_1,MolEmb_2,MolEmb_3,MolEmb_4,MolEmb_5,MolEmb_6,...,MolEmb_246,MolEmb_247,MolEmb_248,MolEmb_249,MolEmb_250,MolEmb_251,MolEmb_252,MolEmb_253,MolEmb_254,MolEmb_255
0,C[C@@H](CO)N1C[C@@H](C)[C@@H](CN(C)C(=O)Nc2ccc...,CC1CN(C(C)CO)C(=O)CCCn2cc(nn2)COC1CN(C)C(=O)Nc...,[C][C][C][N][Branch1_3][Branch1_3][C][Branch1_...,-0.033392,0.056579,-0.025619,-0.009468,0.01308,-0.002021,0.015934,...,0.024896,0.005675,0.031988,-0.012734,-0.058496,-0.012953,-0.00691,0.074245,0.029673,-0.0273
1,C=CCN1C(=O)NC(=O)/C(=C\Nc2ccc(OC)cc2C)C1=O,C=CCN1C(=O)NC(=O)C(=CNc2ccc(OC)cc2C)C1=O,[C][=C][C][N][C][Branch1_3][epsilon][=O][N][C]...,-0.006255,0.047208,0.007993,0.049088,-0.038123,-0.019305,0.026855,...,0.008762,0.038505,0.073897,-0.772491,0.056982,-0.028423,0.033767,0.038079,0.054006,0.011346
2,O=C([C@H]1[C@@H]2N[C@@H](Cn3c2ccc(-c2ccccc2)c3...,O=C(C1C2NC(Cn3c2ccc(-c2ccccc2)c3=O)C1CO)N1CCCCC1,[O][=C][Branch2_3][epsilon][#N][C][C][N][C][Br...,-0.084417,-0.040475,-0.035277,0.229187,0.037274,0.015411,-0.089161,...,0.003716,0.016624,0.013589,0.944191,-0.089152,0.013656,-0.061056,-0.024325,0.006532,-0.074826
3,COc1ccc2nc(O)c(CN(CCc3ccccc3)C(C)=O)cc2c1,COc1ccc2nc(O)c(CN(CCc3ccccc3)C(C)=O)cc2c1,[C][O][c][c][c][c][n][c][Branch1_3][epsilon][O...,-0.036326,0.075981,0.02175,0.032379,0.008221,-0.017829,0.029965,...,-0.033604,-0.028315,0.043125,0.324175,0.038988,0.018984,-0.01221,0.025412,0.048986,8.5e-05
4,CC1=C2[C@H]3OC(=O)[C@@H](C)[C@@H]3CC[C@@]2(C)C...,CC1=C2C3OC(=O)C(C)C3CCC2(C)C=CC1=O,[C][C][=C][C][O][C][Branch1_3][epsilon][=O][C]...,-0.008037,-0.029234,-0.00769,0.154807,-0.024859,-0.016883,-0.056083,...,0.050528,0.032958,0.010116,-0.059367,0.051832,0.00437,0.030975,0.004481,0.017562,-0.019272
