# Compute molecular embeddings of generated molecules 

In [1]:
import os
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
tf.compat.v1.logging.set_verbosity (tf.compat.v1.logging.ERROR)

import cpmolgan.utils as utils
import cpmolgan.inference as infr
import pkg_resources
WEIGHTS_PATH = pkg_resources.resource_filename('cpmolgan','model_weights')

import logging
logging.basicConfig(level=logging.INFO, format ='%(levelname)s - %(message)s')
tf.logging.set_verbosity(tf.logging.ERROR)


### Arguments


In [2]:
# Pick a cluster between 0 and 19
cluster = '0'

args = {
    'use_gpu': True,
    'gpu_device':'1',
    'filename_generated_mols':f'results/generated_mols/Cluster{cluster}__15000_Valid.csv',
}
args['output_filename'] = args['filename_generated_mols'].replace('generated_mols','molecular_embeddings')

if not os.path.isdir( os.path.split(args['output_filename'])[0] ): 
    os.makedirs( os.path.split(args['output_filename'])[0] )

## 1. Set compute environment 

In [3]:
if args['use_gpu']:
    os.environ['CUDA_VISIBLE_DEVICES'] = args['gpu_device']
    gpu_options = tf.GPUOptions(visible_device_list='0')
    tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    tf.config.set_soft_device_placement(True)
    tf.debugging.set_log_device_placement(True)
else:
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

## 2. Load inference model

In [4]:

model_weigth_paths = {
    'autoencoder': os.path.join(WEIGHTS_PATH,'autoencoder.h5'),
    'wgan':{
            'C': os.path.join(WEIGHTS_PATH,'gan_C.h5'),
            'D': os.path.join(WEIGHTS_PATH,'gan_D.h5'),
            'G': os.path.join(WEIGHTS_PATH,'gan_G.h5'),
            'condition_encoder':os.path.join(WEIGHTS_PATH,'gan_condition_encoder.h5'),
            'classifier':os.path.join(WEIGHTS_PATH,'gan_classifier.h5')
            }
}

model = infr.InferenceModel( model_weigth_paths ) 

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla V100-SXM3-32GB, pci bus id: 0000:36:00.0, compute capability: 7.0



## 3. Read generated molecules

In [5]:
# generated compounds
data = pd.read_csv(args['filename_generated_mols'],index_col=0)


## 4. Compute embeddings

In [6]:
if not os.path.isfile(args['output_filename'] ):
    logging.info("Computing selfies")
    smiles = data.SMILES_standard.values.astype(str)
    selfies, valid_idx = model.encode_smiles_to_selfies(smiles)
    logging.info("Removing %i unvalid selfies"%(valid_idx==False).sum())
    data = data.loc[valid_idx].reset_index(drop=True)
    selfies = selfies[valid_idx]
    logging.info("Computing latent representations")
    latents = model.encode_selfies_to_latent(selfies)
    logging.info("Saving latent representations")
    embd_cols = ['MolEmb_'+str(i) for i in range(latents.shape[1])]
    data = pd.concat( [data, pd.DataFrame( columns=embd_cols, data=latents)], axis=1)
    logging.info("Saving file %s"%args['output_filename'])
    data.to_csv(args['output_filename'])
else:
    logging.info("File already exists")
    data = pd.read_csv(args['output_filename'],index_col=0)

INFO - File already exists
