# Generate compounds conditioning on morphological profiles
This notebook exemplifies how to generate SMILES with the GAN model conditioning on 10 morphological profiles provided in `example_profiles.csv`

In [1]:
import os
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
tf.compat.v1.logging.set_verbosity (tf.compat.v1.logging.ERROR)

import cpmolgan.utils
import cpmolgan.inference as infr
import pkg_resources
WEIGHTS_PATH = pkg_resources.resource_filename('cpmolgan','model_weights')


## Input profiles

In [2]:
profiles = pd.read_csv('example_profiles.csv')
profiles.head()

Unnamed: 0,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_MajorAxisLength,Cells_AreaShape_MaxFeretDiameter,Cells_AreaShape_MaximumRadius,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_3_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_3_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_3_0,Nuclei_Texture_Variance_RNA_5_0
0,-0.755125,0.683991,-1.585965,-0.997682,-1.268116,0.693606,1.537728,-0.912031,-0.947704,0.06899,...,-0.12376,-0.492436,-0.420035,-0.455986,0.140627,0.397992,0.691084,-0.343249,-0.082334,-0.039919
1,0.036511,1.861095,-0.486261,-0.233587,0.374287,-0.095672,1.264383,-0.566748,-0.125209,0.0,...,0.014178,-1.434791,-0.541411,-0.992724,-0.166136,0.017312,-0.079206,-1.954931,-1.760399,-1.419219
2,0.176099,-0.033308,0.344421,1.742753,1.652603,-0.346675,-0.063462,0.227453,0.519483,-0.38012,...,0.731451,-0.822257,-0.369352,-0.583649,-0.18775,-0.656267,-0.649672,-1.193651,-1.498226,-1.550288
3,-1.020144,0.337245,-2.000214,0.699154,0.824849,-0.89371,-0.080042,-0.529243,-0.541148,-1.175627,...,-0.946836,0.234467,-0.184202,-0.073382,-0.128276,-0.887577,-0.619908,-0.710301,-1.09257,-0.918907
4,0.689864,1.611283,1.835242,1.751271,1.663987,-1.097318,-0.998787,1.0513,1.050965,0.304195,...,0.848972,-0.930476,-1.021213,-1.790375,-0.361636,-0.030407,-0.066948,1.126699,-0.247639,-0.267828


In [3]:
feature_cols , meta_cols = cpmolgan.utils.get_feature_cols(profiles)
profiles[feature_cols].to_csv('example_profiles.csv',index=False)

## 1. Apply quantile transformer

In [4]:
quantile_transformer =  pickle.load( open( os.path.join(WEIGHTS_PATH,'quantile_transformer.pkl'), 'rb' ) )
feature_cols , meta_cols = cpmolgan.utils.get_feature_cols(profiles)
profiles[feature_cols] = quantile_transformer.transform(profiles[feature_cols].values) 
print('Total profiles: %i \nTotal features: %i'%profiles.shape)

Total profiles: 10 
Total features: 1449


## 2. Set compute environment 

In [5]:
use_gpu = True
gpu_device = '15'

if use_gpu:
    os.environ['CUDA_VISIBLE_DEVICES'] = gpu_device
    gpu_options = tf.GPUOptions(visible_device_list='0')
    tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    tf.config.set_soft_device_placement(True)
    tf.debugging.set_log_device_placement(True)
else:
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
    


## 4. Load inference model

In [6]:
model_weigth_paths = {
    'autoencoder': os.path.join(WEIGHTS_PATH,'autoencoder.h5'),
    'wgan':{
            'C': os.path.join(WEIGHTS_PATH,'gan_C.h5'),
            'D': os.path.join(WEIGHTS_PATH,'gan_D.h5'),
            'G': os.path.join(WEIGHTS_PATH,'gan_G.h5'),
            'condition_encoder':os.path.join(WEIGHTS_PATH,'gan_condition_encoder.h5'),
            'classifier':os.path.join(WEIGHTS_PATH,'gan_classifier.h5')
            }
}

model = infr.InferenceModel( model_weigth_paths ) 

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla V100-SXM3-32GB, pci bus id: 0000:e7:00.0, compute capability: 7.0



## 5. Generate molecules

In [7]:
random_seed = 10
num_mols_per_profile = 10
generated = infr.generate_compounds_multiple_conditions( model, profiles, feature_cols, meta_cols, seed=random_seed, nsamples=num_mols_per_profile)


100%|██████████| 10/10 [00:08<00:00,  1.17it/s]


## 6. Check validity 

In [8]:
generated['SMILES_standard']= cpmolgan.utils.clean_smiles_parallel( generated.SMILES )
generated['valid'] = generated.SMILES_standard.isnull()==False
print('valid percentage: %.2f'%(generated['valid'].sum()/len(generated)))
generated.head()

valid percentage: 0.48


Unnamed: 0,SMILES,classification_score,SMILES_standard,valid
0,COCCNC(=O)C(=CNC(=O)OCC(c1ccccc1)NC=O),0.93522,COCCNC(=O)C=CNC(=O)OCC(NC=O)c1ccccc1,True
1,COcccc(C(=O)NNC(=O)Nccc(C(F)(F)F)nc(N1CCCCC1)),0.904445,,False
2,O=CN(C12CC(F)C1)N(Cc1ccccc1)CCCN(Cc1cc(F)ccc1F...,0.953433,O=CN1N(Cc2ccccc2)CCCN(Cc2cc(F)ccc2F)CC(=O)C12C...,True
3,C#CCNC(=O)C1C(=O)NCN1CC(=O)C1CCOc2c(nn(C)c2C(C...,0.937077,C#CCNC(=O)C1C(=O)NCN1CC(=O)C1CCOc2c(nn(C)c2C(C...,True
4,COcc1cc(NC(=O)CC(NC(=O)CCc2ccc3c(c2)OCO3)CN)(C...,0.947332,,False
