# Molecular generation conditioned on per-SMILES median profiles from different clusters 

In [1]:
import os
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
tf.compat.v1.logging.set_verbosity (tf.compat.v1.logging.ERROR)

import logging
logging.basicConfig(level=logging.INFO, format ='%(levelname)s - %(message)s')
tf.logging.set_verbosity(tf.logging.ERROR)

from sklearn.preprocessing import QuantileTransformer
from rdkit import Chem
from rdkit.Chem.Descriptors import qed
import sys
sys.path.append(os.path.join(Chem.RDConfig.RDContribDir, 'SA_Score'))
import sascorer

import cpmolgan.utils as utils
import cpmolgan.inference as infr
import pkg_resources
WEIGHTS_PATH = pkg_resources.resource_filename('cpmolgan','model_weights')


### Arguments

In [2]:
args = {
    "N_valid_per_cluster":15000,
    'use_gpu': True,
    'gpu_device':'1',
    'filename_train_profiles':"results/train_set_cluster_profiles_median_per_smiles.csv",
    "output_dir":"results/generated_mols/",
    "clusters":['Cluster'+str(c) for c in range(20)][0:1]
}

if not os.path.isdir(args["output_dir"]):
    os.makedirs(args["output_dir"])


## 1. Set compute environment 

In [3]:
if args['use_gpu']:
    os.environ['CUDA_VISIBLE_DEVICES'] = args['gpu_device']
    gpu_options = tf.GPUOptions(visible_device_list='0')
    tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    tf.config.set_soft_device_placement(True)
    tf.debugging.set_log_device_placement(True)
else:
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

## 2. Load inference model

In [4]:
model_weigth_paths = {
    'autoencoder': os.path.join(WEIGHTS_PATH,'autoencoder.h5'),
    'wgan':{
            'C': os.path.join(WEIGHTS_PATH,'gan_C.h5'),
            'D': os.path.join(WEIGHTS_PATH,'gan_D.h5'),
            'G': os.path.join(WEIGHTS_PATH,'gan_G.h5'),
            'condition_encoder':os.path.join(WEIGHTS_PATH,'gan_condition_encoder.h5'),
            'classifier':os.path.join(WEIGHTS_PATH,'gan_classifier.h5')
            }
}

model = infr.InferenceModel( model_weigth_paths ) 

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla V100-SXM3-32GB, pci bus id: 0000:36:00.0, compute capability: 7.0



## 3. Read profiles and apply quantile transformer

In [5]:
# Read data and select specified clusters
data_train = pd.read_csv(args['filename_train_profiles'], index_col=0 )
keep_idx = data_train.cluster.isin(args["clusters"])
data_train = data_train[keep_idx].reset_index(drop=True)
logging.info('Number of clusters: %i'%len(data_train.cluster.unique()))
logging.info('clusters: %a'%data_train.cluster.unique())

# Apply quantile transformer
quantile_transformer =  pickle.load( open( os.path.join(WEIGHTS_PATH,'quantile_transformer.pkl'), 'rb' ) )
feature_cols , info_cols = utils.get_feature_cols(data_train)
data_train[feature_cols] = quantile_transformer.transform(data_train[feature_cols].values)   
logging.info('Number of Morphological profiles: %i'%data_train.shape[0])


INFO - Number of clusters: 1
INFO - clusters: array(['Cluster0'], dtype=object)
INFO - Number of Morphological profiles: 2707


## 4. Generate a fixed number of valid molecules per gene
We dont want to filter unique here, because the frequency in which a molecule is generated also provides information

In [6]:
for cluster in args["clusters"]:
    
    logging.info("------ %s ---------"%cluster)
    
    # Define output file and check that it doesn't already exit
    output_file = os.path.join(args["output_dir"], cluster+"__"+str(args["N_valid_per_cluster"])+"_Valid.csv")
    logging.info(output_file)
    if os.path.isfile(output_file):
        logging.warning("File %s already exists. Skipping it \n"%output_file)
        continue
    
    # Generate a fixed amount of molecules per cluster
    cluster_idx = data_train["cluster"] == cluster
    cluster_data = data_train.loc[cluster_idx].reset_index(drop=True)
    N = len(cluster_data)
    generated_final = pd.DataFrame()
    i = 0    
    
    # Adjust the number of generated molecules per condition according to the number of available samples
    max_Nconditions = 500
    sample_cluster_data = False
    if len(cluster_data) > max_Nconditions:
        sample_cluster_data = True
    if N/args["N_valid_per_cluster"] < 1e-2:
        N_per_condition = 100  
    else:
        N_per_condition = 10
            
    while len(generated_final) < args["N_valid_per_cluster"]:
        
        if sample_cluster_data:
            cluster_data = data_train.loc[cluster_idx].sample(max_Nconditions, random_state=0).reset_index(drop=True)
        
        # Run generation and filter valid and unique inside current generated batch 
        temp_generated = infr.generate_compounds_multiple_conditions( model, cluster_data, feature_cols, info_cols, seed=0, nsamples=N_per_condition)
        temp_generated = infr.filter_valid_and_unique(temp_generated, cond_ID_cols=["cluster"], select_unique=False)
        generated_final = pd.concat([generated_final, temp_generated])
        logging.info("%s iteration %i: %i valid molecules "%(cluster,i,len(generated_final)) )
        i = i +1
        
    # Save results
    generated_final = generated_final.reset_index(drop=True)
    generated_final = generated_final.iloc[0:args["N_valid_per_cluster"]]
    generated_final.to_csv(output_file)
    

INFO - ------ Cluster0 ---------
INFO - results/generated_mols/Cluster0__15000_Valid.csv

