# Find nearest neighbors between generated molecules and agonist from the ExCAPE database based on dice similarity of Morgan fingerprints 

In [1]:
import os
import numpy as np
import pandas as pd
import tqdm

import cpmolgan.utils as utils
import cpmolgan.nearest_neighbors as nn

import logging 
logging.basicConfig(level=logging.INFO, format ='%(levelname)s - %(message)s')


## 1.  Arguments

In [2]:
# Pick one repetition
repetition = ['n1','n2','n3'][0]

args = {
    'K':10,
    'filename_data_Excape':'../../data/ExCAPE_ligands_agonist_noTrainSet.csv',
    'generated_dir':f'results/{repetition}_Valid_PassPhysChemFilter/generated_mols',
    'generated_ref_filename':"GENE__20000_Valid_PassPhysChemFilter.csv",
}

args["results_dir"]= args["generated_dir"].replace("generated_mols","nearest_neighbors")
if not os.path.isdir(args["results_dir"]):
    os.makedirs(args["results_dir"])


## 2. Define comparison pairs

In [3]:
top_10_diff_genes = ['RAF1', 'JUN', 'ATF4', 'BRAF', 'CEBPA', 'RELB', 'MEK1', 'PIK3CD','AKT3', 'WWTR1']
Excape_genes = ["NFKB1","STAT3","TP53","BRCA1","HSPA5","CREBBP","STAT1","HIF1A","NFKBIA","PRKAA1","PDPK1"]
comparison_sets =[]
for Excape_gene in Excape_genes:
    comparison_sets.append( {"Excape":[Excape_gene], 
                             "OE":[Excape_gene,"DMSO"] + top_10_diff_genes})


## 2. Read ExCAPE agonists and compute fingerprints

In [4]:
data_Excape_all= pd.read_csv(args['filename_data_Excape'],index_col=0)
data_Excape_all ["Molecule_ID"] = data_Excape_all.groupby(by="Gene_Symbol").cumcount() # enumerate molecules for each gene
data_Excape_all = data_Excape_all[ ["Molecule_ID","Gene_Symbol","SMILES_standard"] ]

logging.info("Computing Excape fingerprints")
data_Excape_all["ecfp"]  = nn.smiles_to_ecfps_parallel( data_Excape_all.SMILES_standard.values )
data_Excape_all["maccs"] = nn.smiles_to_maccs_parallel( data_Excape_all.SMILES_standard.values )

INFO - Computing Excape fingerprints


## 3. KNNs for generated molecules conditioned on Overexpression profiles
- Data2Excape: For each (OE gene, Excape gene) pair, find the closes K nearest neighbors to each generated compounds among Excape compounds
- Excape2Data: For each (OE gene, Excape gene) pair, find the closes K nearest neighbors to each Excape molecule among all generated compounds 

### Iterate over the comparison pairs

In [5]:
for comp_set in comparison_sets:
    for gene_Excape in comp_set["Excape"]:
        for gene_OE in comp_set["OE"]:
            logging.info("------ Excape %s vs. OE %s ------"%(gene_Excape, gene_OE) )
            
            # Define output filenames
            output_ID = "Excape_"+gene_Excape+"__OE_"+gene_OE
            output_ID = output_ID + "__20000MolsPerGene"
            output_filename_data2Excape = os.path.join( args["results_dir"], str(args['K'])+"KNNs_Mols2Excape__"+output_ID+".csv")
            output_filename_Excape2data = os.path.join( args["results_dir"], str(args['K'])+"KNNs_Excape2Mols__"+output_ID+".csv")

            if not os.path.isfile( output_filename_data2Excape ): 

                # Collect data for all genes in the current comparison set
                data_Excape = data_Excape_all.loc[ data_Excape_all.Gene_Symbol == gene_Excape ].reset_index(drop=True)
                OE_file = os.path.join( args["generated_dir"], args["generated_ref_filename"].replace("GENE",gene_OE) ) 
                data_OE = pd.read_csv( OE_file, index_col=0 )

                # Compute fingerprints
                logging.info("Computing fingerprints")
                data_OE["ecfp"]  = nn.smiles_to_ecfps_parallel( data_OE.SMILES_standard.values )
                data_OE["maccs"] = nn.smiles_to_maccs_parallel( data_OE.SMILES_standard.values )

                data_OE["Gene_Symbol_conditioned"], data_Excape["Gene_Symbol_conditioned"]= gene_OE, gene_OE
                data_OE["Gene_Symbol_Excape"], data_Excape["Gene_Symbol_Excape"] = gene_Excape, gene_Excape
                
                # Find nearest neighbors and save results
                logging.info("Computing KNNs")
                knns_data2Excape, knns_Excape2data = nn.set_to_set_knns( [data_OE, data_Excape] , args['K'])
                
                logging.info("Saving results")
                knns_data2Excape.to_csv(output_filename_data2Excape)
                logging.info("Saved %s"%output_filename_data2Excape)
                knns_Excape2data.to_csv(output_filename_Excape2data)
                logging.info("Saved %s"%output_filename_Excape2data)
            else:
                logging.info("File aleady exist !!!! ")

print("done")

INFO - ------ Excape NFKB1 vs. OE NFKB1 ------
INFO - File aleady exist !!!! 
INFO - ------ Excape NFKB1 vs. OE DMSO ------
INFO - File aleady exist !!!! 
INFO - ------ Excape NFKB1 vs. OE RAF1 ------
INFO - File aleady exist !!!! 
INFO - ------ Excape NFKB1 vs. OE JUN ------
INFO - File aleady exist !!!! 
INFO - ------ Excape NFKB1 vs. OE ATF4 ------
INFO - File aleady exist !!!! 
INFO - ------ Excape NFKB1 vs. OE BRAF ------
INFO - File aleady exist !!!! 
INFO - ------ Excape NFKB1 vs. OE CEBPA ------
INFO - Computing fingerprints
INFO - Computing KNNs
INFO - Saving results
