# Find nearest neighbors between generated molecules from cluster pairs based on cosine distance of Molecular embeddings

In [1]:
import os
import numpy as np
import pandas as pd
import tqdm

import cpmolgan.utils as utils
import cpmolgan.nearest_neighbors as nn

import logging 
logging.basicConfig(level=logging.INFO, format ='%(levelname)s - %(message)s')

### Arguments

In [2]:
args = {
    'metric':'cosine',
    'K':5,
    'mol_embedding_dir':"results/molecular_embeddings",
    'generated_ref_filename':"CLUSTER__15000_Valid.csv",
}
args["results_dir"]= args["mol_embedding_dir"].replace("molecular_embeddings","nearest_neighbors")

if not os.path.isdir(args["results_dir"]):
    os.makedirs(args["results_dir"])

## 2. Define comparison groups
Comparison against cluster 0 for the 3 MST branches in Figure 2c

In [3]:
comparison_sets = [ ["Cluster0", "Cluster0"],
                    ["Cluster0", "Cluster2"],
                    ["Cluster0", "Cluster8"],
                    ["Cluster0", "Cluster19"],
                    ["Cluster0", "Cluster3"],
                    ["Cluster0", "Cluster10"],
                    ["Cluster0", "Cluster18"],
                    ["Cluster0", "Cluster5"],
                    ["Cluster0", "Cluster11"],
                    ["Cluster0", "Cluster15"],
]
        

## 3. Compute KNNS

In [4]:
for comp_set in comparison_sets:
    cluster_l, cluster_r = comp_set
    
    logging.info("------  %s vs. %s ------"%(cluster_l, cluster_r) )
    
    # Define output filenames
    output_ID_l2r = cluster_l+"_"+cluster_r+"__MolEmbedding_"+args['metric']
    output_ID_r2l = cluster_r+"_"+cluster_l+"__MolEmbedding_"+args['metric']  
    output_filename_l2r = os.path.join( args["results_dir"], str(args['K'])+"KNNs_"+output_ID_l2r+".csv")
    output_filename_r2l = os.path.join( args["results_dir"], str(args['K'])+"KNNs_"+output_ID_r2l+".csv")

    if not os.path.isfile(output_filename_l2r): 
            
        # Collect data for current comparison set
        data_filename_l = os.path.join( args["mol_embedding_dir"], args["generated_ref_filename"].replace("CLUSTER",cluster_l) ) 
        data_filename_r = os.path.join( args["mol_embedding_dir"], args["generated_ref_filename"].replace("CLUSTER",cluster_r) ) 
        data_l = pd.read_csv( data_filename_l, index_col=0 )
        data_r = pd.read_csv( data_filename_r, index_col=0 )
        embedding_cols = [ c  for c in data_l.columns if 'MolEmb' in c]

        # Find nearest neighbors and save results
        logging.info("Computing KNNs")
        knns_l2r, knns_r2l = nn.set_to_set_knns_mol_embeddings( [data_l, data_r], args['K'], embedding_cols, metric=args['metric'])
        logging.info("Saving results")
        knns_l2r.to_csv(output_filename_l2r)
        logging.info("Saved %s"%output_filename_l2r)
        knns_r2l.to_csv(output_filename_r2l)
        logging.info("Saved %s"%output_filename_r2l)
        
    else:
        logging.info("File aleady exist !!!! ")

INFO - ------  Cluster0 vs. Cluster0 ------
INFO - File aleady exist !!!! 
INFO - ------  Cluster0 vs. Cluster2 ------
INFO - File aleady exist !!!! 
INFO - ------  Cluster0 vs. Cluster8 ------
INFO - File aleady exist !!!! 
INFO - ------  Cluster0 vs. Cluster19 ------
INFO - File aleady exist !!!! 
INFO - ------  Cluster0 vs. Cluster3 ------
INFO - File aleady exist !!!! 
INFO - ------  Cluster0 vs. Cluster10 ------
INFO - File aleady exist !!!! 
INFO - ------  Cluster0 vs. Cluster18 ------
INFO - File aleady exist !!!! 
INFO - ------  Cluster0 vs. Cluster5 ------
INFO - File aleady exist !!!! 
INFO - ------  Cluster0 vs. Cluster11 ------
INFO - File aleady exist !!!! 
INFO - ------  Cluster0 vs. Cluster15 ------
INFO - File aleady exist !!!! 
