This notebook is for performing MSC search on different datasets.

See caption in the codes for more details.

In [8]:
import os
import sys
import glob

import pandas as pd
import openpyxl
import numpy as np
import json

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from scipy import stats
from scipy.spatial.distance import jaccard
import math

from tqdm import tqdm

In [9]:
main_folder = "/home/data/grd22/AgreementPred_Codes"

For Annotated-Compounds sample datasets. 

Change the testset_name to evaluate on different sample datasets (Annocom, Drug, and NP)

In [None]:
#For sample annotated datasets

#Annotated-Compound Representations
folder = f"{main_folder}/Data/Representations_Annotated-Compounds"
FP_list = glob.glob(f"{folder}/*")

FP_dict = {}
name_lst = []
for f in sorted(FP_list):
    f2 = f.replace(f'{folder}/','')
    name = f2.replace('.csv', '')

    temp = pd.read_csv(f)    
    temp = temp.reset_index(drop=True)

    FP_dict[name] = temp
    name_lst.append(name)

#Testset name (Eg. Annocom, DrugBank, or NP)
testset_name = "Drug"

for set_num in tqdm(range(1,21)):

    test_df = pd.read_csv(f"{main_folder}/Data/Sample_datasets/{testset_name}_Testset/{testset_name}_sample50_0{set_num}.csv") 
    testset = test_df['cid'].tolist()


    folder_path = f'{main_folder}/MSC_Df/MSC_{testset_name}_Sample_{set_num}'
    os.makedirs(folder_path, exist_ok=True)
    
    #N of MSCs
    length = 30

    for n in name_lst:

        fpdf = FP_dict[n]
        last_FPcol = list(fpdf.columns).index('cid')

        train = fpdf[~fpdf['cid'].isin(testset)].reset_index(drop=True)
        cid_lst = train['cid'].tolist()

        MF_test = fpdf[fpdf['cid'].isin(testset)].reset_index(drop=True)

        train_features = train.iloc[:, 1:last_FPcol].values
        test_features = MF_test.iloc[:, 1:last_FPcol].values

        if n in ['minhashMHFP', 'minhashMAP4']:
            # Compute jaccard similarities for categorical representations
            cos_sim_matrix = np.zeros((test_features.shape[0] , train_features.shape[0]))
            for i in range(test_features.shape[0]):
                for j in range(train_features.shape[0]):
                    cos_sim_matrix[i, j] = 1 - jaccard(test_features[i], train_features[j])
        else:        
            # Compute cosine similarities for binary and numerical representations
            cos_sim_matrix = cosine_similarity(test_features, train_features)

        for k in MF_test.index:
            cos_lst = cos_sim_matrix[k]
            indices = np.argsort(cos_lst)[-length:][::-1]

            most_similar_compounds = [cid_lst[ind] for ind in indices]
            sim_lst = [cos_lst[ind] for ind in indices]

            MF_test.loc[k, 'MSCs'] = str(most_similar_compounds)
            MF_test.loc[k, 'similarities'] = str(sim_lst)
            MF_test.loc[k, 'mean_sim'] = np.mean(sim_lst)
            MF_test.loc[k, 'med_sim'] = np.median(sim_lst)
            MF_test.loc[k, 'std_sim'] = np.std(sim_lst)
            MF_test.loc[k, 'min_sim'] = min(sim_lst)
        
        MF_test = MF_test[['cid', 'cmpdname', 'MSCs', 'similarities', 'mean_sim', 'med_sim', 'std_sim', 'min_sim']]
        #MF_test.to_csv(f"{folder_path}/{n}.csv", index=False)


For iSEA dataset

In [None]:
#For iSEA approved drug dataset

#Annotated-Compound Representations
folder = f"{main_folder}/Data/Representations_Isea1157"
FP_list = glob.glob(f"{folder}/*")

FP_dict = {}
name_lst = []
for f in sorted(FP_list):
    f2 = f.replace(f'{folder}/','')
    name = f2.replace('.csv', '')

    temp = pd.read_csv(f)    
    temp = temp.reset_index(drop=True)

    FP_dict[name] = temp
    name_lst.append(name)



for set_num in tqdm(range(1,23)):

    test_df = pd.read_csv(f"{main_folder}/Data/Sample_datasets/Isea_Testset/Isea_sample{set_num}.csv") 
    testset = test_df['cid'].tolist()


    folder_path = f'{main_folder}/MSC_iSEA_Df/MSC_Isea_Sample_{set_num}'
    os.makedirs(folder_path, exist_ok=True)
    
    #N of MSCs
    length = 30

    for n in name_lst:

        fpdf = FP_dict[n]
        last_FPcol = list(fpdf.columns).index('cid')

        train = fpdf[~fpdf['cid'].isin(testset)].reset_index(drop=True)
        cid_lst = train['cid'].tolist()

        MF_test = fpdf[fpdf['cid'].isin(testset)].reset_index(drop=True)

        # Extract feature vectors for train and test sets
        train_features = train.iloc[:, 1:last_FPcol].values
        test_features = MF_test.iloc[:, 1:last_FPcol].values

        if n in ['minhashMHFP', 'minhashMAP4']:
            # Compute jaccard similarities between test and train sets
            cos_sim_matrix = np.zeros((test_features.shape[0] , train_features.shape[0]))
            for i in range(test_features.shape[0]):
                for j in range(train_features.shape[0]):
                    cos_sim_matrix[i, j] = 1 - jaccard(test_features[i], train_features[j])
        else:        
            #Compute cosine similarities between test and train sets
            cos_sim_matrix = cosine_similarity(test_features, train_features)

        for k in MF_test.index:
            cos_lst = cos_sim_matrix[k]
            indices = np.argsort(cos_lst)[-length:][::-1]

            most_similar_compounds = [cid_lst[ind] for ind in indices]
            sim_lst = [cos_lst[ind] for ind in indices]

            MF_test.loc[k, 'MSCs'] = str(most_similar_compounds)
            MF_test.loc[k, 'similarities'] = str(sim_lst)
            MF_test.loc[k, 'mean_sim'] = np.mean(sim_lst)
            MF_test.loc[k, 'med_sim'] = np.median(sim_lst)
            MF_test.loc[k, 'std_sim'] = np.std(sim_lst)
            MF_test.loc[k, 'min_sim'] = min(sim_lst)
        
        MF_test = MF_test[['cid', 'cmpdname', 'MSCs', 'similarities', 'mean_sim', 'med_sim', 'std_sim', 'min_sim']]
        MF_test.to_csv(f"{folder_path}/{n}.csv", index=False)


For SIDER dataset

In [None]:
#For SIDER dataset 

#SIDER Representations
folder = f"{main_folder}/Data/Representations_SIDER1380"
FP_list = glob.glob(f"{folder}/*")


FP_dict = {}
name_lst = []
for f in sorted(FP_list):
    f2 = f.replace(f'{folder}/','')
    name = f2.replace('.csv', '')

    temp = pd.read_csv(f)    
    temp = temp.reset_index(drop=True)

    FP_dict[name] = temp
    name_lst.append(name)


df = FP_dict[name_lst[0]]
df = fpdf.drop_duplicates('cid')
df = df.reset_index(drop=True)
test_data = df[['cid', 'cmpdname']]

folder_path = f'{main_folder}/MSC_SIDER_Df'

#N of MSCs
length = 30

for n in tqdm(name_lst):
  
    fpdf = FP_dict[n]
    fpdf = fpdf.drop_duplicates('cid')
    last_FPcol = list(fpdf.columns).index('cid')
    
    all_MF_tests = []

    for test in test_data.index:
        test_cid = test_data.loc[test, 'cid']
        testset = [test_cid]

        train = fpdf[fpdf['cid'].isin(test_data['cid'].tolist())]
        train = train[~train['cid'].isin(testset)].reset_index(drop=True)
        cid_lst = train['cid'].tolist()

        MF_test = fpdf[fpdf['cid'].isin(testset)].reset_index(drop=True)

        train_features = train.iloc[:, 1:last_FPcol].values
        test_features = MF_test.iloc[:, 1:last_FPcol].values

        if n in ['minhashMHFP', 'minhashMAP4']:
            # Compute jaccard similarities for categorical representations
            cos_sim_matrix = np.zeros((test_features.shape[0] , train_features.shape[0]))
            for i in range(test_features.shape[0]):
                for j in range(train_features.shape[0]):
                    cos_sim_matrix[i, j] = 1 - jaccard(test_features[i], train_features[j])
        else:        
            # Compute cosine similarities for binary and numerical representations
            cos_sim_matrix = cosine_similarity(test_features, train_features)

        for k in MF_test.index:
            cos_lst = cos_sim_matrix[k]
            indices = np.argsort(cos_lst)[-length:][::-1]

            most_similar_compounds = [cid_lst[ind] for ind in indices]
            sim_lst = [cos_lst[ind] for ind in indices]

            MF_test.loc[k, 'MSCs'] = str(most_similar_compounds)
            MF_test.loc[k, 'similarities'] = str(sim_lst)
            MF_test.loc[k, 'mean_sim'] = np.mean(sim_lst)
            MF_test.loc[k, 'med_sim'] = np.median(sim_lst)
            MF_test.loc[k, 'std_sim'] = np.std(sim_lst)
            MF_test.loc[k, 'min_sim'] = min(sim_lst)
        
        all_MF_tests.append(MF_test[['cid', 'cmpdname', 'MSCs', 'similarities', 'mean_sim', 'med_sim', 'std_sim', 'min_sim']])
    
    final_MF_test = pd.concat(all_MF_tests, ignore_index=True)
    final_MF_test.to_csv(f"{folder_path}/{n}.csv", index=False)


100%|██████████| 29/29 [21:33<00:00, 44.61s/it]


For Query compounds

In [None]:
#Query compound SMILES

all_chem = pd.read_csv(f"{main_folder}/Data/All_Compound.csv")[0:50]
all_chem = all_chem[['cmpdname', 'cid', 'canonicalsmiles']]
all_chem

  all_chem = pd.read_csv(f"{main_folder}/Data/All_Compound.csv")[0:50]


Unnamed: 0,cmpdname,cid,canonicalsmiles
0,Acetyl-DL-carnitine,1,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,[2-(Acetyloxy)-3-carboxypropyl]trimethylazanium,2,CC(=O)OC(CC(=O)O)C[N+](C)(C)C
2,"5,6-Dihydroxycyclohexa-1,3-diene-1-carboxylic ...",3,C1=CC(C(C(=C1)C(=O)O)O)O
3,1-Aminopropan-2-ol,4,CC(CN)O
4,3-Amino-2-oxopropyl phosphate,5,C(C(=O)COP(=O)(O)O)N
5,Dinitrochlorobenzene,6,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl
6,9-Ethyladenine,7,CCN1C=NC2=C(N=CN=C21)N
7,"(2,3,4,5,6-Pentahydroxycyclohexyl) dihydrogen ...",9,C1(C(C(C(C(C1O)O)OP(=O)(O)O)O)O)O
8,"1,2-Dichloroethane",11,C(CCl)Cl
9,"1,2,3,5-Tetrahydroxybenzene",12,C1=C(C=C(C(=C1O)O)O)O


In [None]:
#For application to query compounds

#All-Compound Representations
folder = f"{main_folder}/Representations_Query"
FP_list = glob.glob(f"{folder}/*")

FP_dict_test = {}
name_lst = []
for f in sorted(FP_list):
    f2 = f.replace(f'{folder}/','')
    name = f2.replace('.csv', '')

    temp = pd.read_csv(f)
    temp = temp.reset_index(drop=True)

    FP_dict_test[name] = temp
    name_lst.append(name)

folder = f"{main_folder}/Data/Representations_Annotated-Compounds"

FP_dict_train = {}
FP_list = glob.glob(f"{folder}/*")
name_lst = []
for f in sorted(FP_list):
    f2 = f.replace(f'{folder}/','')
    name = f2.replace('.csv', '')

    temp = pd.read_csv(f)
    temp = temp.reset_index(drop=True)

    FP_dict_train[name] = temp
    name_lst.append(name)

trainset = FP_dict_train[name_lst[0]]['cid'].tolist()
print("Number of annotated compounds: ", len(trainset))

testset = FP_dict_test[name_lst[0]]['cid'].tolist()
print("Number of test compounds: ", len(testset))


folder_path = f"{main_folder}/MSC_Query_Df"

length = 1
for n in tqdm(name_lst):

    
    train = FP_dict_train[n]
    cid_lst = train['cid'].tolist()

    MF_test = FP_dict_test[n]
    last_FPcol = list(MF_test.columns).index('cid')

    train_features = train.iloc[:, 1:last_FPcol].values
    test_features = MF_test.iloc[:, 1:last_FPcol].values

    if n in ['minhashMHFP', 'minhashMAP4']:
        # Compute jaccard similarities for categorical representations
        cos_sim_matrix = np.zeros((test_features.shape[0] , train_features.shape[0]))
        for i in range(test_features.shape[0]):
            for j in range(train_features.shape[0]):
                cos_sim_matrix[i, j] = 1 - jaccard(test_features[i], train_features[j])
    else:
        # Compute cosine similarities for binary and numerical representations
        cos_sim_matrix = cosine_similarity(test_features, train_features)

    for k in MF_test.index:
        cos_lst = cos_sim_matrix[k]
        indices = np.argsort(cos_lst)[-length:][::-1]

        most_similar_compounds = [cid_lst[ind] for ind in indices]
        sim_lst = [cos_lst[ind] for ind in indices]

        MF_test.loc[k, 'MSCs'] = str(most_similar_compounds)
        MF_test.loc[k, 'similarities'] = str(sim_lst)
        MF_test.loc[k, 'mean_sim'] = np.mean(sim_lst)
        MF_test.loc[k, 'med_sim'] = np.median(sim_lst)
        MF_test.loc[k, 'std_sim'] = np.std(sim_lst)
        MF_test.loc[k, 'min_sim'] = min(sim_lst)

            
    MF_test = MF_test[['cid', 'cmpdname', 'MSCs', 'similarities', 'mean_sim', 'med_sim', 'std_sim', 'min_sim']]
    MF_test.to_csv(f"{folder_path}/{n}.csv", index=False)


Number of annotated compounds:  9721
Number of test compounds:  50


100%|██████████| 29/29 [00:22<00:00,  1.30it/s]
