In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from sklearn.metrics.pairwise import pairwise_distances

def hyperbolic_distance(u, v):
    u_norm = np.linalg.norm(u)
    v_norm = np.linalg.norm(v)

    return np.arccosh(1 + ((u @ v) / (u_norm * v_norm)) ** 2)
def tanimoto_distance(u, v):
    intersection = np.bitwise_and(u, v)
    union = np.bitwise_or(u, v)

    return 1 - (np.count_nonzero(intersection) / np.count_nonzero(union))

def find_most_similar_molecules(file1_path: str, file2_path: str):
    # Load the fingerprint files into dataframes
    df1 = pd.read_csv(file1_path)
    df2 = pd.read_csv(file2_path)

	# Drop common SMILES from second dataset if any 
    common_smiles = set(df1['SMILES']).intersection(set(df2['SMILES']))
    df2_filtered = df2[~df2['SMILES'].isin(common_smiles)].reset_index(drop=True)

	# Convert binary string fingerprints to RDKit ExplicitBitVect objects for both datasets
    fingerprints1 = [DataStructs.CreateFromBitString(fp) for fp in df1['Fingerprint']]
    fingerprints2 = [DataStructs.CreateFromBitString(fp) for fp in df2_filtered['Fingerprint']]

	# Convert fingerprint lists to numpy arrays
    fingerprints1_np = np.array([np.array(list(fp)) for fp in fingerprints1])
    fingerprints2_np = np.array([np.array(list(fp)) for fp in fingerprints2])

	# Calculate Hamming distance matrix between two datasets using bitwise similarity measure
    hamming_dist_matrix=pairwise_distances(fingerprints1_np,fingerprints2_np, metric='hamming')
	# Calculate Tanimoto distance matrix using the custom function
    tanimoto_dist_matrix = np.zeros((len(df1), len(df2_filtered)))
    for i in range(len(df1)):
	    for j in range(len(df2_filtered)):
	        tanimoto_dist_matrix[i][j] = tanimoto_distance(fingerprints1_np[i], fingerprints2_np[j])

	# Calculate other distances (cosine distance, hyperbolic distance, and euclidean distance)
    cosine_dist_matrix = pairwise_distances(fingerprints1_np, fingerprints2_np, metric='cosine')
    hyperbolic_dist_matrix = np.zeros((len(df1), len(df2_filtered)))
    euclidean_dist_matrix = pairwise_distances(fingerprints1_np, fingerprints2_np , metric='euclidean')

    for i in range(len(df1)):
	    for j in range(len(df2_filtered)):
	        hyperbolic_dist_matrix[i][j] = hyperbolic_distance(fingerprints1_np[i], fingerprints2_np[j])

	# Create a dictionary to store similarity scores with SMILES strings for each distance metric 
    similarity_scores_dict = {}

    for i in range(len(df1)):
	    similarity_scores_dict[i] ={
	        'Hamming Distance': hamming_dist_matrix[i].min(),
	        'Cosine Distance': cosine_dist_matrix[i].min(),
			'Hyperbolic Distance': hyperbolic_dist_matrix[i].min(),
			'Euclidean Distance': euclidean_dist_matrix[i].min(),
			'Tanimoto Distance': tanimoto_dist_matrix[i].min()
	    }

    # Convert the dictionary to a dataframe
    similarity_df = pd.DataFrame(similarity_scores_dict).transpose()

    # Sort the dataframe based on all distances in ascending order
    sorted_similarity_df = similarity_df.sort_values(['Hamming Distance', 'Cosine Distance',
                                                      'Hyperbolic Distance', 'Euclidean Distance', 'Tanimoto Distance'])

    # Print the most similar SMILES from dataset 2
    most_similar_smiles = df2_filtered.loc[sorted_similarity_df.index[:5], 'SMILES']
    # Print the most similar SMILES and their corresponding distance values from dataset 2
    most_similar_distances = sorted_similarity_df.iloc[:5]

    print("Most similar SMILES:")
    print(most_similar_smiles)
    print("\nDistance Values:")
    print(most_similar_distances)

    return most_similar_smiles, most_similar_distances

# Example usage:
file1_path = "/home/ritesh/Desktop/MOLECULES/new/morgan fp/morgan_mol_fp.csv"
file2_path = "/home/ritesh/Desktop/MOLECULES/new/morgan fp/morgan_SMiles_fp.csv"

most_similar_molecules = find_most_similar_molecules(file1_path, file2_path)

Most similar SMILES:
2                     CC(=O)C(C)=O
5                         CCCCCC=O
11           COC1=C(C=CC(=C1)C=O)O
12                  CC(=O)C1=NCCC1
16    C/C=C/C(=O)C1=C(C)C=CCC1(C)C
Name: SMILES, dtype: object

Distance Values:
    Hamming Distance  Cosine Distance  Hyperbolic Distance  \
2                0.0              0.0                  0.0   
5                0.0              0.0                  0.0   
11               0.0              0.0                  0.0   
12               0.0              0.0                  0.0   
16               0.0              0.0                  0.0   

    Euclidean Distance  Tanimoto Distance  
2                  0.0                0.0  
5                  0.0                0.0  
11                 0.0                0.0  
12                 0.0                0.0  
16                 0.0                0.0  


In [3]:
import pandas as pd
def mapping(file1_path: str, file2_path: str):
    # Load the fingerprint files into dataframes
    dataset1 = pd.read_csv(file1_path)
    dataset2 = pd.read_csv(file2_path)
dataset1 = pd.read_csv(file1_path)
dataset2 = pd.read_csv(file2_path)

# Perform string mapping to find common SMILES between datasets
common_smiles = set(dataset1['SMILES']).intersection(set(dataset2['SMILES']))

# Filter rows from both datasets based on common SMILES
filtered_dataset1 = dataset1[dataset1['SMILES'].isin(common_smiles)]
filtered_dataset2 = dataset2[dataset2['SMILES'].isin(common_smiles)]

# Merge filtered datasets on common SMILES
merged_dataset = pd.merge(filtered_dataset1, filtered_dataset2, on='SMILES', how='inner')

# Save merged dataset to a new CSV file
merged_dataset.to_csv('mapping.csv', index=False)
print(merged_dataset)
file1_path = "/home/ritesh/Desktop/MOLECULES/new/morgan fp/morgan_mol_fp.csv"
file2_path = "/home/ritesh/Desktop/MOLECULES/new/morgan fp/morgan_SMiles_fp.csv"

    Unnamed: 0 RECEPTOR              AGONIST                    SMILES  \
0           10       NaN            Coumarin  C1=CC=C2C(=C1)C=CC(=O)O2   
1           12       NaN             octanal                CCCCCCCC=O   
2           14       NaN      Methyl eugenol   COC1=C(C=C(C=C1)CC=C)OC   
3           14       NaN      Methyl eugenol   COC1=C(C=C(C=C1)CC=C)OC   
4           17       NaN     Ethyl caprylate           CCCCCCCC(=O)OCC   
5           21       NaN           1-Hexanol                   CCCCCCO   
6           29       NaN             nonanal               CCCCCCCCC=O   
7           30       NaN       1-octen-3-one             CCCCCC(=O)C=C   
8           33       NaN       (E)-2-Hexenal               CCC/C=C/C=O   
9           34       NaN            Guaiacol            COC1=CC=CC=C1O   
10          40       NaN  Dimethyl disulfide                      CSSC   
11          43       NaN            Propanal                     CCC=O   

                                     

In [4]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from sklearn.metrics.pairwise import pairwise_distances

def tanimoto_distance(u, v):
    intersection = np.bitwise_and(u, v)
    union = np.bitwise_or(u, v)

    return 1 - (np.count_nonzero(intersection) / np.count_nonzero(union))

def find_most_similar_molecules(file1_path: str, file2_path: str):
    # Load the fingerprint files into dataframes
    df1 = pd.read_csv(file1_path)
    df2 = pd.read_csv(file2_path)

	# Drop common SMILES from second dataset if any 
    common_smiles = set(df1['SMILES']).intersection(set(df2['SMILES']))
    df2_filtered = df2[~df2['SMILES'].isin(common_smiles)].reset_index(drop=True)

	# Convert binary string fingerprints to RDKit ExplicitBitVect objects for both datasets
    fingerprints1 = [DataStructs.CreateFromBitString(fp) for fp in df1['Fingerprint']]
    fingerprints2 = [DataStructs.CreateFromBitString(fp) for fp in df2_filtered['Fingerprint']]


	# Convert fingerprint lists to numpy arrays
    fingerprints1_np = np.array([np.array(list(fp)) for fp in fingerprints1])
    fingerprints2_np = np.array([np.array(list(fp)) for fp in fingerprints2])
	# Calculate Tanimoto distance matrix using the custom function
    tanimoto_dist_matrix = np.zeros((len(df1), len(df2_filtered)))
    for i in range(len(df1)):
	    for j in range(len(df2_filtered)):
	        tanimoto_dist_matrix[i][j] = tanimoto_distance(fingerprints1_np[i], fingerprints2_np[j])

	# Create a dictionary to store similarity scores with SMILES strings for each distance metric 
    similarity_scores_dict = {}

    for i in range(len(df1)):
	    similarity_scores_dict[i] ={'Tanimoto Distance': tanimoto_dist_matrix[i].min()
	    }

    # Convert the dictionary to a dataframe
    similarity_df = pd.DataFrame(similarity_scores_dict).transpose()

    # Sort the dataframe based on all distances in ascending order
    sorted_similarity_df = similarity_df.sort_values(['Tanimoto Distance'])

    # Print the most similar SMILES from dataset 2
    most_similar_smiles = df2_filtered.loc[sorted_similarity_df.index[:10], 'SMILES']
    most_similar_distances = sorted_similarity_df.iloc[:10]

    print("Most similar SMILES:")
    print(most_similar_smiles)
    print("\nDistance Values:")
    print(most_similar_distances.round(100))

    return most_similar_smiles, most_similar_distances

# Example usage:
file1_path = "/home/ritesh/Desktop/MOLECULES/new/morgan fp/morgan_mol_fp.csv"
file2_path = "/home/ritesh/Desktop/MOLECULES/new/morgan fp/morgan_SMiles_fp.csv"

most_similar_molecules = find_most_similar_molecules(file1_path, file2_path)

Most similar SMILES:
2                     CC(=O)C(C)=O
5                         CCCCCC=O
6                CC1OC(=O)C(=C1C)O
11           COC1=C(C=CC(=C1)C=O)O
12                  CC(=O)C1=NCCC1
19                        CC(C)C=O
17                     CCCC(=O)OCC
16    C/C=C/C(=O)C1=C(C)C=CCC1(C)C
25               CCC1=NC=C(N=C1C)C
31               CC/C=C/CC\C=C\C=O
Name: SMILES, dtype: object

Distance Values:
    Tanimoto Distance
2                 0.0
5                 0.0
6                 0.0
11                0.0
12                0.0
19                0.0
17                0.0
16                0.0
25                0.0
31                0.0


In [5]:
import pandas as pd

def create_similarity_csv(file1_path: str, file2_path: str, output_file: str):
    # Find the most similar molecules
    most_similar_molecules = find_most_similar_molecules(file1_path, file2_path)

    # Get the SMILES from both datasets
    smiles_dataset1 = pd.read_csv(file1_path)['SMILES']
    
    # Combine the most similar SMILES with their corresponding SMILES from dataset 1
    combined_smiles = pd.concat([smiles_dataset1, most_similar_molecules[0]], axis=0)
    
	# Save to CSV file
    combined_smiles.to_csv(output_file, index=False)

# Example usage:
output_file ="/home/ritesh/Desktop/MOLECULES/new/tanimotosimilarity.csv"
create_similarity_csv(file1_path, file2_path, output_file)

Most similar SMILES:
2                     CC(=O)C(C)=O
5                         CCCCCC=O
6                CC1OC(=O)C(=C1C)O
11           COC1=C(C=CC(=C1)C=O)O
12                  CC(=O)C1=NCCC1
19                        CC(C)C=O
17                     CCCC(=O)OCC
16    C/C=C/C(=O)C1=C(C)C=CCC1(C)C
25               CCC1=NC=C(N=C1C)C
31               CC/C=C/CC\C=C\C=O
Name: SMILES, dtype: object

Distance Values:
    Tanimoto Distance
2                 0.0
5                 0.0
6                 0.0
11                0.0
12                0.0
19                0.0
17                0.0
16                0.0
25                0.0
31                0.0


### descriptor calculation

In [1]:
!pip install mordred
!pip install umap



In [2]:
import os,re,sys,pickle,datetime,time,random,itertools
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from scipy import stats
import math
from tqdm import tqdm
import seaborn as sns
import umap
from multiprocessing import freeze_support
import mordred
from mordred import Calculator, descriptors
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA,NMF
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.preprocessing import StandardScaler,MinMaxScaler,PolynomialFeatures
import rdkit
from rdkit import Chem
from rdkit.Chem import DataStructs, AllChem, MolFromSmiles, PandasTools, Descriptors, Draw, PropertyMol, rdmolfiles, rdFMCS
from rdkit import RDConfig
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem.Draw import MolsToGridImage, IPythonConsole, rdMolDraw2D
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions, Font
DrawingOptions.bondLineWidth=1.8
DrawingOptions.includeAtomNumbers=False
size = (150, 150)
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from PIL import Image
randomstate = 42

In [3]:
filepath1 = "/home/ritesh/Desktop/MOLECULES/new/morgan fp/morgan_mol_fp.csv"
filepath2 = "/home/ritesh/Desktop/MOLECULES/new/morgan fp/morgan_SMiles_fp.csv"


In [15]:
filepath2 = "/home/ritesh/Desktop/MOLECULES/new/morgan fp/morgan_SMiles_fp.csv"
mols = pd.read_csv(filepath2)
mols

Unnamed: 0,SMILES,Fingerprint
0,CSCCC=O,0000000000010000000000000000000000000000000000...
1,CCC(C)C=O,0100000000000000000000000000000000000000000000...
2,CC(=O)C(C)=O,0000000000000000000000000000000000000000000000...
3,CCCCC/C=C/C=C/C=O,0000000000000000000000000000000000000000000000...
4,CC1OC(=C(O)C1=O)C,0000000000000000000000000000000000000000000000...
...,...,...
218,CCCSSCCC,0000000000000000000000000000000000000000000000...
219,CCOC(=O)CCC(C)C,0100000000000000000000000000000000000000000000...
220,CCOC(=O)CCSC,0000000000010000000000000000000000000000000000...
221,CCCC[C@@H](CC)CCC([O-])=O,0100000000000000000000000000000000000000000000...


In [16]:
mols = mols.drop_duplicates(subset=['SMILES'])
mols

Unnamed: 0,SMILES,Fingerprint
0,CSCCC=O,0000000000010000000000000000000000000000000000...
1,CCC(C)C=O,0100000000000000000000000000000000000000000000...
2,CC(=O)C(C)=O,0000000000000000000000000000000000000000000000...
3,CCCCC/C=C/C=C/C=O,0000000000000000000000000000000000000000000000...
4,CC1OC(=C(O)C1=O)C,0000000000000000000000000000000000000000000000...
...,...,...
217,CC1=C(C(=CC=C1)C)O,0000000000000000000000000000000000000000000000...
218,CCCSSCCC,0000000000000000000000000000000000000000000000...
219,CCOC(=O)CCC(C)C,0100000000000000000000000000000000000000000000...
220,CCOC(=O)CCSC,0000000000010000000000000000000000000000000000...


In [17]:
mols['rdmol'] = mols['SMILES'].map(lambda x: Chem.MolFromSmiles(x))
mols['inchi'] = mols['rdmol'].map(Chem.MolToInchi)
mols = mols.drop_duplicates(subset="inchi")

print('Number of SMILES after duplicates removed:',len(mols))

mols

Number of SMILES after duplicates removed: 219




























































Unnamed: 0,SMILES,Fingerprint,rdmol,inchi
0,CSCCC=O,0000000000010000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f78d2e671b0>,"InChI=1S/C4H8OS/c1-6-4-2-3-5/h3H,2,4H2,1H3"
1,CCC(C)C=O,0100000000000000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f78d2e67a70>,"InChI=1S/C5H10O/c1-3-5(2)4-6/h4-5H,3H2,1-2H3"
2,CC(=O)C(C)=O,0000000000000000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f78d2f045f0>,InChI=1S/C4H6O2/c1-3(5)4(2)6/h1-2H3
3,CCCCC/C=C/C=C/C=O,0000000000000000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f78d2f07290>,InChI=1S/C10H16O/c1-2-3-4-5-6-7-8-9-10-11/h6-1...
4,CC1OC(=C(O)C1=O)C,0000000000000000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f78d2f06b90>,"InChI=1S/C6H8O3/c1-3-5(7)6(8)4(2)9-3/h3,8H,1-2H3"
...,...,...,...,...
217,CC1=C(C(=CC=C1)C)O,0000000000000000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f79d09c5070>,"InChI=1S/C8H10O/c1-6-4-3-5-7(2)8(6)9/h3-5,9H,1..."
218,CCCSSCCC,0000000000000000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f79d09c4040>,"InChI=1S/C6H14S2/c1-3-5-7-8-6-4-2/h3-6H2,1-2H3"
219,CCOC(=O)CCC(C)C,0100000000000000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f79d09c5540>,"InChI=1S/C8H16O2/c1-4-10-8(9)6-5-7(2)3/h7H,4-6..."
220,CCOC(=O)CCSC,0000000000010000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f79d09c5380>,"InChI=1S/C6H12O2S/c1-3-8-6(7)4-5-9-2/h3-5H2,1-2H3"


In [18]:
def embed(mol):
    mol_with_H = Chem.AddHs(mol)
    Chem.AllChem.EmbedMolecule(mol_with_H)
    Chem.AllChem.MMFFOptimizeMolecule(mol_with_H)
    return mol_with_H

mols['rdmol_optimized'] = mols.rdmol.map(embed)

mols

Unnamed: 0,SMILES,Fingerprint,rdmol,inchi,rdmol_optimized
0,CSCCC=O,0000000000010000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f78d2e671b0>,"InChI=1S/C4H8OS/c1-6-4-2-3-5/h3H,2,4H2,1H3",<rdkit.Chem.rdchem.Mol object at 0x7f78d2e67920>
1,CCC(C)C=O,0100000000000000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f78d2e67a70>,"InChI=1S/C5H10O/c1-3-5(2)4-6/h4-5H,3H2,1-2H3",<rdkit.Chem.rdchem.Mol object at 0x7f78d2e67f40>
2,CC(=O)C(C)=O,0000000000000000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f78d2f045f0>,InChI=1S/C4H6O2/c1-3(5)4(2)6/h1-2H3,<rdkit.Chem.rdchem.Mol object at 0x7f78d2ec9cb0>
3,CCCCC/C=C/C=C/C=O,0000000000000000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f78d2f07290>,InChI=1S/C10H16O/c1-2-3-4-5-6-7-8-9-10-11/h6-1...,<rdkit.Chem.rdchem.Mol object at 0x7f78d2ecb920>
4,CC1OC(=C(O)C1=O)C,0000000000000000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f78d2f06b90>,"InChI=1S/C6H8O3/c1-3-5(7)6(8)4(2)9-3/h3,8H,1-2H3",<rdkit.Chem.rdchem.Mol object at 0x7f78d2ecb140>
...,...,...,...,...,...
217,CC1=C(C(=CC=C1)C)O,0000000000000000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f79d09c5070>,"InChI=1S/C8H10O/c1-6-4-3-5-7(2)8(6)9/h3-5,9H,1...",<rdkit.Chem.rdchem.Mol object at 0x7f79d0dbac70>
218,CCCSSCCC,0000000000000000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f79d09c4040>,"InChI=1S/C6H14S2/c1-3-5-7-8-6-4-2/h3-6H2,1-2H3",<rdkit.Chem.rdchem.Mol object at 0x7f79d0dbace0>
219,CCOC(=O)CCC(C)C,0100000000000000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f79d09c5540>,"InChI=1S/C8H16O2/c1-4-10-8(9)6-5-7(2)3/h7H,4-6...",<rdkit.Chem.rdchem.Mol object at 0x7f79d0dbad50>
220,CCOC(=O)CCSC,0000000000010000000000000000000000000000000000...,<rdkit.Chem.rdchem.Mol object at 0x7f79d09c5380>,"InChI=1S/C6H12O2S/c1-3-8-6(7)4-5-9-2/h3-5H2,1-2H3",<rdkit.Chem.rdchem.Mol object at 0x7f79d0dbadc0>


In [20]:
calc = Calculator(descriptors) # create calculator for all mordred descriptors (can also specify subtype)

df=calc.pandas(mols['rdmol_optimized'])
print(df)

100%|██████████| 219/219 [00:06<00:00, 34.47it/s] 


                                                   ABC  \
0    module 'numpy' has no attribute 'float'.\n`np....   
1    module 'numpy' has no attribute 'float'.\n`np....   
2    module 'numpy' has no attribute 'float'.\n`np....   
3    module 'numpy' has no attribute 'float'.\n`np....   
4    module 'numpy' has no attribute 'float'.\n`np....   
..                                                 ...   
217  module 'numpy' has no attribute 'float'.\n`np....   
218  module 'numpy' has no attribute 'float'.\n`np....   
219  module 'numpy' has no attribute 'float'.\n`np....   
220  module 'numpy' has no attribute 'float'.\n`np....   
221  module 'numpy' has no attribute 'float'.\n`np....   

                                                 ABCGG  nAcid  nBase  \
0    module 'numpy' has no attribute 'float'.\n`np....      0      0   
1    module 'numpy' has no attribute 'float'.\n`np....      0      0   
2    module 'numpy' has no attribute 'float'.\n`np....      0      0   
3    module 'nu

In [21]:
#Replace mordred errors with NaNs then drop those descriptors/columns
df=df.applymap(lambda x: np.nan if type(x) in [mordred.error.Missing,mordred.error.Error] else x)
df=df.dropna(axis=1)
print(df.shape)

(219, 1118)


In [22]:
#Dropping columns that have a single value
non_zero_std = df.std() != 0
df = df [non_zero_std[non_zero_std].index]
print(df.shape)

(219, 897)


In [23]:
#Dropping highly correlated descriptors: If a pair of descriptors have a Pearson correlation coefficient (r) value greater than the threshold then one descriptor will be removed.
threshold=0.95
df_corr = df.corr().abs()
upper = df_corr.where(np.triu(np.ones(df_corr.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
df = df.drop(to_drop, axis=1)
print(df.shape)

(219, 422)


In [25]:
#saving the csv list
to_save=pd.concat([mols[['SMILES']], df], axis=1)
to_save

Unnamed: 0,SMILES,nAcid,nBase,SpAbs_A,SpMax_A,SpMAD_A,LogEE_A,VR1_A,nAromAtom,nAtom,...,JGI3,JGI4,JGI5,JGI6,JGI7,JGI8,JGI9,JGT10,SRW05,TSRW10
0,CSCCC=O,0,0,6.987918,1.801938,1.164653,2.579830,12.628860,0,14,...,0.041667,0.040000,0.000000,0.000000,0.000000,0.00000,0.00,0.237222,0.000000,28.105124
1,CCC(C)C=O,0,0,6.898979,1.931852,1.149830,2.595800,12.261209,0,16,...,0.031250,0.000000,0.000000,0.000000,0.000000,0.00000,0.00,0.464583,0.000000,29.753427
2,CC(=O)C(C)=O,0,0,6.000000,2.000000,1.000000,2.610845,11.530010,0,12,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00,0.548148,0.000000,30.698690
3,CCCCC/C=C/C=C/C=O,0,0,13.191508,1.931852,1.199228,3.202455,38.458677,0,27,...,0.015625,0.011429,0.009259,0.008163,0.007812,0.00823,0.01,0.145210,0.000000,37.236738
4,CC1OC(=C(O)C1=O)C,0,0,10.692100,2.352843,1.188011,3.113958,26.402276,0,17,...,0.114899,0.000000,0.000000,0.000000,0.000000,0.00000,0.00,0.529429,2.397895,52.444780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,CC1=C(C(=CC=C1)C)O,0,0,10.891331,2.263821,1.210148,3.099901,26.623093,6,19,...,0.062500,0.083333,0.000000,0.000000,0.000000,0.00000,0.00,0.497685,0.000000,37.975562
218,CCCSSCCC,0,0,9.517541,1.879385,1.189693,2.876615,21.482988,0,22,...,0.025000,0.020000,0.018519,0.020408,0.000000,0.00000,0.00,0.192392,0.000000,32.187603
219,CCOC(=O)CCC(C)C,0,0,11.127090,2.052881,1.112709,3.123647,30.761798,0,26,...,0.046875,0.045714,0.018519,0.027211,0.000000,0.00000,0.00,0.504985,0.000000,37.120200
220,CCOC(=O)CCSC,0,0,10.383983,2.015316,1.153776,3.007982,28.081310,0,21,...,0.053571,0.026667,0.013889,0.020408,0.000000,0.00000,0.00,0.385368,0.000000,35.041491


In [26]:
to_save.to_csv('test_smiles_desc.csv', index = True)

In [35]:
filepath1 = "/home/ritesh/Desktop/MOLECULES/new/morgan fp/morgan_mol_fp.csv"
filepath2 = "/home/ritesh/Desktop/MOLECULES/new/morgan fp/morgan_SMiles_fp.csv"


In [56]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

# Load the CSV files
df1 = pd.read_csv(filepath1)
df2 = pd.read_csv(filepath2)

# Extract the descriptor columns
descriptor_cols = list(df2.columns)[1:]  # Assuming the SMILES column is the first column, adjust if needed

# Get the descriptor values for each file
descriptors1 = df1[descriptor_cols].values
descriptors2 = df2[descriptor_cols].values

# Handle scientific notation values
descriptors1 = np.char.replace(descriptors1.astype(str), 'E', 'e')
descriptors2 = np.char.replace(descriptors2.astype(str), 'E', 'e')

# Convert scientific notation to decimal notation
descriptors1 = np.array([float(d) for d in descriptors1])
descriptors2 = np.array([float(d) for d in descriptors2])

# Clip extreme values
clip_min = -1e9  # Define the minimum clip value based on the valid range of your descriptors
clip_max = 1e9  # Define the maximum clip value based on the valid range of your descriptors
descriptors1 = np.clip(descriptors1, clip_min, clip_max)
descriptors2 = np.clip(descriptors2, clip_min, clip_max)

# Scale the data
scaler = MinMaxScaler()
descriptors1 = scaler.fit_transform(descriptors1.reshape(-1, 1))
descriptors2 = scaler.transform(descriptors2.reshape(-1, 1))

# Perform PCA for dimensionality reduction
pca = PCA(n_components=0.93)  # Retain 93% of the variance
descriptors1 = pca.fit_transform(descriptors1)
descriptors2 = pca.transform(descriptors2)

# Define the distance metrics to use
distance_metrics = ['euclidean', 'cosine', 'manhattan', 'correlation', 'chebyshev', 'canberra', 'braycurtis']  # Add more distance metrics as needed

# Calculate the distances using each metric
for metric in distance_metrics:
    distances = pairwise_distances(descriptors1, descriptors2, metric=metric)

    # Find the top most similar molecule for each molecule in file 1
    most_similar_indices = np.argsort(distances, axis=1)[:, 0]
    most_similar_molecules = df2.iloc[most_similar_indices]['SMILES']

    # Print the results
    print(f"Top most similar molecules using {metric} distance:")
    for i , molecule in enumerate(df1['SMILES']):
        print(f"Molecule {i+1} ({molecule}): {most_similar_molecules.iloc[i]}")
    print()


Top most similar molecules using euclidean distance:
Molecule 1 (CCCCOC(=O)C(C)OC(=O)CCC): CC(C)C1=CC=C(C=C1)C=O
Molecule 2 (CC(C)CC(=O)O): CC(C)C1=CC=C(C=C1)C=O
Molecule 3 (CCCCCCCCC(=O)O): CC(C)C1=CC=C(C=C1)C=O
Molecule 4 (CCC(C)CC(=O)O): CC(C)C1=CC=C(C=C1)C=O
Molecule 5 (CC(C)CCC(=O)O): CC(C)C1=CC=C(C=C1)C=O
Molecule 6 (CCCC(=O)O): CC(C)C1=CC=C(C=C1)C=O
Molecule 7 (CCC(C)C(=O)O): CC(C)C1=CC=C(C=C1)C=O
Molecule 8 (C1CC(C1)C(=O)O): CC(C)C1=CC=C(C=C1)C=O
Molecule 9 (C1=CC=C(C=C1)C=O): CC(C)C1=CC=C(C=C1)C=O
Molecule 10 (CC(C(C)O)O): CC(C)C1=CC=C(C=C1)C=O
Molecule 11 (C1=CC=C2C(=C1)C=CC(=O)O2): CC(C)C1=CC=C(C=C1)C=O
Molecule 12 (CCCCCCCC(=O)O): CC(C)C1=CC=C(C=C1)C=O
Molecule 13 (CCCCCCCC=O): CC(C)C1=CC=C(C=C1)C=O
Molecule 14 (C1=CC=C(C=C1)C(=O)C2=CC=CC=C2): CC(C)C1=CC=C(C=C1)C=O
Molecule 15 (COC1=C(C=C(C=C1)CC=C)OC): CC(C)C1=CC=C(C=C1)C=O
Molecule 16 (CC1=CC=C(C=C1)C(C)C): CC(C)C1=CC=C(C=C1)C=O
Molecule 17 (CCCCCCCC1CCC(=O)O1): CC(C)C1=CC=C(C=C1)C=O
Molecule 18 (CCCCCCCC(=O)OCC): CC(C)C1

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA


### pca 


In [70]:
df.index = df.index.astype(int)
df.dropna(axis=0,inplace=True)
X_all = np.array(df)
X_all_ids = np.array(df.index)
scaler = StandardScaler()
scaler.fit(X_all)
X_all_sc = scaler.transform(X_all)

pca = PCA(n_components=10)     # choose number of PCs

pca.fit(X_all_sc)
X_all_pca    = pca.transform(X_all_sc)
pca_score = pca.explained_variance_ratio_
pca_values = pca.singular_values_
print('Variance explained by individual PCs:',np.around(pca.explained_variance_ratio_, decimals=2),'\n')
print('Total variance explained by PCs:',np.around(np.sum(pca.explained_variance_ratio_), decimals=2))

Variance explained by individual PCs: [0.16 0.13 0.1  0.08 0.06 0.04 0.03 0.03 0.03 0.02] 

Total variance explained by PCs: 0.7


In [None]:
nclusters = 10     # choose number of clusters

####################################################################################

X_use = X_all_pca
X_main_ids = X_all_ids
X_kmeans = X_all_pca
kmeans = KMeans(n_clusters=nclusters, random_state=42).fit(X_kmeans)
dists = []
for x in range(len(X_kmeans)):
    delta = [X_kmeans[x,i]-kmeans.cluster_centers_[kmeans.labels_[x],i] for i in range(len(X_kmeans[0]))]
    dist = np.linalg.norm(delta)
    dists.append(dist)
clusters = {}
clusterdists = {}
clustermins = []
clusterorders_dict = {}
clusterorders = np.zeros(X_kmeans.shape[0])
for x in range(nclusters):
    clusters[x] = [i for i in range(len(kmeans.labels_)) if kmeans.labels_[i] == x]
    clusterdists[x] = [dists[i] for i in clusters[x]]
    clustermins.append(clusters[x][clusterdists[x].index(min(clusterdists[x]))])
    clusterorders_dict[x] = [y for _,y in sorted(zip(clusterdists[x],clusters[x]))]
    for i in clusters[x]:
        clusterorders[i] = clusterorders_dict[x].index(i)+1
f_ind_1 = 0
f_ind_2 = 1
x_min, x_max = X_kmeans[:, f_ind_1].min() - 2, X_kmeans[:, f_ind_1].max() + 2
y_min, y_max = X_kmeans[:, f_ind_2].min() - 2, X_kmeans[:, f_ind_2].max() + 2
plt.figure(figsize=(9,9))
extent = [x_min,x_max,y_min,y_max]
plt.xticks(fontsize=15) 
plt.yticks(fontsize=15)
plt.xlabel(r"PC1",fontsize=20)
plt.ylabel(r"PC2",fontsize=20)
plt.locator_params(axis='y', nbins=8)
plt.xlim(x_min,x_max)
plt.ylim(y_min,y_max)
mapping = kmeans.labels_
cMap = "tab20c"
plt.scatter(X_kmeans[:,f_ind_1], X_kmeans[:,f_ind_2],c=mapping,cmap=cMap,alpha=.5,marker=".",s=500,edgecolor="black",linewidth=0.5)
main_row_id = list(enumerate(X_main_ids))
clusters_with_ids = {}
for i in range(0,len(clusters)):
    emptylist = []
    for j in range(0,len(clusters[i])):
        emptylist.append(main_row_id[clusters[i][j]][1])
    clusters_with_ids[i] = emptylist
    
### plot ligand ids    
#for i in range(0,nclusters):
#    for j in range(0,len(clusters_with_ids[i])):
#        plt.text(X_use[clusters[i][j],f_ind_1],X_use[clusters[i][j],f_ind_2]," "+str(clusters_with_ids[i][j]),color="blue",fontsize=12,ha="left")

### plot centroid labels
#for i in range(0,nclusters):
#    for j in range(0,len(clusters_with_ids[i])):
#        if clusters[i][j] in clustermins:
#            plt.text(X_use[clusters[i][j],f_ind_1],X_use[clusters[i][j],f_ind_2]," "+str(clusters_with_ids[i][j]),color="red",fontsize=13,ha="right")
    
plt.tight_layout()
#plt.savefig("mordred_pca_plot.png",dpi=300)
plt.show()     
