In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem,DataStructs, MACCSkeys, RDConfig
from rdkit.Chem import DataStructs, Draw
import numpy as np
from rdkit.Chem.Pharm2D.SigFactory import SigFactory
import os
from rdkit.Chem.Pharm2D import Generate
from matplotlib import pyplot as plt
import ast

In [32]:
df=pd.read_excel('./dataset/flurophores_mdrug_natproduct_recon2_151443.xlsx')

In [33]:
df_drug=df[df['type']=='Drug']
df_drug

Unnamed: 0,name,type,smiles
0,Orlistat,Drug,CCCCCCCCCCCC(CC1OC(=O)C1CCCCCC)OC(=O)C(CC(C)C)...
1,Aclidinium,Drug,O=C(OC1C[N+]2(CCCOc3ccccc3)CCC1CC2)C(O)(c1cccs...
2,Quinapril,Drug,CCOC(=O)C(CCc1ccccc1)NC(C)C(=O)N1Cc2ccccc2CC1C...
3,Fosaprepitant,Drug,CC(OC1OCCN(Cc2nc(=O)n(P(=O)(O)O)[nH]2)C1c1ccc(...
4,Anisindione,Drug,COc1ccc(C2C(=O)c3ccccc3C2=O)cc1
...,...,...,...
1352,Sulpiride,Drug,CCN1CCCC1CNC(=O)c1cc(S(N)(=O)=O)ccc1OC
1353,Pentosan Polysulfate,Drug,O=S(=O)(O)OC1C(O)COC(OC2COC(O)C(OS(=O)(=O)O)C2...
1354,Argatroban,Drug,CC1CNc2c(cccc2S(=O)(=O)NC(CCCN=C(N)N)C(=O)N2CC...
1355,Meloxicam,Drug,Cc1cnc(NC(=O)C2=C(O)c3ccccc3S(=O)(=O)N2C)s1


In [34]:
def fingerprint_atom_pair_smiles_list(
        smiles,
        len_fingerprint=2048,
        min_atom_pair_len=1,
        max_atom_pair_len=30,
        from_atoms=0,
        ignore_atoms=0,
        atom_invariants=0,
        bits_per_entry=4,
        Chirality=False,
        use_2D=True,
        conf_Id=-1
):
    fingerprint_rep = [None] * len(smiles)
    for i, sm in enumerate(smiles):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fingerprint_bin = AllChem.GetHashedAtomPairFingerprintAsBitVect(
                mol, nBits=len_fingerprint, minLength=min_atom_pair_len,
                maxLength=max_atom_pair_len, fromAtoms=from_atoms,
                ignoreAtoms=ignore_atoms, atomInvariants=atom_invariants,
                nBitsPerEntry=bits_per_entry, includeChirality=Chirality,
                use2D=use_2D, confId=conf_Id
            )
            fingerprint_rep[i] = fingerprint_bin.ToBitString()

    return fingerprint_rep


In [35]:
def fingerprint_extended_connectivity_smiles_list(
        smiles,
        radius = 2,
        len_fingerprint=2048,
        from_atoms=0,
        atom_invariants=0,
        Chirality=False,
        Bond_Types=True,
        use_Features=False,
        bitInfo=None,
        RedundantEnvironments=False
):
    fingerprint_rep = [None] * len(smiles)

    for i, sm in enumerate(smiles):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fingerprint_bin = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=len_fingerprint, 
                                                                invariants=atom_invariants, fromAtoms=from_atoms,
                                                                useChirality=Chirality, useBondTypes=Bond_Types,
                                                                useFeatures=use_Features, bitInfo=bitInfo,
                                                                 includeRedundantEnvironments=RedundantEnvironments ) 
            fingerprint_rep[i] = fingerprint_bin.ToBitString()
            
    return fingerprint_rep


In [36]:
def fingerprint_maccs_keys_smiles_list(
        smiles
):
    fingerprint_rep = [None] * len(smiles)

    for i, sm in enumerate(smiles):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fingerprint_bin =  MACCSkeys.GenMACCSKeys(mol)
            fingerprint_rep[i] = fingerprint_bin.ToBitString()

    return fingerprint_rep

In [37]:
def fingerprint_morgan_smiles_list(
        smiles,
        radius = 1,
        len_fingerprint=2048,
        from_atoms=0,
        atom_invariants=0,
        Chirality=False,
        Bond_Types=True,
        use_Features=False,
        bitInfo=None,
        RedundantEnvironments=False
):
    fingerprint_rep = [None] * len(smiles)

    for i, sm in enumerate(smiles):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fingerprint_bin = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=len_fingerprint, 
                                                                invariants=atom_invariants, fromAtoms=from_atoms,
                                                                useChirality=Chirality, useBondTypes=Bond_Types,
                                                                useFeatures=use_Features, bitInfo=bitInfo,
                                                                 includeRedundantEnvironments=RedundantEnvironments ) 
            fingerprint_rep[i] = fingerprint_bin.ToBitString()

    return fingerprint_rep 


In [38]:
def fingerprint_pharmacophore_smiles_list(smiles_list):
    fingerprint_rep = [None] * len(smiles_list)
    for i,smiles in enumerate(smiles_list):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        AllChem.Compute2DCoords(mol)
        featFactory = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
        if featFactory.GetFeaturesForMol(mol) == []:
            return None  
        sigFactory = SigFactory(featFactory)
        sigFactory.SetBins([(0,3), (3,6), (6,10)]) 
        sigFactory.Init()
        pharm2d = Generate.Gen2DFingerprint(mol, sigFactory)
        fingerprint_rep[i]=pharm2d.ToBitString()
    return fingerprint_rep

In [39]:
def fingerprint_pubchem_smiles_list(
        smiles,
        minimum_path=1,
        maximum_path=7,
        len_fingerprint=2048,
        BitsPerHash=2,
        useHs=True,
        Density=0.0,
        minimum_size=128,
        branchedPaths=True,
        useBondOrder=True,
        from_atoms=0,
        atom_invariants=0,
        atomBits=None,
        bitInfo=None
        ):
    fingerprint_rep = [None] * len(smiles)
    for i, sm in enumerate(smiles):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fingerprint_bin = Chem.RDKFingerprint(mol, minPath=minimum_path, maxPath=maximum_path, fpSize=len_fingerprint, 
                                              nBitsPerHash=BitsPerHash, useHs=useHs, tgtDensity=Density, minSize=minimum_size,
                                              branchedPaths=branchedPaths, useBondOrder=useBondOrder, 
                                                atomInvariants=atom_invariants, fromAtoms=from_atoms, atomBits=atomBits,
                                                 bitInfo=bitInfo)
            fingerprint_rep[i] = fingerprint_bin.ToBitString()

    return fingerprint_rep

In [40]:
def fingerprint_substructure_smiles_list(
        smiles
):
    fingerprint_rep = [None] * len(smiles)

    for i, sm in enumerate(smiles):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fingerprint_bin = AllChem.PatternFingerprint(mol) 
            fingerprint_rep[i] = fingerprint_bin.ToBitString()
    return fingerprint_rep 


In [41]:
def fingerprint_topological_torsion_smiles_list(
        smiles,
        len_fingerprint=2048,
        target_size=4,
        from_atoms=0,
        atom_invariants=0,
        ignoreAtoms=0,
        Chirality=False
):
    fingerprint_rep = [None] * len(smiles)

    for i, sm in enumerate(smiles):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fingerprint_bin = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=len_fingerprint, targetSize=target_size,
                                                                atomInvariants=atom_invariants, fromAtoms=from_atoms,ignoreAtoms=ignoreAtoms,
                                                                includeChirality=Chirality) 
            fingerprint_rep[i] = fingerprint_bin.ToBitString()

    return fingerprint_rep


In [42]:
smiles_list = df_drug['smiles']

fingerprint_list_atom_pair  = fingerprint_atom_pair_smiles_list(smiles_list)
fingerprint_list_ecfp = fingerprint_extended_connectivity_smiles_list(smiles_list)
fingerprint_list_maccs = fingerprint_maccs_keys_smiles_list(smiles_list)
fingerprint_list_morgan = fingerprint_morgan_smiles_list(smiles_list)
fingerprint_list_pharmacophore = fingerprint_pharmacophore_smiles_list(smiles_list)
fingerprint_list_pubchem = fingerprint_pubchem_smiles_list(smiles_list)
fingerprint_list_substructure = fingerprint_substructure_smiles_list(smiles_list)
fingerprint_list_topological_torsion = fingerprint_topological_torsion_smiles_list(smiles_list)

In [43]:
def similarity_tanimoto_fingerprints(fingerprint_1, fingerprint_2):
    similarities = np.zeros((len(fingerprint_1),len(fingerprint_2)))
    for i,fp1 in enumerate(fingerprint_1):
        n_a = sum(int(bit) for bit in fp1)  
        for j,fp2 in enumerate(fingerprint_2):
            n_b = sum(int(bit) for bit in fp2)  
            n_ab = sum(1 for bit1, bit2 in zip(fp1, fp2) if bit1 == '1' and bit2 == '1')
            if (n_a + n_b - n_ab) != 0:  
                tanimoto_coefficient = n_ab / (n_a + n_b - n_ab) 
                similarities[i,j] = tanimoto_coefficient
    return similarities

In [44]:
similarity_matrix_atom_pair = similarity_tanimoto_fingerprints(fingerprint_list_atom_pair, fingerprint_list_atom_pair)
similarity_matrix_ecfp = similarity_tanimoto_fingerprints(fingerprint_list_ecfp, fingerprint_list_ecfp)
similarity_matrix_maccs_keys = similarity_tanimoto_fingerprints(fingerprint_list_maccs, fingerprint_list_maccs)
similarity_matrix_morgan = similarity_tanimoto_fingerprints(fingerprint_list_morgan, fingerprint_list_morgan)
similarity_matrix_topological_torsion = similarity_tanimoto_fingerprints(fingerprint_list_topological_torsion, fingerprint_list_topological_torsion)
similarity_matrix_substructure = similarity_tanimoto_fingerprints(fingerprint_list_substructure, fingerprint_list_substructure)
similarity_matrix_pubchem = similarity_tanimoto_fingerprints(fingerprint_list_pubchem, fingerprint_list_pubchem)
similarity_matrix_pharmacophore = similarity_tanimoto_fingerprints(fingerprint_list_pharmacophore, fingerprint_list_pharmacophore)

In [45]:
def top_five_similar_smiles(similarity_matrix,i=5):
    top_five_smiles = []
    for row in similarity_matrix:
        sorted_indices = np.argsort(row)[::-1]
        top_five_indices = sorted_indices[0:i]
        top_five_smiles.append([df_drug['smiles'][idx] for idx in top_five_indices])
    return top_five_smiles

top_five_atom_pair = top_five_similar_smiles(similarity_matrix_atom_pair)
top_five_ecfp = top_five_similar_smiles(similarity_matrix_ecfp)
top_five_maccs_keys = top_five_similar_smiles(similarity_matrix_maccs_keys)
top_five_morgan = top_five_similar_smiles(similarity_matrix_morgan)
top_five_pharmacophore = top_five_similar_smiles(similarity_matrix_pharmacophore)
top_five_pubchem = top_five_similar_smiles(similarity_matrix_pubchem)
top_five_substructure = top_five_similar_smiles(similarity_matrix_substructure)
top_five_topological_torsion = top_five_similar_smiles(similarity_matrix_topological_torsion)


In [46]:
df_similar_smile_top5 = pd.DataFrame({
    'smile': df_drug['smiles'],
    'atom_pair_similar_smile': top_five_atom_pair,
    'ecfp_similar_smile': top_five_ecfp,
    'maccs_keys_similar_smile' : top_five_maccs_keys,
    'morgan_similar_smile' : top_five_morgan,
    'pharmacophore_similar_smile' : top_five_pharmacophore,
    'pubchem_similar_smile' : top_five_pubchem,
    'substructure_similar_smile' : top_five_substructure,
    'topological_torsion_similar_smile' : top_five_topological_torsion
})

df_similar_smile_top5.to_csv('All_fingerprints.csv', index=False)


In [47]:
df_fp = pd.read_csv('All_fingerprints.csv')
smiles_name_list = df_drug['name']
output_directory = 'Similar_Smiles_wrt_different_fingerprints'
os.makedirs(output_directory, exist_ok=True)

In [48]:
def get_similarity(similarity_matrix, main_index, similar_index):
    return similarity_matrix[main_index, similar_index]

for index, row in df_fp.iterrows():
    main_smile = row['smile']
    main_name = smiles_name_list[index]
    atom_pair_similar_smile = ast.literal_eval(row['atom_pair_similar_smile'])
    ecfp_similar_smile = ast.literal_eval(row['ecfp_similar_smile'])
    maccs_keys_similar_smile = ast.literal_eval(row['maccs_keys_similar_smile'])
    morgan_similar_smile = ast.literal_eval(row['morgan_similar_smile'])
    pharmacophore_similar_smile = ast.literal_eval(row['pharmacophore_similar_smile'])
    pubchem_similar_smile = ast.literal_eval(row['pubchem_similar_smile'])
    substructure_similar_smile = ast.literal_eval(row['substructure_similar_smile'])
    topological_torsion_similar_smile = ast.literal_eval(row['topological_torsion_similar_smile'])
    
    main_mol = Chem.MolFromSmiles(main_smile)
    atom_pair_similar_mol = [Chem.MolFromSmiles(smile) for smile in atom_pair_similar_smile]
    ecfp_similar_mol = [Chem.MolFromSmiles(smile) for smile in ecfp_similar_smile]
    maccs_keys_similar_mol = [Chem.MolFromSmiles(smile) for smile in maccs_keys_similar_smile]
    morgan_similar_mol = [Chem.MolFromSmiles(smile) for smile in morgan_similar_smile]
    pharmacophore_similar_mol = [Chem.MolFromSmiles(smile) for smile in pharmacophore_similar_smile]
    pubchem_similar_mol = [Chem.MolFromSmiles(smile) for smile in pubchem_similar_smile]
    substructure_similar_mol = [Chem.MolFromSmiles(smile) for smile in substructure_similar_smile]
    topological_torsion_similar_mol = [Chem.MolFromSmiles(smile) for smile in topological_torsion_similar_smile]
    
    main_mol_list = [main_mol] * 5
    all_similar_mols = [
        atom_pair_similar_mol, ecfp_similar_mol, maccs_keys_similar_mol,
        morgan_similar_mol, pharmacophore_similar_mol, pubchem_similar_mol,
        substructure_similar_mol, topological_torsion_similar_mol
    ]
    for mol_list in all_similar_mols:
        while len(mol_list) < 5:
            mol_list.append(None)
    
    fig, axes = plt.subplots(6, 9, figsize=(20, 24))
    titles = [
        'Main Smile', 'Atom Pair', 'Extended Connectivity',
        'MACCS Keys', 'Morgan', 'Pharmacophore', 'PubChem',
        'Substructure', 'Topological Torsion'
    ]
    
    for col_idx, title in enumerate(titles):
        ax = axes[0, col_idx]
        ax.text(0.5, 0.5, title, ha='center', va='center', fontsize=14, weight='bold')
        ax.axis('off')

    for row_idx in range(1, 6):
        for col_idx in range(9):
            ax = axes[row_idx, col_idx]
            if col_idx == 0:
                mol = main_mol_list[row_idx-1]
                name = main_name
                similarity = ""
            else:
                mol = all_similar_mols[col_idx - 1][row_idx-1]
                if mol is not None:
                    similar_smile = [atom_pair_similar_smile, ecfp_similar_smile, maccs_keys_similar_smile,
                                     morgan_similar_smile, pharmacophore_similar_smile, pubchem_similar_smile,
                                     substructure_similar_smile, topological_torsion_similar_smile][col_idx - 1][row_idx-1]
                    similar_index = df_drug[df_drug['smiles'] == similar_smile].index[0]
                    name = df_drug['name'][similar_index]
                    similarity_matrix = [similarity_matrix_atom_pair, similarity_matrix_ecfp, similarity_matrix_maccs_keys,
                                         similarity_matrix_morgan, similarity_matrix_pharmacophore, similarity_matrix_pubchem,
                                         similarity_matrix_substructure, similarity_matrix_topological_torsion][col_idx - 1]
                    similarity = get_similarity(similarity_matrix, index, similar_index)
                    name = f"{name} ({similarity:.2f})"
                else:
                    name = "N/A"
                    similarity = ""

            if mol is not None:
                ax.imshow(Draw.MolToImage(mol, size=(200, 200)))
            else:
                ax.text(0.5, 0.5, 'N/A', ha='center', va='center', fontsize=12, color='gray')
            ax.axis('off')
            ax.set_title(name, fontsize=10, pad=10)
    
    plt.tight_layout()
    output_file = os.path.join(output_directory, f'{main_name}.png')
    plt.savefig(output_file, bbox_inches='tight', pad_inches=0.1)
    plt.close(fig)
