##  <span style = "color : lightgreen"> Importing Libraries :</span>

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from rdkit import Chem
import os
from rdkit.Chem import Draw
import matplotlib.pyplot as plt

##  <span style = "color : lightgreen"> Data Reading :</span>

In [3]:

df=pd.read_excel('./dataset/flurophores_mdrug_natproduct_recon2_151443.xlsx')

##  <span style = "color : lightgreen"> Extracting Drugs :</span>

In [4]:
df_drug=df[df['type']=='Drug']
df_drug.head(10)

Unnamed: 0,name,type,smiles
0,Orlistat,Drug,CCCCCCCCCCCC(CC1OC(=O)C1CCCCCC)OC(=O)C(CC(C)C)...
1,Aclidinium,Drug,O=C(OC1C[N+]2(CCCOc3ccccc3)CCC1CC2)C(O)(c1cccs...
2,Quinapril,Drug,CCOC(=O)C(CCc1ccccc1)NC(C)C(=O)N1Cc2ccccc2CC1C...
3,Fosaprepitant,Drug,CC(OC1OCCN(Cc2nc(=O)n(P(=O)(O)O)[nH]2)C1c1ccc(...
4,Anisindione,Drug,COc1ccc(C2C(=O)c3ccccc3C2=O)cc1
5,Sulindac,Drug,CC1=C(CC(=O)O)c2cc(F)ccc2C1=Cc1ccc(S(C)=O)cc1
6,Triamcinolone,Drug,CC12C=CC(=O)C=C1CCC1C3CC(O)C(O)(C(=O)CO)C3(C)C...
7,Methoxsalen,Drug,COc1c2occc2cc2ccc(=O)oc12
8,Aprepitant,Drug,CC(OC1OCCN(Cc2n[nH]c(=O)[nH]2)C1c1ccc(F)cc1)c1...
9,Moexipril,Drug,CCOC(=O)C(CCc1ccccc1)NC(C)C(=O)N1Cc2cc(OC)c(OC...


#  <span style = "color : red"> Graph Edit Distance Between Two SMILE</span>

##  <span style = "color : lightgreen"> Converting Molecular SMILES into Graphs :</span>

In [35]:


def get_graph(mol):
    # Get atoms and adjacency matrix
    atoms = [a.GetAtomicNum() for a in mol.GetAtoms()]
    adj_matrix = Chem.GetAdjacencyMatrix(mol)
    
    # Convert adjacency matrix to numpy array
    am = np.zeros((len(atoms), len(atoms)))
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        am[i, j] = am[j, i] = bond.GetBondTypeAsDouble()
    
    # Create a graph from the adjacency matrix
    G = nx.from_numpy_array(am)
    
    return G

##  <span style = "color : lightgreen"> Graph Edit Distance Between Two SMILES :</span>

In [36]:
def ged_from_two_smiles(s1,s2,termination_time):
   
    mol1 = Chem.MolFromSmiles(s1)
    mol2 = Chem.MolFromSmiles(s2) 

    G1 = get_graph(mol1)
    G2 = get_graph(mol2)
    GED = nx.graph_edit_distance(G1, G2, edge_match=lambda a,b: a['weight'] == b['weight'],timeout=termination_time)
    return GED

##  <span style = "color : lightgreen"> Graph Edit Distance Between Two Molecular Graphs :</span>

In [37]:
def ged_from_two_graphs(G1,G2,termination_time):

    GED = nx.graph_edit_distance(G1, G2, edge_match=lambda a,b: a['weight'] == b['weight'],timeout=termination_time)
    return GED

####  <span style = "color : orange"> Testing With Some Examples :</span>

In [38]:
mol1 = Chem.MolFromSmiles('CCO')
mol2 = Chem.MolFromSmiles('C=CC=CC=C') 

G1 = get_graph(mol1)
G2 = get_graph(mol2)

GED = nx.graph_edit_distance(G1, G2, edge_match=lambda a,b: a['weight'] == b['weight'])
print(GED)

7.0


##  <span style = "color : lightgreen"> Graph Edit Distance Between The Drugs In My Dataset :</span>

In [None]:
termination_time=0.1
num_molecules = len(df_drug)
ged_matrix = np.zeros((num_molecules, num_molecules))

for i in range(num_molecules):
    mol1 = Chem.MolFromSmiles(df_drug['smiles'][i])
    G1 = get_graph(mol1)
    for j in range(num_molecules):
        mol2 = Chem.MolFromSmiles(df_drug['smiles'][j])
        G2 = get_graph(mol2)

        # Calculate graph edit distance
        GED = ged_from_two_graphs(G1,G2,termination_time)
       
        # Store GED in the matrix
        ged_matrix[i, j] = GED
        np.savetxt("graph_edit_distance_matrix.csv", ged_matrix, delimiter=",")

In [39]:
ged_df = pd.read_csv('graph_edit_distance_matrix.csv', header=None)
drug_names = df_drug['name'].tolist()
ged_df.columns = drug_names
ged_df.index = drug_names

ged_df.to_csv('graph_edit_distance.csv', index=True)


In [40]:
ged_matrix = pd.read_csv("graph_edit_distance.csv", index_col=0)
drug_names = df_drug['name'].tolist()
drug_smiles = df_drug['smiles'].tolist()

name_to_smiles = dict(zip(drug_names, drug_smiles))
ged_matrix.columns = drug_names
ged_matrix.index = drug_names
print(f"Adjusted GED matrix shape: {ged_matrix.shape}")
print(f"Number of drug names after adjustment: {len(drug_names)}")

top_5_similar_smiles = pd.DataFrame(index=ged_matrix.index, columns=['Top1', 'Top2', 'Top3', 'Top4', 'Top5'])

for index, row in ged_matrix.iterrows():
    top_5_indices = row[row > 0].nsmallest(5).index
    top_5_smiles = [name_to_smiles[name] for name in top_5_indices]
    if len(top_5_smiles) < 5:
        top_5_smiles += [None] * (5 - len(top_5_smiles))
    top_5_similar_smiles.loc[index] = top_5_smiles

# Save the results to a new CSV file
top_5_similar_smiles.to_csv("top_5_similar_smiles.csv")
print("Top 5 similar SMILES saved to 'top_5_similar_smiles.csv'")


Adjusted GED matrix shape: (1357, 1357)
Number of drug names after adjustment: 1357
Top 5 similar SMILES saved to 'top_5_similar_smiles.csv'


In [41]:
top_5_similar = pd.DataFrame(index=ged_matrix.index, columns=['Top1', 'Top2', 'Top3', 'Top4', 'Top5'])
top_5_similar_distances = pd.DataFrame(index=ged_matrix.index, columns=['Dist1', 'Dist2', 'Dist3', 'Dist4', 'Dist5'])

for index, row in ged_matrix.iterrows():
    top_5_indices = row[row > 0].nsmallest(5).index
    top_5_distances = row[row > 0].nsmallest(5).values
    if len(top_5_indices) < 5:
        top_5_indices = list(top_5_indices) + [None] * (5 - len(top_5_indices))
        top_5_distances = list(top_5_distances) + [None] * (5 - len(top_5_distances))
    top_5_similar.loc[index] = top_5_indices
    top_5_similar_distances.loc[index] = top_5_distances

# Output directory
output_directory = 'Similar_Smiles_GED'
os.makedirs(output_directory, exist_ok=True)

# Function to draw molecules and save as images
def draw_similar_smiles(main_smile, main_name, similar_smiles, similar_names, distances):
    mols = [Chem.MolFromSmiles(main_smile)] + [Chem.MolFromSmiles(smile) if smile else None for smile in similar_smiles]
    titles = [f"{main_name}\n(Main Smile)"] + [f"{name}\n({dist:.2f})" if name and dist else "N/A" for name, dist in zip(similar_names, distances)]
    
    fig, axes = plt.subplots(1, 6, figsize=(20, 4))
    
    for ax, mol, title in zip(axes, mols, titles):
        if mol:
            img = Draw.MolToImage(mol, size=(300, 300))
            ax.imshow(img)
            ax.set_title(title, fontsize=10)
            ax.axis('off')
        else:
            ax.text(0.5, 0.5, 'N/A', ha='center', va='center', fontsize=12, color='gray')
            ax.axis('off')
    
    plt.tight_layout()
    output_file = os.path.join(output_directory, f'{main_name}.png')
    plt.savefig(output_file, bbox_inches='tight', pad_inches=0.1)
    plt.close(fig)

# Generate and save the images
for main_name in top_5_similar.index:
    main_smile = df_drug[df_drug['name'] == main_name]['smiles'].values[0]
    similar_names = top_5_similar.loc[main_name].values
    similar_smiles = [df_drug[df_drug['name'] == name]['smiles'].values[0] if name else None for name in similar_names]
    distances = top_5_similar_distances.loc[main_name].values
    draw_similar_smiles(main_smile, main_name, similar_smiles, similar_names, distances)