In [8]:
import pysmiles
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
from itertools import chain
from collections import Counter
import random
import logging
import numpy.linalg

logging.disable(logging.CRITICAL)

In [2]:
df = pd.read_csv("C:/Users/Evint/Documents/Projects/Functional-Group-Analysis/250k_rndm_zinc_drugs_clean_3.csv")
df['smiles'] = df['smiles'].apply(lambda s: s.replace('\n', ''))
df.head()

Unnamed: 0,smiles,logP,qed,SAS
0,CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1,5.0506,0.702012,2.084095
1,C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1,3.1137,0.928975,3.432004
2,N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)...,4.96778,0.599682,2.470633
3,CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c...,4.00022,0.690944,2.822753
4,N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#...,3.60956,0.789027,4.035182


In [3]:
df = df[df["SAS"] > 3.55]
smiles = df["smiles"].values
len(smiles)

62087

In [4]:
mol_graph = [pysmiles.read_smiles(smile, explicit_hydrogen=False, reinterpret_aromatic=True) for smile in smiles]
mol_graph

[<networkx.classes.graph.Graph at 0x2bd870109d0>,
 <networkx.classes.graph.Graph at 0x2bd87013af0>,
 <networkx.classes.graph.Graph at 0x2bdb02d8310>,
 <networkx.classes.graph.Graph at 0x2bdb02d8d60>,
 <networkx.classes.graph.Graph at 0x2bdb02d8bb0>,
 <networkx.classes.graph.Graph at 0x2bdb02d8b80>,
 <networkx.classes.graph.Graph at 0x2bdb027f670>,
 <networkx.classes.graph.Graph at 0x2bdb027f940>,
 <networkx.classes.graph.Graph at 0x2bdb02d8940>,
 <networkx.classes.graph.Graph at 0x2bdb02d88b0>,
 <networkx.classes.graph.Graph at 0x2bdb02d8730>,
 <networkx.classes.graph.Graph at 0x2bdb02d8640>,
 <networkx.classes.graph.Graph at 0x2bdb02d8610>,
 <networkx.classes.graph.Graph at 0x2bdb02d8460>,
 <networkx.classes.graph.Graph at 0x2bdb02d8430>,
 <networkx.classes.graph.Graph at 0x2bdb02d82b0>,
 <networkx.classes.graph.Graph at 0x2bdb02d8ee0>,
 <networkx.classes.graph.Graph at 0x2bdb02d8fd0>,
 <networkx.classes.graph.Graph at 0x2bdb02d9000>,
 <networkx.classes.graph.Graph at 0x2bdb02d91b0>,


In [49]:
smile = mol_graph[0]

# edge connectivity of the graph
edge_connect = np.mean([j for i,j in nx.degree(smile)])

# node connectivity of graph
node_connect = nx.average_node_connectivity(smile)

# modularity of graph
mod = nx.community.modularity(smile, nx.community.label_propagation_communities(smile))

# closeness centrality
cc = np.mean(list(nx.closeness_centrality(smile).values()))

# betweeness centrality
bc = np.mean(list(nx.betweenness_centrality(smile).values()))

# number of edge
num_node = smile.number_of_nodes()

# number of node
num_edge = smile.number_of_edges()

print(np.round(cc, 4), np.round(bc, 4), num_node, num_edge, edge_connect, node_connect, mod)

[ 5.6753  4.7096  4.5075  4.3948  4.2542  3.8172  3.6432  3.2023  3.1362
  3.071   2.4835 -0.      0.0293  0.1844  0.2176  0.2564  0.3951  0.6128
  0.7122  2.      1.1573  1.8442  1.751   1.3784  1.5663  1.      3.
  1.    ]
0.1864 0.176 28 30 2.142857142857143 1.119047619047619 0.5044444444444445


In [50]:
def get_graph_parameter(smile):
    # edge connectivity of the graph
    edge_connect = np.mean([j for i,j in nx.degree(smile)])

    # node connectivity of graph
    node_connect = nx.average_node_connectivity(smile)

    # modularity of graph
    mod = nx.community.modularity(smile, nx.community.label_propagation_communities(smile))

    # closeness centrality
    cc = np.mean(list(nx.closeness_centrality(smile).values()))

    # betweeness centrality
    bc = np.mean(list(nx.betweenness_centrality(smile).values()))

    # number of edge
    num_node = smile.number_of_nodes()

    # number of node
    num_edge = smile.number_of_edges()

    return [np.round(cc, 4), np.round(bc, 4), num_node, num_edge, edge_connect, node_connect, mod]

In [58]:
graphs_parameter = [get_graph_parameter(smile) for smile in mol_graph]
graphs_parameter

[[0.1864,
  0.176,
  28,
  30,
  2.142857142857143,
  1.119047619047619,
  0.5044444444444445],
 [0.2935, 0.1696, 17, 17, 2.0, 1.0735294117647058, 0.4463667820069205],
 [0.2746,
  0.1848,
  17,
  18,
  2.1176470588235294,
  1.1838235294117647,
  0.43364197530864196],
 [0.2124, 0.204, 21, 21, 2.0, 1.0714285714285714, 0.4988662131519274],
 [0.2211,
  0.1935,
  21,
  24,
  2.2857142857142856,
  1.2619047619047619,
  0.4105902777777778],
 [0.2197, 0.1636, 25, 27, 2.16, 1.1866666666666668, 0.43895747599451296],
 [0.2263,
  0.2119,
  19,
  20,
  2.1052631578947367,
  1.1169590643274854,
  0.45375000000000004],
 [0.2108,
  0.1581,
  27,
  28,
  2.074074074074074,
  1.0854700854700854,
  0.548469387755102],
 [0.2752,
  0.1848,
  17,
  18,
  2.1176470588235294,
  1.1470588235294117,
  0.412037037037037],
 [0.2564,
  0.1924,
  18,
  19,
  2.111111111111111,
  1.1633986928104576,
  0.407202216066482],
 [0.3196, 0.1905, 14, 14, 2.0, 1.10989010989011, 0.39540816326530615],
 [0.1902,
  0.186,
  26,


In [66]:
columns = ["closeness_centrality", "betweeness_centrality", "number_of_nodes", "number_of_edges", "edge_connectivity", "node_connectivity", "modularity"]
params_df = pd.DataFrame(graphs_parameter, columns=columns)
params_df

Unnamed: 0,closeness_centrality,betweeness_centrality,number_of_nodes,number_of_edges,edge_connectivity,node_connectivity,modularity
0,0.1864,0.1760,28,30,2.142857,1.119048,0.504444
1,0.2935,0.1696,17,17,2.000000,1.073529,0.446367
2,0.2746,0.1848,17,18,2.117647,1.183824,0.433642
3,0.2124,0.2040,21,21,2.000000,1.071429,0.498866
4,0.2211,0.1935,21,24,2.285714,1.261905,0.410590
...,...,...,...,...,...,...,...
62082,0.1883,0.2062,24,26,2.166667,1.144928,0.496302
62083,0.2469,0.1875,19,21,2.210526,1.356725,0.397959
62084,0.2014,0.2165,21,24,2.285714,1.352381,0.448785
62085,0.2842,0.1663,18,19,2.111111,1.163399,0.401662


In [65]:
params_df.to_csv("Datas/graph_parameter.csv", index=False)