In [1]:
import pysmiles
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
from itertools import chain
from collections import Counter
import random
import logging

logging.disable(logging.CRITICAL)

In [2]:
df = pd.read_csv("C:/Users/Evint/Documents/Projects/Functional-Group-Analysis/250k_rndm_zinc_drugs_clean_3.csv")
df['smiles'] = df['smiles'].apply(lambda s: s.replace('\n', ''))
df.head()

Unnamed: 0,smiles,logP,qed,SAS
0,CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1,5.0506,0.702012,2.084095
1,C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1,3.1137,0.928975,3.432004
2,N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)...,4.96778,0.599682,2.470633
3,CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c...,4.00022,0.690944,2.822753
4,N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#...,3.60956,0.789027,4.035182


In [3]:
df = df[df["SAS"] > 3.55]
smiles = df["smiles"].values
len(smiles)

62087

In [4]:
mol_graph = [pysmiles.read_smiles(smile, explicit_hydrogen=True, reinterpret_aromatic=True) for smile in smiles]
mol_graph

[<networkx.classes.graph.Graph at 0x2429d5c1d80>,
 <networkx.classes.graph.Graph at 0x2429d5c23e0>,
 <networkx.classes.graph.Graph at 0x2429d5c2f50>,
 <networkx.classes.graph.Graph at 0x2429d5c3970>,
 <networkx.classes.graph.Graph at 0x2429d5c1db0>,
 <networkx.classes.graph.Graph at 0x2429d5c1fc0>,
 <networkx.classes.graph.Graph at 0x2429d5c3790>,
 <networkx.classes.graph.Graph at 0x2429d5c3760>,
 <networkx.classes.graph.Graph at 0x2429d5c35b0>,
 <networkx.classes.graph.Graph at 0x2429d5c34c0>,
 <networkx.classes.graph.Graph at 0x2429d5c3340>,
 <networkx.classes.graph.Graph at 0x2429d5c3250>,
 <networkx.classes.graph.Graph at 0x2429d5c3220>,
 <networkx.classes.graph.Graph at 0x2429d5c3070>,
 <networkx.classes.graph.Graph at 0x2429d5c3040>,
 <networkx.classes.graph.Graph at 0x2429d5c2ef0>,
 <networkx.classes.graph.Graph at 0x2429d5c3b20>,
 <networkx.classes.graph.Graph at 0x2429d5c3c10>,
 <networkx.classes.graph.Graph at 0x242f434fa30>,
 <networkx.classes.graph.Graph at 0x242f434f9a0>,


In [5]:
smile = mol_graph[0]
elements = smile.nodes(data='element')
element_list = list([j for i, j in elements])
element_list.count('C')

20

In [6]:
element_list_mols = [smile.nodes(data='element') for smile in mol_graph]
all_elements = np.unique([x for y in [list([j for i, j in elements]) for elements in element_list_mols] for x in y])
all_elements

array(['Br', 'C', 'Cl', 'F', 'H', 'I', 'N', 'O', 'P', 'S'], dtype='<U2')

In [7]:
list_element = list()
for smile in mol_graph:
    elements = smile.nodes(data='element')
    element_list = list([j for i, j in elements])
    list_num_element = [element_list.count(i) for i in all_elements]
    list_element.append(list_num_element)
list_element

[[0, 20, 1, 0, 18, 0, 4, 2, 0, 1],
 [1, 13, 0, 0, 24, 0, 1, 1, 0, 1],
 [1, 12, 0, 0, 20, 0, 3, 1, 0, 0],
 [0, 16, 0, 2, 25, 0, 2, 1, 0, 0],
 [0, 18, 0, 0, 17, 0, 3, 0, 0, 0],
 [0, 19, 0, 0, 27, 0, 4, 2, 0, 0],
 [0, 14, 0, 0, 24, 0, 2, 2, 0, 1],
 [1, 16, 1, 3, 13, 0, 3, 3, 0, 0],
 [0, 11, 0, 0, 20, 0, 3, 3, 0, 0],
 [0, 12, 0, 0, 25, 0, 4, 2, 0, 0],
 [0, 8, 0, 0, 12, 0, 0, 6, 0, 0],
 [0, 21, 0, 0, 24, 0, 3, 2, 0, 0],
 [0, 19, 0, 2, 23, 0, 3, 1, 0, 0],
 [0, 15, 0, 0, 28, 0, 2, 0, 0, 1],
 [0, 10, 0, 0, 19, 0, 2, 3, 0, 1],
 [0, 14, 0, 0, 29, 0, 3, 1, 0, 0],
 [0, 24, 0, 0, 31, 0, 6, 2, 0, 0],
 [0, 14, 0, 1, 20, 0, 2, 1, 0, 0],
 [0, 13, 0, 0, 24, 0, 4, 2, 0, 0],
 [0, 22, 0, 3, 28, 0, 4, 3, 0, 0],
 [0, 11, 0, 0, 16, 0, 2, 2, 0, 0],
 [0, 23, 0, 0, 33, 0, 2, 2, 0, 0],
 [0, 14, 0, 0, 26, 0, 2, 1, 0, 1],
 [0, 20, 1, 0, 15, 0, 6, 1, 0, 1],
 [0, 16, 0, 0, 22, 0, 2, 3, 0, 2],
 [0, 19, 0, 0, 29, 0, 3, 3, 0, 0],
 [0, 16, 0, 0, 23, 0, 4, 1, 0, 0],
 [0, 9, 1, 0, 16, 0, 1, 0, 0, 1],
 [0, 19, 0, 0, 24, 0, 

In [8]:
pd.DataFrame(list_element, columns = all_elements).to_csv("Datas/clear_encoded_data.csv", index=False)