In [2]:
import pandas as pd
import os
import numpy as np

os.chdir('C:\Programming\Github\Project\ChemGCNs\datasets')

# read_dataset

In [3]:
data = pd.read_csv('esol.csv')
data

Unnamed: 0,smiles,logp
0,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.770
1,Cc1occc1C(=O)Nc2ccccc2,-3.300
2,CC(C)=CCCC(C)=CC(=O),-2.060
3,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.870
4,c1ccsc1,-1.330
...,...,...
1123,FC(F)(F)C(Cl)Br,-1.710
1124,CNC(=O)ON=C(SC)C(=O)N(C)C,0.106
1125,CCSCCSP(=S)(OC)OC,-3.091
1126,CCC(C)C,-3.180


In [4]:
data_mat = np.array(data)
data_mat.shape
data_mat

array([['OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O', -0.77],
       ['Cc1occc1C(=O)Nc2ccccc2', -3.3],
       ['CC(C)=CCCC(C)=CC(=O)', -2.06],
       ...,
       ['CCSCCSP(=S)(OC)OC', -3.091],
       ['CCC(C)C', -3.18],
       ['COP(=O)(OC)OC(=CCl)c1cc(Cl)c(Cl)cc1Cl', -4.522]], dtype=object)

In [5]:
target = np.array(data_mat[:, 1], dtype = float) # 기존은 1:3
target

array([-0.77 , -3.3  , -2.06 , ..., -3.091, -3.18 , -4.522])

# smiles_to_mol_graph

In [6]:
from rdkit import Chem

In [7]:
smiles = data_mat[:, 0]

In [8]:
mol = Chem.MolFromSmiles(smiles[0])
adj_mat = Chem.GetAdjacencyMatrix(mol)
adj_mat

array([[0, 1, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0]], dtype=int32)

In [9]:
mol.GetNumAtoms()

32

# read_atom_prop()

In [10]:
from mendeleev.fetch import fetch_table

# elments table 불러오기
tb_atomic_props = fetch_table('elements')
# elements 개수 array로 저장
sel_prop_names = ['atomic_weight',
                'atomic_radius',
                'atomic_volume',
                'dipole_polarizability',
                'fusion_heat',
                'thermal_conductivity',
                'vdw_radius',
                'en_pauling']
arr_atomic_props = np.nan_to_num(np.array(tb_atomic_props[sel_prop_names], dtype=float))
arr_atomic_props.shape
arr_atomic_nums = np.array(tb_atomic_props['atomic_number'], dtype=int)
arr_atomic_nums

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118])

In [11]:
# utils
# 각 feature들을 평균 0, 표준편차 1로 normalize
def zscore(X):
    if len(X.shape) == 1:
        means = np.mean(X)
        stds = np.std(X)

        for i in range(0, X.shape[0]):
            if stds == 0:
                X[i] = 0
            else:
                X[i] = (X[i] - means) / stds
    else:
        means = np.mean(X, axis=0)
        stds = np.std(X, axis=0)

        for i in range(0, X.shape[0]):
            for j in range(0, X.shape[1]):
                if stds[j] == 0:
                    X[i, j] = 0
                else:
                    X[i, j] = (X[i, j] - means[j]) / stds[j]

    return X

In [12]:
# 각 feature들을 평균 0, 표준편차 1로 normalize
arr_atomic_props = zscore(arr_atomic_props)

In [13]:
# 각 원자 1~118개에 대한 8개에 대한 property (표준화된)
atomic_props_mat = {arr_atomic_nums[i]: arr_atomic_props[i, :] for i in range(0, arr_atomic_nums.shape[0])}
atomic_props_mat

{1: array([-1.62958442, -1.22934387,  0.02130638, -1.12744854, -0.70088858,
        -0.50485004, -0.99842165,  0.98992771]),
 2: array([-1.59603634,  0.07864254,  1.47272519, -1.16649754, -0.7096735 ,
        -0.50524731, -0.62045003, -1.32588047]),
 3: array([-1.56312912,  0.42284949, -0.06069469,  0.86797614, -0.49267844,
         0.6346863 , -0.09128976, -0.29429319]),
 4: array([-1.53991476, -0.12788163, -0.72490329, -0.71196302,  0.20711187,
         2.19952286, -0.45666233,  0.32676446]),
 5: array([-1.51977409, -0.40324719, -0.75770372, -0.92750162,  1.06232771,
        -0.13830525,  0.03470078,  0.8215053 ]),
 6: array([-1.50631947, -0.60977136, -0.70030298, -1.04252221, -0.7096735 ,
        -0.48588212, -0.24247841,  1.35835174]),
 7: array([-1.48395858, -0.67861275,  0.28370978, -1.09128094, -0.7096735 ,
        -0.50694412, -0.43146422,  1.87414539]),
 8: array([-1.46164251, -0.74745414,  0.01310627, -1.11753564, -0.7096735 ,
        -0.50693065, -0.46926138,  2.29520142]),


In [19]:
def read_atom_prop():
#    tb_atomic_props = get_table('elements')
    tb_atomic_props = fetch_table('elements')
#    arr_atomic_nums = np.array(tb_atomic_props['atomic_number'], dtype=np.int)
    arr_atomic_nums = np.array(tb_atomic_props['atomic_number'], dtype=int)
#    arr_atomic_props = np.nan_to_num(np.array(tb_atomic_props[sel_prop_names], dtype=np.float32))
    arr_atomic_props = np.nan_to_num(np.array(tb_atomic_props[sel_prop_names], dtype=float))
    arr_atomic_props = zscore(arr_atomic_props)
    atomic_props_mat = {arr_atomic_nums[i]: arr_atomic_props[i, :] for i in range(0, arr_atomic_nums.shape[0])}

    return atomic_props_mat

atomic_props = read_atom_prop()
len(next(iter(atomic_props.values())))

8

In [None]:
# 원자번호가 1(수소)에 대한 property
atomic_props.get(1)

array([-1.62958442, -1.22934387,  0.02130638, -1.12744854, -0.70088858,
       -0.50485004, -0.99842165,  0.98992771])

In [None]:
node_feat_mat = np.empty([mol.GetNumAtoms(), atomic_props.get(1).shape[0]])  # np.zeros해도 될거같은
node_feat_mat

array([[1.47325736e-311, 1.47325736e-311, 1.47325736e-311,
        1.47325736e-311, 1.47325736e-311, 5.43472210e-323,
        0.00000000e+000, 3.95252517e-323],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.0000000

In [87]:
for atom in mol.GetAtoms():
    print(atom.GetNeighbors())

(<rdkit.Chem.rdchem.Atom object at 0x000002B6478455F0>,)
(<rdkit.Chem.rdchem.Atom object at 0x000002B6478454A0>, <rdkit.Chem.rdchem.Atom object at 0x000002B647856CF0>)
(<rdkit.Chem.rdchem.Atom object at 0x000002B6478455F0>, <rdkit.Chem.rdchem.Atom object at 0x000002B647856CF0>, <rdkit.Chem.rdchem.Atom object at 0x000002B647856D60>)
(<rdkit.Chem.rdchem.Atom object at 0x000002B6478454A0>, <rdkit.Chem.rdchem.Atom object at 0x000002B647856CF0>)
(<rdkit.Chem.rdchem.Atom object at 0x000002B6478455F0>, <rdkit.Chem.rdchem.Atom object at 0x000002B647856CF0>, <rdkit.Chem.rdchem.Atom object at 0x000002B647856D60>)
(<rdkit.Chem.rdchem.Atom object at 0x000002B6478454A0>, <rdkit.Chem.rdchem.Atom object at 0x000002B647856CF0>)
(<rdkit.Chem.rdchem.Atom object at 0x000002B6478455F0>, <rdkit.Chem.rdchem.Atom object at 0x000002B647856CF0>)
(<rdkit.Chem.rdchem.Atom object at 0x000002B6478454A0>, <rdkit.Chem.rdchem.Atom object at 0x000002B647856CF0>, <rdkit.Chem.rdchem.Atom object at 0x000002B647856D60>)
(