In [5]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw
from sklearn.utils import class_weight
from scipy.sparse import csr_matrix
import tensorflow as tf
# import tensorflow.keras as K

import kgcn
import kgcn.data_util
import kgcn.core
import kgcn.layers
from kgcn.data_util import dense_to_sparse
from kgcn.preprocessing import chem
from kgcn.preprocessing.utils import create_adjancy_matrix, create_feature_matrix
from kgcn.default_model import DefaultModel
from kgcn.gcn import get_default_config

In [56]:
labels = [[0,1],[1,0], [1,0], [0,1], [1,0], [0,1], [0,1], [0,1], [1,0], [1,0], [0,1], [1,0], [1,0], [1,0], [0,1],
          [0,1], [1,0], [0,1], [0,1],[1,0], [1,0], [1,0], [1,0], [1,0], [0,1], [1,0], [0,1], [1,0], [1,0], [0,1]]
smiles = ['COC1=CC(OC)=C(C=C1)N(CC(=O)NCCSC1=CC=CC=C1)S(=O)(=O)C1=CC=CC=C1',
          'FC1=C(SCC(=O)NC(=O)C2=C(Br)C=CC=C2)C=CC=C1',
          'O=C(CCC1=CC=CC=C1)NC1=CC2=C3N(CCCC3=C1)C(=O)CC2',
          'COC1=NC=C(NC(=O)\C=C\C2=CC=CC(=C2)C(F)(F)F)C=C1',
          'NC(=O)C1CCN(CC(O)COCC2=C(Cl)C=CC=C2)CC1',
          'CCOC1=C(NC(=O)COC(=O)C2=C(C)OC(C)=C2)C=CC=C1',
          'COC1=C(Cl)SC=C1C(=O)NC1=C(OC)C=CC(OC)=C1',
          'CC(C)N1CNC(NC2=NC3=C(O2)C=CC=C3)=NC1',
          'NS(=O)(=O)C1=CC=C(NC(=O)COC(=O)C2=CC=CO2)C=C1',
          'CS(=O)(=O)CC1=CC=C(C=C1)C(=O)N1CCN(CC1)C(=O)C1=CC=CS1',
          'CCOC(=O)C1=CN=C2C=CC(C)=CC2=C1N(C)CC1=CC=CC=C1',
          'CNC(=S)NC1=CC=CC(=C1)C(=O)OC',
          'CC(NC(=O)CSC1=NNC=N1)C1=C(Cl)C=C(Cl)C=C1',
          'FC1=CC(F)=C(C=C1)S(=O)(=O)NC1=CC=CC=C1C(=O)NN1CCOCC1',
          'COC1=CC=C(C(OC)=C1C(=O)NC(=O)NC1=C(C)C=CC=C1C)[N+]([O-])=O',
          r'COC1=CC(\C=N\NC(=O)C2=CC=CC3=C2C=CC=C3)=CC(OC)=C1O',
          'COC1=C(CC(=O)NC2=NC(C)=CC=C2)C=CC=C1',
          'CC1=NN(C(C)=C1C(=O)OC1=CC=CC=C1)C1=CC=CC=C1',
          r'CCN(CC)C1=CC(O)=C(\C=N\NC(=O)C2=CC3=C(NC=N3)C=C2)C=C1',
          r'N(\N=C\C1=CC=CN=C1)C1=NC=CC=C1',
          'CC(C)N1CCN(CC1)C1=NC=C(C=C1)S(=O)(=O)N1CCOCC1',
          'CCCCOC1=C(Cl)C=C(CSC(N)=N)C=C1',
          'C=CCNC(=O)C(=O)NC1=CC2=C3N(CCCC3=C1)C(=O)CC2',
          'C(N1CCN(CC1)C1=NC2=C(C=CC=C2)N=C1)C1=CC=CC=C1',
          'CC1=CC(Cl)=C(C=C1)C1=NC(=NO1)C1=CC=NC=C1',
          'O=C(CNC(=O)C1=CC=CC=C1)NCC1=NC2=C(N1)C=CC=C2',
          'CC1=CC=CC(OCCSC2=NC3=C(N)N=CNC3=N2)=C1',
          'CC1=CC2=C(NC3=C2CCN2C(=O)OC(C)(C)C32C)C=C1',
          'CN1CCC(CC1)NC1=CC=C(Cl)C=C1',
          'CC1=CC=CC(=C1)C(=O)NC1=C(C(O)=O)C2=C(CCCC2)S1']
mol_obj_list = [Chem.MolFromSmiles(s) for s in smiles]
labels = np.array(labels, dtype=np.float16)
# Draw.MolsToGridImage(mol_obj_list, molsPerRow=6, subImgSize=(180, 180))

In [69]:
from fingerprints.infofunction import smi2infomax

ModuleNotFoundError: No module named 'dgl'

In [52]:
# labels = labels[:, 0]

In [57]:
mask_labels = np.zeros_like(labels, dtype=np.float16)
atom_num_limit = 50

mol_list = []
adj_list = []
feature_list = []
mol_name_list = []
label_data_list = []
label_mask_list = []

for i, mol in enumerate(mol_obj_list):
    Chem.SanitizeMol(mol, sanitizeOps=Chem.SANITIZE_ADJUSTHS)
    mol_list.append(mol)
    name = f"index_{str(i)}"
    mol_name_list.append(name)
    adj = create_adjancy_matrix(mol)
    feature = create_feature_matrix(mol, atom_num_limit)

    adj_list.append(dense_to_sparse(adj))
    feature_list.append(feature)
    label_data_list.append(labels[i])
    label_mask_list.append(mask_labels[i])
    # compute class weight
    label_int = np.argmax(label_data_list
                          , axis=1
                          )
    cw = class_weight.compute_class_weight(class_weight="balanced",
                                            classes=np.unique(label_int),y = label_int)

# This dictionary is used as an input of kGCN
obj = {"feature": np.asarray(feature_list),
       "adj": np.asarray(adj_list),
       "label": np.asarray(label_data_list),
       "mask_label": np.asarray(label_mask_list),
       "label_dim": labels.shape[1],
       "label_sparse": csr_matrix(np.asarray(label_data_list).astype(float)),
       "mask_label_sparse": csr_matrix(np.asarray(label_mask_list).astype(float)),
       "max_node_num": atom_num_limit,
       "mol_info": {"obj_list": mol_list,
                    "name_list": mol_name_list},
       "class_weight": cw
       }

  "adj": np.asarray(adj_list),


In [65]:
label_data_list

[array([0., 1.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([0., 1.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([0., 1.], dtype=float16),
 array([0., 1.], dtype=float16),
 array([0., 1.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([0., 1.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([0., 1.], dtype=float16),
 array([0., 1.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([0., 1.], dtype=float16),
 array([0., 1.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([0., 1.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([0., 1.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([1., 0.], dtype=float16),
 array([0., 1.], dtype=float16)]

In [20]:
class NNModel(DefaultModel):
    def build_placeholders(self, info, config, batch_size, **kwargs):
        # input data types (placeholders) of this neural network
        return self.get_placeholders(info, config, batch_size,
            ['adjs','nodes','labels','mask','dropout_rate',
            'enabled_node_nums','is_train','features'], **kwargs)

    def build_model(self, placeholders, info, config, batch_size, **kwargs):
        adj_channel_num = info.adj_channel_num
        embedding_dim = config["embedding_dim"]
        in_adjs = placeholders["adjs"]
        features = placeholders["features"]
        in_nodes = placeholders["nodes"]
        labels = placeholders["labels"]
        mask=placeholders["mask"]
        enabled_node_nums = placeholders["enabled_node_nums"]
        dropout_rate = placeholders["dropout_rate"]

        layer = features
        in_dim = info.feature_dim
        # layer: batch_size x graph_node_num x dim
        # 1
        layer = kgcn.layers.GraphConv(
            128,
            adj_channel_num
            )(layer, adj=in_adjs)
        layer = kgcn.layers.GraphBatchNormalization()(
            layer,
            max_node_num=info.graph_node_num,
            enabled_node_nums=enabled_node_nums
            )
        layer = tf.nn.relu(layer)
        layer = K.layers.Dropout(dropout_rate)(layer)
        # 2
        layer = kgcn.layers.GraphDense(128)(layer)
        layer = kgcn.layers.GraphBatchNormalization()(
            layer,
            max_node_num=info.graph_node_num,
            enabled_node_nums=enabled_node_nums
            )
        layer = kgcn.layers.GraphGather()(layer)
        layer = tf.nn.tanh(layer)
        # 3
        layer = K.layers.Dense(128)(layer)
        layer = K.layers.BatchNormalization()(layer)
        layer = tf.nn.relu(layer)
        logits=K.layers.Dense(info.label_dim)(layer)
        # compute prediction
        predictions = tf.nn.softmax(logits)
        # compute loss
        labels = tf.cast(labels,dtype=tf.float32)
        cw = info['class_weight']
        w = tf.reduce_sum(cw * labels, axis=1)
        unweighted_cost = tf.nn.softmax_cross_entropy_with_logits(labels=labels,logits=logits)
        weighted_cost = unweighted_cost * w
        loss_to_minimize = tf.reduce_sum(weighted_cost)
        # compute correct count
        metrics = {}
        correct_count = mask*tf.cast(tf.equal(tf.argmax(predictions,1), tf.argmax(labels,1)),tf.float32)
        metrics["correct_count"] = tf.reduce_sum(correct_count)
        return logits, predictions, loss_to_minimize, loss_to_minimize, metrics

In [21]:
config = {
    "with_feature": True,
    "with_node_embedding": False,
    "normalize_adj_flag":True,
    "split_adj_flag":False,
    "shuffle_data":False
}
train_data, info = kgcn.data_util.build_data(config, obj)

[OK] checking #graphs
The number of graphs                   =30
Dimension of a feature                 =81
The maximum number of nodes in a graph =50
The number of nodes in all graphs      =None
Dimension of a label                   =2
The number of adj. matrices in a graph =1



In [22]:
sess=tf.compat.v1.Session()
config = {
    "learning_rate": 0.01,
    "batch_size": 100,
    "param": None,
    "retrain": None,
    "save_model_path": "model",
    "epoch": 5,
    "profile": None,
    "dropout_rate": 0.,
    "task": "classification",
    "save_interval": 2,
    "embedding_dim": 4
}
model = kgcn.core.CoreModel(sess, config, info)
nn = NNModel()
model.build(nn)

2022-07-14 18:58:14.852676: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-07-14 18:58:14.852761: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [23]:
model.fit(train_data)

2022-07-14 18:58:29.447760: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-07-14 18:58:29.451300: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


#train data =  30


2022-07-14 18:58:30.216073: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-07-14 18:58:30.256252: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:model/model.00000.ckpt is not in all_model_checkpoint_paths. Manually adding it.


2022-07-14 18:58:38.315171: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


epoch 0, training cost 1.50941 (acc=0.566667), validation cost 0 (acc=0) (count=0) ([SAVE] model/model.00000.ckpt) 
[SAVE]  model/model.best.ckpt
INFO:tensorflow:model/model.best.ckpt is not in all_model_checkpoint_paths. Manually adding it.


2022-07-14 18:58:39.300658: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


epoch 1, training cost 1.55244 (acc=0.433333), validation cost 0 (acc=0) (count=0) 
INFO:tensorflow:model/model.00002.ckpt is not in all_model_checkpoint_paths. Manually adding it.


2022-07-14 18:58:39.728797: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


epoch 2, training cost 2.10685 (acc=0.566667), validation cost 0 (acc=0) (count=0) ([SAVE] model/model.00002.ckpt) 


2022-07-14 18:58:40.484099: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


epoch 3, training cost 1.0347 (acc=0.566667), validation cost 0 (acc=0) (count=0) 
INFO:tensorflow:model/model.00004.ckpt is not in all_model_checkpoint_paths. Manually adding it.


2022-07-14 18:58:40.885772: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


epoch 4, training cost 0.989067 (acc=0.433333), validation cost 0 (acc=0) (count=0) ([SAVE] model/model.00004.ckpt) 
[RESTORE]  model/model.best.ckpt
INFO:tensorflow:Restoring parameters from model/model.best.ckpt
[SAVE]  model/model.last.ckpt
INFO:tensorflow:model/model.last.ckpt is not in all_model_checkpoint_paths. Manually adding it.


2022-07-14 18:58:41.561880: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


[{'epoch': 0,
  'validation_cost': 0,
  'training_cost': 1.5094098409016927,
  'save_path': 'model/model.00000.ckpt',
  'validation_accuracy': 0,
  'training_correct_count': 17.0,
  'training_accuracy': 0.5666666666666667},
 {'epoch': 1,
  'validation_cost': 0,
  'training_cost': 1.5524372100830077,
  'save_path': None,
  'validation_accuracy': 0,
  'training_correct_count': 13.0,
  'training_accuracy': 0.43333333333333335},
 {'epoch': 2,
  'validation_cost': 0,
  'training_cost': 2.106853485107422,
  'save_path': 'model/model.00002.ckpt',
  'validation_accuracy': 0,
  'training_correct_count': 17.0,
  'training_accuracy': 0.5666666666666667},
 {'epoch': 3,
  'validation_cost': 0,
  'training_cost': 1.0346997578938801,
  'save_path': None,
  'validation_accuracy': 0,
  'training_correct_count': 17.0,
  'training_accuracy': 0.5666666666666667},
 {'epoch': 4,
  'validation_cost': 0,
  'training_cost': 0.9890669504801433,
  'save_path': 'model/model.00004.ckpt',
  'validation_accuracy': 0

In [24]:
train_y=model.pred(train_data)
print(train_y)


2022-07-14 18:59:04.257927: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-07-14 18:59:04.297676: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


[array([0.04835534, 0.95164466], dtype=float32), array([0.06273767, 0.9372624 ], dtype=float32), array([0.05210143, 0.9478985 ], dtype=float32), array([0.055229  , 0.94477093], dtype=float32), array([0.03595885, 0.9640412 ], dtype=float32), array([0.03963828, 0.9603617 ], dtype=float32), array([0.03573409, 0.9642659 ], dtype=float32), array([0.03900567, 0.96099436], dtype=float32), array([0.04662781, 0.95337224], dtype=float32), array([0.04493654, 0.95506346], dtype=float32), array([0.05113424, 0.9488658 ], dtype=float32), array([0.05443053, 0.94556946], dtype=float32), array([0.04048838, 0.95951164], dtype=float32), array([0.03727497, 0.962725  ], dtype=float32), array([0.02980658, 0.97019345], dtype=float32), array([0.04865111, 0.95134884], dtype=float32), array([0.06055751, 0.9394425 ], dtype=float32), array([0.06278741, 0.9372125 ], dtype=float32), array([0.04400392, 0.95599604], dtype=float32), array([0.06848184, 0.9315182 ], dtype=float32), array([0.0218982, 0.9781018], dtype=flo

In [67]:
0.05766264+0.9423374

1.00000004