In [1]:
import numpy as np
import pandas as pd
from os.path import join
from sklearn.decomposition import PCA

import xgboost as xgb
from tensorflow.keras import backend as K
import warnings
warnings.filterwarnings('ignore')

from directory_infomation import *
from functions_and_dicts_data_preprocessing_GNN import *
from build_GNN import *

#### Load model with trained parameters:

In [2]:
# Model parameters
N = 70        # maximum number of nodes
F1 = 32         # feature dimensionality of atoms
F2 = 10         # feature dimensionality of bonds
F = F1+F2

class Linear(layers.Layer):

    def __init__(self, dim=(1,1,42,64)):
        super(Linear, self).__init__()
        w_init = tf.random_normal_initializer()
        self.w = tf.Variable(initial_value = w_init(shape=(dim),
                                                  dtype='float32'),
                             trainable=True)
    def call(self, inputs):
        return tf.matmul(inputs, self.w)
    
    
class Linear_with_bias(layers.Layer):

    def __init__(self, dim):
        super(Linear_with_bias, self).__init__()
        w_init = tf.random_normal_initializer()
        b_init = tf.constant_initializer(0.1)
        self.w = tf.Variable(initial_value = w_init(shape=(dim),
                                                  dtype='float32'),
                             trainable=True)
        self.b = tf.Variable(initial_value = b_init(shape=[self.w.shape[-1]], dtype='float32'), trainable=True)
        
    def call(self, inputs):
        return tf.math.add(tf.matmul(inputs, self.w), self.b)


def DMPNN(l2_reg_conv, l2_reg_fc, learning_rate, D, N, F1, F2, F, drop_rate = 0.15, ada_rho = 0.95):

    # Model definition
    XE_in = Input(shape=(N, N, F), name = "XE", dtype='float32')
    X_in = Input(shape=(N, F1), dtype='float32')
    Extras_in = Input((2), name ="Extras", dtype='float32')
    Unirep_in = Input((20), name ="Unirep", dtype='float32')

    X = tf.reshape(X_in, (-1, N, 1, F1))
    A_in = Input((N, N, 1),name ="A", dtype='float32') # 64 copies of A stacked behind each other
    Wi = Linear((1,1,F,D))
    Wm1 = Linear((1,1,D,D))
    Wm2= Linear((1,1,D,D))
    Wa = Linear((1,D+F1,D))

    W_fc1 = Linear_with_bias([D + 2, 32])
    W_fc2 = Linear_with_bias([32, 16])
    W_fc3=  Linear_with_bias([16, 1])

    OnesN_N = tf.ones((N,N))
    Ones1_N = tf.ones((1,N))

    H0 = relu(Wi(XE_in)) #W*XE

    #only get neighbors in each row: (elementwise multiplication)
    M1 = tf.multiply(H0, A_in)
    M1 = tf.transpose(M1, perm =[0,2,1,3])
    M1 = tf.matmul(OnesN_N, M1)
    M1 = add(inputs= [M1,-tf.transpose(H0, perm =[0,2,1,3])])
    M1 = tf.multiply(M1, A_in)
    H1 = add(inputs = [H0, Wm1(M1)])
    H1 = relu(BatchNormalization(momentum=0.90, trainable=True)(H1))

    M2 = tf.multiply(H1, A_in)
    M2 = tf.transpose(M2, perm =[0,2,1,3])
    M2 = tf.matmul(OnesN_N, M2)
    M2 = add(inputs= [M2,-tf.transpose(H1, perm =[0,2,1,3])])
    M2 = tf.multiply(M2, A_in)
    H2 = add(inputs = [H0, Wm2(M2)]) 
    H2 = relu(BatchNormalization(momentum=0.90, trainable=True)(H2))
    
    M_v = tf.multiply(H2, A_in)
    M_v = tf.matmul(Ones1_N, M_v)
    XM = Concatenate()(inputs= [X, M_v])
    H = relu(Wa(XM))
    h = tf.matmul(Ones1_N, tf.transpose(H, perm= [0,2,1,3]))
    h = tf.reshape(h, (-1,D))
    h_extras = Concatenate()(inputs= [h, Extras_in])
    h_extras = BatchNormalization(momentum=0.90, trainable=True)(h_extras)

    fc1 = relu(W_fc1(h_extras))
    fc1 = BatchNormalization(momentum=0.90, trainable=True)(fc1)
    fc1 = Dropout(drop_rate)(fc1)

    fc2 =relu(W_fc2(fc1))
    fc2 = BatchNormalization(momentum=0.90, trainable=True)(fc2)

    output = W_fc3(fc2)
    
    def total_loss(y_true, y_pred):
        reg_conv_loss = (tf.nn.l2_loss(Wi.w) + tf.nn.l2_loss(Wm1.w)+ tf.nn.l2_loss(Wm2.w) + tf.nn.l2_loss(Wa.w))
        reg_fc_loss = (tf.nn.l2_loss(W_fc1.w) +tf.nn.l2_loss(W_fc2.w) +tf.nn.l2_loss(W_fc3.w))
        mse_loss = tf.keras.losses.MSE(y_true, y_pred)
        return(tf.reduce_mean(mse_loss + l2_reg_conv * reg_conv_loss + l2_reg_fc * reg_fc_loss))

    # Build model
    model = Model(inputs=[XE_in, X_in, A_in, Extras_in, Unirep_in], outputs=output)

    #tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, H1_batch.updates)
    optimizer = Adadelta(lr=learning_rate, rho = ada_rho)

    model.compile(optimizer=optimizer, loss=total_loss, metrics=['mse', "mae"])
    return(model)


batch_size =32
D = 50
learning_rate = 0.05
epochs = 50
l2_reg_fc = 0.01
l2_reg_conv = 0.01
rho = 0.99 


model = DMPNN(l2_reg_conv = l2_reg_conv, l2_reg_fc = l2_reg_fc, learning_rate = learning_rate,
                  D = D, N = N, F1 = F1, F2 = F2, F= F, drop_rate = 0.0, ada_rho = rho)
model.load_weights(join(datasets_dir, "model_weights", "saved_model_GNN_best_hyperparameters"))


Two checkpoint references resolved to different objects (<__main__.Linear_with_bias object at 0x0000024E13186588> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x0000024E0CBF7688>).


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x24e13801648>

#### Create new model which can process molecules with up to 250 atoms (instead of just 70) and use the learned weights from the model above:

In [3]:
# Model parameters
N = 100        # maximum number of nodes
F1 = 32         # feature dimensionality of atoms
F2 = 10         # feature dimensionality of bonds
F = F1+F2

class Linear(layers.Layer):

    def __init__(self, dim=(1,1,42,64)):
        super(Linear, self).__init__()
        w_init = tf.random_normal_initializer()
        self.w = tf.Variable(initial_value = w_init(shape=(dim),
                                                  dtype='float32'),
                             trainable=True)
    def call(self, inputs):
        return tf.matmul(inputs, self.w)
    
    
class Linear_with_bias(layers.Layer):

    def __init__(self, dim):
        super(Linear_with_bias, self).__init__()
        w_init = tf.random_normal_initializer()
        b_init = tf.constant_initializer(0.1)
        self.w = tf.Variable(initial_value = w_init(shape=(dim),
                                                  dtype='float32'),
                             trainable=True)
        self.b = tf.Variable(initial_value = b_init(shape=[self.w.shape[-1]], dtype='float32'), trainable=True)
        
    def call(self, inputs):
        return tf.math.add(tf.matmul(inputs, self.w), self.b)


def DMPNN(l2_reg_conv, l2_reg_fc, learning_rate, D, N, F1, F2, F, drop_rate = 0.15, ada_rho = 0.95):

    # Model definition
    XE_in = Input(shape=(N, N, F), name = "XE", dtype='float32')
    X_in = Input(shape=(N, F1), dtype='float32')
    Extras_in = Input((2), name ="Extras", dtype='float32')
    Unirep_in = Input((20), name ="Unirep", dtype='float32')

    X = tf.reshape(X_in, (-1, N, 1, F1))
    A_in = Input((N, N, 1),name ="A", dtype='float32') # 64 copies of A stacked behind each other
    Wi = Linear((1,1,F,D))
    Wm1 = Linear((1,1,D,D))
    Wm2= Linear((1,1,D,D))
    Wa = Linear((1,D+F1,D))

    W_fc1 = Linear_with_bias([D + 2, 32])
    W_fc2 = Linear_with_bias([32, 16])
    W_fc3=  Linear_with_bias([16, 1])

    OnesN_N = tf.ones((N,N))
    Ones1_N = tf.ones((1,N))

    H0 = relu(Wi(XE_in)) #W*XE

    #only get neighbors in each row: (elementwise multiplication)
    M1 = tf.multiply(H0, A_in)
    M1 = tf.transpose(M1, perm =[0,2,1,3])
    M1 = tf.matmul(OnesN_N, M1)
    M1 = add(inputs= [M1,-tf.transpose(H0, perm =[0,2,1,3])])
    M1 = tf.multiply(M1, A_in)
    H1 = add(inputs = [H0, Wm1(M1)])
    H1 = relu(BatchNormalization(momentum=0.90, trainable=True)(H1))

    M2 = tf.multiply(H1, A_in)
    M2 = tf.transpose(M2, perm =[0,2,1,3])
    M2 = tf.matmul(OnesN_N, M2)
    M2 = add(inputs= [M2,-tf.transpose(H1, perm =[0,2,1,3])])
    M2 = tf.multiply(M2, A_in)
    H2 = add(inputs = [H0, Wm2(M2)]) 
    H2 = relu(BatchNormalization(momentum=0.90, trainable=True)(H2))
    
    M_v = tf.multiply(H2, A_in)
    M_v = tf.matmul(Ones1_N, M_v)
    XM = Concatenate()(inputs= [X, M_v])
    H = relu(Wa(XM))
    h = tf.matmul(Ones1_N, tf.transpose(H, perm= [0,2,1,3]))
    h = tf.reshape(h, (-1,D))
    h_extras = Concatenate()(inputs= [h, Extras_in])
    h_extras = BatchNormalization(momentum=0.90, trainable=True)(h_extras)

    fc1 = relu(W_fc1(h_extras))
    fc1 = BatchNormalization(momentum=0.90, trainable=True)(fc1)
    fc1 = Dropout(drop_rate)(fc1)

    fc2 =relu(W_fc2(fc1))
    fc2 = BatchNormalization(momentum=0.90, trainable=True)(fc2)

    output = W_fc3(fc2)
    
    def total_loss(y_true, y_pred):
        reg_conv_loss = (tf.nn.l2_loss(Wi.w) + tf.nn.l2_loss(Wm1.w)+ tf.nn.l2_loss(Wm2.w) + tf.nn.l2_loss(Wa.w))
        reg_fc_loss = (tf.nn.l2_loss(W_fc1.w) +tf.nn.l2_loss(W_fc2.w) +tf.nn.l2_loss(W_fc3.w))
        mse_loss = tf.keras.losses.MSE(y_true, y_pred)
        return(tf.reduce_mean(mse_loss + l2_reg_conv * reg_conv_loss + l2_reg_fc * reg_fc_loss))

    # Build model
    model = Model(inputs=[XE_in, X_in, A_in, Extras_in, Unirep_in], outputs=output)

    #tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, H1_batch.updates)
    optimizer = Adadelta(lr=learning_rate, rho = ada_rho)

    model.compile(optimizer=optimizer, loss=total_loss, metrics=['mse', "mae"])
    return(model)


model_250 = DMPNN(l2_reg_conv = l2_reg_conv, l2_reg_fc = l2_reg_fc, learning_rate = learning_rate,
                  D = D, N = 250, F1 = F1, F2 = F2, F= F, drop_rate = 0.0, ada_rho = rho)
model_250.set_weights = model.weights

#### Calculating fingerprints:

In [5]:
df_subs = pd.read_pickle(join(datasets_dir, "BiGG_data", "df_subs.pkl"))
df_subs_avail = df_subs.loc[df_subs["substrate_available"] == True]
df_subs

Unnamed: 0,metanetx ID,KEGG ID,SMILES,substrate_available
0,MNXM3,C00002,,True
1,MNXM89621,C05345,,True
2,MNXM12,C00010,,True
3,MNXM23,C00022,,True
4,MNXM160,C00092,,True
...,...,...,...,...
4451,,C00479,,False
4452,,C00163,,False
4453,,C00100,,False
4454,,C06044,,False


In [9]:
get_fingerprint_fct = K.function([[model_250.layers[0].input, model_250.layers[26].input,
                                  model_250.layers[3].input, model_250.layers[36].input]],
                                  [model_250.layers[-10].output])

def get_substrate_representations(df):
    df["GNN FP"] = ""
    i = 0
    n = len(df)
    UniRep = np.zeros((64, 20))
    cid_all = list(df["metanetx ID"])
    
    while i*64 <= n:
        if (i+1)*64  <= n:
            XE, X, A, extras = get_representation_input(cid_all[i*64:(i+1)*64])
            representations = get_fingerprint_fct([np.array(XE), np.array(X),np.array(A),
                                                   np.array(extras)])[0]
            df["GNN FP"][i*64:(i+1)*64] = list(representations[:, :52])
        else:
            print(i)
            XE, X, A, extras = get_representation_input(cid_all[-64:])
            representations = get_fingerprint_fct([np.array(XE), np.array(X),np.array(A),
                                                   np.array(extras)])[0]
            df["GNN FP"][-64:] = list(representations[:, :52])
        i += 1
        
    return(df)

input_data_folder = join(datasets_dir, "Bigg_data", "input_data")        
def get_representation_input(cid_list):
    XE = ();
    X = ();
    A = ();
    UniRep = ();
    extras = ();
    # Generate data
    for cid in cid_list:
        X = X + (np.load(join(input_data_folder, cid + '_X.npy')), );
        XE = XE + (np.load(join(input_data_folder, cid + '_XE.npy')), );
        A = A + (np.load(join(input_data_folder, cid + '_A.npy')), );
        extras =  extras + (np.load(join(input_data_folder, cid + '_extras.npy')), );
    return(XE, X, A, extras)

In [10]:
df_subs_avail = get_substrate_representations(df = df_subs_avail)

53


#### Merge df_subs and df_subs_avail:

In [11]:
df_subs_avail = df_subs_avail.drop(columns = ["SMILES", "KEGG ID", "substrate_available"])

Unnamed: 0,metanetx ID,KEGG ID,SMILES,substrate_available,GNN FP
0,MNXM3,C00002,,True,"[3.5004892, 0.0, 0.42342216, 0.023474924, 0.26..."
1,MNXM89621,C05345,,True,"[1.8386872, 0.0, 0.0, 0.007824975, 0.0, 1.4801..."
2,MNXM12,C00010,,True,"[4.6419253, 0.0, 0.42342216, 0.039124876, 0.27..."
3,MNXM23,C00022,,True,"[0.5204832, 0.0, 0.0, 0.01564995, 0.0, 0.22595..."
4,MNXM160,C00092,,True,"[1.9818256, 0.0, 0.0, 0.007824975, 0.0, 1.2651..."
...,...,...,...,...,...
624789,,C00479,,False,
624790,,C00163,,False,
624791,,C00100,,False,
624792,,C06044,,False,


In [13]:
df_subs_avail.to_pickle(join(datasets_dir, "BiGG_data", "df_subs_avail.pkl"))