In [None]:
import os

import pandas as pd
import numpy as np
import math

from rdkit import Chem
from rdkit.Chem import Descriptors


import tensorflow as tf
import keras
from keras.layers import *
from keras.regularizers import *
import keras.backend as K
from keras.models import Model
from keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, Callback
from keras.callbacks import EarlyStopping
from keras import metrics


# os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# config = tf.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 0.3
# session = tf.Session(config=config)
# K.set_session(session)

In [None]:
def split_dataset(df):
    x_fp = df.filter(regex='fp')
    y_label = df.drop(x_fp, axis=1)
    
    return x_fp, y_label

def split_train_test(x):
    x = x.sample(frac=1)
    
    train = round(x.shape[0]*0.8)
    print(train)
    
    return x.iloc[0:train,:], x.iloc[train:,:]

def split_dataset_descriptor(df):
    x_fp = df.filter(regex='fp')
    temp = df.drop(x_fp, axis=1)
    x_desc = temp.iloc[:,0:100]
    y_label = temp.iloc[:,100:]

    return x_desc, y_label

def split_dataset_descriptor_both(df):
    x_fp = df.filter(regex='fp')
    temp = df.drop(x_fp, axis=1)
    x_desc = temp.iloc[:,0:100]
    y_label = temp.iloc[:,100:]

    return x_fp, x_desc, y_label

In [None]:
###################################
#    Pearson's correlation loss
###################################
def tf_pearson(y_true, y_pred):
    return tf.contrib.metrics.streaming_pearson_correlation(y_pred, y_true)[1]


###################################
#    Cosine annealing scheduler
#    https://github.com/4uiiurz1/keras-cosine-annealing
###################################
class CosineAnnealingScheduler(Callback):
    def __init__(self, T_max, eta_max, eta_min=0, verbose=0):
        super(CosineAnnealingScheduler, self).__init__()
        self.T_max = T_max
        self.eta_max = eta_max
        self.eta_min = eta_min
        self.verbose = verbose

    def on_epoch_begin(self, epoch, logs=None):
        if not hasattr(self.model.optimizer, 'lr'):
            raise ValueError('Optimizer must have a "lr" attribute.')
        lr = self.eta_min + (self.eta_max - self.eta_min) * (1 + math.cos(math.pi * epoch / self.T_max)) / 2
        K.set_value(self.model.optimizer.lr, lr)
        if self.verbose > 0:
            print('\nEpoch %05d: CosineAnnealingScheduler setting learning '
                  'rate to %s.' % (epoch + 1, lr))

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        logs['lr'] = K.get_value(self.model.optimizer.lr)

In [None]:
###################################
#    Model architectures
###################################

def struct_only_model(input_dim):
    K.clear_session()
    
    input_drug = Input(shape=(input_dim,))

    hidden = Dense(512, input_dim=input_dim, activation='relu', kernel_regularizer=l2(0.01))(input_drug)
    hidden = BatchNormalization()(hidden)
    hidden = Dense(128, input_dim=input_dim, activation='relu', kernel_regularizer=l2(0.01))(hidden)
    hidden = BatchNormalization()(hidden)
    output = Dense(978, activation='tanh', kernel_regularizer=l2(0.01))(hidden)
    output = Lambda(lambda x:10*x)(output)
    
    
    feature = Model(inputs=input_drug, outputs=output)
    feature.compile(loss="mean_squared_error", optimizer=Adam(0.0001), metrics=[metrics.mse, tf_pearson])
    
    return feature

def descriptor_only_model(input_dim):
    K.clear_session()
    
    input_drug = Input(shape=(input_dim,))

    hidden = Dense(128, input_dim=input_dim, activation='relu', kernel_regularizer=l2(0.01))(input_drug)
    hidden = BatchNormalization()(hidden)
    output = Dense(978, activation='tanh', kernel_regularizer=l2(0.01))(hidden)
    output = Lambda(lambda x:10*x)(output)
    
    
    feature = Model(inputs=input_drug, outputs=output)
    feature.compile(loss="mean_squared_error", optimizer=Adam(0.0001), metrics=[metrics.mse, tf_pearson])
    
    return feature

def structure_descriptor_model(structure_input, prop_input):
    K.clear_session()
    
    structure = Input(shape=(structure_input,))

    hidden = Dense(512, input_dim=structure_input, activation='relu', kernel_regularizer=l2(0.01))(structure)
    hidden = BatchNormalization()(hidden)
    hidden = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(hidden)
    hidden = BatchNormalization()(hidden)
    
    prop = Input(shape=(prop_input,))
    
    hidden2 = Dense(128, input_dim=prop_input, activation='relu', kernel_regularizer=l2(0.01))(prop)
    hidden2 = BatchNormalization()(hidden2)
    
    concat = concatenate([hidden, hidden2])
    output = Dense(978, activation='tanh', kernel_regularizer=l2(0.01))(concat)
    
    output = Lambda(lambda x:10*x)(output)
    

    feature = Model(inputs=[structure, prop], outputs=output)
    feature.compile(loss="mean_squared_error", optimizer=Adam(0.0001), metrics=[metrics.mse, tf_pearson])
    
    return feature

### Training

In [None]:
###################################
#    Model train
###################################

# Type in directory to save model
save_path = ''

def train_cv(model_type, input_data, epochs):
    early_stopping = EarlyStopping(monitor='val_mean_squared_error', patience=30)

    x_fp, x_desc, y_train = split_dataset_descriptor_both(input_data)
    
    
    # 1. structure only model
    # 2. property only model
    # 3. Structure + property model
    if (model_type == 1):
        print('Model with compound fingerprints')
        checkpoint = ModelCheckpoint(save_path+'cv_test_structOnly.h5', monitor='loss', verbose=1, save_best_only=True, mode='min', period=1)
        model = struct_only_model(x_fp.shape[1])
        _x = x_fp

    elif (model_type == 2):
        print('Model with compound properties')
        checkpoint = ModelCheckpoint(save_path+'cv_test_propOnly.h5', monitor='loss', verbose=1, save_best_only=True, mode='min', period=1)
        model = descriptor_only_model(x_fp.shape[1])
        _x = x_desc

    elif (model_type == 3):
        print('Model with both compound fingerprints and properties')
        checkpoint = ModelCheckpoint(save_path+'cv_test_scaled.h5', monitor='loss', verbose=1, save_best_only=True, mode='min', period=1)
        model = structure_descriptor_model(x_fp.shape[1], x_desc.shape[1])
        _x = [x_fp, x_desc]

    callbacks = [CosineAnnealingScheduler(T_max=20, eta_max=1e-4), checkpoint]


    # ============================================================================
    K.get_session().run(tf.local_variables_initializer())

    
    _y = y_train
    model.fit(_x,_y, validation_split=0.1, shuffle=True, verbose=0, batch_size=64, epochs=epochs, callbacks=callbacks)

    pd.DataFrame(model.history.history).to_csv(save_path+'model_cv_performance.csv')
    
    return model

### Prediction 

In [None]:
def model_predict(data, model, model_type):    
    x_fp_total, x_desc_total, y_total = split_dataset_descriptor_both(data)
    
    # 1. structure only model
    # 2. property only model
    # 3. Structure + property model
    if (model_type == 1):
        recon_x = model.predict([x_fp_total])
    elif (model_type == 2):
        recon_x = model.predict([x_desc_total])
    elif (model_type == 3):
        recon_x = model.predict([x_fp_total, x_desc_total])
            
    corr_list = list()
    for i in range(recon_x.shape[0]):
        corr_list.append(np.corrcoef(y_total.values[i], recon_x[i])[0][1])
    print('Average Correlation: ', np.mean(corr_list))

    return corr_list

### Example

In [None]:
# load input data
data_path = ''
input_data = pd.read_csv(data_path+'example_feature_model_input.csv')

In [None]:
def load_model(save_path, model_name):
    custom_objects = {'tf_pearson':tf_pearson}

    test_model = keras.models.load_model(save_path+model_name+'.h5', custom_objects=custom_objects)

    print(model_name + ' loaded === ')
    return test_model

In [None]:
model = train_cv(model_type=3, input_data=input_data, epochs=10)

In [None]:
correlation_list = model_predict(input_data, model, model_type=3)