In [None]:
import os

import pandas as pd
import numpy as np
import math

from rdkit import Chem
from rdkit.Chem import Descriptors


import tensorflow as tf
import keras
from keras.layers import *
from keras.regularizers import *
import keras.backend as K
from keras.models import Model
from keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, Callback
from keras.callbacks import EarlyStopping
from keras import metrics

from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.python.keras.backend import set_session
import math

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
session = tf.Session(config=config)
K.set_session(session)

import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, precision_recall_curve, average_precision_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [None]:
def find_exp(drug_df, ts_exp, column_name):
    return pd.merge(drug_df, ts_exp, left_on=column_name, right_on='pubchem', how='left').iloc[:,2:]

In [None]:
###################################
#    Data generator
###################################

class custom_dataGenerator(keras.utils.Sequence):
    def __init__(self, x_set, y_label, batch_size, exp_df, shuffle=True):
        self.x = x_set
        self.y = y_label
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.x))
        self.shuffle = shuffle
        self.exp_df = exp_df
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def on_epoch_end(self):
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
            
    def __len__(self):
        return math.ceil(len(self.x)/self.batch_size)
        
    def __data_generation__(self, x_list):
        x1 = find_exp(x_list[['drug1']], self.exp_df, 'drug1')
        x2 = find_exp(x_list[['drug2']], self.exp_df, 'drug2')
        x_se = x_list['SE']
        
        x_se_one_hot = to_categorical(x_list['SE'], num_classes=963)

        x1 = np.array(x1).astype(float)
        x2 = np.array(x2).astype(float)
        
        return x1, x2, x_se, x_se_one_hot
        
    def __getitem__(self, idx):
        indexes = self.indexes[idx*self.batch_size:(idx + 1) * self.batch_size]
        batch_x = self.x.iloc[indexes]
        batch_y = self.y[indexes]        
        
        x1, x2, x_se, x_se_one_hot = self.__data_generation__(batch_x)
        
        return [x1, x2, x_se, x_se_one_hot], batch_y

In [None]:
###################################
#    Classification model architecture
###################################

def gen_classification_model(input_drug_dim, input_se_dim, drug_emb_dim, se_emb_dim, output_dim, margin):    
    # classification model
    drug1_exp = Input(shape=(input_drug_dim,))
    drug2_exp = Input(shape=(input_drug_dim,))
    
    hidden_d1 = Dense(output_dim=input_drug_dim)(drug1_exp)
    hidden_d1 = BatchNormalization()(hidden_d1)
    hidden_d2 = Dense(output_dim=input_drug_dim)(drug2_exp)
    hidden_d2 = BatchNormalization()(hidden_d2)
    
    concat = Concatenate()([hidden_d1, hidden_d2])
    
    glu1 = Dense(input_drug_dim, activation='sigmoid')(concat)
    glu2 = Dense(input_drug_dim, activation='sigmoid')(concat)
    
    att_d1 = Multiply()([hidden_d1, glu1])
    att_d2 = Multiply()([hidden_d2, glu2])
    
    att_d1 = BatchNormalization()(att_d1)
    att_d2 = BatchNormalization()(att_d2)

    
    # drug embedding
    drug1_emb = Dense(drug_emb_dim, kernel_regularizer=l2(0.001))(att_d1)
    drug2_emb = Dense(drug_emb_dim, kernel_regularizer=l2(0.001))(att_d2)
    
    # side effect
    input_se = Input(shape=(input_se_dim,))
    se_emb = Embedding(963, output_dim=se_emb_dim, input_length=input_se_dim)(input_se)

    # one-hot side effect for metric
    input_se_one_hot = Input(shape=(963,))
    
    # side effect mapping matrix
    se_head = Embedding(963, output_dim=drug_emb_dim*se_emb_dim, input_length=input_se_dim, embeddings_regularizer=l2(0.01))(input_se)
    se_head = Reshape((se_emb_dim, drug_emb_dim))(se_head)
    se_tail = Embedding(963, output_dim=drug_emb_dim*se_emb_dim, input_length=input_se_dim, embeddings_regularizer=l2(0.01))(input_se)
    se_tail = Reshape((se_emb_dim, drug_emb_dim))(se_tail)
    
    print(drug1_emb.shape, se_head.shape)
    
    # MhH & MtT
    mh_dx = Dot(axes=(2,1))([se_head, drug1_emb])
    mt_dy = Dot(axes=(2,1))([se_tail, drug2_emb])
    mh_dy = Dot(axes=(2,1))([se_head, drug2_emb])
    mt_dx = Dot(axes=(2,1))([se_tail, drug1_emb])
    
    # || MhH + r - MtT ||
    score1 = add([mh_dx, se_emb])
    score1 = subtract([score1, mt_dy])
    score1 = Lambda(lambda x:K.sqrt(K.sum(K.square(x), axis=-1)))(score1)
    score1 = Reshape((1,))(score1)
    
    score2 = add([mh_dy, se_emb])
    score2 = subtract([score2, mt_dx])
    score2 = Lambda(lambda x:K.sqrt(K.sum(K.square(x), axis=-1)))(score2)
    score2 = Reshape((1,))(score2)
    
    final_score = add([score1, score2])

    model_classification = Model(inputs=[drug1_exp, drug2_exp, input_se, input_se_one_hot], outputs=final_score)
    model_classification.compile(loss=[lambda y_true, y_pred: custom_margin_loss(y_true, y_pred, se_one_hot=input_se_one_hot,margin=margin)], \
                                 optimizer=Adam(0.001), metrics=['accuracy'])
    
    return model_classification

def custom_margin_loss(y_true, y_pred, se_one_hot, margin):
    pos_score = (y_true*y_pred)
    neg_score = (K.abs(K.ones_like(y_true)-y_true)*y_pred)
    
    se_pos = K.dot(K.transpose(pos_score), se_one_hot)
    se_neg = K.dot(K.transpose(neg_score), se_one_hot)
    
    se_mask = K.cast(se_pos*se_neg, dtype=bool)
    
    se_pos_score = K.cast(se_mask, dtype='float32')*se_pos
    se_neg_score = K.cast(se_mask, dtype='float32')*se_neg
    
    score = se_pos_score-se_neg_score+(K.ones_like(se_pos_score)*K.cast(se_mask, dtype='float32'))*margin
    final_loss = K.sum(K.maximum(K.zeros_like(score),score))
        
    return final_loss

In [None]:
###################################
#    Model checkpoint
###################################

model_save_path = '/home/eykim/DDI_model/'

class CustomModelCheckPoint(keras.callbacks.Callback):
    def __init__(self, save_path, model_name, init_learining_rate, decay_rate, decay_steps, \
                 save_best_metric='val_loss',this_max=False, **kargs):
        super(CustomModelCheckPoint,self).__init__(**kargs)
        self.epoch_loss = {}
        self.epoch_val_loss = {}
        self.save_path = save_path
        self.model_name = model_name
        
        self.init_learining_rate = init_learining_rate
        self.decay_rate = decay_rate
        self.decay_steps = decay_steps
        self.global_step = 0
        
        self.save_best_metric = save_best_metric
        self.max = this_max
        if this_max:
            self.best = float('-inf')
        else:
            self.best = float('inf')
        
    def on_epoch_end(self, epoch, logs={}):
        lr = float(K.get_value(self.model.optimizer.lr))
#         print('learning rate: %.5f'%lr)
        
        metric_value = logs.get(self.save_best_metric)
        if self.max:
            if metric_value > self.best:
                self.best = metric_value
                self.best_model = self.model
        else:
            if metric_value < self.best:
                self.best = metric_value
                self.best_model = self.model
                
        self.epoch_loss[epoch] = logs.get('loss')
        self.epoch_val_loss[epoch] = logs.get('val_loss')
        self.best_model.save_weights(self.save_path + self.model_name + '.h5')
        
    def on_epoch_begin(self, epoch, logs={}):
        actual_lr = float(K.get_value(self.model.optimizer.lr))
        decayed_learning_rate = actual_lr * self.decay_rate ** (epoch / self.decay_steps)
        K.set_value(self.model.optimizer.lr, decayed_learning_rate)
        if epoch % 10 == 0:
            K.set_value(self.model.optimizer.lr, self.init_learining_rate)

In [None]:
###################################
#    Model evaluation
###################################

# boxplot predicted scores
def mean_predicted_score(true_df, predicted_y):
    test_pred_result = pd.concat([true_df.reset_index(drop=True), pd.DataFrame(predicted_y, columns=['predicted_score'])], axis=1)
#     fig, ax = plt.subplots(figsize=(6,6))
#     temp = test_pred_result.groupby('label')['predicted_score'].apply(list)
#     sns.boxplot(x='label', y='predicted_score', data=test_pred_result[['label','predicted_score']])
#     plt.show()
    
    return test_pred_result

def Find_Optimal_Cutoff(target, predicted):
    fpr, tpr, threshold = roc_curve(target, predicted)
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]

    return list(roc_t['threshold']) 

def cal_performance(predicted_scores_df):
    uniqueSE = predicted_scores_df.SE.unique()

    dfDict = {elem : pd.DataFrame for elem in uniqueSE}

    for key in dfDict.keys():
        dfDict[key] = predicted_scores_df[:][predicted_scores_df.SE == key]
        
    se_performance = pd.DataFrame(columns=['Side effect no.','median_pos', 'median_neg', 'optimal_thr','SN','SP','PR','AUC','AUPR'])
    for se in uniqueSE:
        df = dfDict[se]

        med_1 = np.median(df[df.label == 1.0].predicted_score)
        med_0 = np.median(df[df.label == 0.0].predicted_score)

        temp_thr = (med_1 + med_0)/2
        temp_y = df.predicted_score.apply(lambda x: 0 if x > temp_thr else 1)
        tn, fp, fn, tp = confusion_matrix(df.label, temp_y).ravel()

        optimal_thr = Find_Optimal_Cutoff(1-df.label, df.predicted_score)[0]
        temp_y_opt = df.predicted_score.apply(lambda x: 0 if x > optimal_thr else 1)
        tn, fp, fn, tp = confusion_matrix(df.label, temp_y_opt).ravel()

        auc = roc_auc_score(1-df.label, df.predicted_score)
        aupr = average_precision_score(1-df.label, df.predicted_score)

        temp_df = pd.DataFrame({'Side effect no.':se, 'median_pos':med_1, 'median_neg':med_0, 'optimal_thr':optimal_thr, \
                                'SN':tp/(tp+fn), 'SP':tn/(tn+fp), 'PR':tp/(tp+fp), 'AUC':auc, 'AUPR':aupr}, index=[0])
        se_performance = pd.concat([se_performance, temp_df], axis=0)
        
    return se_performance

In [None]:
###################################
#    Model setting
###################################

def set_model_options(model_save_path, model_name, margin=1, embedding_size_drug=100, embedding_size_se=100, init_lr=0.0001, decay_rate=0.9, decay_steps=2):
    checkpoint= CustomModelCheckPoint(save_path=model_save_path, model_name=model_name, \
                                      init_learining_rate=init_lr, decay_rate=decay_rate, decay_steps=decay_steps)
    
    classification_model = gen_classification_model(978, input_se_dim=1, drug_emb_dim=embedding_size_drug, se_emb_dim=embedding_size_se, \
                                                            output_dim=1, margin=margin)
    
    print("Classification model set")
    return classification_model, checkpoint

In [None]:
###################################
#    Model train
###################################

def cross_validation(model, train_data, split_frac, sampling_size, callbacks, batch_size, exp_df):
    optimal_threshold = pd.DataFrame(np.array(range(0,len(train_x.SE.unique()))), columns=['SE'])
    
    for n in range(sampling_size):
        print(n+1, ' Sample =======')
        cv_test = train_data.groupby(['SE', 'label']).apply(pd.DataFrame.sample, frac=split_frac)
        cv_test_x = cv_test.reset_index(drop=True).iloc[:,:3]
        cv_test_y = cv_test.reset_index(drop=True).iloc[:,-1]

        cv_train_data_rest = pd.concat([train_data, cv_test]).drop_duplicates(keep=False, inplace=False)
        cv_train_x = cv_train_data_rest.iloc[:,:3]
        cv_train_y = cv_train_data_rest.iloc[:,3]
        print('Cross validation train, test dataset size: ', cv_train_x.shape, cv_test_x.shape)

        cv_train_gen = custom_dataGenerator(cv_train_x, cv_train_y.values, batch_size=batch_size, exp_df=exp_df)
        cv_test_gen = custom_dataGenerator(cv_test_x, cv_test_y.values, batch_size=batch_size, exp_df=exp_df, shuffle=False) 

        steps_per_epoch = cv_train_x.shape[0] // batch_size // 10
        
        #======================================================================================================================#
        model.fit_generator(generator=cv_train_gen, steps_per_epoch=steps_per_epoch, validation_data=cv_test_gen, \
                                                   epochs=10, verbose=0, shuffle=True, callbacks=[callbacks])
        
        cv_test_pred_y = model.predict_generator(generator=cv_test_gen)
        
        cv_test_prediction_scores = mean_predicted_score(cv_test, cv_test_pred_y)
        cv_test_prediction_perf = cal_performance(cv_test_prediction_scores)
        optimal_threshold = pd.concat([optimal_threshold, pd.DataFrame(cv_test_prediction_perf.optimal_thr).reset_index(drop=True)], axis=1)
        
    return model, optimal_threshold

In [None]:
###################################################################
#    Labeling predicted scores & Calculate prediction performance
###################################################################

def calculate_predicted_label(predicted_score_df, mean_optimal_thr, se_col_name, threshold_col_name):
    merged = pd.merge(predicted_score_df, mean_optimal_thr, left_on='SE', right_on=se_col_name, how='left')
    merged['predicted_label'] = merged['predicted_score'] < merged[threshold_col_name]
    merged.predicted_label = merged.predicted_label.map(int)
    test_perf = merged[['drug1','drug2','SE','label','predicted_label','predicted_score']]
    return test_perf

def calculate_test_performance(predicted_scores_df):
    uniqueSE = predicted_scores_df.SE.unique()

    dfDict = {elem : pd.DataFrame for elem in uniqueSE}

    for key in dfDict.keys():
        dfDict[key] = predicted_scores_df[:][predicted_scores_df.SE == key]
        
    se_performance = pd.DataFrame(columns=['Side effect no.','SN','SP','PR','AUC','AUPR'])
    for se in uniqueSE:
        df = dfDict[se]

        tn, fp, fn, tp = confusion_matrix(df.label, df.predicted_label).ravel()

        auc = roc_auc_score(1-df.label, df.predicted_score)
        aupr = average_precision_score(1-df.label, df.predicted_score)

        temp_df = pd.DataFrame({'Side effect no.':se, \
                                'SN':tp/(tp+fn), 'SP':tn/(tn+fp), 'PR':tp/(tp+fp), 'AUC':auc, 'AUPR':aupr}, index=[0])
        se_performance = pd.concat([se_performance, temp_df], axis=0)
        
    return se_performance

def calculate_predicted_label(predicted_score_df, mean_optimal_thr, se_col_name, threshold_col_name):
    merged = pd.merge(predicted_score_df, mean_optimal_thr, left_on='SE', right_on=se_col_name, how='left')
    merged['predicted_label'] = merged['predicted_score'] < merged[threshold_col_name]
    merged.predicted_label = merged.predicted_label.map(int)
    merged['gap'] = merged['predicted_score'] - merged[threshold_col_name]
    merged.gap = merged.gap.map(abs)
    test_perf = merged[['drug1','drug2','SE','label','predicted_label','predicted_score','gap']]
    return test_perf

In [None]:
###################################
#    Model validation (Case 1)
###################################

def external_validation(model, test_x, test_y, optimal_threshold, batch_size, exp_df):
    test_gen = custom_dataGenerator(validation_x, validation_y.values, batch_size=batch_size, exp_df=exp_df, shuffle=False)
    pred_test = model.predict_generator(generator=test_gen)
    
    test_prediction_scores = mean_predicted_score(pd.concat([test_x, test_y], axis=1), pred_test)
    test_prediction_predicted_label_df = calculate_predicted_label(test_prediction_scores, optimal_threshold, se_col_name='SE',threshold_col_name='mean_thr')
    test_prediction_perf_df = calculate_test_performance(test_prediction_predicted_label_df)
    
    return test_prediction_predicted_label_df, test_prediction_perf_df

### Example

In [None]:
# predicted expression of TWOSIDES drugs
twosides_exp = pd.read_csv('/home/eykim/DDI_model/data/twosides_predicted_expression_scaled.csv')

# load data (Case 1)
data_path = '/home/eykim/DDI_model/data/'
train_x = pd.read_csv(data_path+'ddi_example_x.csv')
train_y = pd.read_csv(data_path+'ddi_example_y.csv')
print('Data loaded === ', train_x.shape, train_y.shape)

In [None]:
# Data split to train & validation sets
train_data = pd.concat([train_x, train_y], axis=1)
validation_data = train_data.groupby(['SE', 'label']).apply(pd.DataFrame.sample, frac=0.1)

validation_x = validation_data.reset_index(drop=True).iloc[:,:3]
validation_y = validation_data.reset_index(drop=True).iloc[:,-1]

train_data_rest = pd.concat([train_data, validation_data]).drop_duplicates(keep=False, inplace=False)

print('Total: ', train_x.shape, 'Train: ', train_data_rest.shape[0], 'Test: ', validation_data.shape[0])

In [None]:
model_name = 'ddi_model'
print('Model: ', model_name)

classification_model, callbacks = set_model_options(model_save_path=model_save_path, model_name=model_name)
classification_model, optimal_threshold = cross_validation(classification_model, train_data_rest, split_frac=0.1, sampling_size=10, callbacks=callbacks,batch_size=1024, exp_df=twosides_exp)
print('Model trained === ')

classification_model.save(model_save_path+'final_'+model_name+'.h5')
print('Model saved === ')

optimal_threshold['mean_thr'] = optimal_threshold.iloc[:,1:].mean(axis=1)
optimal_threshold.to_csv(model_save_path+model_name+'_opt_threshold.csv')
print('Optimal threshold saved === ')

test_prediction_predicted_label_df, test_prediction_perf_df = external_validation(classification_model, validation_x, validation_y, optimal_threshold=optimal_threshold, batch_size=1024, exp_df=twosides_exp)
print('Test set predicted === ')

result_path = '/home/eykim/DDI_model/'

test_prediction_predicted_label_df.to_csv(result_path + 'test_prediction_predicted_label_df.csv')
test_prediction_perf_df.to_csv(result_path+'test_prediction_predicted_perf_df.csv')
print('Test performance saved === ')