In [1]:
import tensorflow as tf
import pandas as pd
import functools
import time

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score, \
    auc, average_precision_score, pairwise_distances
import scikitplot as skplt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import pickle
import dill
from functools import partial
from keras.utils import np_utils

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


# read gene features

In [2]:
genes = pd.read_csv("data/gene_data/gene_features_all.csv",index_col=0)

# read the training,val,test splits

In [3]:
all_df = pd.read_csv("data/gene_data/allsigs.csv",index_col=0)
val1_df = pd.read_csv("data/gene_data/splits/csv/val_set_1.csv",index_col=0)
val2_df = pd.read_csv("data/gene_data/splits/csv/val_set_2.csv",index_col=0)
val3_df = pd.read_csv("data/gene_data/splits/csv/val_set_3.csv",index_col=0)
val4_df = pd.read_csv("data/gene_data/splits/csv/val_set_4.csv",index_col=0)
test_df = pd.read_csv("data/gene_data/splits/csv/test_set.csv",index_col=0)

valsets = [val1_df,val2_df,val3_df,val4_df]

# only keep labels with more than 3 examples?

In [4]:
all_df = all_df[all_df["moa_count"]>3]
genes = genes.loc[all_df.index]

# Encode labels

In [5]:
# encode class values as integers
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(all_df['moa_v1'])
#encoded_Y = encoder.transform(all_df['moa_v1'])
# convert integers to dummy variables (i.e. one hot encoded)
#dummy_y = np_utils.to_categorical(encoded_Y)

LabelEncoder()

# first remove the test set from all_df and all genes

In [6]:
test_genes = genes.loc[test_df["sig_id"]]
test_sigs = all_df.loc[test_df["sig_id"]]
all_df = all_df.drop(test_df["sig_id"])
genes = genes.drop(test_df["sig_id"])

# Parameter space

In [7]:
# defining the space
fspace = {
    'dense_size_1' : hp.quniform('dense_size_1', 512,1024,64), #+
    'dense_size_2' : hp.quniform('dense_size_2', 256,512,64), #+
    'dense_size_3' : hp.quniform('dense_size_3', 128,256,64), #+
    'lr' : hp.uniform('lr', 0.0005, 0.01),
    'dropout' : hp.uniform('dropout',0.1,0.5),
    'epochs' : hp.quniform('epochs',5,50,5),
}

In [8]:
import tensorflow as tf
import keras
import keras.backend as K
from keras.layers import Dense, Dropout, Input, Lambda
from keras.models import Model, Sequential, load_model
from keras.callbacks import History, ReduceLROnPlateau,EarlyStopping,ModelCheckpoint

In [13]:
def objective(fspace, all_df, genes, valsets, label_encoder):
    accs = []
    model_params = {
        'dense_size' : [int(fspace['dense_size_1']),int(fspace['dense_size_2']),int(fspace['dense_size_3'])],
        'dropout' : [fspace['dropout'],fspace['dropout'],fspace['dropout']],
        'lr' : fspace['lr']
    }
    for i in range(len(valsets)):
        #split val and train
        val_genes = genes.loc[valsets[i]["sig_id"]]
        val_sigs = all_df.loc[valsets[i]["sig_id"]]
        train_genes = genes.drop(valsets[i]["sig_id"])
        train_sigs = all_df.drop(valsets[i]["sig_id"])
        #one hot encode labels
        encoded_Y_train = label_encoder.transform(train_sigs['moa_v1'])
        encoded_Y_val = label_encoder.transform(val_sigs['moa_v1'])
        train_y = np_utils.to_categorical(encoded_Y_train)
        val_y = np_utils.to_categorical(encoded_Y_val)
        # define the mlp model
        
        gene_input = Input(name = 'gene_input',shape = (978,), dtype = 'float32')

        fc1 = Dense(model_params['dense_size'][0], activation = 'relu', kernel_initializer='random_normal')(gene_input)
        fc1 = Dropout(model_params['dropout'][0])(fc1)
        fc2 = Dense(model_params['dense_size'][1], activation = 'relu', kernel_initializer='random_normal')(fc1)
        fc2 = Dropout(model_params['dropout'][1])(fc2)
        fc3 = Dense(model_params['dense_size'][2], activation = 'relu', kernel_initializer='random_normal')(fc2)
        fc3 = Dropout(model_params['dropout'][2])(fc3)

        prediction = Dense(all_df['moa_v1'].nunique(), activation = 'softmax')(fc3)
        adam = keras.optimizers.Adam(lr=model_params["lr"], beta_1=0.9, beta_2=0.999, decay=0.0, amsgrad=False)
        mlp_model = Model(inputs = gene_input, outputs = prediction)
        mlp_model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
        
        rlr = ReduceLROnPlateau(monitor='loss',factor=0.5, patience=4, verbose=1, min_lr=0.0000001)
        
        mlp_model.fit(np.array(train_genes),train_y, batch_size = 64, 
                      epochs = int(fspace['epochs']), 
                      verbose = 0, shuffle = True, validation_data = None)
       
        pred = mlp_model.predict(np.array(val_genes))
        
        accs.append(accuracy_score(np.argmax(val_y,axis=1), np.argmax(pred,axis=1)))
    ave_acc = np.mean(accs,axis = 0)
    return {'loss': -ave_acc ,  'status': STATUS_OK}

In [14]:
fmin_objective = partial(objective, all_df = all_df, genes = genes, valsets = valsets, label_encoder = label_encoder)

In [15]:
def run_trials():

    trials_step = 100  # how many additional trials to do after loading saved trials. 1 = save after iteration
    max_trials = 1  # initial max_trials. put something small to not have to wait

    
    try:  # try to load an already saved trials object, and increase the max
        trials = pickle.load(open("MLP_genes.hyperopt", "rb"))
        print("Found saved Trials! Loading...")
        max_trials = len(trials.trials) + trials_step
        print("Rerunning from {} trials to {} (+{}) trials".format(len(trials.trials), max_trials, trials_step))
    except:  # create a new trials object and start searching
        trials = Trials()

    best = fmin(fn = fmin_objective, space = fspace, algo=tpe.suggest, max_evals=max_trials, trials=trials)

    print("Best:", best)
    
    # save the trials object
    with open("MLP_genes.hyperopt", "wb") as f:
        pickle.dump(trials, f)
    return(trials)

In [16]:
trials = run_trials()

Found saved Trials! Loading...
Rerunning from 1 trials to 101 (+100) trials
100%|██████████████████████████████████████████| 101/101 [4:42:00<00:00, 167.53s/trial, best loss: -0.6265860190229938]
Best: {'dense_size_1': 576.0, 'dense_size_2': 256.0, 'dense_size_3': 128.0, 'dropout': 0.2733924105790248, 'epochs': 25.0, 'lr': 0.000981699073272445}


# load the best parameters

In [19]:
model_params = {
        'dense_size' : [int(896),int(384),int(128)],
        'dropout' : [0.45,0.45,0.45],
        'lr' : 0.00127
    }

In [20]:
accs = []
test_preds = []
for i in range(len(valsets)):
    #split val and train
    val_genes = genes.loc[valsets[i]["sig_id"]]
    val_sigs = all_df.loc[valsets[i]["sig_id"]]
    train_genes = genes.drop(valsets[i]["sig_id"])
    train_sigs = all_df.drop(valsets[i]["sig_id"])
    #one hot encode labels
    encoded_Y_train = label_encoder.transform(train_sigs['moa_v1'])
    encoded_Y_val = label_encoder.transform(val_sigs['moa_v1'])
    train_y = np_utils.to_categorical(encoded_Y_train)
    val_y = np_utils.to_categorical(encoded_Y_val)
    # define the mlp model
        
    gene_input = Input(name = 'gene_input',shape = (978,), dtype = 'float32')

    fc1 = Dense(model_params['dense_size'][0], activation = 'relu', kernel_initializer='random_normal')(gene_input)
    fc1 = Dropout(model_params['dropout'][0])(fc1)
    fc2 = Dense(model_params['dense_size'][1], activation = 'relu', kernel_initializer='random_normal')(fc1)
    fc2 = Dropout(model_params['dropout'][1])(fc2)
    fc3 = Dense(model_params['dense_size'][2], activation = 'relu', kernel_initializer='random_normal')(fc2)
    fc3 = Dropout(model_params['dropout'][2])(fc3)

    prediction = Dense(all_df['moa_v1'].nunique(), activation = 'softmax')(fc3)
    adam = keras.optimizers.Adam(lr=model_params["lr"], beta_1=0.9, beta_2=0.999, decay=0.0, amsgrad=False)
    mlp_model = Model(inputs = gene_input, outputs = prediction)
    mlp_model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
        
    rlr = ReduceLROnPlateau(monitor='loss',factor=0.5, patience=4, verbose=1, min_lr=0.0000001)
        
    mlp_model.fit(np.array(train_genes),train_y, batch_size = 64, 
                    epochs = int(100), 
                    verbose = 1, shuffle = True, validation_data = None)
       
    pred = mlp_model.predict(np.array(val_genes))
    test_pred = mlp_model.predict(np.array(test_genes))
    test_preds.append(test_pred)
    accs.append(accuracy_score(np.argmax(val_y,axis=1), np.argmax(pred,axis=1)))
ave_acc = np.mean(accs,axis = 0)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 5

Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 4

Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 2

Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [21]:
accs

[0.6470588235294118,
 0.5846153846153846,
 0.6122448979591837,
 0.6307692307692307]

# Test performance

In [25]:
encoded_Y_test = label_encoder.transform(test_sigs['moa_v1'])
test_y = np_utils.to_categorical(encoded_Y_test)
test_predictions = np.argmax(np.mean(test_preds,axis = 0),axis=1)

In [26]:
accuracy_score(np.argmax(test_y,axis=1), test_predictions)

0.5042735042735043

In [27]:
test_predictions

array([ 31,  31,  31,  31,  50,  50,  50,  44,  44,  50,  44,  44,  44,
        44,  50,  44,  44,  44,  44,  44,  50,  50,  50,  50,  50,  50,
        50,  50,  50,  50,  50,  50,  50,  50,  50,  50,  50,  50,  52,
        52, 102,  52,  69, 102,  89,   4, 102,  52,  63,  63,  63,  63,
        63,  10,  10,  10,  35,  78,  93,  93,  80,  80,  80,  80,  80,
        93,  80,  80,  80,  80,  80,  80,  17,  14,  14,  58,  14,  14,
       114,  14,  14,  77,  14,  14,  14,  14,  14,  14,  14,  78,  14,
        14,  14,  58,  50,  14,  44,  50,  44,  44,  50,  44,  44,  44,
        44,  44,  62,  45,  18,  46,  45, 102, 102, 102, 119, 104, 104],
      dtype=int64)

In [30]:
test_sigs['predicted'] = label_encoder.inverse_transform(test_predictions.astype(int))
drug_acc = per_drug_acc(test_sigs)

In [31]:
drug_acc

0.625

In [28]:
def per_drug_acc(df):
    unique_drugs = df['rdkit'].unique()
    s = 0
    for drug in unique_drugs:
        filt = df[df['rdkit']==drug]
        score = accuracy_score(filt['moa_v1'], filt['predicted'])
        nunique_moa = filt['predicted'].nunique()
        if score >= (1/nunique_moa):
            s = s + 1
    return(s/len(unique_drugs))


In [90]:
#le.inverse_transform(pred.astype(int))