In [1]:
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)

In [3]:
from loaders.Loaders import CSVLoader
from compoundFeaturization.rdkitFingerprints import MorganFingerprint
from featureSelection.baseFeatureSelector import LowVarianceFS
from splitters.splitters import SingletaskStratifiedSplitter
from models.kerasModels import KerasModel
from metrics.Metrics import Metric
from metrics.metricsFunctions import roc_auc_score, precision_score, accuracy_score, confusion_matrix, classification_report


import tensorflow as tf
print(tf.version.VERSION)
from tensorflow.keras.losses import binary_crossentropy

2.4.1


In [4]:
#Load Dataset
dataset = CSVLoader(dataset_path='preprocessed_dataset_wfoodb.csv', 
                    mols_field='Smiles', 
                    labels_fields='Class', 
                    id_field='ID')#, shard_size=4000)
dataset = dataset.create_dataset()
print(dataset.get_shape())

Mols_shape:  23290
Features_shape:  X not defined!
Labels_shape:  (23290,)
None


In [5]:
#Featurization
dataset = MorganFingerprint().featurize(dataset)
dataset.get_shape()

Featurizing datapoint 0
Featurizing datapoint 1000
Featurizing datapoint 2000
Featurizing datapoint 3000
Featurizing datapoint 4000
Featurizing datapoint 5000
Featurizing datapoint 6000


RDKit ERROR: [14:07:15] Explicit valence for atom # 1 Cl, 4, is greater than permitted


error in smile: O=[Cl]=O
Featurizing datapoint 7000


RDKit ERROR: [14:07:18] Explicit valence for atom # 3 B, 4, is greater than permitted
RDKit ERROR: [14:07:18] Explicit valence for atom # 1 Cl, 9, is greater than permitted


error in smile: OB1O[B]2(O)OB(O)O[B](O)(O1)O2
error in smile: O=[Cl-](=O)(=O)=O
Featurizing datapoint 8000
Featurizing datapoint 9000
Featurizing datapoint 10000
Featurizing datapoint 11000
Featurizing datapoint 12000
Featurizing datapoint 13000
Featurizing datapoint 14000


RDKit ERROR: [14:07:36] Explicit valence for atom # 0 P, 11, is greater than permitted


error in smile: [P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C
Featurizing datapoint 15000
Featurizing datapoint 16000
Featurizing datapoint 17000
Featurizing datapoint 18000
Featurizing datapoint 19000
Featurizing datapoint 20000
Featurizing datapoint 21000
Featurizing datapoint 22000
Featurizing datapoint 23000
Elements with indexes:  [6257, 7708, 7709, 14244]  were removed due to the presence of NAs!
The elements in question are:  ['O=[Cl]=O' 'OB1O[B]2(O)OB(O)O[B](O)(O1)O2' 'O=[Cl-](=O)(=O)=O'
 '[P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C']
Mols_shape:  23286
Features_shape:  (23286, 1024)
Labels_shape:  (23286,)


In [6]:
#Feature Selection
dataset = LowVarianceFS(0.15).featureSelection(dataset)
dataset.get_shape()

Mols_shape:  23286
Features_shape:  (23286, 49)
Labels_shape:  (23286,)


In [7]:
#Data Split
splitter = SingletaskStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, frac_train=0.6, 
                                                                             frac_valid=0.2, frac_test=0.2)

In [8]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import numpy as np


input_dim = train_dataset.X.shape[1]


def create_model(optimizer='adam', dropout=0.5, input_dim=input_dim):
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=input_dim, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


In [10]:
from models.kerasModels import KerasModel

input_dim = train_dataset.X.shape[1]
print(input_dim)
model = KerasModel(create_model, epochs = 5, verbose=1, optimizer='adam')

49


In [11]:
print(train_dataset.X.shape, train_dataset.y.shape)


model.fit(train_dataset)#.features, train_dataset.y)#, nb_epoch=50)


(13968, 49) (13968,)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
metrics = [Metric(roc_auc_score), 
           Metric(precision_score), 
           Metric(accuracy_score), 
           Metric(confusion_matrix), 
           Metric(classification_report)]


print('training set score:', model.evaluate(train_dataset, metrics))
print('test set score:', model.evaluate(test_dataset, metrics))

 208/1397 [===>..........................] - ETA: 0s



roc_auc_score: 
 0.5
precision_score: 
 0.0
accuracy_score: 
 0.9435137457044673
confusion_matrix: 
 [[13179     0]
 [  789     0]]
classification_report: 
               precision    recall  f1-score   support

           0       0.94      1.00      0.97     13179
           1       0.00      0.00      0.00       789

    accuracy                           0.94     13968
   macro avg       0.47      0.50      0.49     13968
weighted avg       0.89      0.94      0.92     13968

training set score: {'roc_auc_score': 0.5, 'precision_score': 0.0, 'accuracy_score': 0.9435137457044673, 'confusion_matrix': 3492.0, 'classification_report': None}
 88/466 [====>.........................] - ETA: 0s

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


roc_auc_score: 
 0.5
precision_score: 
 0.0
accuracy_score: 
 0.9432989690721649
confusion_matrix: 
 [[4392    0]
 [ 264    0]]
classification_report: 
               precision    recall  f1-score   support

           0       0.94      1.00      0.97      4392
           1       0.00      0.00      0.00       264

    accuracy                           0.94      4656
   macro avg       0.47      0.50      0.49      4656
weighted avg       0.89      0.94      0.92      4656

test set score: {'roc_auc_score': 0.5, 'precision_score': 0.0, 'accuracy_score': 0.9432989690721649, 'confusion_matrix': 1164.0, 'classification_report': None}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
from parameterOptimization.HyperparameterOpt import HyperparamOpt_Valid, HyperparamOpt_CV
#Hyperparameter Optimization
optimizer = HyperparamOpt_Valid(create_model)

params_dict = {'optimizer' : ['adam', 'rmsprop'],
              'dropout' : [0.2, 0.4, 0.5]}

best_model, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict, train_dataset, 
                                                                        valid_dataset, Metric(roc_auc_score))

print('#################')
print(best_hyperparams)
print(best_model)

#Evaluate model
best_model.evaluate(test_dataset, metrics) #check error when using more metrics

MODE:  classification
Fitting 6 random models from a space of 6 possible models.
Fitting model 1/6
hyperparameters: {'optimizer': 'adam', 'dropout': 0.2}


Using TensorFlow backend.


expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.7999405379626592
Model 1/6, Metric roc_auc_score, Validation set 1: 0.799941
	best_validation_score so far: 0.799941
Fitting model 2/6
hyperparameters: {'optimizer': 'adam', 'dropout': 0.4}




expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.7903166894445796
Model 2/6, Metric roc_auc_score, Validation set 2: 0.790317
	best_validation_score so far: 0.799941
Fitting model 3/6
hyperparameters: {'optimizer': 'adam', 'dropout': 0.5}




expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.7977278058162008
Model 3/6, Metric roc_auc_score, Validation set 3: 0.797728
	best_validation_score so far: 0.799941
Fitting model 4/6
hyperparameters: {'optimizer': 'rmsprop', 'dropout': 0.2}




expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8033026963913381
Model 4/6, Metric roc_auc_score, Validation set 4: 0.803303
	best_validation_score so far: 0.803303
Fitting model 5/6
hyperparameters: {'optimizer': 'rmsprop', 'dropout': 0.4}




expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8100927936684614
Model 5/6, Metric roc_auc_score, Validation set 5: 0.810093
	best_validation_score so far: 0.810093
Fitting model 6/6
hyperparameters: {'optimizer': 'rmsprop', 'dropout': 0.5}




expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8118909360640285
Model 6/6, Metric roc_auc_score, Validation set 6: 0.811891
	best_validation_score so far: 0.811891




roc_auc_score: 
 0.8107335276548482
Best hyperparameters: ('rmsprop', 0.5)
train_score: 0.810734
validation_score: 0.811891
#################
('rmsprop', 0.5)
SklearnModel(mode='/tmp/tmpm9dgju9y',
             model=<tensorflow.python.keras.engine.sequential.Sequential object at 0x7fa6187212e8>)
roc_auc_score: 
 0.8208515310205884
precision_score: 
 0.0
accuracy_score: 
 0.9432989690721649
confusion_matrix: 
 [[4392    0]
 [ 264    0]]
classification_report: 
               precision    recall  f1-score   support

           0       0.94      1.00      0.97      4392
           1       0.00      0.00      0.00       264

    accuracy                           0.94      4656
   macro avg       0.47      0.50      0.49      4656
weighted avg       0.89      0.94      0.92      4656



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'roc_auc_score': 0.8208515310205884,
 'precision_score': 0.0,
 'accuracy_score': 0.9432989690721649,
 'confusion_matrix': 1164.0,
 'classification_report': None}

In [14]:
#Hyperparameter Optimization with CV
optimizer = HyperparamOpt_CV(create_model)

best_rf, best_hyperparams, all_results = optimizer.hyperparam_search('keras',
                                                                     params_dict, 
                                                                     train_dataset,  
                                                                     'accuracy', 
                                                                     cv=3,
                                                                     n_iter_search=10)

print('#################')
print(best_hyperparams)
print(best_rf)
#Evaluate model
best_rf.evaluate(test_dataset, metrics)

MODEL TYPE:  keras




<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7fa5f8782eb8>

 
 Best accuracy: 0.943514 using {'dropout': 0.2, 'optimizer': 'adam'}

 accuracy: 0.943514 (0.000000) with: {'dropout': 0.2, 'optimizer': 'adam'} 


 accuracy: 0.943514 (0.000000) with: {'dropout': 0.2, 'optimizer': 'rmsprop'} 


 accuracy: 0.943514 (0.000000) with: {'dropout': 0.4, 'optimizer': 'adam'} 


 accuracy: 0.943514 (0.000000) with: {'dropout': 0.4, 'optimizer': 'rmsprop'} 


 accuracy: 0.943514 (0.000000) with: {'dropout': 0.5, 'optimizer': 'adam'} 


 accuracy: 0.943514 (0.000000) with: {'dropout': 0.5, 'optimizer': 'rmsprop'} 

Fitting best model!
#################
{'dropout': 0.2, 'optimizer': 'adam'}
KerasModel(batch_size=None, epochs=None, model_builder=None,
           model_dir='/tmp/tmpbsm9utlr', verbose=None)
roc_auc_score: 
 0.585899983440967




precision_score: 
 0.5217391304347826
accuracy_score: 
 0.9441580756013745
confusion_matrix: 
 [[4348   44]
 [ 216   48]]
classification_report: 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97      4392
           1       0.52      0.18      0.27       264

    accuracy                           0.94      4656
   macro avg       0.74      0.59      0.62      4656
weighted avg       0.93      0.94      0.93      4656



{'roc_auc_score': 0.585899983440967,
 'precision_score': 0.5217391304347826,
 'accuracy_score': 0.9441580756013745,
 'confusion_matrix': 1164.0,
 'classification_report': None}