In [1]:
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)

In [2]:
from Dataset.Dataset import CSVLoader
from compoundFeaturization.rdkitFingerprints import MorganFingerprint
from featureSelection.baseFeatureSelector import LowVarianceFS
from splitters.splitters import SingletaskStratifiedSplitter
from models.kerasModels import KerasModel
from metrics.Metrics import Metric
from metrics.metricsFunctions import f1_score, roc_auc_score, precision_score, accuracy_score, confusion_matrix, classification_report


import tensorflow as tf
print(tf.version.VERSION)
from tensorflow.keras.losses import binary_crossentropy

2.2.0


In [3]:
#Load Dataset
dataset = CSVLoader('preprocessed_dataset_wfoodb.csv', 'Smiles', ['Class'], 'ID')#, chunk_size=4000)
print(dataset.get_shape())

(23290,) (23290,) (0,) (23290,)
((23290,), (23290,), (0,), (23290,))


In [4]:
#Featurization
dataset = MorganFingerprint().featurize(dataset)
dataset.get_shape()

Featurizing datapoint 0
Featurizing datapoint 1000
Featurizing datapoint 2000
Featurizing datapoint 3000
Featurizing datapoint 4000
Featurizing datapoint 5000
Featurizing datapoint 6000


RDKit ERROR: [18:10:50] Explicit valence for atom # 1 Cl, 4, is greater than permitted


error in smile: O=[Cl]=O
Featurizing datapoint 7000


RDKit ERROR: [18:10:53] Explicit valence for atom # 3 B, 4, is greater than permitted
RDKit ERROR: [18:10:53] Explicit valence for atom # 1 Cl, 9, is greater than permitted


error in smile: OB1O[B]2(O)OB(O)O[B](O)(O1)O2
error in smile: O=[Cl-](=O)(=O)=O
Featurizing datapoint 8000
Featurizing datapoint 9000
Featurizing datapoint 10000
Featurizing datapoint 11000
Featurizing datapoint 12000
Featurizing datapoint 13000
Featurizing datapoint 14000


RDKit ERROR: [18:11:09] Explicit valence for atom # 0 P, 11, is greater than permitted


error in smile: [P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C
Featurizing datapoint 15000
Featurizing datapoint 16000
Featurizing datapoint 17000
Featurizing datapoint 18000
Featurizing datapoint 19000
Featurizing datapoint 20000
Featurizing datapoint 21000
Featurizing datapoint 22000
Featurizing datapoint 23000
Elements with indexes:  [6257, 7708, 7709, 14244]  were removed due to the presence of NAs!
The elements in question are:  ['O=[Cl]=O' 'OB1O[B]2(O)OB(O)O[B](O)(O1)O2' 'O=[Cl-](=O)(=O)=O'
 '[P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C']
(23286,) (23286,) (23286, 1024) (23286,)


((23286,), (23286,), (23286, 1024), (23286,))

In [5]:
#Feature Selection
dataset = LowVarianceFS(0.15).featureSelection(dataset)
dataset.get_shape()

(23286,) (23286,) (23286, 49) (23286,)


((23286,), (23286,), (23286, 49), (23286,))

In [6]:
#Data Split
splitter = SingletaskStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, frac_train=0.6, 
                                                                             frac_valid=0.2, frac_test=0.2)

In [7]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GaussianNoise, Conv1D, Flatten, Reshape
from tensorflow.keras.optimizers import Adadelta, Adam, RMSprop
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import numpy as np


#Hyperparameters for the network
#DENSE = 128
#DROPOUT = 0.5
#C1_K  = 8 #Number of kernels/feature extractors for first layer
#C1_S  = 32 #Width of the convolutional mini networks
#C2_K  = 16
#C2_S  = 32

#activation='relu'

input_dim = train_dataset.features.shape[1]

def make_cnn_model(input_dim=input_dim,
                   g_noise = 0.05, 
                   DENSE=128, 
                   DROPOUT=0.5, 
                   C1_K=8, 
                   C1_S=32, 
                   C2_K=16, 
                   C2_S=32,
                   activation='relu',
                   loss='binary_crossentropy',
                   optimizer='adadelta', 
                   learning_rate=0.01, 
                   metrics='accuracy'):
    model = Sequential()
    #Adding a bit of GaussianNoise also works as regularization
    model.add(GaussianNoise(g_noise, input_shape=(input_dim,)))
    #First two is number of filter + kernel size
    model.add(Reshape((input_dim, 1)))
    model.add(Conv1D(C1_K, (C1_S), activation=activation, padding="same"))
    model.add(Conv1D(C2_K, (C2_S), padding="same", activation=activation))
    model.add(Flatten())
    model.add(Dropout(DROPOUT))
    model.add(Dense(DENSE, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    if optimizer=='adadelta':
        opt = Adadelta(lr=learning_rate)
    elif optimizer=='adam':
        opt = Adam(lr=learning_rate)
    elif optimizer=='rsmprop':
        opt = RMSprop(lr=learning_rate)
    else : 
        opt = optimizer

    model.compile(loss=loss, optimizer=opt, metrics=metrics)

    return model

In [8]:
#from models.kerasModels import KerasModel

#input_dim = train_dataset.features.shape[1]
#print(input_dim)
#model = KerasModel(make_cnn_model, epochs = 150, verbose=1)

In [9]:
#print(train_dataset.features.shape, train_dataset.y.shape)


#model.fit(train_dataset)

In [10]:
#metrics = [Metric(roc_auc_score), 
#           Metric(precision_score), 
#           Metric(accuracy_score), 
#           Metric(confusion_matrix), 
#           Metric(classification_report)]

#print('training set score:', model.evaluate(train_dataset, metrics))
#print('test set score:', model.evaluate(test_dataset, metrics))

In [13]:
from parameterOptimization.HyperparameterOpt import GridHyperparamOpt
from sklearn.metrics import f1_score, make_scorer


#Hyperparameter Optimization
optimizer = GridHyperparamOpt(make_cnn_model)

params_dict = {'optimizer' : ['adam', 'rmsprop', 'adadelta'],
              'DROPOUT' : [0.2, 0.4, 0.5],
              'learning_rate' : [0.01, 0.001, 0.0001],
              'activation' : ['relu', 'elu', 'selu'],
              'g_noise' : [0.01, 0.05, 0.005]}

#TODO: multiple scoring not working
#scoring = {'f1': make_scorer(f1_score), 'Accuracy': 'accuracy'}
scoring = make_scorer(f1_score)

best_model, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict, train_dataset,
                                                                        valid_dataset, scoring,
                                                                        cv=3, n_jobs=1, verbose=3)

#print('#################')
#print(best_hyperparams)
#print(best_model)

Fitting 15 random models from a space of 243 possible models.
Fitting model 1/15
hyperparameters: {'optimizer': 'adam', 'DROPOUT': 0.2, 'learning_rate': 0.01, 'activation': 'relu', 'g_noise': 0.05}
<class 'sklearn.metrics._scorer._PredictScorer'>
METRIC:  make_scorer(f1_score)
Fitting 15 random models from a space of 243 possible models.
(13968, 49) 4656.0
Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] optimizer=adam, learning_rate=0.0001, g_noise=0.05, activation=selu, DROPOUT=0.5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
[CV]  optimizer=adam, learning_rate=0.0001, g_noise=0.05, activation=selu, DROPOUT=0.5, score=0.044, total=   2.8s
[CV] optimizer=adam, learning_rate=0.0001, g_noise=0.05, activation=selu, DROPOUT=0.5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.8s remaining:    0.0s


[CV]  optimizer=adam, learning_rate=0.0001, g_noise=0.05, activation=selu, DROPOUT=0.5, score=0.041, total=   1.2s
[CV] optimizer=adam, learning_rate=0.0001, g_noise=0.05, activation=selu, DROPOUT=0.5 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.0s remaining:    0.0s


[CV]  optimizer=adam, learning_rate=0.0001, g_noise=0.05, activation=selu, DROPOUT=0.5, score=0.000, total=   1.1s
[CV] optimizer=rmsprop, learning_rate=0.001, g_noise=0.05, activation=relu, DROPOUT=0.4 
[CV]  optimizer=rmsprop, learning_rate=0.001, g_noise=0.05, activation=relu, DROPOUT=0.4, score=0.119, total=   1.3s
[CV] optimizer=rmsprop, learning_rate=0.001, g_noise=0.05, activation=relu, DROPOUT=0.4 
[CV]  optimizer=rmsprop, learning_rate=0.001, g_noise=0.05, activation=relu, DROPOUT=0.4, score=0.151, total=   1.2s
[CV] optimizer=rmsprop, learning_rate=0.001, g_noise=0.05, activation=relu, DROPOUT=0.4 
[CV]  optimizer=rmsprop, learning_rate=0.001, g_noise=0.05, activation=relu, DROPOUT=0.4, score=0.000, total=   1.2s
[CV] optimizer=adam, learning_rate=0.01, g_noise=0.01, activation=relu, DROPOUT=0.4 
[CV]  optimizer=adam, learning_rate=0.01, g_noise=0.01, activation=relu, DROPOUT=0.4, score=0.000, total=   1.1s
[CV] optimizer=adam, learning_rate=0.01, g_noise=0.01, activation=rel

[CV]  optimizer=adam, learning_rate=0.01, g_noise=0.05, activation=relu, DROPOUT=0.5, score=0.000, total=   1.1s
[CV] optimizer=adam, learning_rate=0.01, g_noise=0.05, activation=relu, DROPOUT=0.5 
[CV]  optimizer=adam, learning_rate=0.01, g_noise=0.05, activation=relu, DROPOUT=0.5, score=0.000, total=   1.1s
[CV] optimizer=adam, learning_rate=0.01, g_noise=0.05, activation=relu, DROPOUT=0.5 
[CV]  optimizer=adam, learning_rate=0.01, g_noise=0.05, activation=relu, DROPOUT=0.5, score=0.000, total=   1.1s
[CV] optimizer=adadelta, learning_rate=0.01, g_noise=0.005, activation=elu, DROPOUT=0.2 
[CV]  optimizer=adadelta, learning_rate=0.01, g_noise=0.005, activation=elu, DROPOUT=0.2, score=0.000, total=   1.0s
[CV] optimizer=adadelta, learning_rate=0.01, g_noise=0.005, activation=elu, DROPOUT=0.2 
[CV]  optimizer=adadelta, learning_rate=0.01, g_noise=0.005, activation=elu, DROPOUT=0.2, score=0.000, total=   1.1s
[CV] optimizer=adadelta, learning_rate=0.01, g_noise=0.005, activation=elu, DRO

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   54.3s finished



 
 Best make_scorer(f1_score): 0.138994 using {'optimizer': 'rmsprop', 'learning_rate': 0.001, 'g_noise': 0.005, 'activation': 'elu', 'DROPOUT': 0.2}

 make_scorer(f1_score): 0.028265 (0.020036) with: {'optimizer': 'adam', 'learning_rate': 0.0001, 'g_noise': 0.05, 'activation': 'selu', 'DROPOUT': 0.5} 


 make_scorer(f1_score): 0.089936 (0.064873) with: {'optimizer': 'rmsprop', 'learning_rate': 0.001, 'g_noise': 0.05, 'activation': 'relu', 'DROPOUT': 0.4} 


 make_scorer(f1_score): 0.000000 (0.000000) with: {'optimizer': 'adam', 'learning_rate': 0.01, 'g_noise': 0.01, 'activation': 'relu', 'DROPOUT': 0.4} 


 make_scorer(f1_score): 0.000000 (0.000000) with: {'optimizer': 'adam', 'learning_rate': 0.0001, 'g_noise': 0.01, 'activation': 'relu', 'DROPOUT': 0.2} 


 make_scorer(f1_score): 0.000000 (0.000000) with: {'optimizer': 'adadelta', 'learning_rate': 0.01, 'g_noise': 0.005, 'activation': 'selu', 'DROPOUT': 0.4} 


 make_scorer(f1_score): 0.042280 (0.038695) with: {'optimizer': 'adade

In [None]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())