In [1]:
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
#sess = tf.compat.v1.Session(config=config)
sess =tf.compat.v1.InteractiveSession(config=config)

In [2]:
from loaders.Loaders import CSVLoader
from compoundFeaturization.rdkitFingerprints import MorganFingerprint
from featureSelection.baseFeatureSelector import LowVarianceFS
from splitters.splitters import SingletaskStratifiedSplitter
from models.kerasModels import KerasModel
from metrics.Metrics import Metric
from metrics.metricsFunctions import f1_score, roc_auc_score, precision_score, accuracy_score, confusion_matrix, classification_report


import tensorflow as tf
print(tf.version.VERSION)
from tensorflow.keras.losses import binary_crossentropy

2.4.1


In [3]:
#Load Dataset
dataset = CSVLoader(dataset_path='preprocessed_dataset_wfoodb.csv', 
                    mols_field='Smiles', 
                    labels_fields='Class', 
                    id_field='ID')#, shard_size=4000)
dataset = dataset.create_dataset()
print(dataset.get_shape())

Mols_shape:  23290
Features_shape:  X not defined!
Labels_shape:  (23290,)
None


In [4]:
#Featurization
dataset = MorganFingerprint().featurize(dataset)
dataset.get_shape()

Featurizing datapoint 0
Featurizing datapoint 1000
Featurizing datapoint 2000
Featurizing datapoint 3000
Featurizing datapoint 4000
Featurizing datapoint 5000
Featurizing datapoint 6000


RDKit ERROR: [14:32:33] Explicit valence for atom # 1 Cl, 4, is greater than permitted


error in smile: O=[Cl]=O
Featurizing datapoint 7000


RDKit ERROR: [14:32:37] Explicit valence for atom # 3 B, 4, is greater than permitted
RDKit ERROR: [14:32:37] Explicit valence for atom # 1 Cl, 9, is greater than permitted


error in smile: OB1O[B]2(O)OB(O)O[B](O)(O1)O2
error in smile: O=[Cl-](=O)(=O)=O
Featurizing datapoint 8000
Featurizing datapoint 9000
Featurizing datapoint 10000
Featurizing datapoint 11000
Featurizing datapoint 12000
Featurizing datapoint 13000
Featurizing datapoint 14000


RDKit ERROR: [14:32:55] Explicit valence for atom # 0 P, 11, is greater than permitted


error in smile: [P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C
Featurizing datapoint 15000
Featurizing datapoint 16000
Featurizing datapoint 17000
Featurizing datapoint 18000
Featurizing datapoint 19000
Featurizing datapoint 20000
Featurizing datapoint 21000
Featurizing datapoint 22000
Featurizing datapoint 23000
Elements with indexes:  [6257, 7708, 7709, 14244]  were removed due to the presence of NAs!
The elements in question are:  ['O=[Cl]=O' 'OB1O[B]2(O)OB(O)O[B](O)(O1)O2' 'O=[Cl-](=O)(=O)=O'
 '[P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C']
Mols_shape:  23286
Features_shape:  (23286, 1024)
Labels_shape:  (23286,)


In [5]:
#Feature Selection
dataset = LowVarianceFS(0.15).featureSelection(dataset)
dataset.get_shape()

Mols_shape:  23286
Features_shape:  (23286, 49)
Labels_shape:  (23286,)


In [6]:
#Data Split
splitter = SingletaskStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, frac_train=0.6, 
                                                                             frac_valid=0.2, frac_test=0.2)

In [7]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GaussianNoise, Conv1D, Flatten, Reshape
from tensorflow.keras.optimizers import Adadelta, Adam, RMSprop
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import numpy as np


#Hyperparameters for the network
#DENSE = 128
#DROPOUT = 0.5
#C1_K  = 8 #Number of kernels/feature extractors for first layer
#C1_S  = 32 #Width of the convolutional mini networks
#C2_K  = 16
#C2_S  = 32

#activation='relu'

input_dim = train_dataset.X.shape[1]

def make_cnn_model(input_dim=input_dim,
                   g_noise = 0.05, 
                   DENSE=128, 
                   DROPOUT=0.5, 
                   C1_K=8, 
                   C1_S=32, 
                   C2_K=16, 
                   C2_S=32,
                   activation='relu',
                   loss='binary_crossentropy',
                   optimizer='adadelta', 
                   learning_rate=0.01, 
                   metrics='accuracy'):
    model = Sequential()
    #Adding a bit of GaussianNoise also works as regularization
    model.add(GaussianNoise(g_noise, input_shape=(input_dim,)))
    #First two is number of filter + kernel size
    model.add(Reshape((input_dim, 1)))
    model.add(Conv1D(C1_K, (C1_S), activation=activation, padding="same"))
    model.add(Conv1D(C2_K, (C2_S), padding="same", activation=activation))
    model.add(Flatten())
    model.add(Dropout(DROPOUT))
    model.add(Dense(DENSE, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    if optimizer=='adadelta':
        opt = Adadelta(lr=learning_rate)
    elif optimizer=='adam':
        opt = Adam(lr=learning_rate)
    elif optimizer=='rsmprop':
        opt = RMSprop(lr=learning_rate)
    else : 
        opt = optimizer

    model.compile(loss=loss, optimizer=opt, metrics=metrics)

    return model

In [8]:
#from models.kerasModels import KerasModel

#input_dim = train_dataset.X.shape[1]
#print(input_dim)
#model = KerasModel(make_cnn_model, epochs = 150, verbose=1)

In [9]:
#print(train_dataset.X.shape, train_dataset.y.shape)


#model.fit(train_dataset)

In [10]:
#metrics = [Metric(roc_auc_score), 
#           Metric(precision_score), 
#           Metric(accuracy_score), 
#           Metric(confusion_matrix), 
#           Metric(classification_report)]

#print('training set score:', model.evaluate(train_dataset, metrics))
#print('test set score:', model.evaluate(test_dataset, metrics))

In [12]:
from parameterOptimization.HyperparameterOpt import HyperparamOpt_Valid
from sklearn.metrics import f1_score, make_scorer


#Hyperparameter Optimization
optimizer = HyperparamOpt_Valid(make_cnn_model)

params_dict = {'optimizer' : ['adam', 'rmsprop', 'adadelta'],
              'DROPOUT' : [0.2, 0.4, 0.5],
              'learning_rate' : [0.01, 0.001, 0.0001],
              'activation' : ['relu', 'elu', 'selu'],
              'g_noise' : [0.01, 0.05, 0.005]}

#TODO: multiple scoring not working
#scoring = {'f1': make_scorer(f1_score), 'Accuracy': 'accuracy'}

best_model, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict, train_dataset,
                                                                        valid_dataset, Metric(f1_score),
                                                                        n_jobs=1, verbose=3)

#print('#################')
#print(best_hyperparams)
#print(best_model)

MODE:  classification
Fitting 15 random models from a space of 243 possible models.
Fitting model 1/15
hyperparameters: {'optimizer': 'adam', 'DROPOUT': 0.2, 'learning_rate': 0.001, 'activation': 'relu', 'g_noise': 0.005}
expected str, bytes or os.PathLike object, not NoneType
f1_score: 
 0.10035842293906809
Model 1/15, Metric f1_score, Validation set 1: 0.100358
	best_validation_score so far: 0.100358
Fitting model 2/15
hyperparameters: {'optimizer': 'adam', 'DROPOUT': 0.2, 'learning_rate': 0.0001, 'activation': 'relu', 'g_noise': 0.005}




expected str, bytes or os.PathLike object, not NoneType
f1_score: 
 0.0
Model 2/15, Metric f1_score, Validation set 2: 0.000000
	best_validation_score so far: 0.100358
Fitting model 3/15
hyperparameters: {'optimizer': 'adam', 'DROPOUT': 0.4, 'learning_rate': 0.0001, 'activation': 'selu', 'g_noise': 0.01}
expected str, bytes or os.PathLike object, not NoneType
f1_score: 
 0.09929078014184398
Model 3/15, Metric f1_score, Validation set 3: 0.099291
	best_validation_score so far: 0.100358
Fitting model 4/15
hyperparameters: {'optimizer': 'rmsprop', 'DROPOUT': 0.2, 'learning_rate': 0.001, 'activation': 'relu', 'g_noise': 0.005}
expected str, bytes or os.PathLike object, not NoneType
f1_score: 
 0.18604651162790697
Model 4/15, Metric f1_score, Validation set 4: 0.186047
	best_validation_score so far: 0.186047
Fitting model 5/15
hyperparameters: {'optimizer': 'rmsprop', 'DROPOUT': 0.4, 'learning_rate': 0.001, 'activation': 'selu', 'g_noise': 0.05}
expected str, bytes or os.PathLike object, no

In [None]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())