In [1]:
from rdkit import RDLogger
import logging
import warnings

warnings.filterwarnings("ignore")
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
RDLogger.DisableLog('rdApp.*')

from sklearn.metrics import accuracy_score
from deepmol.metrics import Metric
from sklearn.svm import SVC
from deepmol.parameter_optimization import HyperparameterOptimizerValidation

from deepmol.splitters import RandomSplitter
from deepmol.compound_featurization import MorganFingerprint
from deepmol.loaders import SDFLoader

from tensorflow.python.keras.layers import Dropout
from tensorflow.python import keras
from tensorflow.python.keras import layers



2023-03-17 18:40:25.314672: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-17 18:40:25.402708: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-17 18:40:25.402722: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-17 18:40:25.421608: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-17 18:40:25.817421: W tensorflow/stream_executor/platform/de

In [2]:
dataset = SDFLoader("../data/CHEMBL217_conformers.sdf", id_field="_ID", labels_fields=["_Class"]).create_dataset()
MorganFingerprint().featurize(dataset)
train_dataset, valid_dataset, test_dataset = RandomSplitter().train_valid_test_split(dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1)

2023-03-17 18:40:31,637 — INFO — Assuming classification since there are less than 10 unique y values. If otherwise, explicitly set the mode to 'regression'!


# Hyperparameter tuning with scikit-learn

In [9]:
optimizer = HyperparameterOptimizerValidation(SVC)
params_dict_svc = {"C": [1.0, 1.2, 0.8]}
best_svm, best_hyperparams, all_results = optimizer.hyperparameter_search(train_dataset=train_dataset,
                                                                          valid_dataset=valid_dataset,
                                                                          metric=Metric(accuracy_score),
                                                                          maximize_metric=True,
                                                                          n_iter_search=2,
                                                                          params_dict=params_dict_svc,
                                                                          )

2023-03-17 13:17:09,300 — INFO — Fitting 2 random models from a space of 3 possible models.
2023-03-17 13:17:09,301 — INFO — Fitting model 1/2
2023-03-17 13:17:09,302 — INFO — hyperparameters: {'C': 1.0}
2023-03-17 13:17:43,806 — INFO — Model 1/2, Metric accuracy_score, Validation set 1: 0.984356
2023-03-17 13:17:43,807 — INFO — 	best_validation_score so far: 0.984356
2023-03-17 13:17:43,807 — INFO — Fitting model 2/2
2023-03-17 13:17:43,808 — INFO — hyperparameters: {'C': 0.8}
2023-03-17 13:18:19,376 — INFO — Model 2/2, Metric accuracy_score, Validation set 2: 0.982551
2023-03-17 13:18:19,376 — INFO — 	best_validation_score so far: 0.984356
2023-03-17 13:18:56,974 — INFO — Best hyperparameters: {'C': 1.0}
2023-03-17 13:18:56,975 — INFO — train_score: 0.995037
2023-03-17 13:18:56,975 — INFO — validation_score: 0.984356


In [10]:
best_svm.evaluate(test_dataset, metrics = [Metric(accuracy_score)])

({'accuracy_score': 0.9819603126879134}, {})

# Hyperparameter tuning with keras

In [6]:
def create_model(input_dim, optimizer='adam', dropout=0.5):
    # create model
    inputs = layers.Input(shape=input_dim)

    # Define the shared layers
    shared_layer_1 = layers.Dense(64, activation="relu")
    dropout_1 = Dropout(dropout)
    shared_layer_2 = layers.Dense(32, activation="relu")

    # Define the shared layers for the inputs
    x = shared_layer_1(inputs)
    x = dropout_1(x)
    x = shared_layer_2(x)

    task_output = layers.Dense(1, activation="sigmoid")(x)

    # Define the model that outputs the predictions for each task
    model = keras.Model(inputs=inputs, outputs=task_output)
    # Compile the model with different loss functions and metrics for each task
    model.compile(
        optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
    )
    return model


In [9]:
optimizer = HyperparameterOptimizerValidation(create_model)
params_dict_dense = {
                   "input_dim": [train_dataset.X.shape[1]],
                   "dropout": [0.5, 0.6, 0.7],
                   "optimizer": ["adam", "rmsprop"]
                   }

best_dnn, best_hyperparams, all_results = optimizer.hyperparameter_search(train_dataset=train_dataset,
                                                                          valid_dataset=valid_dataset,
                                                                          metric=Metric(accuracy_score),
                                                                          maximize_metric=True,
                                                                          n_iter_search=2,
                                                                          params_dict=params_dict_dense,
                                                                          )

2023-03-17 18:13:19,910 — INFO — Fitting 2 random models from a space of 6 possible models.
2023-03-17 18:13:19,911 — INFO — Fitting model 1/2
2023-03-17 18:13:19,911 — INFO — hyperparameters: {'input_dim': 2048, 'dropout': 0.5, 'optimizer': 'adam'}
2023-03-17 18:13:20,598 — ERROR — cannot pickle 'weakref' object
2023-03-17 18:13:20,674 — INFO — Model 1/2, Metric accuracy_score, Validation set 1: 0.980144
2023-03-17 18:13:20,674 — INFO — 	best_validation_score so far: 0.980144
2023-03-17 18:13:20,674 — INFO — Fitting model 2/2
2023-03-17 18:13:20,675 — INFO — hyperparameters: {'input_dim': 2048, 'dropout': 0.7, 'optimizer': 'rmsprop'}
2023-03-17 18:13:21,512 — ERROR — cannot pickle 'weakref' object
2023-03-17 18:13:21,584 — INFO — Model 2/2, Metric accuracy_score, Validation set 2: 0.973526
2023-03-17 18:13:21,585 — INFO — 	best_validation_score so far: 0.980144
2023-03-17 18:13:21,790 — INFO — Best hyperparameters: {'input_dim': 2048, 'dropout': 0.5, 'optimizer': 'adam'}
2023-03-17 18

In [10]:
best_dnn.evaluate(test_dataset, metrics = [Metric(accuracy_score)])

({'accuracy_score': 0.9753457606734817}, {})

# Hyperparameter tuning with deepchem models with cross validation

In [3]:
from deepmol.parameter_optimization import HyperparameterOptimizerCV
from deepmol.compound_featurization import ConvMolFeat
from sklearn.metrics import roc_auc_score, precision_score
from deepmol.models import DeepChemModel
from deepchem.models import GraphConvModel

ConvMolFeat().featurize(train_dataset)
ConvMolFeat().featurize(valid_dataset)
ConvMolFeat().featurize(test_dataset)

def graphconv_builder(graph_conv_layers, batch_size=256, epochs=5):
    graph = GraphConvModel(n_tasks=1, graph_conv_layers=graph_conv_layers, batch_size=batch_size,
                           mode='classification')
    return DeepChemModel(graph, epochs=epochs)

model_graph = HyperparameterOptimizerCV(model_builder=graphconv_builder)

best_model, _, _ = model_graph.hyperparameter_search(train_dataset=train_dataset,
                                                     metric=Metric(roc_auc_score),
                                                     n_iter_search=2,
                                                     maximize_metric=True,
                                                     cv = 2,
                                                     params_dict={'graph_conv_layers': [[64, 64], [32, 32]]},
                                                     model_type="deepchem")


2023-03-17 18:46:09,927 — INFO — MODEL TYPE: deepchem
2023-03-17 18:46:09,927 — INFO — Computing Stratified K-fold split


2023-03-17 18:46:12.276353: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-17 18:46:12.276372: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-17 18:46:12.276385: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (bisbii): /proc/driver/nvidia/version does not exist
2023-03-17 18:46:12.276531: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


2023-03-17 18:47:14,884 — INFO — 
 
 Best <deepmol.metrics.metrics.Metric object at 0x7fbf2be4cd00>: 0.977278 using {'graph_conv_layers': [64, 64]}
2023-03-17 18:47:14,885 — INFO — 
 <deepmol.metrics.metrics.Metric object at 0x7fbf2be4cd00>: 0.977278 (0.002598) with: {'graph_conv_layers': [64, 64]} 

2023-03-17 18:47:14,885 — INFO — 
 <deepmol.metrics.metrics.Metric object at 0x7fbf2be4cd00>: 0.965922 (0.002891) with: {'graph_conv_layers': [32, 32]} 



In [12]:
test_preds = best_model.predict(test_dataset)

metrics = [Metric(roc_auc_score), Metric(precision_score), Metric(accuracy_score)]

best_model.evaluate(test_dataset, metrics)


({'roc_auc_score': 0.9898796965934775,
  'precision_score': 0.9402480270574972,
  'accuracy_score': 0.9591100420926038},
 {})