# Introducing Machine Learning models in DeepMol

## Import packages

In [1]:
from rdkit import RDLogger
import logging
import warnings
from deepmol.loaders import SDFLoader
from sklearn.metrics import roc_auc_score, accuracy_score
from deepmol.metrics import Metric

warnings.filterwarnings("ignore")
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
RDLogger.DisableLog('rdApp.*')

# Shallow learning models using Scikit-learn

## Let's start by loading the data and splitting it into train and test sets

In [2]:
from deepmol.splitters import RandomSplitter

dataset = SDFLoader("../data/CHEMBL217_conformers.sdf", id_field="_ID", labels_fields=["_Class"]).create_dataset()
random_splitter = RandomSplitter()
train_dataset, test_dataset = random_splitter.train_test_split(dataset, frac_train=0.8)

2023-06-01 13:38:42,546 — INFO — Assuming classification since there are less than 10 unique y values. If otherwise, explicitly set the mode to 'regression'!


In [3]:
train_dataset.get_shape()

2023-06-01 13:38:48,946 — INFO — Mols_shape: (13298,)
2023-06-01 13:38:48,947 — INFO — Features_shape: None
2023-06-01 13:38:48,947 — INFO — Labels_shape: (13298,)


((13298,), None, (13298,))

## Let's generate Morgan fingerprints from our data

In [4]:
from deepmol.compound_featurization import MorganFingerprint

MorganFingerprint(n_jobs=10).featurize(train_dataset, inplace=True)
MorganFingerprint(n_jobs=10).featurize(test_dataset, inplace=True)

2023-06-01 13:38:55.341803: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-01 13:38:55.423854: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-06-01 13:38:55.423866: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-06-01 13:38:55.900541: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

## Now that we have our data ready, let's train a Random Forest model

In [5]:
from deepmol.models import SklearnModel
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
model = SklearnModel(model=rf)
model.fit(train_dataset)

## Now that we have our model trained, let's make some predictions

In [6]:
model.predict(test_dataset)

array([[0.8 , 0.2 ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       ...,
       [0.01, 0.99],
       [0.35, 0.65],
       [0.  , 1.  ]])

## And finally, let's evaluate our model according to some metrics

In [7]:
model.evaluate(test_dataset, metrics=[Metric(metric=roc_auc_score), Metric(metric=accuracy_score)])

({'roc_auc_score': 0.9977580691051172, 'accuracy_score': 0.9828571428571429},
 {})

## DeepMol also allows you to save your models without any effort

In [8]:
model.save("my_model")

## And load them back

In [11]:
model = SklearnModel.load("my_model")
model.evaluate(test_dataset, metrics=[Metric(metric=roc_auc_score), Metric(metric=accuracy_score)])

({'roc_auc_score': 0.9977580691051172, 'accuracy_score': 0.9828571428571429},
 {})

As you see in the previous example, DeepMol allows you to train and evaluate your models in a very simple way. You can also use any other model from Scikit-learn, such as SVMs, Logistic Regression, etc. You can also use any other featurization method from DeepMol, such as ECFP, GraphConv, etc. Moreover, saving and deploying your models never was so easy!

# Deep learning models using Keras

## Let's start by extracting some features from our data

In [13]:
MorganFingerprint(n_jobs=10).featurize(train_dataset, inplace=True)
MorganFingerprint(n_jobs=10).featurize(test_dataset, inplace=True)

## Now that we have our data ready, let's train a Deep Learning model
In DeepMol we provide full flexibility to the user to define the architecture of the model. The only requirement is that the model must be defined as a function that takes as input the input dimension of the data and returns a compiled Keras model. The function can also take as input any other parameter that the user wants to tune. In this case, we will define a simple model with two hidden layers and a dropout layer.

In [14]:
from keras.layers import Dense, Dropout
from keras import Sequential

def create_model(input_dim, optimizer='adam', dropout=0.5):
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=input_dim, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


## Now that we implemented our model, we can train it

In [None]:
from deepmol.models import KerasModel

input_dim = train_dataset.X.shape[1]
model = KerasModel(create_model, epochs = 5, verbose=1, optimizer='adam', input_dim=input_dim)
model = model.fit(train_dataset)

Could not connect to 127.0.0.1: 35283
Traceback (most recent call last):
  File "/snap/pycharm-professional/336/plugins/python/helpers/pydev/_pydevd_bundle/pydevd_comm.py", line 463, in start_client
    s.connect((host, port))
ConnectionRefusedError: [Errno 111] Connection refused
Traceback (most recent call last):
  File "/snap/pycharm-professional/336/plugins/python/helpers-pro/jupyter_debug/pydev_jupyter_utils.py", line 81, in attach_to_debugger
    debugger.connect(pydev_localhost.get_localhost(), debugger_port)
  File "/snap/pycharm-professional/336/plugins/python/helpers/pydev/pydevd.py", line 660, in connect
    s = start_client(host, port)
  File "/snap/pycharm-professional/336/plugins/python/helpers/pydev/_pydevd_bundle/pydevd_comm.py", line 463, in start_client
    s.connect((host, port))
ConnectionRefusedError: [Errno 111] Connection refused
Failed to connect to target debugger.


In [16]:
model.predict(test_dataset)



array([[9.8671627e-01, 1.3283747e-02],
       [1.0000000e+00, 4.9822679e-10],
       [5.4292679e-03, 9.9457073e-01],
       ...,
       [5.3464174e-03, 9.9465358e-01],
       [1.9562900e-02, 9.8043710e-01],
       [6.4892769e-03, 9.9351072e-01]], dtype=float32)

In [17]:
model.evaluate(test_dataset, metrics=[Metric(metric=roc_auc_score), Metric(metric=accuracy_score)])



({'roc_auc_score': 0.9959412851927224, 'accuracy_score': 0.9795488721804512},
 {})

In [18]:
model.save("my_model")

In [19]:
model = KerasModel.load("my_model")
model.evaluate(test_dataset, metrics=[Metric(metric=roc_auc_score), Metric(metric=accuracy_score)])



({'roc_auc_score': 0.9959412851927224, 'accuracy_score': 0.9795488721804512},
 {})

# Deep learning models using DeepChem models

In [20]:
from deepmol.compound_featurization import ConvMolFeat

ConvMolFeat(n_jobs=10).featurize(train_dataset, inplace=True)

In [21]:
ConvMolFeat(n_jobs=10).featurize(test_dataset, inplace=True)

In [22]:
from deepchem.models import GraphConvModel
from deepmol.models import DeepChemModel

model = DeepChemModel(model=GraphConvModel(graph_conv_layers=[32, 32], dense_layer_size=128, n_tasks=1), epochs=5, verbose=1)
model.fit(train_dataset)

In [23]:
model.predict(test_dataset)

array([[9.9915403e-01, 8.4594759e-04],
       [9.9851429e-01, 1.4855991e-03],
       [5.3278193e-02, 9.4672173e-01],
       ...,
       [4.0388817e-04, 9.9959618e-01],
       [9.7295139e-03, 9.9027050e-01],
       [2.3896188e-02, 9.7610384e-01]], dtype=float32)

In [24]:
model.evaluate(test_dataset, metrics=[Metric(metric=roc_auc_score), Metric(metric=accuracy_score)])

({'roc_auc_score': 0.9941217864209249, 'accuracy_score': 0.9720300751879699},
 {})

In [25]:
model.save("my_model")

In [26]:
model = DeepChemModel.load("my_model")
model.evaluate(test_dataset, metrics=[Metric(metric=roc_auc_score), Metric(metric=accuracy_score)])

({'roc_auc_score': 0.9941217864209249, 'accuracy_score': 0.9720300751879699},
 {})