# Introducing Machine Learning models in DeepMol

## Import packages

In [1]:
from rdkit import RDLogger
import logging
import warnings
from deepmol.loaders import SDFLoader
from sklearn.metrics import roc_auc_score, accuracy_score
from deepmol.metrics import Metric

warnings.filterwarnings("ignore")
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
RDLogger.DisableLog('rdApp.*')

# Shallow learning models using Scikit-learn

## Let's start by loading the data and splitting it into train and test sets

In [3]:
from deepmol.splitters import RandomSplitter

dataset = SDFLoader("../data/CHEMBL217_conformers.sdf", id_field="_ID", labels_fields=["_Class"]).create_dataset()
random_splitter = RandomSplitter()
train_dataset, test_dataset = random_splitter.train_test_split(dataset, frac_train=0.8)

In [4]:
train_dataset.get_shape()

2023-05-31 17:49:26,033 — INFO — Mols_shape: (13298,)
2023-05-31 17:49:26,035 — INFO — Features_shape: None
2023-05-31 17:49:26,036 — INFO — Labels_shape: (13298,)


((13298,), None, (13298,))

## Let's generate Morgan fingerprints from our data

In [8]:
from deepmol.compound_featurization import MorganFingerprint

MorganFingerprint(n_jobs=10).featurize(train_dataset, inplace=True)
MorganFingerprint(n_jobs=10).featurize(test_dataset, inplace=True)

## Now that we have our data ready, let's train a Random Forest model

In [10]:
from deepmol.models import SklearnModel
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
model = SklearnModel(model=rf)
model.fit(train_dataset)

## Now that we have our model trained, let's make some predictions

In [11]:
model.predict(test_dataset)

array([[0.99, 0.01],
       [0.02, 0.98],
       [0.93, 0.07],
       ...,
       [0.02, 0.98],
       [0.9 , 0.1 ],
       [0.99, 0.01]])

## And finally, let's evaluate our model according to some metrics

In [12]:
model.evaluate(test_dataset, metrics=[Metric(metric=roc_auc_score), Metric(metric=accuracy_score)])

({'roc_auc_score': 0.9989268647082491, 'accuracy_score': 0.9888721804511278},
 {})

# Deep learning models using Keras

## Let's start by extracting some features from our data

In [13]:
MorganFingerprint(n_jobs=10).featurize(train_dataset, inplace=True)
MorganFingerprint(n_jobs=10).featurize(test_dataset, inplace=True)

## Now that we have our data ready, let's train a Deep Learning model
In DeepMol we provide full flexibility to the user to define the architecture of the model. The only requirement is that the model must be defined as a function that takes as input the input dimension of the data and returns a compiled Keras model. The function can also take as input any other parameter that the user wants to tune. In this case, we will define a simple model with two hidden layers and a dropout layer.

In [14]:
from keras.layers import Dense, Dropout
from keras import Sequential

def create_model(input_dim, optimizer='adam', dropout=0.5):
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=input_dim, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


## Now that we implemented our model, we can train it

In [16]:
from deepmol.models import KerasModel

input_dim = train_dataset.X.shape[1]
model = KerasModel(create_model, epochs = 5, verbose=1, optimizer='adam', input_dim=input_dim)
model = model.fit(train_dataset)

Epoch 1/5


2023-05-31 18:00:48.519838: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 108937216 exceeds 10% of free system memory.
2023-05-31 18:00:48.592404: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 108937216 exceeds 10% of free system memory.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
model.predict(test_dataset)



2023-05-31 18:04:34.406302: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 27238400 exceeds 10% of free system memory.




array([[9.9999982e-01, 1.6883784e-07],
       [8.3446503e-07, 9.9999917e-01],
       [9.9984217e-01, 1.5780855e-04],
       ...,
       [4.0531158e-06, 9.9999595e-01],
       [9.4746321e-01, 5.2536760e-02],
       [9.9999994e-01, 3.8245172e-08]], dtype=float32)

In [19]:
model.evaluate(test_dataset, metrics=[Metric(metric=roc_auc_score), Metric(metric=accuracy_score)])



({'roc_auc_score': 0.9978161009369492, 'accuracy_score': 0.9831578947368421},
 {})

# Deep learning models using DeepChem models

In [24]:
from deepmol.compound_featurization import ConvMolFeat

ConvMolFeat(n_jobs=10).featurize(train_dataset, inplace=True)

In [25]:
ConvMolFeat(n_jobs=10).featurize(test_dataset, inplace=True)

In [26]:
from deepchem.models import GraphConvModel
from deepmol.models import DeepChemModel

model = DeepChemModel(model=GraphConvModel(graph_conv_layers=[32, 32], dense_layer_size=128, n_tasks=1), epochs=5, verbose=1)
model.fit(train_dataset)

In [27]:
model.predict(test_dataset)

array([[9.9998903e-01, 1.0985196e-05],
       [6.2768497e-03, 9.9372309e-01],
       [9.9981922e-01, 1.8080678e-04],
       ...,
       [2.8508517e-04, 9.9971491e-01],
       [9.8996836e-01, 1.0031606e-02],
       [9.9995100e-01, 4.9050355e-05]], dtype=float32)

In [28]:
model.evaluate(test_dataset, metrics=[Metric(metric=roc_auc_score), Metric(metric=accuracy_score)])

({'roc_auc_score': 0.9942084704601963, 'accuracy_score': 0.9663157894736842},
 {})