In [None]:
from rdkit import RDLogger
import logging
import warnings
from deepmol.loaders import SDFLoader
from sklearn.metrics import roc_auc_score, accuracy_score
from deepmol.metrics import Metric

warnings.filterwarnings("ignore")
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
RDLogger.DisableLog('rdApp.*')

# Shallow learning models using Scikit-learn

In [2]:
dataset = SDFLoader("../data/CHEMBL217_conformers.sdf", id_field="_ID", labels_fields=["_Class"]).create_dataset()



2023-03-16 16:11:34,297 — INFO — Assuming classification since there are less than 10 unique y values. If otherwise, explicitly set the mode to 'regression'!


In [3]:
from deepmol.splitters import RandomSplitter

random_splitter = RandomSplitter()
train_dataset, test_dataset = random_splitter.train_test_split(dataset, frac_train=0.8)

In [4]:
train_dataset.get_shape()

2023-03-16 16:11:34,389 — INFO — Mols_shape: (13298,)
2023-03-16 16:11:34,391 — INFO — Features_shape: None
2023-03-16 16:11:34,393 — INFO — Labels_shape: (13298,)


((13298,), None, (13298,))

In [5]:
from deepmol.compound_featurization import MorganFingerprint

MorganFingerprint(n_jobs=10).featurize(train_dataset)
MorganFingerprint(n_jobs=10).featurize(test_dataset)

2023-03-16 16:11:34.454280: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-16 16:11:34.743194: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-16 16:11:35.371405: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-16 16:11:35.371477: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

<deepmol.datasets.datasets.SmilesDataset at 0x7f10dae222b0>

In [6]:
from deepmol.models import SklearnModel
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
model = SklearnModel(model=rf)
model.fit(train_dataset)

In [7]:
model.predict(test_dataset)

array([[0.43, 0.57],
       [0.  , 1.  ],
       [0.05, 0.95],
       ...,
       [0.92, 0.08],
       [0.8 , 0.2 ],
       [0.01, 0.99]])

In [8]:
model.evaluate(test_dataset, metrics=[Metric(metric=roc_auc_score), Metric(metric=accuracy_score)])

({'roc_auc_score': 0.9968300425121696, 'accuracy_score': 0.9819548872180451},
 {})

# Deep learning models using Keras

In [23]:
MorganFingerprint(n_jobs=10).featurize(train_dataset)
MorganFingerprint(n_jobs=10).featurize(test_dataset)

<deepmol.datasets.datasets.SmilesDataset at 0x7f0fe41c27c0>

In [24]:
from keras.layers import Dense, Dropout
from keras import Sequential

def create_model(input_dim, optimizer='adam', dropout=0.5):
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=input_dim, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


In [25]:
from deepmol.models import KerasModel

input_dim = train_dataset.X.shape[1]
model = KerasModel(create_model, epochs = 5, verbose=1, optimizer='adam', input_dim=input_dim)
model.fit(train_dataset)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [26]:
model.predict(test_dataset)



array([[9.9999988e-01, 1.0798050e-07],
       [8.6513758e-03, 9.9134862e-01],
       [9.9999732e-01, 2.7008832e-06],
       ...,
       [1.0000000e+00, 6.9377570e-09],
       [1.5693903e-04, 9.9984306e-01],
       [1.0000000e+00, 5.4381460e-10]], dtype=float32)

In [27]:
model.evaluate(test_dataset, metrics=[Metric(metric=roc_auc_score), Metric(metric=accuracy_score)])



({'roc_auc_score': 0.9982351157325482, 'accuracy_score': 0.9834586466165414},
 {})

# Deep learning models using DeepChem models

In [35]:
from deepmol.compound_featurization import ConvMolFeat

ConvMolFeat(n_jobs=10).featurize(train_dataset)

<deepmol.datasets.datasets.SmilesDataset at 0x7f0fe4a01bb0>

In [38]:
ConvMolFeat(n_jobs=10).featurize(test_dataset)

<deepmol.datasets.datasets.SmilesDataset at 0x7f0fe41c27c0>

In [36]:
from deepchem.models import GraphConvModel
from deepmol.models import DeepChemModel

model = DeepChemModel(model=GraphConvModel(graph_conv_layers=[32, 32], dense_layer_size=128, n_tasks=1), epochs=5, verbose=1)
model.fit(train_dataset)



In [39]:
model.predict(test_dataset)

array([[9.9965996e-01, 3.3993475e-04],
       [3.5753742e-02, 9.6424627e-01],
       [9.9916947e-01, 8.3054311e-04],
       ...,
       [9.9998349e-01, 1.6476351e-05],
       [1.0034339e-02, 9.8996568e-01],
       [9.9939179e-01, 6.0817268e-04]], dtype=float32)

In [40]:
model.evaluate(test_dataset, metrics=[Metric(metric=roc_auc_score), Metric(metric=accuracy_score)])

({'roc_auc_score': 0.9937101797308633, 'accuracy_score': 0.9530827067669173},
 {})