In [1]:
!pip install --pre deepchem

Collecting deepchem
  Downloading deepchem-2.6.0.dev20211026183818-py3-none-any.whl (610 kB)
[K     |████████████████████████████████| 610 kB 7.6 MB/s 
Installing collected packages: deepchem
Successfully installed deepchem-2.6.0.dev20211026183818


In [2]:
!pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2021.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.4 MB)
[K     |████████████████████████████████| 20.4 MB 1.4 MB/s 
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2021.9.2


In [3]:
import pandas as pd
import numpy as np
import deepchem as dc

dc.__version__

'2.6.0.dev'

# Dataset ChEMBL

ChEMBL is a manually curated database of bioactive molecules with drug-like properties. It brings together chemical, bioactivity and genomic data to aid the translation of genomic information into effective new drugs.
https://www.ebi.ac.uk/chembl/

In [5]:
np.random.seed(105)

In [8]:
chembl_tasks, datasets, transformers = dc.molnet.load_chembl(
    shard_size=2000, featurizer="ECFP", set="5thresh", splitter="scaffold")
train_dataset, valid_dataset, test_dataset = datasets

In [9]:
train_dataset

<DiskDataset X.shape: (19096, 1024), y.shape: (19096, 691), w.shape: (19096, 691), task_names: ['CHEMBL1075051' 'CHEMBL1075104' 'CHEMBL1075145' ... 'CHEMBL6164'
 'CHEMBL6166' 'CHEMBL6184']>

In [11]:
test_dataset.to_dataframe().head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X25,X26,X27,X28,X29,X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,X40,...,w653,w654,w655,w656,w657,w658,w659,w660,w661,w662,w663,w664,w665,w666,w667,w668,w669,w670,w671,w672,w673,w674,w675,w676,w677,w678,w679,w680,w681,w682,w683,w684,w685,w686,w687,w688,w689,w690,w691,ids
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,O=C(CCCCCCCC(=O)N1CCCN(CC1)C2(C(=O)NC(=O)NC2=O...
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,C(Cc1cccc2ccccc12)N3CCC4(CC3)Oc5ccccc5C=C4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CCOC(=O)\C(=C\C(=O)C1=CN(Cc2ccc(F)cc2)c3cc(ccc...
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CN(Cc1ccccc1)C(=O)c2ccc(cc2)S(=O)(=O)Nc3ccccc3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Nc1nc(nc2nc(nn12)c3occc3)N4CCN5C[C@@H](COc6ccc...


# Model Training

In [16]:
n_layers = 3

model = dc.models.MultitaskRegressor(
    len(chembl_tasks),
    n_features=1024,
    layer_sizes=[1000] * n_layers,
    dropouts=[0.25] * n_layers,
    weight_init_stddevs=[0.02] * n_layers,
    bias_init_consts=[1.0] * n_layers,
    learning_rate=0.0003,
    weight_decay_penalty=0.0001,
    batch_size=100)

In [17]:
model.fit(train_dataset, nb_epoch=5)


0.12166444605047053

In [29]:
metric = dc.metrics.Metric(dc.metrics.rms_score, task_averager=np.mean)

In [30]:
train_scores = model.evaluate(train_dataset, [metric], transformers)
print(train_scores)


{'mean-rms_score': 5.832144036624212}


In [31]:
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
print(valid_scores)

{'mean-rms_score': 5.808899921988996}
