# Predict binding affinity from a protein-ligand crystal structure

In [1]:
# Packages installed in my conda environment: Deepchem, rdkit, pytorch, pytorch-geometric, tensorflow, pdbfixer, numpy <1.25
# Deep Learning for the Life Sciences by Bharath Ramsundar, Peter Eastman, Patrick Walters and Vijay Pande
# pdbbind dataset: Cheng, T.J. et al. J. Chem. Inf. Model., 2009, 49, 1079-1093. (PDBbind v.2009)

In [5]:
import deepchem as dc

In [6]:
featurizer = dc.feat.RdkitGridFeaturizer(voxel_width=2.0,
                                         feature_types=["ecfp", "splif", "salt_bridge", "hbond"],
                                         flatten = True,
                                         sanitize = True)

In [7]:
import numpy as np
np.__version__

'1.24.4'

In [10]:
tasks, datasets, transformers = dc.molnet.load_pdbbind(featurizer=featurizer, reload=False, set_name='core' )

  return vector / np.linalg.norm(vector)
[10:08:39] Explicit valence for atom # 248 C, 5, is greater than permitted
Mol [H]C([H])(C([H])([H])C([H])([H])[N+]([H])([H])[H])C([H])([H])[C@@]([H])(C=O)[N+]([H])([H])[H].[H]NC1([H])(C([H])([H])O[H])OC1=O.[H]N[C@]([H])(C(=O)N([H])[C@]([H])(C(=O)N([H])[C@]([H])(C=O)C([H])([H])C([H])(C([H])([H])[H])C([H])([H])[H])C([H])([H])[H])C([H])([H])[H].[H]N[C@]([H])(C(=O)N([H])[C@]([H])(C(=O)N([H])[C@]([H])(C=O)[C@]([H])(O[H])C([H])([H])[H])[C@]([H])(O[H])C([H])([H])[H])C([H])([H])O[H].[H]N[C@]([H])(C=O)C([H])(C([H])([H])[H])C([H])([H])[H].[H]N[C@]([H])(C=O)C([H])([H])C([H])(C([H])([H])[H])C([H])([H])[H].[H]N[C@]([H])(C=O)C([H])([H])C([H])(C([H])([H])[H])C([H])([H])[H].[H]N[C@]([H])(C=O)C([H])([H])C([H])(C([H])([H])[H])C([H])([H])[H].[H]N[C@]([H])(C=O)C([H])([H])C([H])([H])C([H])([H])C([H])([H])[N+]([H])([H])[H].[H]N[C@]([H])(C=O)C([H])([H])O[H].[H]N[C@]([H])(C=O)C([H])([H])[H].[H]N[C@]([H])(C=O)[C@]([H])(O[H])C([H])([H])[H].[H]N[C@]([H])(C=O)[C@]([H])(O[

In [12]:
tasks

['-logKd/Ki']

In [13]:
transformers

[<deepchem.trans.transformers.NormalizationTransformer at 0x7fb272d54230>]

In [14]:
train, vaild, test = datasets

In [15]:
print(train.X.dtype)
print(train.y.dtype)
print(train.w.dtype)

float64
float64
float32


In [16]:
x_train = train.X.astype("float32")
y_train = train.y.astype("float32")
w_train = train.w.astype("float32")

x_test = test.X.astype("float32")
y_test = test.y.astype("float32")
w_test = test.w.astype("float32")

In [17]:
x_train.dtype

dtype('float32')

In [19]:
train.X.shape

(154, 1)

In [None]:
train_dataset = dc.data.NumpyDataset(x_train, y_train, w_train)
test_dataset = dc.data.NumpyDataset(x_test, y_test, w_test)

In [None]:
train_dataset.X.shape

In [None]:
model = dc.models.MultitaskRegressor(n_tasks=1, n_features=train_dataset.X.shape[1], layer_sizes=[5000, 2000, 1000], dropouts=0.5)

In [None]:
model.fit(train_dataset, nb_epoch=200)

In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)

In [None]:
train_score = model.evaluate(train_dataset, [metric], transformers)
print(train_score)

In [None]:
test_score = model.evaluate(test_dataset, [metric], transformers)
print(test_score)

In [None]:
test_dataset.X.shape

In [None]:
prediction = model.predict_on_batch(test_dataset.X)
prediction[:5]

In [None]:
test_dataset.y[:5]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
prediction.shape

In [None]:
prediction = prediction.reshape(20)
prediction.shape

In [None]:
fig, ax = plt.subplots(figsize=(5,5))
ax = sns.scatterplot(x=test_dataset.y, y=prediction)
ax.set_title('pdbbind dataset')
ax.set_xlabel('True')
ax.set_ylabel('prediction')