# Predict binding affinity from a protein-ligand crystal structure

In [1]:
# Packages installed in my conda environment: Deepchem, rdkit, pytorch, pytorch-geometric, tensorflow, pdbfixer, numpy <1.25
# Deep Learning for the Life Sciences by Bharath Ramsundar, Peter Eastman, Patrick Walters and Vijay Pande
# pdbbind dataset: Cheng, T.J. et al. J. Chem. Inf. Model., 2009, 49, 1079-1093. (PDBbind v.2009)

In [2]:
import deepchem as dc

2024-01-15 11:14:46.458848: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Skipped loading some PyTorch models, missing a dependency. No module named 'torch'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch'
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'torch'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [3]:
featurizer = dc.feat.RdkitGridFeaturizer(voxel_width=2.0,
                                         feature_types=["ecfp", "splif", "salt_bridge", "hbond"],
                                         flatten = True,
                                         sanitize = True)

In [4]:
tasks, datasets, transformers = dc.molnet.load_pdbbind(featurizer=featurizer, reload=False, set_name = "core")

  return vector / np.linalg.norm(vector)
[11:15:00] Explicit valence for atom # 248 C, 5, is greater than permitted
Mol [H]C([H])(C([H])([H])C([H])([H])[N+]([H])([H])[H])C([H])([H])[C@@]([H])(C=O)[N+]([H])([H])[H].[H]NC1([H])(C([H])([H])O[H])OC1=O.[H]N[C@]([H])(C(=O)N([H])[C@]([H])(C(=O)N([H])[C@]([H])(C=O)C([H])([H])C([H])(C([H])([H])[H])C([H])([H])[H])C([H])([H])[H])C([H])([H])[H].[H]N[C@]([H])(C(=O)N([H])[C@]([H])(C(=O)N([H])[C@]([H])(C=O)[C@]([H])(O[H])C([H])([H])[H])[C@]([H])(O[H])C([H])([H])[H])C([H])([H])O[H].[H]N[C@]([H])(C=O)C([H])(C([H])([H])[H])C([H])([H])[H].[H]N[C@]([H])(C=O)C([H])([H])C([H])(C([H])([H])[H])C([H])([H])[H].[H]N[C@]([H])(C=O)C([H])([H])C([H])(C([H])([H])[H])C([H])([H])[H].[H]N[C@]([H])(C=O)C([H])([H])C([H])(C([H])([H])[H])C([H])([H])[H].[H]N[C@]([H])(C=O)C([H])([H])C([H])([H])C([H])([H])C([H])([H])[N+]([H])([H])[H].[H]N[C@]([H])(C=O)C([H])([H])O[H].[H]N[C@]([H])(C=O)C([H])([H])[H].[H]N[C@]([H])(C=O)[C@]([H])(O[H])C([H])([H])[H].[H]N[C@]([H])(C=O)[C@]([H])(O[

In [23]:
tasks

['-logKd/Ki']

In [24]:
transformers

[<deepchem.trans.transformers.NormalizationTransformer at 0x7f21f1d3b7f0>]

In [25]:
train, vaild, test = datasets

In [26]:
print(train.X.dtype)
print(train.y.dtype)
print(train.w.dtype)

int8
float64
float32


In [27]:
x_train = train.X.astype("float32")
y_train = train.y.astype("float32")
w_train = train.w.astype("float32")

x_test = test.X.astype("float32")
y_test = test.y.astype("float32")
w_test = test.w.astype("float32")

In [28]:
x_train.dtype

dtype('float32')

In [29]:
train.X.shape

(154, 18432)

In [39]:
train_dataset = dc.data.NumpyDataset(x_train, y_train, w_train)
test_dataset = dc.data.NumpyDataset(x_test, y_test, w_test)

In [40]:
train_dataset.X.shape

(154, 18432)

In [43]:
train_dataset.y.shape

(154,)

In [63]:
from deepchem.models import DTNNModel

In [65]:
model = DTNNModel(n_tasks=1,
                  n_embedding=20,
                  n_distance=100,
                  learning_rate=1.0,
                  mode="regression",
                  )

In [68]:
model.fit(train_dataset, nb_epoch=250)

IndexError: too many indices for array: array is 2-dimensional, but 3 were indexed

In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)

In [None]:
train_score = model.evaluate(train_dataset, [metric], transformers)
print(train_score)

In [None]:
test_score = model.evaluate(test_dataset, [metric], transformers)
print(test_score)

In [None]:
test_dataset.X.shape

In [None]:
prediction = model.predict_on_batch(test_dataset.X)
prediction[:5]

In [None]:
test_dataset.y[:5]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
prediction.shape

In [None]:
prediction = prediction.reshape(20)
prediction.shape

In [None]:
fig, ax = plt.subplots(figsize=(5,5))
ax = sns.scatterplot(x=test_dataset.y, y=prediction)
ax.set_title('pdbbind dataset')
ax.set_xlabel('True')
ax.set_ylabel('prediction')