In [1]:
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats
from sklearn.ensemble import RandomForestRegressor

plt.style.use('fivethirtyeight')

plt.rcParams['axes.facecolor']='white'
plt.rcParams['axes.edgecolor']='white'
plt.rcParams['figure.facecolor']='white'
plt.rcParams['savefig.facecolor']='white'

Load the PDBbind training set.

In [None]:
pdbbind_training_set_pk = pd.read_csv('../data/pdbbind_training_set_binding_affinity.csv', index_col=0, header=None, squeeze=True)
pdbbind_training_set = pdbbind_training_set_pk.index

docked_pose_features = pd.read_csv('../data/docked_pose_features.csv', index_col=0)

feature_sets = {}
with open('../data/lb_feature_names.txt') as f:
    feature_sets['LB'] = pd.Index([l.strip() for l in f])
with open('../data/sb_feature_names.txt') as f:
    feature_sets['SB'] = pd.Index([l.strip() for l in f])
with open('../data/hb_feature_names.txt') as f:
    feature_sets['HB'] = pd.Index([l.strip() for l in f])

# List the pose labels corresponding to the docked poses in the training and test sets
with open('../data/docked_pose_labels.json') as f:
    docked_pose_labels = json.load(f)

pdbbind_training_set_labels = []
for pdb in pdbbind_training_set:
    pdbbind_training_set_labels.extend(docked_pose_labels[pdb])
pdbbind_training_set_labels = pd.Index(pdbbind_training_set_labels)

Load the updated DUD-E diverse set.

In [2]:
targets = ['AKT1', 'CP3A4', 'GCR', 'HIVPR', 'HIVRT', 'KIF11']
docked_features = {}
ligand_features = {}
for target in targets:
    with open(f'../data/{target}_KI_docked_features.json') as f:
        docked_features[target] = json.load(f)
    with open(f'../data/{target}_KI_rdkit_descriptors.json') as f:
        ligand_features[target] = json.load(f)

# Concatenate structure-based and ligand-based features
features = {}
for target in targets:
    features[target] = {}
    for label in docked_features[target]:
        features[target][label] = {**docked_features[target][label], **ligand_features[target][label.split('_')[0]]}
features = {target: pd.DataFrame(features[target]).T for target in features}

In [None]:
binding_data = {}
for target in targets:
    df = pd.read_csv(f'../data/{target}_KI_clean.csv', index_col=0)
    binding_data[target] = df['pChEMBL Value']

### Distribution of binding data

In [None]:
fig, axes = plt.subplots(3,2,figsize=(12, 18))
axes = axes.flatten()
annots = iter(['A', 'B', 'C', 'D', 'E', 'F'])
for target, ax in zip(targets, axes):
    data = binding_data[target].values.ravel()
    xmin = int(min(data))
    xmax = int(max(data))+1
    ax.hist(data, bins=np.arange(xmin, xmax+1, 1), alpha=0.5)
    ax.set_title(target)
    ax.text(-0.1, 1.1, next(annots), transform=ax.transAxes, size=20, weight='bold')
    ax.set_xlabel('pChEMBL value')
    ax.set_ylabel('Frequency')
fig.tight_layout()
#fig.savefig('../figures/chembl_data_distribution.pdf', dpi=350, bbox_inches='tight')

Get the labels of the pose ranked highest by Smina for each ligand.

In [None]:
base_ids = {}
conformer_ids = {}
for target in targets:
    labels = features[target].index
    base_ids[target] = []
    conformer_ids[target] = {}
    for label in labels:
        base_id = label.split('_')[0]
        if base_id in conformer_ids[target]:
            conformer_ids[target][base_id].append(label)
        else:
            conformer_ids[target][base_id] = [label]

dude_top_docked_poses = {}

for target in targets:
    dude_top_docked_poses[target] = pd.Index([conformer_ids[target][base_id][0] for base_id in conformer_ids[target]])

### Inter-target validation

In [None]:
inter_target_pred = {}
for target in targets:
    inter_target_pred[target] = {}
    training_targets = [t for t in targets if t != target]
    
    X_train = []
    y_train = []
    for t in training_targets:
        X = features[t].loc[dude_top_docked_poses[t]]
        idx = pd.Index([i.split('_')[0] for i in X.index])
        X.index = idx
        y = binding_data[t].loc[idx]
        X_train.append(X)
        y_train.append(y)
    X_train = pd.concat(X_train, axis='index')
    y_train = pd.concat(y_train, axis='index')
    
    for f in feature_sets:
        rf= RandomForestRegressor(n_estimators=500, max_features=0.33, oob_score=True, random_state=42, n_jobs=32)
        rf.fit(X_train.loc[:, feature_sets[f]], y_train)

        X_test = features[target].loc[:, feature_sets[f]]
        inter_target_pred[target][f] = pd.Series(index=X_test.index, data=rf.predict(X_test))