In [None]:
import json

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import TruncatedSVD

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

### Load data

In [None]:
pdbbind_training_set_pk = pd.read_csv('../data/pdbbind_training_set_binding_affinity.csv', index_col=0, header=None, squeeze=True)

crystal_pose_features = pd.read_csv('../data/crystal_pose_features.csv', index_col=0)
minimised_pose_features = pd.read_csv('../data/minimised_pose_features.csv', index_col=0)
docked_pose_features = pd.read_csv('../data/docked_pose_features.csv', index_col=0)

feature_sets = {}
with open('../data/lb_feature_names.txt') as f:
    feature_sets['LB'] = pd.Index([l.strip() for l in f])
with open('../data/sb_feature_names.txt') as f:
    feature_sets['SB'] = pd.Index([l.strip() for l in f])
with open('../data/hb_feature_names.txt') as f:
    feature_sets['HB'] = pd.Index([l.strip() for l in f])

# We've enumerated the docked poses associated to each PDB structure - these labels are used for cross-validation later
with open('../data/docked_pose_labels.json') as f:
    docked_pose_labels = json.load(f)

### Prepare cross-validation folds

Randomly shuffle and split into five folds - we'll use the same folds across all experiments.

In [None]:
shuffled = pdbbind_training_set_pk.sample(frac=1, replace=False, random_state=42).index
n_test = int(len(shuffled) / 5)
folds = [shuffled[:n_test], shuffled[n_test:2*n_test], shuffled[2*n_test:3*n_test], shuffled[3*n_test:4*n_test], shuffled[4*n_test:]]

pdbbind_training_set = pdbbind_training_set_pk.index

### Performance of Vina scoring function on CV folds

In [None]:
vina_crystal_pk = pd.read_csv('../results/vina_crystal_predicted_pk.csv', index_col=0, squeeze=True)
vina_docked_pk = pd.read_csv('../results/vina_docked_predicted_pk.csv', index_col=0, squeeze=True)
vina_docked_mean_pk = pd.read_csv('../results/vina_docked_mean_predicted_pk.csv', index_col=0, squeeze=True)
vina_minimized_pk = pd.read_csv('../results/vina_minimized_predicted_pk.csv', index_col=0, squeeze=True)

vina_crystal_fold_pearsonr = []
vina_docked_fold_pearsonr = []
vina_mean_docked_fold_pearsonr = []
vina_minimized_fold_pearsonr = []

for fold in folds:
    y_true = pdbbind_training_set_pk.loc[fold]
    # crystal
    y_pred = vina_crystal_pk.loc[fold]
    vina_crystal_fold_pearsonr.append(stats.pearsonr(y_true, y_pred)[0])
    # docked
    y_pred = vina_docked_pk.loc[fold]
    vina_docked_fold_pearsonr.append(stats.pearsonr(y_true, y_pred)[0])
    # docked (mean)
    y_pred = vina_docked_mean_pk.loc[fold]
    vina_mean_docked_fold_pearsonr.append(stats.pearsonr(y_true, y_pred)[0])
    # minimized
    y_pred = vina_minimized_pk.loc[fold]
    vina_minimized_fold_pearsonr.append(stats.pearsonr(y_true, y_pred)[0])
vina_crystal_cv_pearsonr = np.mean(vina_crystal_fold_pearsonr)
vina_docked_cv_pearsonr = np.mean(vina_docked_fold_pearsonr)
vina_mean_docked_cv_pearsonr = np.mean(vina_mean_docked_fold_pearsonr)
vina_minimized_cv_pearsonr = np.mean(vina_minimized_fold_pearsonr)
vina_crystal_cv_pearsonr_stdev = np.std(vina_crystal_fold_pearsonr)
vina_docked_cv_pearsonr_stdev = np.std(vina_docked_fold_pearsonr)
vina_mean_docked_cv_pearsonr_stdev = np.std(vina_mean_docked_fold_pearsonr)
vina_minimized_cv_pearsonr_stdev = np.std(vina_minimized_fold_pearsonr)

In [None]:
print(vina_crystal_cv_pearsonr, vina_crystal_cv_pearsonr_stdev)
print(vina_docked_cv_pearsonr, vina_docked_cv_pearsonr_stdev)
print(vina_mean_docked_cv_pearsonr, vina_mean_docked_cv_pearsonr_stdev)
print(vina_minimized_cv_pearsonr, vina_minimized_cv_pearsonr_stdev)

# Cross-validation using crystal poses

For reference, we first establish a benchmark by performing the cross-validation experiment using crystal poses for training and testing.

In [None]:
cv_crystal_results = {}

for model in feature_sets:
    fold_pearsonr = []
    fold_mse = []
    for fold in folds:
        index_train = pdbbind_training_set.difference(fold)
        X_train = crystal_pose_features.loc[index_train, feature_sets[model]].values
        X_test = crystal_pose_features.loc[fold, feature_sets[model]].values
        y_train = pdbbind_training_set_pk.loc[index_train].values.ravel()
        y_test = pdbbind_training_set_pk.loc[fold].values.ravel()
        rf = RandomForestRegressor(n_estimators=500, max_features=0.33, n_jobs=6, random_state=42)
        rf.fit(X_train, y_train)
        pred = rf.predict(X_test)
        fold_pearsonr.append(stats.pearsonr(y_test, pred)[0])
        fold_mse.append(mean_squared_error(y_test, pred))
    cv_crystal_results[model] = {'pearsonr': np.mean(fold_pearsonr), 'rmse': np.sqrt(np.mean(fold_mse)), 'pearsonr_stdev': np.std(fold_pearsonr)}

with open('../results/cv_crystal_results.json', 'w') as f:
    json.dump(cv_crystal_results, f)

In [None]:
cv_minimised_results = {}

for model in feature_sets:
    fold_pearsonr = []
    fold_mse = []
    for fold in folds:
        index_train = pdbbind_training_set.difference(fold)
        X_train = minimised_pose_features.loc[index_train, feature_sets[model]].values
        X_test = minimised_pose_features.loc[fold, feature_sets[model]].values
        y_train = pdbbind_training_set_pk.loc[index_train].values.ravel()
        y_test = pdbbind_training_set_pk.loc[fold].values.ravel()
        rf = RandomForestRegressor(n_estimators=500, max_features=0.33, n_jobs=6, random_state=42)
        rf.fit(X_train, y_train)
        pred = rf.predict(X_test)
        fold_pearsonr.append(stats.pearsonr(y_test, pred)[0])
        fold_mse.append(mean_squared_error(y_test, pred))
    cv_minimised_results[model] = {'pearsonr': np.mean(fold_pearsonr), 'rmse': np.sqrt(np.mean(fold_mse)), 'pearsonr_stdev': np.std(fold_pearsonr)}

with open('../results/cv_minimised_results.json', 'w') as f:
    json.dump(cv_minimised_results, f)

### Cross-validation using minimised poses

We repeat the benchmarking exercise, this time using minimized poses of each ligand for training and testing.

### PLEC

In [None]:
from oddt.fingerprints import sparse_to_csr_matrix, csr_matrix_to_sparse, fold, sparse_to_dense
from scipy import sparse

with open('../data/pdbbind_training_set.txt') as f:
    train_pdbs = [l.strip() for l in f]

with open('pdbbind_training_set_docked_plec_sparse.json') as f:
    train_features_docked = json.load(f)
    
plec_docked = {pdb: feats for pdb, feats in zip(train_pdbs, train_features_docked)}
    
with open('pdbbind_training_set_crystal_plec_sparse.json') as f:
    train_features_crystal = json.load(f)
    
plec_crystal = {pdb: feats for pdb, feats in zip(train_pdbs, train_features_crystal)}

### PLECscore

using original parameters i.e. 100 trees

In [None]:
%%time
plecscore_crystal_fold_predicted = []
plecscore_docked_fold_predicted = []
plecscore_fold_crystal_pearsonr = []
plecscore_fold_docked_pearsonr = []

for i, fold in enumerate(folds):
    print(f'Fold {i+1}')
    fold_train = pdbbind_training_set.difference(fold)
    y_train = pdbbind_training_set_pk.loc[fold_train].values.ravel()
    y_test = pdbbind_training_set_pk.loc[fold].values.ravel()
    training_features = [plec_crystal[pdb] for pdb in fold_train]
    test_features = [plec_crystal[pdb] for pdb in fold]
    X_train = sparse.vstack([sparse_to_csr_matrix(plec, size=65536) for plec in training_features], format='csr')
    X_test = sparse.vstack([sparse_to_csr_matrix(plec, size=65536) for plec in test_features], format='csr')
    rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=6)
    print('Crystal')
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    rp = stats.pearsonr(y_test, pred)[0]
    plecscore_fold_crystal_pearsonr.append(rp)
    plecscore_crystal_fold_predicted.append({pdb: score for pdb, score in zip(fold, pred)})
    
    training_features = [plec_docked[pdb] for pdb in fold_train]
    test_features = [plec_docked[pdb] for pdb in fold]
    X_train = sparse.vstack([sparse_to_csr_matrix(plec, size=65536) for plec in training_features], format='csr')
    X_test = sparse.vstack([sparse_to_csr_matrix(plec, size=65536) for plec in test_features], format='csr')
    rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=6)
    print('Docked')
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    rp = stats.pearsonr(y_test, pred)[0]
    plecscore_fold_docked_pearsonr.append(rp)
    plecscore_docked_fold_predicted.append({pdb: score for pdb, score in zip(fold, pred)})

In [None]:
%%time
plec_crystal_fold_predicted = []
plec_docked_fold_predicted = []
plec_fold_crystal_pearsonr = []
plec_fold_docked_pearsonr = []

for i, fold in enumerate(folds):
    print(f'Fold {i+1}')
    fold_train = pdbbind_training_set.difference(fold)
    y_train = pdbbind_training_set_pk.loc[fold_train].values.ravel()
    y_test = pdbbind_training_set_pk.loc[fold].values.ravel()
    training_features = [plec_crystal[pdb] for pdb in fold_train]
    test_features = [plec_crystal[pdb] for pdb in fold]
    X_train = sparse.vstack([sparse_to_csr_matrix(plec, size=65536) for plec in training_features], format='csr')
    X_test = sparse.vstack([sparse_to_csr_matrix(plec, size=65536) for plec in test_features], format='csr')
    rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=6)
    print('Crystal')
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    rp = stats.pearsonr(y_test, pred)[0]
    plec_fold_crystal_pearsonr.append(rp)
    plec_crystal_fold_predicted.append({pdb: score for pdb, score in zip(fold, pred)})
    
    training_features = [plec_docked[pdb] for pdb in fold_train]
    test_features = [plec_docked[pdb] for pdb in fold]
    X_train = sparse.vstack([sparse_to_csr_matrix(plec, size=65536) for plec in training_features], format='csr')
    X_test = sparse.vstack([sparse_to_csr_matrix(plec, size=65536) for plec in test_features], format='csr')
    rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=6)
    print('Docked')
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    rp = stats.pearsonr(y_test, pred)[0]
    plec_fold_docked_pearsonr.append(rp)
    plec_docked_fold_predicted.append({pdb: score for pdb, score in zip(fold, pred)})

### PLEC with Truncated SVD

In [None]:
%%time
plec_svd_crystal_fold_predicted = []
plec_svd_docked_fold_predicted = []
plec_svd_fold_crystal_pearsonr = []
plec_svd_fold_docked_pearsonr = []
plec_svd_rdk_crystal_fold_predicted = []
plec_svd_rdk_docked_fold_predicted = []
plec_svd_rdk_fold_crystal_pearsonr = []
plec_svd_rdk_fold_docked_pearsonr = []

lb_features = crystal_pose_features.loc[pdbbind_training_set, feature_sets['LB']]
#rfv2_rdk_features_crystal = pd.concat([rfv2_features_crystal, lb_features], axis='columns')
#rfv2_rdk_features_docked = pd.concat([rfv2_features_docked, lb_features], axis='columns')


for i, fold in enumerate(folds):
    print(f'Fold {i+1}')
    fold_train = pdbbind_training_set.difference(fold)
    
    # lb features
    rdk_train = lb_features.loc[fold_train].values
    rdk_test = lb_features.loc[fold].values
    
    y_train = pdbbind_training_set_pk.loc[fold_train].values.ravel()
    y_test = pdbbind_training_set_pk.loc[fold].values.ravel()
    training_features = [plec_crystal[pdb] for pdb in fold_train]
    test_features = [plec_crystal[pdb] for pdb in fold]
    X_train = sparse.vstack([sparse_to_csr_matrix(plec, size=65536) for plec in training_features], format='csr')
    X_test = sparse.vstack([sparse_to_csr_matrix(plec, size=65536) for plec in test_features], format='csr')
    svd = TruncatedSVD(n_components=200, random_state=42).fit(X_train)
    X_train = svd.transform(X_train)
    X_test = svd.transform(X_test)
    rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=8, max_features=0.33)
    print('Crystal')
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    rp = stats.pearsonr(y_test, pred)[0]
    plec_svd_fold_crystal_pearsonr.append(rp)
    plec_svd_crystal_fold_predicted.append({pdb: score for pdb, score in zip(fold, pred)})
    
    # add lb features
    X_train_rdk = np.concatenate((X_train, rdk_train), axis=1)
    X_test_rdk = np.concatenate((X_test, rdk_test), axis=1)
    
    rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=8, max_features=0.33)
    rf.fit(X_train_rdk, y_train)
    pred = rf.predict(X_test_rdk)
    rp = stats.pearsonr(y_test, pred)[0]
    plec_svd_rdk_fold_crystal_pearsonr.append(rp)
    plec_svd_rdk_crystal_fold_predicted.append({pdb: score for pdb, score in zip(fold, pred)})

    training_features = [plec_docked[pdb] for pdb in fold_train]
    test_features = [plec_docked[pdb] for pdb in fold]
    X_train = sparse.vstack([sparse_to_csr_matrix(plec, size=65536) for plec in training_features], format='csr')
    X_test = sparse.vstack([sparse_to_csr_matrix(plec, size=65536) for plec in test_features], format='csr')
    svd = TruncatedSVD(n_components=200, random_state=42).fit(X_train)
    X_train = svd.transform(X_train)
    X_test = svd.transform(X_test)
    rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=8)
    print('Docked')
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    rp = stats.pearsonr(y_test, pred)[0]
    plec_svd_fold_docked_pearsonr.append(rp)
    plec_svd_docked_fold_predicted.append({pdb: score for pdb, score in zip(fold, pred)})
    
    # add lb features
    X_train_rdk = np.concatenate((X_train, rdk_train), axis=1)
    X_test_rdk = np.concatenate((X_test, rdk_test), axis=1)
    
    rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=8, max_features=0.33)
    rf.fit(X_train_rdk, y_train)
    pred = rf.predict(X_test_rdk)
    rp = stats.pearsonr(y_test, pred)[0]
    plec_svd_rdk_fold_docked_pearsonr.append(rp)
    plec_svd_rdk_docked_fold_predicted.append({pdb: score for pdb, score in zip(fold, pred)})

In [None]:
with open('../results/cross_validation_plec_crystal_predictions.json', 'w') as f:
    json.dump(plec_crystal_fold_predicted, f, cls=NumpyEncoder)
    
with open('../results/cross_validation_plec_docked_predictions.json', 'w') as f:
    json.dump(plec_docked_fold_predicted, f, cls=NumpyEncoder)

In [None]:
plec_fold_crystal_pearsonr = []
for plec_fold in plec_crystal_fold_predicted:
    pred = pd.Series(plec_fold)
    y_true = pdbbind_training_set_pk.loc[pred.index]
    plec_fold_crystal_pearsonr.append(stats.pearsonr(y_true, pred)[0])

In [None]:
plec_fold_docked_pearsonr = []
for plec_fold in plec_docked_fold_predicted:
    pred = pd.Series(plec_fold)
    y_true = pdbbind_training_set_pk.loc[pred.index]
    plec_fold_docked_pearsonr.append(stats.pearsonr(y_true, pred)[0])

In [None]:
lb_fold_docked_predicted = []

for fold in folds:
    index_train = pdbbind_training_set.difference(fold)

    X_train = lb_features.loc[index_train].values
    y_train = pdbbind_training_set_pk.loc[index_train].values.ravel()

    rf = RandomForestRegressor(n_estimators=500, max_features=0.33, n_jobs=6, random_state=42)
    rf.fit(X_train, y_train)

    X_test = lb_features.loc[fold].values
    y_test = pdbbind_training_set_pk.loc[fold].values.ravel()

    pred = pd.Series(data=rf.predict(X_test), index=fold)
    lb_fold_docked_predicted.append(pred)

In [None]:
fold_rps = []
for plec_fold, lb_fold in zip(plec_docked_fold_predicted, lb_fold_docked_predicted):
    pred = pd.Series(plec_fold).loc[lb_fold.index]
    stacked_pred = pd.Series({pdb: (pred[pdb] + lb_fold[pdb])/2 for pdb in pred.index})
    y_true = pdbbind_training_set_pk.loc[pred.index]
    fold_rps.append(stats.pearsonr(y_true, stacked_pred)[0])
np.mean(fold_rps), np.std(fold_rps)

In [None]:
fold_rps = []
for plec_fold, lb_fold in zip(plec_crystal_fold_predicted, lb_fold_docked_predicted):
    pred = pd.Series(plec_fold).loc[lb_fold.index]
    stacked_pred = pd.Series({pdb: (pred[pdb] + lb_fold[pdb])/2 for pdb in pred.index})
    y_true = pdbbind_training_set_pk.loc[pred.index]
    fold_rps.append(stats.pearsonr(y_true, stacked_pred)[0])
np.mean(fold_rps), np.std(fold_rps)

In [None]:
cv_docked_results = {}

for model in ['SB' ,'LB']
    fold_results = []
    for fold in folds:
        index_train = pdbbind_training_set.difference(fold)
        training_pose_labels = []

        # Get the labels for the highest-ranked pose for each training complex
        for pdb in index_train:
            training_pose_labels.append(docked_pose_labels[pdb][0])
        training_pose_labels = pd.Index(training_pose_labels)

        X_train = docked_pose_features.loc[training_pose_labels, feature_sets[model]].values
        y_train = pdbbind_training_set_pk.loc[index_train].values.ravel()

        rf = RandomForestRegressor(n_estimators=500, max_features=0.33, n_jobs=6, random_state=42)
        rf.fit(X_train, y_train)

        # List docked poses for this fold
        fold_dock_labels = []
        for pdb in fold:
            fold_dock_labels.extend(docked_pose_labels[pdb])
        fold_dock_labels = pd.Index(fold_dock_labels)

        X_test = docked_pose_features.loc[fold_dock_labels, feature_sets[model]].values
        y_test = pdbbind_training_set_pk.loc[fold].values.ravel()

        pred = pd.Series(data=rf.predict(X_test), index=fold_dock_labels)

### Experiment 1 - scoring strategy

First we run the cross-validation experiment using different strategies for scoring a ligand when multiple docked poses are available. Three strategies were tested: scoring the pose ranked highest by Smina ("top dock"); scoring all poses and taxing the highest score ("all docks max"); and scoring all poses and taking the mean score ("all docks mean"). Models are trained using a single pose for each ligand, minimised using Smina to achieve a single near-native docked pose. We also train and test using crystal poses for reference.

In [None]:
cv_train_minimized_test_top_dock_results = {}
cv_train_minimized_test_all_docks_max_results = {}
cv_train_minimized_test_all_docks_mean_results = {}

for model in feature_sets:
    fold_pearsonr_max = []
    fold_pearsonr_mean = []
    fold_pearsonr_top = []
    fold_mse_max = []
    fold_mse_mean = []
    fold_mse_top = []
    for fold in folds:
        index_train = pdbbind_training_set.difference(fold)
        X_train = minimised_pose_features.loc[index_train, feature_sets[model]].values
        y_train = pdbbind_training_set_pk.loc[index_train].values.ravel()
        rf = RandomForestRegressor(n_estimators=500, max_features=0.33, n_jobs=6, random_state=42)
        rf.fit(X_train, y_train)

        # List docked poses for this fold
        fold_dock_labels = []
        for pdb in fold:
            fold_dock_labels.extend(docked_pose_labels[pdb])
        fold_dock_labels = pd.Index(fold_dock_labels)

        X_test = docked_pose_features.loc[fold_dock_labels, feature_sets[model]].values
        y_test = pdbbind_training_set_pk.loc[fold].values.ravel()

        pred = pd.Series(data=rf.predict(X_test), index=fold_dock_labels)

        # Score all poses, taking max/mean score for each ligand
        max_pred = []
        mean_pred = []

        for pdb in fold:
            max_pred.append(np.max(pred.loc[docked_pose_labels[pdb]]))
            mean_pred.append(np.mean(pred.loc[docked_pose_labels[pdb]]))
        fold_pearsonr_max.append(stats.pearsonr(y_test, max_pred)[0])
        fold_pearsonr_mean.append(stats.pearsonr(y_test, mean_pred)[0])
        fold_mse_max.append(mean_squared_error(y_test, max_pred))
        fold_mse_mean.append(mean_squared_error(y_test, mean_pred))

        # Take the score of the pose ranked highest by Smina
        top_pred = []
        for pdb in fold:
            top_pred.append(pred.loc[docked_pose_labels[pdb][0]])
        fold_pearsonr_top.append(stats.pearsonr(y_test, top_pred)[0])
        fold_mse_top.append(mean_squared_error(y_test, top_pred))
    
    cv_train_minimized_test_all_docks_max_results[model] = {'pearsonr': np.mean(fold_pearsonr_max), 
                                                            'pearsonr_stdev': np.std(fold_pearsonr_max), 
                                                            'rmse': np.sqrt(np.mean(fold_mse_max))}
    
    cv_train_minimized_test_all_docks_mean_results[model] = {'pearsonr': np.mean(fold_pearsonr_mean), 
                                                             'pearsonr_stdev': np.std(fold_pearsonr_mean),
                                                             'rmse': np.sqrt(np.mean(fold_mse_mean))}
    cv_train_minimized_test_top_dock_results[model] = {'pearsonr': np.mean(fold_pearsonr_top), 
                                                       'pearsonr_stdev': np.std(fold_pearsonr_top),
                                                       'rmse': np.sqrt(np.mean(fold_mse_top))}

In [None]:
test_strategy_pearsonr = {
    'Smina top pose': {model: cv_train_minimized_test_top_dock_results[model]['pearsonr'] for model in feature_sets},
    'Maximum pose score': {model: cv_train_minimized_test_all_docks_max_results[model]['pearsonr'] for model in feature_sets},
    'Mean pose score': {model: cv_train_minimized_test_all_docks_mean_results[model]['pearsonr'] for model in feature_sets},
    'Train-test crystal': {m: cv_crystal_results[m]['pearsonr'] for m in feature_sets}
}
test_strategy_pearsonr = pd.DataFrame(test_strategy_pearsonr).loc[['LB','SB','HB']]
test_strategy_pearsonr.index = ['LB model', 'SB model','HB model']
test_strategy_pearsonr.T.to_csv('../results/train_minimised_pose_cv_pearsonr.csv')
test_strategy_pearsonr.T

In [None]:
test_strategy_pearsonr_stdev = {
    'Smina top pose': {model: cv_train_minimized_test_top_dock_results[model]['pearsonr_stdev'] for model in feature_sets},
    'Maximum pose score': {model: cv_train_minimized_test_all_docks_max_results[model]['pearsonr_stdev'] for model in feature_sets},
    'Mean pose score': {model: cv_train_minimized_test_all_docks_mean_results[model]['pearsonr_stdev'] for model in feature_sets},
    'Train-test crystal': {m: cv_crystal_results[m]['pearsonr_stdev'] for m in feature_sets}
}
test_strategy_pearsonr_stdev = pd.DataFrame(test_strategy_pearsonr_stdev).loc[['LB','SB','HB']]
test_strategy_pearsonr_stdev.index = ['LB model', 'SB model','HB model']
test_strategy_pearsonr_stdev.T

### Experiment 2 - Training on docked poses

Next, we repeat the cross-validation experiment, this itme training on the docked pose ranked highest by Smina for each ligand. 

In [None]:
cv_train_top_dock_test_top_dock_results = {}
cv_train_top_dock_test_all_docks_max_results = {}
cv_train_top_dock_test_all_docks_mean_results = {}

for model in feature_sets:
    fold_pearsonr_max = []
    fold_pearsonr_mean = []
    fold_pearsonr_top = []
    fold_mse_max = []
    fold_mse_mean = []
    fold_mse_top = []
    for fold in folds:
        index_train = pdbbind_training_set.difference(fold)
        training_pose_labels = []

        # Get the labels for the highest-ranked pose for each training complex
        for pdb in index_train:
            training_pose_labels.append(docked_pose_labels[pdb][0])
        training_pose_labels = pd.Index(training_pose_labels)

        X_train = docked_pose_features.loc[training_pose_labels, feature_sets[model]].values
        y_train = pdbbind_training_set_pk.loc[index_train].values.ravel()

        rf = RandomForestRegressor(n_estimators=500, max_features=0.33, n_jobs=6, random_state=42)
        rf.fit(X_train, y_train)

        # List docked poses for this fold
        fold_dock_labels = []
        for pdb in fold:
            fold_dock_labels.extend(docked_pose_labels[pdb])
        fold_dock_labels = pd.Index(fold_dock_labels)

        X_test = docked_pose_features.loc[fold_dock_labels, feature_sets[model]].values
        y_test = pdbbind_training_set_pk.loc[fold].values.ravel()

        pred = pd.Series(data=rf.predict(X_test), index=fold_dock_labels)

        # Score all poses, taking max/mean score for each ligand
        max_pred = []
        mean_pred = []

        for pdb in fold:
            max_pred.append(np.max(pred.loc[docked_pose_labels[pdb]]))
            mean_pred.append(np.mean(pred.loc[docked_pose_labels[pdb]]))
        fold_pearsonr_max.append(stats.pearsonr(y_test, max_pred)[0])
        fold_pearsonr_mean.append(stats.pearsonr(y_test, mean_pred)[0])
        fold_mse_max.append(mean_squared_error(y_test, max_pred))
        fold_mse_mean.append(mean_squared_error(y_test, mean_pred))

        # Take the score of the pose ranked highest by Smina
        top_pred = []
        for pdb in fold:
            top_pred.append(pred.loc[docked_pose_labels[pdb][0]])
        fold_pearsonr_top.append(stats.pearsonr(y_test, top_pred)[0])
        fold_mse_top.append(mean_squared_error(y_test, top_pred))

    cv_train_top_dock_test_all_docks_max_results[model] = {'pearsonr': np.mean(fold_pearsonr_max), 
                                                            'pearsonr_stdev': np.std(fold_pearsonr_max), 
                                                            'rmse': np.sqrt(np.mean(fold_mse_max))}
    
    cv_train_top_dock_test_all_docks_mean_results[model] = {'pearsonr': np.mean(fold_pearsonr_mean), 
                                                            'pearsonr_stdev': np.std(fold_pearsonr_mean), 
                                                            'rmse': np.sqrt(np.mean(fold_mse_mean))}
    cv_train_top_dock_test_top_dock_results[model] = {'pearsonr': np.mean(fold_pearsonr_top), 
                                                      'pearsonr_stdev': np.std(fold_pearsonr_top), 
                                                      'rmse': np.sqrt(np.mean(fold_mse_top))}

In [None]:
test_strategy_pearsonr = {
    'Smina top pose': {model: cv_train_top_dock_test_top_dock_results[model]['pearsonr'] for model in feature_sets},
    'Maximum pose score': {model: cv_train_top_dock_test_all_docks_max_results[model]['pearsonr'] for model in feature_sets},
    'Mean pose score': {model: cv_train_top_dock_test_all_docks_mean_results[model]['pearsonr'] for model in feature_sets},
    'Train-test crystal': {m: cv_crystal_results[m]['pearsonr'] for m in feature_sets}
}
test_strategy_pearsonr = pd.DataFrame(test_strategy_pearsonr).loc[['LB','SB','HB']]
test_strategy_pearsonr.index = ['LB model', 'SB model','HB model']
test_strategy_pearsonr.T.to_csv('../results/train_top_docked_pose_cv_pearsonr.csv')
test_strategy_pearsonr.T

In [None]:
test_strategy_pearsonr_stdev = {
    'Smina top pose': {model: cv_train_top_dock_test_top_dock_results[model]['pearsonr_stdev'] for model in feature_sets},
    'Maximum pose score': {model: cv_train_top_dock_test_all_docks_max_results[model]['pearsonr_stdev'] for model in feature_sets},
    'Mean pose score': {model: cv_train_top_dock_test_all_docks_mean_results[model]['pearsonr_stdev'] for model in feature_sets},
    'Train-test crystal': {m: cv_crystal_results[m]['pearsonr_stdev'] for m in feature_sets}
}
test_strategy_pearsonr_stdev = pd.DataFrame(test_strategy_pearsonr_stdev).loc[['LB','SB','HB']]
test_strategy_pearsonr_stdev.index = ['LB model', 'SB model','HB model']
test_strategy_pearsonr_stdev.T

### Experiment 3 - Training using multiple poses

Next we again repeat the cross-validation experiment, this time training on all of the docked poses for each ligand. To control for the effect of increasing the size of training set, we also repeat the experiment by training on a number of redundant copies of the top pose for each ligand equal to the number of docked poses.

In [None]:
# run cv on docks, training on all docks
cv_train_all_docks_test_all_docks_results = {}

for model in feature_sets:
    fold_pearsonr = []
    fold_mse = []
    for fold in folds:
        index_train = pdbbind_training_set.difference(fold)
        
        training_pose_labels = []
        for pdb in index_train:
            training_pose_labels.extend(docked_pose_labels[pdb])
        training_pose_labels = pd.Index(training_pose_labels)

        X_train = docked_pose_features.loc[training_pose_labels, feature_sets[model]].values
        # Training affinities are the same for each pose of a ligand
        training_pose_pdbs = pd.Index(i[:4] for i in training_pose_labels)
        y_train = pdbbind_training_set_pk.loc[training_pose_pdbs].values.ravel()

        rf = RandomForestRegressor(n_estimators=500, max_features=0.33, n_jobs=6, random_state=42)
        rf.fit(X_train, y_train)

        # test on all docks
        fold_pose_labels = []
        for pdb in fold:
            fold_pose_labels.extend(docked_pose_labels[pdb])
        fold_pose_labels = pd.Index(fold_pose_labels)
        X_test = docked_pose_features.loc[fold_pose_labels, feature_sets[model]].values
        y_test = pdbbind_training_set_pk.loc[fold].values.ravel()

        pred = pd.Series(data=rf.predict(X_test), index=fold_pose_labels)
        max_pred = []
        for pdb in fold:
            max_pred.append(np.max(pred.loc[docked_pose_labels[pdb]]))
        fold_pearsonr.append(stats.pearsonr(y_test, max_pred)[0])
        fold_mse.append(mean_squared_error(y_test, max_pred))

    cv_train_all_docks_test_all_docks_results[model] = {'pearsonr': np.mean(fold_pearsonr), 
                                                        'rmse': np.sqrt(np.mean(fold_mse)), 
                                                        'pearsonr_stdev': np.std(fold_pearsonr)}

In [None]:
cv_train_redundant_docks_test_all_docks_results = {}

for model in feature_sets:
    fold_pearsonr = []
    fold_mse = []
    for fold in folds:

        index_train = pdbbind_training_set.difference(fold)
        
        training_pose_labels = []
        for pdb in index_train:
            training_pose_labels.extend(docked_pose_labels[pdb])
        training_pose_labels = pd.Index(training_pose_labels)

        # This time we want N copies of the pose ranked highest by Smina
        training_pose_labels = training_pose_labels.map(lambda x: x[:4]+'_0')
        X_train = docked_pose_features.loc[training_pose_labels, feature_sets[model]].values

        # Training affinities are the same for each pose of a ligand
        training_pose_pdbs = pd.Index(i[:4] for i in training_pose_labels)
        y_train = pdbbind_training_set_pk.loc[training_pose_pdbs].values.ravel()

        rf = RandomForestRegressor(n_estimators=500, max_features=0.33, n_jobs=6, random_state=42)
        rf.fit(X_train, y_train)

        # test on all docks
        fold_pose_labels = []
        for pdb in fold:
            fold_pose_labels.extend(docked_pose_labels[pdb])
        fold_pose_labels = pd.Index(fold_pose_labels)
        X_test = docked_pose_features.loc[fold_pose_labels, feature_sets[model]].values
        y_test = pdbbind_training_set_pk.loc[fold].values.ravel()

        pred = pd.Series(data=rf.predict(X_test), index=fold_pose_labels)
        max_pred = []
        for pdb in fold:
            max_pred.append(np.max(pred.loc[docked_pose_labels[pdb]]))
        fold_pearsonr.append(stats.pearsonr(y_test, max_pred)[0])
        fold_mse.append(mean_squared_error(y_test, max_pred))
    
    cv_train_redundant_docks_test_all_docks_results[model] = {'pearsonr': np.mean(fold_pearsonr), 
                                                              'rmse': np.sqrt(np.mean(fold_mse)),
                                                              'pearsonr_stdev': np.std(fold_pearsonr)}

In [None]:
multipose_pearsonr = {}

for m in feature_sets:
    multipose_pearsonr[m] = {'Smina top pose': cv_train_top_dock_test_all_docks_max_results[m]['pearsonr'],
                                'All poses': cv_train_all_docks_test_all_docks_results[m]['pearsonr'],
                                'Redundant poses': cv_train_redundant_docks_test_all_docks_results[m]['pearsonr']}

multipose_pearsonr = pd.DataFrame(multipose_pearsonr)[['LB', 'SB', 'HB']]
multipose_pearsonr.columns = ['LB model', 'SB model','HB model']
multipose_pearsonr.to_csv('../results/train_multiple_poses_cv_pearsonr.csv')
multipose_pearsonr

In [None]:
multipose_pearsonr_stdev = {}

for m in feature_sets:
    multipose_pearsonr_stdev[m] = {'Smina top pose': cv_train_top_dock_test_all_docks_max_results[m]['pearsonr_stdev'],
                                'All poses': cv_train_all_docks_test_all_docks_results[m]['pearsonr_stdev'],
                                'Redundant poses': cv_train_redundant_docks_test_all_docks_results[m]['pearsonr_stdev']}

multipose_pearsonr_stdev = pd.DataFrame(multipose_pearsonr_stdev)[['LB', 'SB', 'HB']]
multipose_pearsonr_stdev.columns = ['LB model', 'SB model','HB model']
multipose_pearsonr_stdev

In [None]:
with open('pdbbind_training_set_crystal_rfv2.json') as f:
    rfv2_features_crystal = pd.DataFrame(json.load(f)).T
with open('pdbbind_training_set_docked_rfv2.json') as f:
    rfv2_features_docked = pd.DataFrame(json.load(f)).T

### RF-Score v2

In [None]:
%%time

lb_features = crystal_pose_features.loc[pdbbind_training_set, feature_sets['LB']]
rfv2_rdk_features_crystal = pd.concat([rfv2_features_crystal, lb_features], axis='columns')
rfv2_rdk_features_docked = pd.concat([rfv2_features_docked, lb_features], axis='columns')

rfv2_cv_crystal_results = {}
rfv2_cv_docked_results = {}
rfv2_rdk_cv_crystal_results = {}
rfv2_rdk_cv_docked_results = {}

print('RFv2 crytsal')
fold_pearsonr = []
fold_mse = []
for fold in folds:
    index_train = pdbbind_training_set.difference(fold)
    X_train = rfv2_features_crystal.loc[index_train].values
    X_test = rfv2_features_crystal.loc[fold].values
    y_train = pdbbind_training_set_pk.loc[index_train].values.ravel()
    y_test = pdbbind_training_set_pk.loc[fold].values.ravel()
    rf = RandomForestRegressor(n_estimators=500, max_features=14, n_jobs=6, random_state=42) #mtry=14 for rfv2
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    fold_pearsonr.append(stats.pearsonr(y_test, pred)[0])
    fold_mse.append(mean_squared_error(y_test, pred))
rfv2_cv_crystal_results = {'pearsonr': np.mean(fold_pearsonr), 'rmse': np.sqrt(np.mean(fold_mse)), 'pearsonr_stdev': np.std(fold_pearsonr)}

print('RFv2 docked')
fold_pearsonr = []
fold_mse = []
for fold in folds:
    index_train = pdbbind_training_set.difference(fold)
    X_train = rfv2_features_docked.loc[index_train].values
    X_test = rfv2_features_docked.loc[fold].values
    y_train = pdbbind_training_set_pk.loc[index_train].values.ravel()
    y_test = pdbbind_training_set_pk.loc[fold].values.ravel()
    rf = RandomForestRegressor(n_estimators=500, max_features=14, n_jobs=6, random_state=42) #mtry=14 for rfv2
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    fold_pearsonr.append(stats.pearsonr(y_test, pred)[0])
    fold_mse.append(mean_squared_error(y_test, pred))
rfv2_cv_docked_results = {'pearsonr': np.mean(fold_pearsonr), 'rmse': np.sqrt(np.mean(fold_mse)), 'pearsonr_stdev': np.std(fold_pearsonr)}

print('RFv2 rdk crytsal')
fold_pearsonr = []
fold_mse = []
for fold in folds:
    index_train = pdbbind_training_set.difference(fold)
    X_train = rfv2_rdk_features_crystal.loc[index_train].values
    X_test = rfv2_rdk_features_crystal.loc[fold].values
    y_train = pdbbind_training_set_pk.loc[index_train].values.ravel()
    y_test = pdbbind_training_set_pk.loc[fold].values.ravel()
    rf = RandomForestRegressor(n_estimators=500, max_features=0.33, n_jobs=6, random_state=42) #mtry=14 for rfv2
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    fold_pearsonr.append(stats.pearsonr(y_test, pred)[0])
    fold_mse.append(mean_squared_error(y_test, pred))
rfv2_rdk_cv_crystal_results = {'pearsonr': np.mean(fold_pearsonr), 'rmse': np.sqrt(np.mean(fold_mse)), 'pearsonr_stdev': np.std(fold_pearsonr)}

print('RFv2 rdk docked')
fold_pearsonr = []
fold_mse = []
for fold in folds:
    index_train = pdbbind_training_set.difference(fold)
    X_train = rfv2_rdk_features_docked.loc[index_train].values
    X_test = rfv2_rdk_features_docked.loc[fold].values
    y_train = pdbbind_training_set_pk.loc[index_train].values.ravel()
    y_test = pdbbind_training_set_pk.loc[fold].values.ravel()
    rf = RandomForestRegressor(n_estimators=500, max_features=0.33, n_jobs=6, random_state=42) #mtry=14 for rfv2
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    fold_pearsonr.append(stats.pearsonr(y_test, pred)[0])
    fold_mse.append(mean_squared_error(y_test, pred))
rfv2_rdk_cv_docked_results = {'pearsonr': np.mean(fold_pearsonr), 'rmse': np.sqrt(np.mean(fold_mse)), 'pearsonr_stdev': np.std(fold_pearsonr)}