In [None]:
from oddt.toolkits import ob
from joblib import delayed, Parallel
from functools import partial
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot as plt
from scipy import stats

import json
import numpy as np

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

PDBBIND_DIR = '/home/fergus/pdbbind/v2019'
with open('../data/pdbbind_training_set.txt') as f:
    pdbs = [l.strip() for l in f]
with open('../data/pdbbind_training_set.txt') as f:
    train_pdbs = [l.strip() for l in f]
    
%matplotlib inline

plt.style.use('fivethirtyeight')

plt.rcParams['axes.facecolor']='white'
plt.rcParams['axes.edgecolor']='white'
plt.rcParams['figure.facecolor']='white'
plt.rcParams['savefig.facecolor']='white'

In [None]:
from oddt.scoring.descriptors import close_contacts_descriptor
ligand_atomic_nums = [6, 7, 8, 9, 15, 16, 17, 35, 53]
protein_atomic_nums = [6, 7, 8, 16]
cutoff = [0, 2, 4, 6, 8, 10, 12]
descriptors_func = close_contacts_descriptor(cutoff=cutoff,protein_types=protein_atomic_nums,ligand_types=ligand_atomic_nums)

@delayed
def build(pdb):
    protein = next(ob.readfile('pdb', f'{PDBBIND_DIR}/{pdb}/{pdb}_protein.pdb'))
    protein.protein = True
    # crystal
    ligand = next(ob.readfile('sdf', f'{PDBBIND_DIR}/{pdb}/{pdb}_ligand.sdf'))
    features_c = descriptors_func.build(ligand, protein)[0]

    # docked        
    ligand = next(ob.readfile('sdf', f'../pdbbind_docked_poses/{pdb}/{pdb}_ligand_docked.sdf'))
    features_d = descriptors_func.build(ligand, protein)[0]
    return (features_c, features_d)

In [None]:
with Parallel(n_jobs=6, verbose=10) as parallel:
    features = parallel(build(pdb) for pdb in pdbs)

In [None]:
features_crystal = {}
features_docked = {}

for feature, pdb in zip(features, pdbs):
    features_crystal[pdb] = feature[0]
    features_docked[pdb] = feature[1]

In [None]:
with open('../features/pdbbind_training_set_docked_rfv2.json', 'w') as f:
    json.dump(features_docked, f, cls=NumpyEncoder)
    
with open('../features/pdbbind_training_set_crystal_rfv2.json', 'w') as f:
    json.dump(features_crystal, f, cls=NumpyEncoder)

In [None]:
with open('../data/pdbbind_test_set.txt') as f:
    test_pdbs = [l.strip() for l in f]

In [None]:
with Parallel(n_jobs=6, verbose=10) as parallel:
    test_features = parallel(build(pdb) for pdb in test_pdbs)

In [None]:
test_features_crystal = {}
test_features_docked = {}

for feature, pdb in zip(test_features, test_pdbs):
    test_features_crystal[pdb] = feature[0]
    test_features_docked[pdb] = feature[1]
    
with open('pdbbind_test_set_docked_rfv2.json', 'w') as f:
    json.dump(test_features_docked, f, cls=NumpyEncoder)
    
with open('pdbbind_test_set_crystal_rfv2.json', 'w') as f:
    json.dump(test_features_crystal, f, cls=NumpyEncoder)

In [None]:
pdbbind_training_set_affinity = pd.read_csv('../data/pdbbind_training_set_binding_affinity.csv', index_col=0, header=None, squeeze=True)
pdbbind_test_set_affinity = pd.read_csv('../data/pdbbind_test_set_binding_affinity.csv', index_col=0, header=None, squeeze=True)

In [None]:
features_docked = pd.Series(features_docked)

In [None]:
with open('../data/pdbbind_training_set_docked_rfv2.json') as f:
    features_docked = json.load(f)
features_docked = pd.DataFrame(features_docked).T
with open('../data/pdbbind_training_set_crystal_rfv2.json') as f:
    features_crystal = json.load(f)
features_crystal = pd.DataFrame(features_crystal).T

In [None]:
with open('./data/pdbbind_test_set_crystal_rfv2.json') as f:
    test_features_crystal = pd.DataFrame(json.load(f)).T    
with open('../data/pdbbind_test_set_docked_rfv2.json') as f:
    test_features_docked = pd.DataFrame(json.load(f)).T

In [None]:
pdbbind_training_set_affinity = pd.read_csv('../data/pdbbind_training_set_binding_affinity.csv', index_col=0, header=None, squeeze=True)
pdbbind_test_set_affinity = pd.read_csv('../data/pdbbind_test_set_binding_affinity.csv', index_col=0, header=None, squeeze=True)

In [None]:
pdbbind_training_set = pdbbind_training_set_affinity.index
pdbbind_test_set = pdbbind_test_set_affinity.index

In [None]:
blast_clusters = {}
for cutoff in [30, 40, 50, 70, 90, 95, 100]:
    with open(f'../data/bc-{cutoff}.out') as f:
        blast_clusters[cutoff] = [set(item[:4].lower() for item in line.strip().split()) for line in f]

test_set_similar_pdbs = {}
for cutoff in blast_clusters:
    pdbs = set()
    for pdb in pdbbind_test_set:
        for cluster in blast_clusters[cutoff]:
            if pdb in cluster:
                pdbs.update(cluster)
    test_set_similar_pdbs[cutoff] = pd.Index(pdbs).intersection(pdbbind_training_set)

test_set_similar_pdbs['All'] = pd.Index([])

In [None]:
with open('../data/pdbbind_ligand_tanimoto_similarity.json') as f:
    tanimoto_similarity = json.load(f)

tc_thresholds = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

test_set_similar_ligands = {}

for t in tc_thresholds:
    similar = set()
    for pdb_test in pdbbind_test_set:
        for pdb_train in pdbbind_training_set:
            if pdb_train in similar:
                continue
            if tanimoto_similarity[pdb_test][pdb_train] >= t:
                similar.add(pdb_train)
    test_set_similar_ligands[t] = pd.Index(similar)
test_set_similar_ligands['All'] = pd.Index([])

In [None]:
%%time
predicted_crystal_sequence_identity = {}
predicted_docked_sequence_identity = {}
pearsonr_crystal_sequence_identity = {}
pearsonr_docked_sequence_identity = {}

for cutoff in test_set_similar_pdbs:
    print(cutoff)
    training_set = pdbbind_training_set.difference(test_set_similar_pdbs[cutoff])
    y_train = pdbbind_training_set_affinity.loc[training_set].values.ravel()
    X_train_crystal = features_crystal.loc[training_set].values
    X_train_docked = features_docked.loc[training_set].values
    X_test_crystal = test_features_crystal.loc[pdbbind_test_set].values
    X_test_docked = test_features_docked.loc[pdbbind_test_set].values
    y_test = pdbbind_test_set_affinity.loc[pdbbind_test_set].values.ravel()
    rf_crystal = RandomForestRegressor(n_estimators=500, max_features=0.33,random_state=42, n_jobs=8, oob_score=True)
    rf_crystal.fit(X_train_crystal, y_train)
    rf_docked = RandomForestRegressor(n_estimators=500, max_features=0.33,random_state=42, n_jobs=8, oob_score=True)
    rf_docked.fit(X_train_docked, y_train)
    
    predicted_crystal = rf_crystal.predict(X_test_crystal)
    predicted_docked = rf_docked.predict(X_test_docked)
    
    predicted_crystal_sequence_identity[cutoff] = {pdb: pred for pdb, pred in zip(pdbbind_test_set, predicted_crystal)}
    predicted_docked_sequence_identity[cutoff] = {pdb: pred for pdb, pred in zip(pdbbind_test_set, predicted_docked)}
    
    pearsonr_crystal_sequence_identity[cutoff] = stats.pearsonr(y_test, predicted_crystal)[0]
    pearsonr_docked_sequence_identity[cutoff] = stats.pearsonr(y_test, predicted_docked)[0]

In [None]:
%%time
predicted_crystal_tanimoto = {}
predicted_docked_tanimoto = {}
pearsonr_crystal_tanimoto = {}
pearsonr_docked_tanimoto = {}

for cutoff in test_set_similar_ligands:
    print(cutoff)
    training_set = pdbbind_training_set.difference(test_set_similar_ligands[cutoff])
    y_train = pdbbind_training_set_affinity.loc[training_set].values.ravel()
    X_train_crystal = features_crystal.loc[training_set].values
    X_train_docked = features_docked.loc[training_set].values
    X_test_crystal = test_features_crystal.loc[pdbbind_test_set].values
    X_test_docked = test_features_docked.loc[pdbbind_test_set].values
    y_test = pdbbind_test_set_affinity.loc[pdbbind_test_set].values.ravel()
    rf_crystal = RandomForestRegressor(n_estimators=500, max_features=0.33,random_state=42, n_jobs=8, oob_score=True)
    rf_crystal.fit(X_train_crystal, y_train)
    rf_docked = RandomForestRegressor(n_estimators=500, max_features=0.33,random_state=42, n_jobs=8, oob_score=True)
    rf_docked.fit(X_train_docked, y_train)
    
    predicted_crystal = rf_crystal.predict(X_test_crystal)
    predicted_docked = rf_docked.predict(X_test_docked)
    
    predicted_crystal_tanimoto[cutoff] = {pdb: pred for pdb, pred in zip(pdbbind_test_set, predicted_crystal)}
    predicted_docked_tanimoto[cutoff] = {pdb: pred for pdb, pred in zip(pdbbind_test_set, predicted_docked)}
    
    pearsonr_crystal_tanimoto[cutoff] = stats.pearsonr(y_test, predicted_crystal)[0]
    pearsonr_docked_tanimoto[cutoff] = stats.pearsonr(y_test, predicted_docked)[0]

In [None]:
lb_features = pd.read_csv('lb_features.csv', index_col=0)

In [None]:
%%time
predicted_crystal_rdk_sequence_identity = {}
predicted_docked_rdk_sequence_identity = {}
pearsonr_crystal_rdk_sequence_identity = {}
pearsonr_docked_rdk_sequence_identity = {}

for cutoff in test_set_similar_pdbs:
    print(cutoff)
    training_set = pdbbind_training_set.difference(test_set_similar_pdbs[cutoff])
    y_train = pdbbind_training_set_affinity.loc[training_set].values.ravel()
    X_train_crystal = features_crystal.loc[training_set]
    X_train_docked = features_docked.loc[training_set]
    X_test_crystal = test_features_crystal.loc[pdbbind_test_set]
    X_test_docked = test_features_docked.loc[pdbbind_test_set]
    y_test = pdbbind_test_set_affinity.loc[pdbbind_test_set].values.ravel()
    # add lb features
    X_train_crystal = pd.concat([X_train_crystal, lb_features.loc[training_set]], axis='columns')
    X_test_crystal = pd.concat([X_test_crystal, lb_features.loc[pdbbind_test_set]], axis='columns')
    X_train_docked = pd.concat([X_train_docked, lb_features.loc[training_set]], axis='columns')
    X_test_docked = pd.concat([X_test_docked, lb_features.loc[pdbbind_test_set]], axis='columns')
    rf_crystal = RandomForestRegressor(n_estimators=500, max_features=0.33,random_state=42, n_jobs=8, oob_score=True)
    rf_crystal.fit(X_train_crystal, y_train)
    rf_docked = RandomForestRegressor(n_estimators=500, max_features=0.33,random_state=42, n_jobs=8, oob_score=True)
    rf_docked.fit(X_train_docked, y_train)
    
    predicted_crystal = rf_crystal.predict(X_test_crystal)
    predicted_docked = rf_docked.predict(X_test_docked)
    
    predicted_crystal_rdk_sequence_identity[cutoff] = {pdb: pred for pdb, pred in zip(pdbbind_test_set, predicted_crystal)}
    predicted_docked_rdk_sequence_identity[cutoff] = {pdb: pred for pdb, pred in zip(pdbbind_test_set, predicted_docked)}
    
    pearsonr_crystal_rdk_sequence_identity[cutoff] = stats.pearsonr(y_test, predicted_crystal)[0]
    pearsonr_docked_rdk_sequence_identity[cutoff] = stats.pearsonr(y_test, predicted_docked)[0]

In [None]:
%%time
predicted_crystal_rdk_tanimoto = {}
predicted_docked_rdk_tanimoto = {}
pearsonr_crystal_rdk_tanimoto = {}
pearsonr_docked_rdk_tanimoto = {}

for cutoff in test_set_similar_ligands:
    print(cutoff)
    training_set = pdbbind_training_set.difference(test_set_similar_ligands[cutoff])
    y_train = pdbbind_training_set_affinity.loc[training_set].values.ravel()
    X_train_crystal = features_crystal.loc[training_set]
    X_train_docked = features_docked.loc[training_set]
    X_test_crystal = test_features_crystal.loc[pdbbind_test_set]
    X_test_docked = test_features_docked.loc[pdbbind_test_set]
    y_test = pdbbind_test_set_affinity.loc[pdbbind_test_set].values.ravel()
    # add lb features
    X_train_crystal = pd.concat([X_train_crystal, lb_features.loc[training_set]], axis='columns')
    X_test_crystal = pd.concat([X_test_crystal, lb_features.loc[pdbbind_test_set]], axis='columns')
    X_train_docked = pd.concat([X_train_docked, lb_features.loc[training_set]], axis='columns')
    X_test_docked = pd.concat([X_test_docked, lb_features.loc[pdbbind_test_set]], axis='columns')
    rf_crystal = RandomForestRegressor(n_estimators=500, max_features=0.33,random_state=42, n_jobs=8, oob_score=True)
    rf_crystal.fit(X_train_crystal, y_train)
    rf_docked = RandomForestRegressor(n_estimators=500, max_features=0.33,random_state=42, n_jobs=8, oob_score=True)
    rf_docked.fit(X_train_docked, y_train)
    
    predicted_crystal = rf_crystal.predict(X_test_crystal)
    predicted_docked = rf_docked.predict(X_test_docked)
    
    predicted_crystal_rdk_tanimoto[cutoff] = {pdb: pred for pdb, pred in zip(pdbbind_test_set, predicted_crystal)}
    predicted_docked_rdk_tanimoto[cutoff] = {pdb: pred for pdb, pred in zip(pdbbind_test_set, predicted_docked)}
    
    pearsonr_crystal_rdk_tanimoto[cutoff] = stats.pearsonr(y_test, predicted_crystal)[0]
    pearsonr_docked_rdk_tanimoto[cutoff] = stats.pearsonr(y_test, predicted_docked)[0]

In [None]:
with open('../results/RFv2_predicted_crystal_sequence_identity.json', 'w') as f:
    json.dump(predicted_crystal_sequence_identity, f, cls=NumpyEncoder)
with open('../results/RFv2_predicted_docked_sequence_identity.json', 'w') as f:
    json.dump(predicted_docked_sequence_identity, f, cls=NumpyEncoder)
with open('../results/RFv2_pearsonr_crystal_sequence_identity.json', 'w') as f:
    json.dump(pearsonr_crystal_sequence_identity, f, cls=NumpyEncoder)
with open('../results/RFv2_pearsonr_docked_sequence_identity.json', 'w') as f:
    json.dump(pearsonr_docked_sequence_identity, f, cls=NumpyEncoder)

In [None]:
with open('../results/RFv2_predicted_crystal_tanimoto.json', 'w') as f:
    json.dump(predicted_crystal_tanimoto, f, cls=NumpyEncoder)
with open('../results/RFv2_predicted_docked_tanimoto.json', 'w') as f:
    json.dump(predicted_docked_tanimoto, f, cls=NumpyEncoder)
with open('../results/RFv2_pearsonr_crystal_tanimoto.json', 'w') as f:
    json.dump(pearsonr_crystal_tanimoto, f, cls=NumpyEncoder)
with open('../results/RFv2_pearsonr_docked_tanimoto.json', 'w') as f:
    json.dump(pearsonr_docked_tanimoto, f, cls=NumpyEncoder)

In [None]:
with open('../results/RFv2_RDK_predicted_crystal_sequence_identity.json', 'w') as f:
    json.dump(predicted_crystal_rdk_sequence_identity, f, cls=NumpyEncoder)
with open('../results/RFv2_RDK_predicted_docked_sequence_identity.json', 'w') as f:
    json.dump(predicted_docked_rdk_sequence_identity, f, cls=NumpyEncoder)
with open('../results/RFv2_RDK_pearsonr_crystal_sequence_identity.json', 'w') as f:
    json.dump(pearsonr_crystal_rdk_sequence_identity, f, cls=NumpyEncoder)
with open('../results/RFv2_RDK_pearsonr_docked_sequence_identity.json', 'w') as f:
    json.dump(pearsonr_docked_rdk_sequence_identity, f, cls=NumpyEncoder)

In [None]:
with open('../results/RFv2_RDK_predicted_crystal_tanimoto.json', 'w') as f:
    json.dump(predicted_crystal_rdk_tanimoto, f, cls=NumpyEncoder)
with open('../results/RFv2_RDK_predicted_docked_tanimoto.json', 'w') as f:
    json.dump(predicted_docked_rdk_tanimoto, f, cls=NumpyEncoder)
with open('../results/RFv2_RDK_pearsonr_crystal_tanimoto.json', 'w') as f:
    json.dump(pearsonr_crystal_rdk_tanimoto, f, cls=NumpyEncoder)
with open('../results/RFv2_RDK_pearsonr_docked_tanimoto.json', 'w') as f:
    json.dump(pearsonr_docked_rdk_tanimoto, f, cls=NumpyEncoder)

In [None]:
with open('../results/RFv2_predicted_crystal_sequence_identity.json') as f:
    predicted_crystal_sequence_identity = json.load(f)
with open('../results/RFv2_predicted_docked_sequence_identity.json') as f:
    predicted_docked_sequence_identity = json.load(f)
with open('../results/RFv2_pearsonr_crystal_sequence_identity.json') as f:
    pearsonr_crystal_sequence_identity = json.load(f)
with open('../results/RFv2_pearsonr_docked_sequence_identity.json') as f:
    pearsonr_docked_sequence_identity = json.load(f)

In [None]:
with open('../results/RFv2_predicted_crystal_tanimoto.json') as f:
    predicted_crystal_tanimoto = json.load(f)
with open('../results/RFv2_predicted_docked_tanimoto.json') as f:
    predicted_docked_tanimoto = json.load(f)
with open('../results/RFv2_pearsonr_crystal_tanimoto.json') as f:
    pearsonr_crystal_tanimoto = json.load(f)
with open('../results/RFv2_pearsonr_docked_tanimoto.json') as f:
    pearsonr_docked_tanimoto = json.load(f)

In [None]:
with open('../results/RFv2_RDK_predicted_crystal_sequence_identity.json') as f:
    predicted_crystal_rdk_sequence_identity = json.load(f)
with open('../results/RFv2_RDK_predicted_docked_sequence_identity.json') as f:
    predicted_docked_rdk_sequence_identity = json.load(f)
with open('../results/RFv2_RDK_pearsonr_crystal_sequence_identity.json') as f:
    pearsonr_crystal_rdk_sequence_identity = json.load(f)
with open('../results/RFv2_RDK_pearsonr_docked_sequence_identity.json') as f:
    pearsonr_docked_rdk_sequence_identity = json.load(f)

In [None]:
with open('../results/RFv2_RDK_predicted_crystal_tanimoto.json') as f:
    predicted_crystal_rdk_tanimoto = json.load(f)
with open('../results/RFv2_RDK_predicted_docked_tanimoto.json') as f:
    predicted_docked_rdk_tanimoto = json.load(f)
with open('../results/RFv2_RDK_pearsonr_crystal_tanimoto.json') as f:
    pearsonr_crystal_rdk_tanimoto = json.load(f)
with open('../results/RFv2_RDK_pearsonr_docked_tanimoto.json') as f:
    pearsonr_docked_rdk_tanimoto = json.load(f)

In [None]:
import statistical_tests as st

pearsonr_ci_crystal_si = {}
pearsonr_ci_docked_si = {}
pearsonr_pval_crystal_si = {}
pearsonr_pval_docked_si = {}

y_test = pdbbind_test_set_affinity.loc[pdbbind_test_set]
for cutoff in predicted_crystal_sequence_identity:
    y_pred = pd.Series(predicted_crystal_sequence_identity[cutoff]).loc[pdbbind_test_set]
    pearsonr_ci_crystal_si[cutoff] = st.bootstrap_pearsonr(y_test, y_pred)
    pearsonr_pval_crystal_si[cutoff] = st.permutation_pearsonr(y_test, y_pred)
    y_pred = pd.Series(predicted_docked_sequence_identity[cutoff]).loc[pdbbind_test_set]
    pearsonr_ci_docked_si[cutoff] = st.bootstrap_pearsonr(y_test, y_pred)
    pearsonr_pval_docked_si[cutoff] = st.permutation_pearsonr(y_test, y_pred)
    
pearsonr_ci_crystal_tanimoto = {}
pearsonr_ci_docked_tanimoto = {}
pearsonr_pval_crystal_tanimoto = {}
pearsonr_pval_docked_tanimoto = {}

y_test = pdbbind_test_set_affinity.loc[pdbbind_test_set]
for cutoff in predicted_crystal_tanimoto:
    y_pred = pd.Series(predicted_crystal_tanimoto[cutoff]).loc[pdbbind_test_set]
    pearsonr_ci_crystal_tanimoto[cutoff] = st.bootstrap_pearsonr(y_test, y_pred)
    pearsonr_pval_crystal_tanimoto[cutoff] = st.permutation_pearsonr(y_test, y_pred)
    y_pred = pd.Series(predicted_docked_tanimoto[cutoff]).loc[pdbbind_test_set]
    pearsonr_ci_docked_tanimoto[cutoff] = st.bootstrap_pearsonr(y_test, y_pred)
    pearsonr_pval_docked_tanimoto[cutoff] = st.permutation_pearsonr(y_test, y_pred)

In [None]:
pearsonr_ci_crystal_rdk_si = {}
pearsonr_ci_docked_rdk_si = {}
pearsonr_pval_crystal_rdk_si = {}
pearsonr_pval_docked_rdk_si = {}

y_test = pdbbind_test_set_affinity.loc[pdbbind_test_set]
for cutoff in predicted_crystal_rdk_sequence_identity:
    y_pred = pd.Series(predicted_crystal_rdk_sequence_identity[cutoff]).loc[pdbbind_test_set]
    pearsonr_ci_crystal_rdk_si[cutoff] = st.bootstrap_pearsonr(y_test, y_pred)
    pearsonr_pval_crystal_rdk_si[cutoff] = st.permutation_pearsonr(y_test, y_pred)
    y_pred = pd.Series(predicted_docked_rdk_sequence_identity[cutoff]).loc[pdbbind_test_set]
    pearsonr_ci_docked_rdk_si[cutoff] = st.bootstrap_pearsonr(y_test, y_pred)
    pearsonr_pval_docked_rdk_si[cutoff] = st.permutation_pearsonr(y_test, y_pred)
    
pearsonr_ci_crystal_rdk_tanimoto = {}
pearsonr_ci_docked_rdk_tanimoto = {}
pearsonr_pval_crystal_rdk_tanimoto = {}
pearsonr_pval_docked_rdk_tanimoto = {}

y_test = pdbbind_test_set_affinity.loc[pdbbind_test_set]
for cutoff in predicted_crystal_rdk_tanimoto:
    y_pred = pd.Series(predicted_crystal_rdk_tanimoto[cutoff]).loc[pdbbind_test_set]
    pearsonr_ci_crystal_rdk_tanimoto[cutoff] = st.bootstrap_pearsonr(y_test, y_pred)
    pearsonr_pval_crystal_rdk_tanimoto[cutoff] = st.permutation_pearsonr(y_test, y_pred)
    y_pred = pd.Series(predicted_docked_rdk_tanimoto[cutoff]).loc[pdbbind_test_set]
    pearsonr_ci_docked_rdk_tanimoto[cutoff] = st.bootstrap_pearsonr(y_test, y_pred)
    pearsonr_pval_docked_rdk_tanimoto[cutoff] = st.permutation_pearsonr(y_test, y_pred)

In [None]:
pearsonr_crystal_si = pd.Series({c: pearsonr_ci_crystal_si[c][0] for c in pearsonr_ci_crystal_si})
pearsonr_docked_si = pd.Series({c: pearsonr_ci_docked_si[c][0] for c in pearsonr_ci_docked_si})
pearsonr_crystal_rdk_si = pd.Series({c: pearsonr_ci_crystal_rdk_si[c][0] for c in pearsonr_ci_crystal_si})
pearsonr_docked_rdk_si = pd.Series({c: pearsonr_ci_docked_rdk_si[c][0] for c in pearsonr_ci_docked_si})

pearsonr_crystal_tc = pd.Series({c: pearsonr_ci_crystal_tanimoto[c][0] for c in pearsonr_ci_crystal_tanimoto})
pearsonr_docked_tc = pd.Series({c: pearsonr_ci_docked_tanimoto[c][0] for c in pearsonr_ci_docked_tanimoto})
pearsonr_crystal_rdk_tc = pd.Series({c: pearsonr_ci_crystal_rdk_tanimoto[c][0] for c in pearsonr_ci_crystal_tanimoto})
pearsonr_docked_rdk_tc = pd.Series({c: pearsonr_ci_docked_rdk_tanimoto[c][0] for c in pearsonr_ci_docked_tanimoto})

fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,6),sharey=True)

#colours = ['#008fd5', '#fc4f30', '#e5ae38']
colours = ['#fc4f30', '#e5ae38']

pearsonr_crystal_si.plot(ax=ax1, label='RF-Score v2 (crystal pose)', alpha=0.7, color=colours[0])
pearsonr_docked_si.plot(ax=ax1, label='RF-Score v2 (docked pose)', alpha=0.7, color=colours[0], linestyle=':')

pearsonr_crystal_rdk_si.plot(ax=ax1, label='RF-Score v2 + LB (crystal pose)', alpha=0.7, color=colours[1])
pearsonr_docked_rdk_si.plot(ax=ax1, label='RF-Score v2 + LB (docked pose)', alpha=0.7, color=colours[1], linestyle=':')

pearsonr_crystal_tc.plot(ax=ax2, label='RF-Score v2 (crystal pose)', alpha=0.7, color=colours[0])
pearsonr_docked_tc.plot(ax=ax2, label='RF-Score v2 (docked pose)', alpha=0.7, color=colours[0], linestyle=':')

pearsonr_crystal_rdk_tc.plot(ax=ax2, label='RF-Score v2 + LB (crystal pose)', alpha=0.7, color=colours[1])
pearsonr_docked_rdk_tc.plot(ax=ax2, label='RF-Score v2 + LB (docked pose)', alpha=0.7, color=colours[1], linestyle=':')

ax1.set_ylim(0.49, 0.86)
ax1.set_xlabel('Protein sequence identity threshold')
ax1.set_ylabel('Pearson correlation coefficient')

ax2.set_xlabel('Ligand Tanimoto similarity threshold')
ax2.set_ylabel('Pearson correlation coefficient')

#ax1.axhline(y=pearsonr_vina_docked, label='Vina (docked pose)', color='k', linestyle=':')
#ax2.axhline(y=pearsonr_vina_docked, label='Vina (docked pose)', color='k', linestyle=':')

for ax, annotation in zip((ax1,ax2), ['A', 'B']):
    ax.text(-0.1, 1.1, annotation, transform=ax.transAxes, size=20, weight='bold')
fig.tight_layout()
ax1.legend(title='Model', bbox_to_anchor=(1.8, 1.45), ncol=2)

#fig.savefig('../figures/CASF_pearsonr_crystal_dock.png', dpi=350, bbox_inches='tight')

In [None]:
from oddt.scoring.descriptors import close_contacts_descriptor
ligand_atomic_nums = [6, 7, 8, 9, 15, 16, 17, 35, 53]
protein_atomic_nums = [6, 7, 8, 16]
cutoff = [0, 2, 4, 6, 8, 10, 12]
descriptors_func = close_contacts_descriptor(cutoff=cutoff,protein_types=protein_atomic_nums,ligand_types=ligand_atomic_nums)

@delayed
def get_rfv2(protein, ligand):
    features = descriptors_func.build(protein, ligand)[0]
    return features

In [None]:
targets = ['AKT1', 'CP3A4', 'GCR', 'HIVPR', 'HIVRT', 'KIF11']
chembl_rfv2 = {}
for target in targets:
    protein = next(ob.readfile('pdb', f'../dude_chembl/dude_data/{target.lower()}/receptor.pdb'))
    protein.protein = True
    ligands = {}
    ligand_file = f'../dude_chembl/{target}_docked.sdf'
    for mol in ob.readfile('sdf', ligand_file):
        mol_name = mol.OBMol.GetTitle()
        if mol_name in ligands:
            continue
        else:
            ligands[mol_name] = mol
    with Parallel(n_jobs=8, verbose=10) as parallel:
        results = parallel(get_rfv2(protein, ligands[mol_name]) for mol_name in ligands)
    chembl_rfv2[target] = {mol_name: result for mol_name, result in zip(ligands, results)}

In [None]:
with open('../data/chembl_rfscorev2_features.json', 'w') as f:
    json.dump(chembl_rfv2, f, cls=NumpyEncoder)