In this iPython notebook, we will featurize MOR ligand binding simulation by pairwise distances between the ligand and different receptor residues. We will then perform tICA and prospectively build an MSM. 

In [1]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

# changing matplotlib the default style
matplotlib.style.use('ggplot')
#matplotlib.rcParams["figure.facecolor"] = "white"
#matplotlib.rcPar|ams["savefig.transparent"] = "True"


In [2]:
import pandas as pd


from PDB_Order_Fixer import PDB_Order_Fixer
import mdtraj as md
import os
import numpy as np
import h5py

import datetime
import glob
import copy
from functools import partial 
import operator
import time

import random 
import subprocess
from subprocess import Popen
import sys
from custom_clusterer import *
from custom_tica import *
from custom_featurizer import *
from pdb_editing import *
from analysis import *
from io_functions import *
#from topology_fixing import *
from subsampling import *
from conversions import *
from custom_msm import *
from grids import *
from docking_analysis import *

from scipy import stats
import os
from efficacy_scripts import *




because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [3]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [4]:
from sklearn.preprocessing import scale

In [5]:
from detect_intermediates import *
from interpret_tICs import *

In [6]:
from msmbuilder.utils import verbosedump, verboseload


In [7]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import scale
from random import shuffle
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc

In [8]:
def normalize_per_ligand(df):
    new_df = copy.deepcopy(df)
    n_df = new_df.ix[:, [i for i in range(0, new_df.shape[1]) if "difference" not in new_df.columns.values[i].lower()]]
    n_df = pd.DataFrame(preprocessing.scale(n_df.values, axis=1), index=n_df.index, columns=n_df.columns.values.tolist())
    new_df = pd.concat([new_df, n_df], axis=1)
    return(new_df)

In [9]:
def create_named_df(df, worker_pool):
    smiles_list, compound_names, pc_smiles, pc_names, pc_ids = convert_sdfs_to_compounds(["/home/enf/htbc/sdfs/%s.sdf" %f for f in df.index.values], parallel=True, worker_pool=worker_pool)
    smiles_list = [s.strip() for s in smiles_list]
    pc_smiles = [s.strip() for s in pc_smiles]
    new_df = pd.DataFrame(list(zip(smiles_list, compound_names, pc_smiles, pc_names, pc_ids)), index=df.index, columns=["smiles", "pubchem_name", "pubchem_smiles", "alternate_name", "pubchem_ID"])
    return(new_df)

In [10]:
import multiprocessing as mp
mp.cpu_count()

12

In [11]:
from ipyparallel import Client
rc = Client()
print(len(rc.ids))
dview = rc[:]
dview.map(os.chdir, ['/home/enf/b2ar_analysis/conformation']*len(rc.ids))

110


<AsyncMapResult: chdir>

In [12]:
precision = "SP"
docking_dir = "/home/enf/htbc/b2ar/docking_SP_32-stereoisomer_6-ring-conf"
analysis_dir = "%s/analysis" %docking_dir
if not os.path.exists(analysis_dir):
    os.makedirs(analysis_dir)

In [13]:
%load_ext autoreload
%autoreload 2

In [14]:
import analysis
from importlib import reload
reload(analysis)
from analysis import *


precision = "SP"
docking_dir = "/home/enf/htbc/b2ar/docking_SP_32-stereoisomer_6-ring-conf"
analysis_dir = "%s/analysis" %docking_dir
if not os.path.exists(analysis_dir):
    os.makedirs(analysis_dir)

docking_df, poses_df = analyze_docking_results_multiple(docking_dir, precision, "%s/summary.pkl" %docking_dir,
                                                        poses_summary=None, redo=False, reread=False,
write_to_disk=True, worker_pool=None, parallel=True)

#docking_df, poses_df = analyze_docking_results_in_dir(docking_dir, ligands_dir, write_to_disk=True, redo=True)
#docking_df[docking_df.columns] = np.nan_to_num(docking_df[docking_df.columns].values)
docking_df = docking_df.fillna(0.)
zeros_per_row = (docking_df == 0).sum(axis=1)
docking_df = docking_df.loc[zeros_per_row < 5]
#where_zeros = np.where(docking_df.values == 0)
#for row_index in where

#keep_cols = []
#for drug in docking_df.index:
#    if np.where(docking_df.loc[column].values == 0).shape[0] < 3:
#        keep_cols.append(col)
#docking_df = docking_df[keep_cols]

full_docking_df = copy.deepcopy(docking_df)
#full_docking_df = pd.concat([ref_df, docking_df[[c for c in docking_df.columns.values.tolist() if "cluster" in c]]], axis=1)
full_docking_df = full_docking_df[[n for n in full_docking_df.columns if "grid" not in n.lower()]]
full_docking_df[full_docking_df.columns] = np.nan_to_num(full_docking_df[full_docking_df.columns].values)
new_names =  [n.replace("cluster", "State ").replace("_sample0", "") for n in full_docking_df.columns.values.tolist()]
new_names = ["Inactive Crystal", "Active Crystal"] + new_names[2:]
full_docking_df.columns = new_names
full_docking_df.index = [n.lower().strip().replace(" ", "_").replace("β", "beta").replace("α", "alpha").replace("Α", "alpha") for n in full_docking_df.index.values]
full_docking_df = full_docking_df.groupby(full_docking_df.index).first()

In [15]:
docking_df.iloc[:5]

Unnamed: 0,2RH1_prepped,2rh1_grid,3P0G_pymol_prepped_new,3p0g_grid,cluster0_sample0,cluster10_sample0,cluster11_sample0,cluster12_sample0,cluster13_sample0,cluster14_sample0,...,cluster23_sample0,cluster24_sample0,cluster2_sample0,cluster3_sample0,cluster4_sample0,cluster5_sample0,cluster6_sample0,cluster7_sample0,cluster8_sample0,cluster9_sample0
CID_118710935,6.49,0.0,8.49,0.0,9.19,6.61,6.79,8.95,8.71,9.13,...,7.18,7.12,8.74,9.37,8.63,8.48,7.97,8.72,9.82,7.32
CID_9859211,6.39,6.64,7.09,6.92,7.2,5.11,6.19,6.09,6.64,5.64,...,6.16,6.05,6.5,5.97,6.66,5.72,6.81,6.79,6.75,6.24
CID_3406,5.22,4.85,6.11,4.93,5.09,5.44,5.11,5.08,6.56,4.95,...,4.88,4.87,5.86,4.63,5.32,4.81,6.38,5.45,5.31,5.3
CHEMBL361505,9.2,0.0,6.63,0.0,9.33,8.54,8.42,9.26,8.74,6.99,...,9.07,7.84,8.19,9.39,8.84,6.92,10.46,8.95,9.56,8.04
CID_68911,4.97,4.58,3.75,4.19,4.29,4.92,4.92,5.13,6.37,4.81,...,5.81,4.68,3.2,5.39,5.2,5.71,5.45,6.38,6.41,4.88


In [17]:
full_docking_df = copy.deepcopy(docking_df)
#full_docking_df = pd.concat([ref_df, docking_df[[c for c in docking_df.columns.values.tolist() if "cluster" in c]]], axis=1)
full_docking_df = full_docking_df[[n for n in full_docking_df.columns if "grid" not in n.lower()]]
full_docking_df[full_docking_df.columns] = np.nan_to_num(full_docking_df[full_docking_df.columns].values)
new_names =  [n.replace("cluster", "State ").replace("_sample0", "") for n in full_docking_df.columns.values.tolist()]
new_names = ["Inactive Crystal", "Active Crystal"] + new_names[2:]
full_docking_df.columns = new_names
full_docking_df.index = [n.lower().strip().replace(" ", "_").replace("β", "beta").replace("α", "alpha").replace("Α", "alpha") for n in full_docking_df.index.values]
full_docking_df = full_docking_df.groupby(full_docking_df.index).first()

In [18]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 10000)

In [40]:
def add_difference_columns(df):
    new_df = copy.deepcopy(df)
    for i, col_i in enumerate(df.columns):
        print(i)
        for j, col_j in enumerate(df.columns):
           if j <= i: continue
    ##        if "crystal" in col_j.lower() or "crystal" in col_i.lower(): continue
           new_df["%s-%s" %(col_j, col_i)] = new_df[col_j].subtract(new_df[col_i])
    #for i, col_i in enumerate(df.columns):
    #    if col_i != "Active Crystal":
    #        new_df["Active Crystal-%s" %col_i] = new_df["Active Crystal"].subtract(new_df[col_i])
    return(new_df)

In [19]:
agonist_model_dir = "/home/enf/md_simulations/MOR/h8_reimaged/sparse-tICA_t50_n_components2all_residues_4dkl_5c1m_under_cutoff6A-CA-py3-far_2_regularization_wolf_autoShrinkage0pt01-backup/analysis_n_clusters25_random/agonism_refined-wikipedia_SP_1000trees.pkl"
with open(agonist_model_dir, "rb") as f:
    agonist_model = pickle.load(f)

In [20]:
X_df = normalize_per_ligand(full_docking_df)
C_df = X_df[[c for c in X_df.columns if "state" not in c.lower()]]

In [25]:
agonism_test_df = pd.DataFrame(agonist_model["MSM Docking"][0].predict_proba(X_df), index=X_df.index, columns=["antagonist", "agonist"])
agonism_test_df.sort("agonist", ascending=False, inplace=True)

In [22]:
agonism_test_df = pd.DataFrame(agonist_model["Crystal Structures"][0].predict_proba(C_df), index=C_df.index, columns=["antagonist", "agonist"])
agonism_test_df.sort("agonist", ascending=False, inplace=True)

ValueError: Number of features of the model must  match the input. Model n_features is 3 and  input n_features is 8 

In [26]:
agonism_test_df.iloc[:10]

Unnamed: 0,antagonist,agonist
_49729,0.001,0.999
_63869,0.001,0.999
_49262,0.002,0.998
_34540,0.002,0.998
_99912,0.002,0.998
_7800,0.002,0.998
_115411,0.002,0.998
_48133,0.002,0.998
_53682,0.002,0.998
_47132,0.002,0.998


In [27]:
from efficacy_scripts import *
keep_top = 100
smiles_list, compound_names, pc_smiles, pc_names, pc_ids = convert_sdfs_to_compounds(["/home/enf/htbc/sdfs/%s.sdf" %f for f in agonism_test_df.iloc[:keep_top].index.values], parallel=True, worker_pool=dview)
smiles_list = [s.strip() for s in smiles_list]
pc_smiles = [s.strip() for s in pc_smiles]
agonism_names_df = pd.DataFrame(list(zip(smiles_list, compound_names, pc_smiles, pc_names, pc_ids)), index=agonism_test_df.iloc[:keep_top].index, columns=["smiles", "pubchem_name", "pubchem_smiles", "alternate_name", "pubchem_ID"])

Getting SMILES from SDFs...
Done. Now getting compound names from SMILES...
Done. returning compound names.


In [28]:
agonism_names_df.iloc[:10]

Unnamed: 0,smiles,pubchem_name,pubchem_smiles,alternate_name,pubchem_ID
_49729,S1CCN(CC1)C1CCN(CC1)Cc1ccccc1,Ambcb5455818,C1CN(CCC1N2CCSCC2)CC3=CC=CC=C3,Ambcb5455818,30982700
_63869,O=C(C1C(C1(C)C)C=C(Cl)Cl)NCCc1ccccc1,AG-690/10514039,CC1(C(C1C(=O)NCCC2=CC=CC=C2)C=C(Cl)Cl)C,AG-690/10514039,2851986
_49262,CCN(CCN(C)C)CCC(c1ccccc1)C,Ambcb5431960,CCN(CCC(C)C1=CC=CC=C1)CCN(C)C,Ambcb5431960,2846211
_34540,CCOc1cc(ccc1OC)CN1CCN(CC1)c1ccccc1OC,AC1LLIFE,CCOC1=C(C=CC(=C1)CN2CCN(CC2)C3=CC=CC=C3OC)OC,AC1LLIFE,1067955
_99912,CCOC(=O)c1[nH]c(c(c1C)C(=O)NC1CCCC(C1C)C)C,MolPort-007-720-339,CCOC(=O)C1=C(C(=C(N1)C)C(=O)NC2CCCC(C2C)C)C,MolPort-007-720-339,22515588
_7800,N#CSCC(=O)N1c2ccccc2C(CC1(C)C)(C)c1ccccc1,"2-oxo-2-(2,2,4-trimethyl-4-phenyl-3,4-dihydroquinolin-1(2H)-yl)ethyl thiocyanate",CC1(CC(C2=CC=CC=C2N1C(=O)CSC#N)(C)C3=CC=CC=C3)C,"2-oxo-2-(2,2,4-trimethyl-4-phenyl-3,4-dihydroquinolin-1(2H)-yl)ethyl thiocyanate",2886786
_115411,c1ccc(cc1)CN1CCC(CC1)n1cccc1,1-benzyl-4-(1H-pyrrol-1-yl)piperidine,C1CN(CCC1N2C=CC=C2)CC3=CC=CC=C3,1-benzyl-4-(1H-pyrrol-1-yl)piperidine,2779264
_48133,OCCN1CCN(CC1)CCC1=C(C)CCCC1(C)C,AC1O5D3Q,CC1=C(C(CCC1)(C)C)CCN2CCN(CC2)CCO,AC1O5D3Q,6456230
_53682,OCCN(Cc1ccccc1)CCC(c1ccccc1)C,Ambcb5421554,CC(CCN(CCO)CC1=CC=CC=C1)C2=CC=CC=C2,Ambcb5421554,45927593
_47132,CN(Cc1cccc(c1Cl)Cl)CCc1ccccn1,Ambcb5345874,CN(CCC1=CC=CC=N1)CC2=C(C(=CC=C2)Cl)Cl,Ambcb5345874,23826443


In [21]:
analysis_dir = "/home/enf/htbc/b2ar/docking_SP_32-stereoisomer_6-ring-conf/analysis"
affinity_model_dir = "%s/b2ar_affinity_model_1pt0_cutoff_rfr.pkl" %analysis_dir
with open(affinity_model_dir, "rb") as f:
    affinity_model = pickle.load(f)

In [22]:
X_df = full_docking_df.copy()
C_df = X_df[[c for c in X_df.columns if "state" not in c.lower()]]

In [27]:
X_df = full_docking_df.copy()
C_df = X_df[[c for c in X_df.columns if "state" not in c.lower()]]

b2ar_affinity_test_df = pd.DataFrame(affinity_model["MSM Docking"][0].predict_proba(X_df), index=X_df.index, columns=["non-binder", "binder"])
b2ar_affinity_test_df.sort("binder", ascending=False, inplace=True)

In [35]:
affinity_test_df = pd.DataFrame(affinity_model["Crystal Structures"][0].predict_proba(C_df), index=C_df.index, columns=["non-binder", "binder"])
affinity_test_df.sort("binder", ascending=False, inplace=True)

In [33]:
affinity_names_df = create_named_df(affinity_test_df.iloc[:100], dview)

Getting SMILES from SDFs...
Done. Now getting compound names from SMILES...
Done. returning compound names.


In [106]:
affinity_names_df

Unnamed: 0,smiles,pubchem_name,pubchem_smiles,alternate_name,pubchem_ID
_116572,CNCc1cc(ccc1N1CCOCC1)C(F)(F)F,886851-52-1,CNCC1=C(C=CC(=C1)C(F)(F)F)N2CCOCC2,886851-52-1,18525888.0
_76651,OCCC1CNCCN1Cc1ccc(cc1)OCC,2-[1-(4-ethoxybenzyl)-2-piperazinyl]ethanol,CCOC1=CC=C(C=C1)CN2CCNCC2CCO,2-[1-(4-ethoxybenzyl)-2-piperazinyl]ethanol,2989709.0
_48420,CC1CCC(C1)N1CCN(CC1)c1ccc(cc1)Cl,AC1O5D6K,CC1CCC(C1)N2CCN(CC2)C3=CC=C(C=C3)Cl,AC1O5D6K,6456264.0
_33051,NCCC1(CCOC(C1)(C)C)c1ccc(cc1)F,"2-[4-(4-Fluoro-phenyl)-2,2-dimethyl-tetrahydro-pyran-4-yl]-ethylamine",CC1(CC(CCO1)(CCN)C2=CC=C(C=C2)F)C,"2-[4-(4-Fluoro-phenyl)-2,2-dimethyl-tetrahydro-pyran-4-yl]-ethylamine",2835263.0
_71678,COc1ccc(cc1)C1OC(=NN1C(=O)C)c1ccc(cc1)Cl,AKOS003363717,CC(=O)N1C(OC(=N1)C2=CC=C(C=C2)Cl)C3=CC=C(C=C3)OC,AKOS003363717,17380573.0
_14,COc1cc(CN2CCNCC2)cc2c1OCO2,"1-[(7-methoxy-1,3-benzodioxol-5-yl)methyl]piperazine",COC1=CC(=CC2=C1OCO2)CN3CCNCC3,"1-[(7-methoxy-1,3-benzodioxol-5-yl)methyl]piperazine",3128130.0
_112636,COc1cc(OC)ccc1C1C=C(Nc2n1nnn2)c1ccc(cc1)C,AC1NXUBE,CC1=CC=C(C=C1)C2=CC(N3C(=N2)N=NN3)C4=C(C=C(C=C4)OC)OC,AC1NXUBE,5792042.0
_121896,CC(=O)OC1/C=C(/C)\CCC2C(CC3C1C(=C)C(=O)O3)(C)O2,Chrysanthin,CC1=CC(C2C(CC3(C(O3)CC1)C)OC(=O)C2=C)OC(=O)C,Chrysanthin,5860420.0
_47478,COc1cc(CN(C2CCN(CC2)C)C)cc(c1)OC,Oprea1_288259,CN1CCC(CC1)N(C)CC2=CC(=CC(=C2)OC)OC,Oprea1_288259,23885101.0
_24056,COc1ccc(cc1)CCC(C1(O)CCCCC1)O,ST069094,COC1=CC=C(C=C1)CCC(C2(CCCCC2)O)O,ST069094,4375993.0


In [34]:
affinity_names_df

Unnamed: 0,smiles,pubchem_name,pubchem_smiles,alternate_name,pubchem_ID
_116572,CNCc1cc(ccc1N1CCOCC1)C(F)(F)F,886851-52-1,CNCC1=C(C=CC(=C1)C(F)(F)F)N2CCOCC2,886851-52-1,18525888.0
_48420,CC1CCC(C1)N1CCN(CC1)c1ccc(cc1)Cl,AC1O5D6K,CC1CCC(C1)N2CCN(CC2)C3=CC=C(C=C3)Cl,AC1O5D6K,6456264.0
_7320,OC(CN1C(C)(C)CCCC1(C)C)COc1ccccc1,"1-phenoxy-3-(2,2,6,6-tetramethylpiperidin-1-yl)propan-2-ol",CC1(CCCC(N1CC(COC2=CC=CC=C2)O)(C)C)C,"1-phenoxy-3-(2,2,6,6-tetramethylpiperidin-1-yl)propan-2-ol",211983.0
_14,COc1cc(CN2CCNCC2)cc2c1OCO2,"1-[(7-methoxy-1,3-benzodioxol-5-yl)methyl]piperazine",COC1=CC(=CC2=C1OCO2)CN3CCNCC3,"1-[(7-methoxy-1,3-benzodioxol-5-yl)methyl]piperazine",3128130.0
_24056,COc1ccc(cc1)CCC(C1(O)CCCCC1)O,ST069094,COC1=CC=C(C=C1)CCC(C2(CCCCC2)O)O,ST069094,4375993.0
_60769,C=CCOc1ccc2c(c1)c(C(=O)NCc1ccccc1C(F)(F)F)c(n2Cc1ccco1)C,,,,
_76651,OCCC1CNCCN1Cc1ccc(cc1)OCC,2-[1-(4-ethoxybenzyl)-2-piperazinyl]ethanol,CCOC1=CC=C(C=C1)CN2CCNCC2CCO,2-[1-(4-ethoxybenzyl)-2-piperazinyl]ethanol,2989709.0
_77940,O=C(c1ccccc1)ON1C(C)(C)CC(CC1(C)C)NS(=O)(=O)C,ST048187,CC1(CC(CC(N1OC(=O)C2=CC=CC=C2)(C)C)NS(=O)(=O)C)C,ST048187,1242519.0
_112636,COc1cc(OC)ccc1C1C=C(Nc2n1nnn2)c1ccc(cc1)C,AC1NXUBE,CC1=CC=C(C=C1)C2=CC(N3C(=N2)N=NN3)C4=C(C=C(C=C4)OC)OC,AC1NXUBE,5792042.0
_33051,NCCC1(CCOC(C1)(C)C)c1ccc(cc1)F,"2-[4-(4-Fluoro-phenyl)-2,2-dimethyl-tetrahydro-pyran-4-yl]-ethylamine",CC1(CC(CCO1)(CCN)C2=CC=C(C=C2)F)C,"2-[4-(4-Fluoro-phenyl)-2,2-dimethyl-tetrahydro-pyran-4-yl]-ethylamine",2835263.0


In [None]:
import analysis
from importlib import reload
reload(analysis)
from analysis import *

precision = "SP"
base_docking_dir = "/home/enf/md_simulations/B1AR/sparse-tICA_t50_n_components2contact_6pt6_angstroms-CA-py3-far_regularization_wolf_autoShrinkage0pt01-backup/clusterer_25clusters_1samples_samples_kdtree"
docking_dir = "%s/docking_%s/32-stereoisomers_6-ring-conf" %(base_docking_dir, precision)
analysis_dir = "/home/enf/md_simulations/B1AR/sparse-tICA_t50_n_components2contact_6pt6_angstroms-CA-py3-far_regularization_wolf_autoShrinkage0pt01-backup/clusterer_25clusters_1samples_samples_kdtree/analysis"
if not os.path.exists(analysis_dir):
    os.makedirs(analysis_dir)

docking_df, poses_df = analyze_docking_results_multiple(docking_dir, precision, "%s/summary.pkl" %docking_dir,
                                                        poses_summary=None, redo=False, reread=False,
write_to_disk=True, worker_pool=None, parallel=True)

#docking_df, poses_df = analyze_docking_results_in_dir(docking_dir, ligands_dir, write_to_disk=True, redo=True)
#docking_df[docking_df.columns] = np.nan_to_num(docking_df[docking_df.columns].values)
docking_df = docking_df.fillna(0.)
zeros_per_row = (docking_df == 0).sum(axis=1)
docking_df = docking_df.loc[zeros_per_row < 5]
#where_zeros = np.where(docking_df.values == 0)
#for row_index in where

#keep_cols = []
#for drug in docking_df.index:
#    if np.where(docking_df.loc[column].values == 0).shape[0] < 3:
#        keep_cols.append(col)
#docking_df = docking_df[keep_cols]

full_docking_df = copy.deepcopy(docking_df)
#full_docking_df = pd.concat([ref_df, docking_df[[c for c in docking_df.columns.values.tolist() if "cluster" in c]]], axis=1)
full_docking_df = full_docking_df[[n for n in full_docking_df.columns if "grid" not in n.lower()]]
full_docking_df[full_docking_df.columns] = np.nan_to_num(full_docking_df[full_docking_df.columns].values)
new_names =  [n.replace("cluster", "State ").replace("_sample0", "") for n in full_docking_df.columns.values.tolist()]
new_names = ["Inactive Crystal", "Active Crystal"] + new_names[2:]
full_docking_df.columns = new_names
full_docking_df.index = [n.lower().strip().replace(" ", "_").replace("β", "beta").replace("α", "alpha").replace("Α", "alpha") for n in full_docking_df.index.values]
full_docking_df = full_docking_df.groupby(full_docking_df.index).first()

In [56]:
#affinity_model_dir = "%s/b1ar_affinity_model_rfr.pkl" %analysis_dir
affinity_model_dir = "%s/b1ar_affinity_model_bret_mehrdad_rfr.pkl" %analysis_dir

with open(affinity_model_dir, "rb") as f:
    affinity_model = pickle.load(f)


In [57]:
X_df = full_docking_df.copy()
C_df = X_df[[c for c in X_df.columns if "state" not in c.lower()]]

b1ar_affinity_test_df = pd.DataFrame(affinity_model["MSM Docking"][0].predict_proba(X_df), index=X_df.index, columns=["non-binder", "binder"])
b1ar_affinity_test_df.sort("binder", ascending=False, inplace=True)

In [58]:
b1ar_affinity_test_df.loc[[n for n in b1ar_affinity_test_df.index if "ks" in n or "std" in n or "cid_1978" in n or "cid_3869" in n]]

Unnamed: 0,non-binder,binder
ksn010126,0.054,0.946
ksn01014,0.081,0.919
ksn01018,0.082,0.918
ks01509,0.103,0.897
ks01502,0.114,0.886
ks01503,0.117,0.883
ks01501,0.117,0.883
std101c4,0.124,0.876
ks01504,0.157,0.843
ks01506,0.169,0.831


In [41]:
agonism_model_dir = "%s/b1ar_agonism_model_rfr.pkl" %analysis_dir
with open(agonism_model_dir, "rb") as f:
    agonism_model = pickle.load(f)

X_df = full_docking_df.copy()
X_df = add_difference_columns(X_df)
C_df = X_df[[c for c in X_df.columns if "state" not in c.lower()]]


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32


In [42]:
b1ar_agonism_test_df = pd.DataFrame(agonism_model["MSM Docking"][0].predict_proba(X_df), index=X_df.index, columns=["antagonist", "agonist"])
b1ar_agonism_test_df.sort("agonist", ascending=False, inplace=True)

In [43]:
b1ar_agonism_test_df.loc[[n for n in b1ar_agonism_test_df.index if "ks" in n or "std" in n or "cid_1978" in n or "cid_3869" in n]]

Unnamed: 0,antagonist,agonist
ksn01014,0.045,0.955
ks01501,0.067,0.933
ks01506,0.073,0.927
ks01515,0.076,0.924
ks01509,0.082,0.918
ksn01018,0.085,0.915
ks01502,0.088,0.912
ksn010115,0.103,0.897
ks01504,0.105,0.895
ksn010126,0.109,0.891


In [59]:
keep = 1000000
intersecting_ligands = list(set(b1ar_affinity_test_df.index.values.tolist()[:keep]).intersection(set(b1ar_agonism_test_df.index.values.tolist()[:keep])).intersection(set(b2ar_affinity_test_df.index.values.tolist()[:keep])))

In [None]:
#with mehrdad and bret
combined_df = pd.concat([b1ar_affinity_test_df["binder"], b2ar_affinity_test_df["non-binder"], b1ar_agonism_test_df["agonist"]], axis=1).loc[intersecting_ligands]
combined_df.columns = ["b1ar_binder", "b2ar_non_binder", "b1ar_agonist"]
combined_df["joint"] = combined_df.product(axis=1)
combined_df.sort("joint", inplace=True, ascending=False)
combined_df

<rdkit.Chem.rdchem.Mol at 0x7f36cbcbf030>

In [61]:
#with mehrdad and bret
combined_df = pd.concat([b1ar_affinity_test_df["binder"], b2ar_affinity_test_df["non-binder"], b1ar_agonism_test_df["agonist"]], axis=1).loc[intersecting_ligands]
combined_df.columns = ["b1ar_binder", "b2ar_non_binder", "b1ar_agonist"]
combined_df["joint"] = combined_df.product(axis=1)
combined_df.sort("joint", inplace=True, ascending=False)
combined_df

Unnamed: 0,b1ar_binder,b2ar_non_binder,b1ar_agonist,joint
cid_5403,0.867,0.954,0.988,0.817193
cid_838,0.822,0.981,0.953,0.768482
cid_23843,0.858,0.947,0.92,0.747524
cid_10087493,0.888,0.869,0.96,0.740805
cid_3779,0.808,0.983,0.906,0.719603
cid_23844,0.841,0.907,0.941,0.717783
cid_36811,0.791,0.931,0.961,0.707701
cid_4086,0.838,0.906,0.928,0.704564
cid_10443654,0.881,0.837,0.939,0.692416
cid_951,0.811,0.936,0.882,0.669523


In [64]:
mjg_df = combined_df.loc[[n for n in combined_df.index.values.tolist() if "mjg" in n]]
mjg_df

Unnamed: 0,b1ar_binder,b2ar_non_binder,b1ar_agonist,joint
mjg-02,0.778,0.733,0.906,0.516668
mjg-44,0.71,0.726,0.841,0.433502
mjg-61,0.772,0.639,0.851,0.419805
mjg-52,0.691,0.695,0.79,0.379394
mjg-60,0.776,0.547,0.88,0.373535
mjg-08,0.728,0.614,0.826,0.369215
mjg-58,0.735,0.691,0.717,0.364154
mjg-51,0.725,0.559,0.863,0.349752
mjg-03,0.621,0.66,0.832,0.341004
mjg-45,0.777,0.589,0.742,0.339579


In [53]:
combined_df.sort("b1ar_binder", inplace=False)

Unnamed: 0,b1ar_binder,b2ar_non_binder,b1ar_agonist,joint
cid_2119,0.001,0.302,0.132,4e-05
cid_4171,0.001,0.994,0.365,0.000363
cid_59768,0.002,0.995,0.369,0.000734
chembl226348,0.016,0.885,0.401,0.005678
chembl40650,0.018,0.91,0.536,0.00878
cid_4382,0.022,0.993,0.896,0.019574
chembl251392,0.023,0.549,0.481,0.006074
cid_2249,0.025,0.993,0.138,0.003426
cid_4883,0.031,0.999,0.116,0.003592
cid_2405,0.032,0.983,0.129,0.004058


In [45]:
len(intersecting_ligands)

191

In [None]:
names_df = create_named_df(combined_df, None)

Getting SMILES from SDFs...
Done. Now getting compound names from SMILES...
Done. returning compound names.


In [None]:
names_df

Unnamed: 0,smiles,pubchem_name,pubchem_smiles,alternate_name,pubchem_ID
_59621,CCOc1ccc2c(c1)c(C(=O)N1CCN(CC1)CC=Cc1ccccc1)c(n2CCc1ccc(cc1)OC)C,,,,
_2488,COc1c(ccc(c1OC)OC)CN1CCN(CC1)Cc1ccccc1C(F)(F)F,AK-968/13149944,COC1=C(C(=C(C=C1)CN2CCN(CC2)CC3=CC=CC=C3C(F)(F)F)OC)OC,AK-968/13149944,1126253.0
_44195,CCCn1c(nc2c1cccc2)C(c1ccc(cc1)N(C)C)O,CBMicro_014544,CCCN1C2=CC=CC=C2N=C1C(C3=CC=C(C=C3)N(C)C)O,CBMicro_014544,3039564.0
_99190,COc1ccccc1CNC(=O)C1CCCN(C1)Cc1cnn(c1n1cccc1)c1ccccc1,AKOS001893969,COC1=CC=CC=C1CNC(=O)C2CCCN(C2)CC3=C(N(N=C3)C4=CC=CC=C4)N5C=CC=C5,AKOS001893969,22511744.0
_52376,NS(=O)(=O)c1ccc2c(c1)C1C=CCC1C(N2)c1cccc2c1cccc2,CHEMBL412119,C1C=CC2C1C(NC3=C2C=C(C=C3)S(=O)(=O)N)C4=CC=CC5=CC=CC=C54,CHEMBL412119,5038679.0
_123473,CCN(Cc1c(O)ccc2c1cccc2)Cc1cc(Cl)cc(c1O)C,NSC88850,CCN(CC1=C(C(=CC(=C1)Cl)C)O)CC2=C(C=CC3=CC=CC=C32)O,NSC88850,259173.0
_94499,OC(Cn1cc(c2c1cccc2)/C=C/C(=O)c1ccc(cc1)Br)CN1CCOCC1,AE-848/42799179,C1COCCN1CC(CN2C=C(C3=CC=CC=C32)C=CC(=O)C4=CC=C(C=C4)Br)O,CTK7J6616,11835319.0
_88928,CC(=O)C(n1c(CN(Cc2ccccc2)Cc2ccccc2)nc2c1c(=O)[nH]c(=O)n2C)C,AC1NL9P4,CC(C(=O)C)N1C(=NC2=C1C(=O)NC(=O)N2C)CN(CC3=CC=CC=C3)CC4=CC=CC=C4,AC1NL9P4,4976490.0
_49845,c1ccc(cc1)CCN1CCC(CC1)NC(c1ccccc1)Cc1ccccc1,AC1ME1BV,C1CN(CCC1NC(CC2=CC=CC=C2)C3=CC=CC=C3)CCC4=CC=CC=C4,AC1ME1BV,2846316.0
_56627,O=C(N1N=CCC1(CCOCc1ccccc1)C(=O)NC1CCc2c1cccc2)NCc1ccccc1,,,,


In [36]:
names_df = create_named_df(combined_df.iloc[:100], dview)

Getting SMILES from SDFs...
Done. Now getting compound names from SMILES...
Done. returning compound names.


In [37]:
names_df

Unnamed: 0,smiles,pubchem_name,pubchem_smiles,alternate_name,pubchem_ID
_7320,OC(CN1C(C)(C)CCCC1(C)C)COc1ccccc1,"1-phenoxy-3-(2,2,6,6-tetramethylpiperidin-1-yl)propan-2-ol",CC1(CCCC(N1CC(COC2=CC=CC=C2)O)(C)C)C,"1-phenoxy-3-(2,2,6,6-tetramethylpiperidin-1-yl)propan-2-ol",211983.0
_77940,O=C(c1ccccc1)ON1C(C)(C)CC(CC1(C)C)NS(=O)(=O)C,ST048187,CC1(CC(CC(N1OC(=O)C2=CC=CC=C2)(C)C)NS(=O)(=O)C)C,ST048187,1242519.0
_115830,NCc1ccccc1OC1CCOCC1,898289-33-3,C1COCCC1OC2=CC=CC=C2CN,898289-33-3,18525860.0
_42633,CCC(=O)OC1(CCN(C(C1C)C)C)c1ccccc1,CBDivE_014579,CCC(=O)OC1(CCN(C(C1C)C)C)C2=CC=CC=C2,CBDivE_014579,6612424.0
_40201,CN(CC1CCCCC1(OC(=O)C)c1ccccc1)C,AC1LC34P,CC(=O)OC1(CCCCC1CN(C)C)C2=CC=CC=C2,AC1LC34P,546980.0
_121907,COc1ccc2c(c1O)[C@@]13CCN([C@H](C2)[C@@H]3CCC(=O)C1)C,Dihydrothebainone,CN1CCC23CC(=O)CCC2C1CC4=C3C(=C(C=C4)OC)O,AE-562/12222547,5355412.0
_37605,Cc1ccc(cc1)OCCCCCCc1c(C)n[nH]c1C,AG-664/14117620,CC1=CC=C(C=C1)OCCCCCCC2=C(NN=C2C)C,AG-664/14117620,2057791.0
_102267,CN1CCN(CC1)Cc1csc2c1CCCC2,ZINC20514305,CN1CCN(CC1)CC2=CSC3=C2CCCC3,ZINC20514305,22554311.0
_53765,CN(CCCN1C(=O)CSC1c1ccc(cc1)Br)C,STK386756,CN(C)CCCN1C(SCC1=O)C2=CC=C(C=C2)Br,STK386756,2861590.0
_121545,COc1c(ccc(c1OC)OC)CN1CCNCC1,TRIMETAZIDINE,COC1=C(C(=C(C=C1)CN2CCNCC2)OC)OC,TRIMETAZIDINE,21109.0


In [38]:
docking_dir

'/home/enf/md_simulations/MOR/h8_reimaged/sparse-tICA_t50_n_components2all_residues_4dkl_5c1m_under_cutoff6A-CA-py3-far_2_regularization_wolf_autoShrinkage0pt01-backup/all_clusterer_25clusters_1samples_samples_kdtree/docking_SP_32-stereoisomers_6-ring-conf/htbc'

In [None]:
test_df[["pubchem_name", "smiles", "alternate_smiles", "alternate_name"]] = compound_names_df.loc[test_df.index][["pubchem_name", "smiles", "pubchem_smiles", "alternate_name"]]
bret_pred_df = test_df.loc[[n for n in test_df.index.values if n in common_ligands]]
bret_pred_df["label"] = 0.
bret_pred_df["label"].loc[common_ligands] = y
from rdkit.ML.Scoring.Scoring import CalcAUC
CalcAUC(bret_pred_df.values[:,(0,1)], col=0)#, alpha=10)

In [None]:
test_df.loc[[n for n in test_df.index.values if "compd0" in n or "compd11" in n]].sort("class B", ascending=False,inplace=False).iloc[:]