In this iPython notebook, we will featurize MOR ligand binding simulation by pairwise distances between the ligand and different receptor residues. We will then perform tICA and prospectively build an MSM. 

In [1]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

# changing matplotlib the default style
matplotlib.style.use('ggplot')
#matplotlib.rcParams["figure.facecolor"] = "white"
#matplotlib.rcPar|ams["savefig.transparent"] = "True"


In [2]:
import pandas as pd


from PDB_Order_Fixer import PDB_Order_Fixer
import mdtraj as md
import os
import numpy as np
import h5py

import datetime
import glob
import copy
from functools import partial 
import operator
import time

import random 
import subprocess
from subprocess import Popen
import sys
from custom_clusterer import *
from custom_tica import *
from custom_featurizer import *
from pdb_editing import *
from analysis import *
from io_functions import *
#from topology_fixing import *
from subsampling import *
from conversions import *
from custom_msm import *
from grids import *
from docking_analysis import *

from scipy import stats
import os
from efficacy_scripts import *




because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [3]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [4]:
from sklearn.preprocessing import scale

In [5]:
from detect_intermediates import *
from interpret_tICs import *

In [6]:
from msmbuilder.utils import verbosedump, verboseload


In [7]:
from b2ar_feature_types import *
#from b2ar_feature_types import *
from get_variable_names import *
from b2ar_tica_config import *
from residue import Residue, Atom

In [8]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import scale
from random import shuffle
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc

In [9]:

ori_feature_name = copy.deepcopy(feature_name)

In [10]:
#schemes = ["closest-heavy", "CA"]
#feature_name = "%s-CA" %ori_feature_name

In [11]:
rho = 0.01
rho_string = "_rho0pt01"
n_clusters = 25
n_samples = 1
lag_time=5
precision = "XP"

In [12]:
(active_ref_dir, inactive_ref_dir, simulation_ref_dir, scripts_dir,
          ligand_dir, agonist_dir, inverse_agonist_dir, biased_agonist_dir, ref_receptors_dir, whole_trajectory_pnas,
          sasa_file) = get_base_files(base)

tica_dir = get_tica_dir(base, is_sparse, lag_time, n_components, feature_name, 
                                 wolf_string, shrinkage_string, rho_string)
ori_tica_dir = copy.deepcopy(tica_dir)
features_dir = get_features_dir(base, feature_name)

landmarks_dir = get_landmarks_dir(tica_dir)
analysis_dir = get_analysis_dir(tica_dir, n_clusters, sampling_method)
gmm_dir = get_gmm_dir(tica_dir)
rf_dirdir = get_rf_dir(tica_dir)


ref_tica_dir, ref_tica_coords = get_ref_tica_dirs(tica_dir)

graph_file = get_graph_file(tica_dir, msm_lag_time, n_clusters)

pnas_titles =  ["tm6_tm3_dist", "rmsd_npxxy_inactive", "rmsd_npxxy_active", "rmsd_connector_inactive", "rmsd_connector_active"]
pnas_features_dir = analysis_dir


(clusterer_dir, msm_model_dir, macrostate_dir, features_known, model_dir, projected_features_dir,
         projection_operator_dir, ktica_fit_model_filename, ktica_projected_data_filename, nystroem_data_filename,
         mutual_information_csv, pearson_csv) = get_tica_files(base, tica_dir, n_clusters, msm_lag_time, n_macrostates)

(standardized_features_dir, feature_residues_csv, feature_residues_pkl,
          contact_csv, ref_features_dir) = get_feature_files(features_dir)

(kmeans_csv, tica_coords_csv, features_csv, active_rmsd_dir, inactive_rmsd_dir, active_pnas_dir, inactive_pnas_joined, active_pnas_joined,
        clusters_map_file, ktica_clusters_map_file, analysis_file, combined_file, docking_summary, docking_joined, docking_z_scores_csv,
        aggregate_docking, aggregate_docking_joined, docking_pnas_joined, aggregate_docking_pnas, aggregate_docking_pnas_joined, docking_multiple_ligands,
        docking_distances_file, docking_pdf, mmgbsa_docking_distances, pnas_coords, mmgbsa_dir, mmgbsa_csv, mmgbsa_pdf, aggregate_mmgbsa,
        aggregate_mmgbsa_joined, aggregate_mmgbsa_pnas_joined, mmgbsa_z_scores_csv, active_clusters_csv, intermediate_clusters_csv,
        inactive_clusters_csv, pnas_clusters_averages, tica_clusters_averages, tica_classes_csv, tica_samples_csv, subgraph_save_base,
        degree_save_base, degree_map_csv, degree_z_map_csv, aggregate_docking_pnas_degree_z_joined, tic_residue_csv, feature_coefs_csv,
        duplicated_feature_coefs_csv) = get_analysis_files(analysis_dir, n_clusters, tica_dir, tica_dir, sampling_method, n_samples, precision,
                                                           msm_lag_time)

(inactive_pnas_distances_dir, active_pnas_distances_dir, active_pnas_all_distances_dir,
          inactive_pnas_distances_new_csv, active_pnas_distances_new_csv, active_pnas_joined, active_pnas_means, pnas_coords_dir,
          pnas_coords_csv, pnas_all_coords_csv, pnas_coords_hexbin_dir, pnas_coords_co_crystallized_docking_dir,
          pnas_coords_active_colors_dir, user_defined_features_file, reaction_coordinates_trajs_file) = get_pnas_files(whole_trajectory_pnas, pnas_features_dir)

features_dir = get_features_dir(base, feature_name)



graph_file = get_graph_file(tica_dir, msm_lag_time, n_clusters)
(scripts_dir, pymol_fixpdb_dir) = get_script_dir(scripts_dir)
(save_dir, reimaged_dir, mae_dir, combined_reimaged_dir, grid_dir, docking_dir) = get_docking_dirs(tica_dir, n_clusters, n_components, n_samples, sampling_method, precision)


/home/enf/b2ar_analysis/featuresall_residues_2rh1_3sn6_under_cutoff6A
/home/enf/b2ar_analysis/featuresall_residues_2rh1_3sn6_under_cutoff6A


In [13]:
import multiprocessing as mp
mp.cpu_count()

12

In [52]:
from ipyparallel import Client
rc = Client()
print(len(rc.ids))
dview = rc[101:]
dview.map(os.chdir, ['/home/enf/b2ar_analysis/conformation']*len(rc.ids))

225


<AsyncMapResult: chdir>

In [15]:
grid_center = "64.4, 16.9, 11.99"

precision = "SP"
htbc_dir = "/home/enf/htbc/sdfs"
reference_docking_dir = "/home/enf/b2ar_analysis/reference_docking/docking_SP"
reference_grid_dir = "/home/enf/b2ar_analysis/reference_docking/reference_grids/"

dock_ligands_and_receptors(reference_grid_dir, reference_docking_dir,
                           htbc_dir, precision = precision, ext = "-out.maegz",
                           chosen_ligands=None, chosen_receptors=None, parallel=False,
                           grid_ext = ".zip", worker_pool=dview)



NameError: name 'dview' is not defined

In [26]:
from imp import reload
import analysis
reload(analysis)
from analysis import *

import efficacy_scripts
reload(efficacy_scripts)
from efficacy_scripts import *

reference_docking_dir = "/home/enf/b2ar_analysis/reference_docking/docking_SP"
ligands_dir = "/home/enf/htbc/sdfs"
reference_docking_df, reference_poses_df = analyze_docking_results_in_dir(reference_docking_dir, ligands_dir, write_to_disk=False, redo=False)

In [24]:
#import analysis
#reload(analysis)
from analysis import *

#docking_dir = "/home/enf/htbc/b2ar/docking_xp/1_stereoisomer"
#docking_dir = "/home/enf/htbc/b2ar/docking_xp/32-stereoisomers_6-ring-conf"
docking_dir = "/home/enf/htbc/b2ar/docking_SP_32-stereoisomer_6-ring-conf/htbc"
#docking_dir = "/home/enf/htbc/b2ar/docking_SP_1-stereoisomer/bret"

precision = "SP"

docking_df, poses_df = analyze_docking_results_multiple(docking_dir, precision, "%s/summary.csv" %docking_dir, 
ligands=None, poses_summary=None, redo=True, reread=False,
write_to_disk=True, worker_pool=dview, parallel=True)

kept_columns = [n for n in docking_df.columns.values if "grid" not in n.lower()]
docking_df = docking_df[kept_columns]
docking_df = docking_df.apply(pd.to_numeric)
docking_df[docking_df.columns] = np.nan_to_num(docking_df[docking_df.columns].values)

full_docking_df = copy.deepcopy(docking_df)
#full_docking_df = pd.concat([ref_df, docking_df[[c for c in docking_df.columns.values.tolist() if "cluster" in c]]], axis=1)
full_docking_df[full_docking_df.columns] = np.nan_to_num(full_docking_df[full_docking_df.columns].values)
new_names =  [n.replace("cluster", "State ").replace("_sample0", "") for n in full_docking_df.columns.values.tolist()]
full_docking_df.columns = new_names

Analyzing docking results
/home/enf/htbc/b2ar/docking_SP_32-stereoisomer_6-ring-conf/htbc
Obtaining docking scores now...
Obtained ligand arguments.
Examined all ligands.
Parsed all log files.


In [61]:
from sklearn import preprocessing

from importlib import reload
import efficacy_scripts
reload(efficacy_scripts)
from efficacy_scripts import *
         
def replace_columns(X_df):
    old_cols = X_df.columns.values.tolist()
    new_cols = []
    for col in old_cols:
        if "2rh1" in col.lower():
            col = "Inactive Crystal"
        if "3p0g" in col.lower():
            col = "Active Crystal"
        if "null" in col:
            col = "difference"
        if "cluster" in col:
            col = col.replace("cluster", "State ")
        new_cols.append(col)
    X_df.columns = new_cols
    #X_df["difference"] = X_df["Active Crystal"].subtract(X_df["Inactive Crystal"])
    return(X_df)

X_df = copy.deepcopy(full_docking_df)
X_df = replace_columns(X_df)

C_df = X_df[[c for c in X_df.columns.values.tolist() if "State" not in c and "difference" not in c]]

In [84]:
analysis_dir = "/home/enf/b2ar_analysis/sparse-tICA_t5_n_components2all_residues_2rh1_3sn6_under_cutoff6A_regularization_wolf_autoShrinkage_rho0pt01/analysis_n_clusters25_random"
model_dir = '/home/enf/b2ar_analysis/sparse-tICA_t5_n_components2all_residues_2rh1_3sn6_under_cutoff6A_regularization_wolf_autoShrinkage_rho0pt01/analysis_n_clusters25_random/binder_vs_dud-0.3_rfr_trials100_split0.9_normalizeFalse_normalize-axis0False_n-estimators1000_precisionSP.pkl'
with open(model_dir, "rb") as f:
    model = pickle.load(f)
    

In [85]:
y_proba =model['Crystal Structures'][0].predict_proba(C_df.values)
print(y_proba.shape)
aff_test_df = pd.DataFrame(y_proba, index=C_df.index, columns=["P(Non-binder)", "P(Binder)"]).sort("P(Binder)", ascending=False, inplace=False)
aff_test_df.iloc[:100]

(134376, 2)


Unnamed: 0,P(Non-binder),P(Binder)
_119548,0.009,0.991
_113981,0.009,0.991
_113861,0.009,0.991
_84989,0.064,0.936
_43052,0.067219,0.932781
_124391,0.072552,0.927448
_113803,0.072552,0.927448
_27044,0.074,0.926
_70134,0.075,0.925
_93597,0.076,0.924


In [72]:
C_df.loc["_82394"]

Inactive Crystal    9.8
Active Crystal     -0.0
Name: _82394, dtype: float64

In [73]:
aff_test_df.loc[aff_test_df["P(Binder)"] > 0.04].shape

(49237, 2)

In [None]:
#print(smiles_strings[:2])
df = copy.deepcopy(aff_test_df)
df["names"] = ""
df["Active-Innactive"] = C_df["Active Crystal"].subtract(C_df["Inactive Crystal"]).loc[df.index]
sub_df = df.loc[(df["Active-Innactive"] > 0.) & (df["P(Binder)"] > 0.024)]
smiles_strings = convert_compounds_in_dir_to_smiles(sub_df.index.values.tolist(), "/home/enf/htbc/sdfs", parallel=False, worker_pool=dview)
results = convert_smiles_to_compounds(smiles_strings, parallel=False, worker_pool=dview)
names = [c[0] for c in results]
sub_df["names"] = names
sub_df["smiles"] = smiles_strings


In [102]:
print(len(smiles_strings))
print(sub_df.shape)
sub_df

2545
(2545, 5)


Unnamed: 0,P(Non-binder),P(Binder),names,Active-Innactive,smiles
_119947,0.096,0.904,labetalol,1.38,CC(CCc1ccccc1)NCC(c1ccc(c(c1)C(=O)N)O)O\t\n
_119833,0.136,0.864,,0.97,OCNc1cc(ccc1O)[C@@H](CN[C@H](Cc1ccc(cc1)C=O)C)...
_119803,0.224,0.776,fenoterol,1.43,CC(Cc1ccc(cc1)O)NCC(c1cc(O)cc(c1)O)O\t\n
_62434,0.228,0.772,,1.25,CC(CN(c1snc(n1)NCCc1c[nH]c2c1cccc2)CC(O)C)O\t\n
_27513,0.262,0.738,"7-ethyl-1,3-dimethyl-8-[(2-oxo-2-phenylethyl)s...",0.61,CCn1c(SCC(=O)c2ccccc2)nc2c1c(=O)n(c(=O)n2C)C\t\n
_20757,0.266,0.734,CHEMBL1626156,0.62,OCCNCc1cc(Br)ccc1OCc1ccc(cc1)Cl\t\n
_62978,0.268,0.732,,0.59,CC(CN(c1snc(n1)NCc1ccc(cc1)C(F)(F)F)CC(O)C)O\t\n
_113560,0.276,0.724,dobutamine,1.46,CC(CCc1ccc(cc1)O)NCCc1ccc(c(c1)O)O\t\n
_28399,0.28,0.72,ST50917399,1.04,Cc1ccnc(n1)NS(=O)(=O)c1ccc(cc1)NC(=O)C1C2C=CC(...
_62493,0.287,0.713,,1.24,OCCN(c1snc(n1)NCCc1ccccn1)CCO\t\n


In [101]:
reference_docking_df.shape

NameError: name 'reference_docking_df' is not defined

In [34]:
df = reference_docking_df.sort(columns='3p0g_grid', inplace=False, ascending=False)
df[df.columns] = np.nan_to_num(df.values)

In [52]:
df["difference"] = df["3p0g_grid"].subtract(df["2rh1_grid"])
df["mean"] = df["3p0g_grid"].add(df["2rh1_grid"])
df.sort("mean", inplace=True, ascending=False)

In [53]:
df.iloc[:10]

Unnamed: 0,2rh1_grid,3p0g_grid,difference,mean
_46365,11.15,10.35,-0.8,21.5
_43235,10.87,9.7,-1.17,20.57
_62901,10.67,8.85,-1.82,19.52
_119833,8.53,10.85,2.32,19.38
_103698,10.82,8.49,-2.33,19.31
_42622,10.83,8.19,-2.64,19.02
_124773,8.52,10.49,1.97,19.01
_103874,10.1,8.91,-1.19,19.01
_124756,9.39,9.6,0.21,18.99
_120850,9.38,9.51,0.13,18.89


In [46]:
#df = df.sort("3p0g_grid", ascending=False, inplace=False)
#df.iloc[0:10]

Unnamed: 0,2rh1_grid,3p0g_grid,difference
_119833,8.53,10.85,2.32
_114015,7.61,10.79,3.18
_124668,7.6,10.72,3.12
_124773,8.52,10.49,1.97
_113986,8.03,10.39,2.36
_46365,11.15,10.35,-0.8
_119947,8.22,10.19,1.97
_119642,8.15,10.1,1.95
_10391,6.84,10.09,3.25
_120116,6.94,9.95,3.01


In [29]:
#with open('/home/enf/b2ar_analysis/sparse-tICA_t5_n_components2all_residues_2rh1_3sn6_under_cutoff6A_regularization_wolf_autoShrinkage_rho0pt01/analysis_n_clusters25_random/a_vs_g-all-0.66_rfr_trials1000_split0.8_normalizeFalse_normalize-axis0True_n-estimators1000.pkl', "rb") as f:
with open('/home/enf/b2ar_analysis/sparse-tICA_t5_n_components2all_residues_2rh1_3sn6_under_cutoff6A_regularization_wolf_autoShrinkage_rho0pt01/analysis_n_clusters25_random/antagonist_vs_agonist_vs_dud-0.4_rfr_trials100_split0.9_normalizeFalse_normalize-axis0True_n-estimators1000_precisionSP.pkl', "rb") as f:
    model = pickle.load(f)
    


In [30]:
xtal_model = model['Crystal Structures'][0]

bias_prob = xtal_model.predict_proba(df.values)

bias_df = pd.DataFrame(bias_prob, index=df.index, columns=["class A", "class B"]).sort("class B", ascending=False, inplace=False)

NameError: name 'df' is not defined

In [51]:
bias_df.iloc[:10]

#CANNOT DO MUST FIT A SEPARATE SP MODEL!!!!

Unnamed: 0,class A,class B
_119833,0.212,0.788
_72125,0.221,0.779
_130022,0.221,0.779
_134080,0.221,0.779
_26712,0.225,0.775
_87046,0.225,0.775
_75299,0.225,0.775
_24372,0.226,0.774
_5594,0.226,0.774
_84202,0.226,0.774


In [17]:
chembl_df = pd.read_excel("/home/enf/b2ar_analysis/bioactivity-16_0_12_50.xlsx", header=0, index_col=0)#.set_index("CMPD_CHEMBLID")

chembl_df["smiles"] = chembl_df["CANONICAL_SMILES"]
compounds = convert_smiles_to_compounds(chembl_df["smiles"].values.tolist(), parallel=False, worker_pool=dview)
chembl_df["CID"] = [c[3] for c in compounds]
chembl_df["ligand"] = chembl_df.index
chembl_df["ligand"].loc[chembl_df["CID"] != ""] = ["CID_%d" %cid for cid in chembl_df.loc[chembl_df["CID"] != ""]["CID"].values.tolist()]
chembl_df

#write_smiles_files(chembl_df, "/home/enf/b2ar_analysis/all_ligands/32-stereoisomers_6-ring-conf")
#write_smiles_files(chembl_df, "/home/enf/b2ar_analysis/all_ligands/1_stereoisomer")

#undone_ligands = list(set(chembl_df.loc[chembl_df["CID"] == ""].index.values.tolist() +  [n for n in chembl_cid_names if n not in get_ligands("/home/enf/b2ar_analysis/all_ligands/32-stereoisomers_6-ring-conf", ".sdf")]))

#print(len(chembl_cids))
#print(len(undone_ligands))



Writing SMILES files now.
Finished writing SMILES files.
Writing SMILES files now.
Finished writing SMILES files.


In [19]:
with open("/home/enf/b2ar_analysis/all_ligands/chembl_df.pkl", "wb") as f:
    pickle.dump(chembl_df, f, protocol=2)

In [90]:
import efficacy_scripts
reload(efficacy_scripts)
from efficacy_scripts import *



In [72]:
import grids
reload(grids)
from grids import *


bret = pd.read_excel("/home/enf/b2ar_analysis/bret_bias_study2.xlsx", header=0).set_index("EvanName")
#bret = pd.read_csv("/home/enf/b2ar_analysis/bias_analysis/bret_bias_study.csv", header=0).dropna().set_index("EvanName")
bret["B2AR-Arrestin, Mean"].sort(inplace=False)#.subtract(bret["B2AR-Arrestin, Mean"])
cids = [int(cid) for cid in bret["CID"].values if 0 != int(np.nan_to_num(cid))]
dow

In [80]:
len([n for n in get_ligands(all_ligands_dir, ".sdf") if "CID" in n[0:3]])

45

In [80]:


baker_ligands = ["329-63-5", "97879-29-3", "5716-20-1", "86615-41-0", "86615-41-0", "71119-11-4", "5696-15-1", "57775-29-8", "54239-37-1", "21898-19-1", "874882-72-1", "71771-90-9", "49745-95-1", "62-31-7", "1944-12-03", "183814-30-4", "51-30-9", "579-56-6", "244192-94-7", "159182-43-1", "03-10-5588", "5874-97-5", "118457-14-0", "108341-18-0", "770-05-8", "6452-73-9", "13523-86-9", "62929-91-3", "90274-24-1", "23239-51-2", "51022-70-9", "89365-50-4", "39731-05-0", "174689-39-5", "244081-42-3", "23031-32-5", "56776-01-3", "178600-17-4", "129689-30-1", "37000-20-7"]
baker_ligands += ["CGP 20712A", "ICI 89406", "Practolol", "Xamoterol", "Bisoprolol", "Betaxolol", "Atenolol", "ICI 215001", "Acebutolol", "Metoprolol", "CGP 12177", "Labetolol", "Carvedilol", "Pronethalol", "Propranolol", "Sotalol", "CL 316243", "Alprenolol", "Bupranolol", "Nadolol", "Timolol", "ICI 118551", "Salbutamol", "Terbutaline", "Salmeterol"]
wiki_ligands = ["Abediterol", "Amibegron", "Arbutamine", "Arformoterol", "Arotinolol", "Bambuterol", "Befunolol", "Bitolterol", "Bromoacetylalprenololmenthane", "(BAAM)", "Broxaterol", "Buphenine", "Carbuterol", "Cimaterol", "Clenbuterol", "Denopamine", "Deterenol", "Dipivefrine", "Dobutamine", "Dopamine", "Dopexamine", "Ephedrine", "Epinephrine", "(adrenaline)", "Etafedrine", "Etilefrine", "Ethylnorepinephrine", "Fenoterol", "2-Fluoronorepinephrine", "5-Fluoronorepinephrine", "Formoterol", "Hexoprenaline", "Higenamine", "Indacaterol", "Isoetarine", "Isoprenaline", "(isoproterenol)", "N-Isopropyloctopamine", "Isoxsuprine", "Labetalol", "Levonordefrin", "Levosalbutamol", "Mabuterol", "Methoxyphenamine", "Methyldopa", "Norepinephrine", "(noradrenaline)", "Orciprenaline", "Oxyfedrine", "Phenylpropanolamine", "Pirbuterol", "Prenalterol", "Ractopamine", "Procaterol", "Pseudoephedrine", "Quinterenol", "Reproterol", "Rimiterol", "Ritodrine", "Salbutamol", "(albuterol)", "Salmeterol", "Solabegron", "Terbutaline", "Tretoquinol", "Tulobuterol", "Xamoterol", "Zilpaterol", "Zinterol"]
baker_ligands += wiki_ligands
baker_cids = convert_names_to_cids(baker_ligands, None, True)

pubchem_cids = ["76972397", "76972192", "76969093", "76968032", "76966860", "76963246", "76958839", "76956802", "73417104", "73416983", "73416870", "73416846", "73416798", "71587364", "70685132", "56603492", "54545496", "46886609", "43834308", "24906312", "23368803", "16219010", "14010333", "9890216", "9859211", "9841972", "9796663", "6917856", "6603756", "6438331", "5702285", "5702273", "5702098", "5489013", "5489012", "5458580", "5311064", "3035442", "3032600", "657230", "656677", "656634", "441411", "431097", "217067", "166551", "156297", "127126", "122186", "121888", "121877", "114840", "71149", "68603", "65772", "61122", "60789", "56052", "55483", "38286", "37990", "37989", "36283", "29138", "29137", "25654", "13320", "11368", "8239", "5924", "5816", "5807", "5606", "5581", "4567", "4117", "3995", "3783", "3779", "2783", "2755", "2687", "2437", "2309", "838", "91827721", "76969219", "76967915", "76966198", "76965438", "76960798", "73417071", "73416960", "73416949", "73416743", "73416654", "71700183", "53477580", "24866733", "24835641", "16759149", "16654936", "16219912", "16035068", "11954385", "11711522", "11504295", "10378992", "10297443", "9933004", "9887812", "9884233", "9865528", "9861452", "9832292", "9820882", "6917801", "6917710", "6603122", "6440459", "6335907", "5702161", "5486546", "5312115", "3083544", "3040551", "3038500", "3038222", "3034756", "688570", "688563", "688561", "657301", "656601", "441334", "441333", "325003", "189562", "171674", "164652", "155774", "71301", "68658", "65324", "56801", "43590", "43091", "42396", "39859", "36811", "36094", "35330", "35329", "33572", "31728", "31620", "23702", "5403", "5152", "4916", "4845", "4086", "3762", "3609", "3410", "3343", "3306", "2083"]

ligand_df = pd.DataFrame(baker_ligands, columns=["name"])

In [81]:
import grids
reload(grids)
from grids import *

download_sdfs_from_cids([int(s) for s in pubchem_cids] + baker_cids, "/home/enf/b2ar_analysis/all_ligands", worker_pool=None, parallel=True)

In [40]:
smiles_df = pd.read_table("/home/enf/b2ar_analysis/all_ligands/dude_inactives.txt")
x = [str(line).split() for line in smiles_df.values]
smiles_df = pd.DataFrame([l[1] for l in x], index=["b2ar_dude_inactive-%d" %i for i in range(0, len(x))], columns=["smiles"])
smiles_df["ligand"] = smiles_df.index
smiles_df.iloc[:10]

Unnamed: 0,smiles,ligand
b2ar_dude_inactive-0,'c1ccc2c(c1)c3c([nH]2)c(=O)n(c(=O)[nH]3)CCN4CC...,b2ar_dude_inactive-0
b2ar_dude_inactive-1,'c1ccc2c(c1)c3c([nH]2)c(=O)n(c(=O)[nH]3)CCN4CC...,b2ar_dude_inactive-1
b2ar_dude_inactive-2,'c1ccc2c(c1)cccc2CC(=O)Nc3ccc(cc3)CCNC[C@@H](C...,b2ar_dude_inactive-2
b2ar_dude_inactive-3,'c1ccc2c(c1)ccc(n2)CC(=O)Nc3ccc(cc3)CCNC[C@@H]...,b2ar_dude_inactive-3
b2ar_dude_inactive-4,'c1ccc2c(c1)C(C=N2)CC(=O)Nc3ccc(cc3)CCNC[C@@H]...,b2ar_dude_inactive-4
b2ar_dude_inactive-5,'c1ccc2cc(ccc2c1)CC(=O)Nc3ccc(cc3)CCNC[C@@H](C...,b2ar_dude_inactive-5
b2ar_dude_inactive-6,'c1ccc2cnc(cc2c1)CC(=O)Nc3ccc(cc3)CCNC[C@@H](C...,b2ar_dude_inactive-6
b2ar_dude_inactive-7,'c1cc(c(c2c1C(OCC2)CN)O)O,b2ar_dude_inactive-7
b2ar_dude_inactive-8,'c1cc(c(cc1C2(CCCNC2)O)O)O,b2ar_dude_inactive-8
b2ar_dude_inactive-9,'c1ccc(cc1)c2c(=O)c3ccc(cc3oc2c4ccccc4)OCC(CNC...,b2ar_dude_inactive-9


In [16]:
from importlib import reload
import grids
reload(grids)
from grids import *

grid_center = "64.4, 16.9, 11.99"


#reimaged_dir = samples_dir
#mae_dir = reimaged_dir
#remove_ter(reimaged_dir)
#reorder(reimaged_dir)


#agonist_ligands = [a for a in agonist_ligands if "TA" not in a]

#pprep(mae_dir, ref = active_ref_dir, chosen_receptors = chosen_receptors, worker_pool=None, parallel=True)
#generate_grids(mae_dir, grid_center, grid_dir, remove_lig = "BIA", chosen_receptors = chosen_receptors, worker_pool=None, outer_box=25.)

precision = "SP"
htbc_dir = "/home/enf/htbc/sdfs"
docking_dir = "/home/enf/htbc/b2ar/docking_SP_32-stereoisomer_6-ring-conf"
grid_dir = "/home/enf/htbc/b2ar/grids"
all_ligands_dir = "/home/enf/b2ar_analysis/all_ligands/32-stereoisomers_6-ring-conf"

#lig_dir = "/home/enf/htbc/sdfs/32_stereoisomers"
#df = prepare_ligands(lig_dir, exts = [".smi"],
#                     n_ring_conf=6, n_stereoisomers=32,
#                     force_field=16, worker_pool=dview,
#                     parallel=True, redo=False,
#                     smiles_df=None, cid_df=None,
#                     binding_db=None,
#                     return_df=True)

if not os.path.exists(docking_dir):
    os.makedirs(docking_dir)
if not os.path.exists(grid_dir):
    os.makedirs(grid_dir)

#chosen_ligands = df.index.values.tolist()[:2000]
chosen_receptors = ["3P0G_pymol_prepped_new", "2RH1_prepped"] + ["cluster%d_sample0" %i for i in range(0,25)]
chosen_ligands = [n for n in get_ligands(lig_dir, ".mae") if "scaffold" in n]
lig_dir = all_ligands_dir
dock_ligands_and_receptors(grid_dir, docking_dir, lig_dir,
                           precision = precision, ext = "-out.maegz",
                           chosen_ligands=chosen_ligands, chosen_receptors = chosen_receptors,
                           parallel = True, grid_ext = ".zip", worker_pool=None,
                           retry_after_failed=False, timeout=60*1200*1000)
#dock_ligands_and_receptors(grid_dir, docking_dir, lig_dir, precision = precision, ext = "-out.maegz", chosen_ligands=chosen_ligands, chosen_receptors = None, parallel = False, grid_ext = ".zip", worker_pool=dview, retry_after_failed=True, timeout=60*1200)


#dock_ligands_and_receptors(grid_dir, docking_dir,  biased_agonist_dir, precision = precision, ext = "-out.maegz", chosen_ligands = biased_ligands, chosen_receptors = chosen_receptors, parallel = None, grid_ext = ".grd", worker_pool=dview)
#dock_ligands_and_receptors(grid_dir, docking_dir, agonist_dir, precision = precision, ext = "-out.maegz", chosen_ligands = agonist_ligands, chosen_receptors = chosen_receptors, parallel = None, grid_ext = ".grd", worker_pool=dview)




Creating new directories for each ligand.
Done creating directories. Determining which docking jobs to conduct.
About to do 81 Docking computations.
timeout 72000000 $SCHRODINGER/glide /home/enf/htbc/b2ar/docking_SP_32-stereoisomer_6-ring-conf/scaffold1/cluster13_sample0.in -OVERWRITE -WAIT -strict -NOJOBID > cluster13_sample0.log
timeout 72000000 $SCHRODINGER/glide /home/enf/htbc/b2ar/docking_SP_32-stereoisomer_6-ring-conf/scaffold1/2RH1_prepped.in -OVERWRITE -WAIT -strict -NOJOBID > 2RH1_prepped.log
timeout 72000000 $SCHRODINGER/glide /home/enf/htbc/b2ar/docking_SP_32-stereoisomer_6-ring-conf/scaffold1/cluster19_sample0.in -OVERWRITE -WAIT -strict -NOJOBID > cluster19_sample0.log
timeout 72000000 $SCHRODINGER/glide /home/enf/htbc/b2ar/docking_SP_32-stereoisomer_6-ring-conf/scaffold1/cluster24_sample0.in -OVERWRITE -WAIT -strict -NOJOBID > cluster24_sample0.log
timeout 72000000 $SCHRODINGER/glide /home/enf/htbc/b2ar/docking_SP_32-stereoisomer_6-ring-conf/scaffold1/3P0G_pymol_prepped_n

In [17]:
docking_dir

'/home/enf/htbc/b2ar/docking_SP_32-stereoisomer_6-ring-conf'

In [86]:
len([n for n in get_trajectory_files(lig_dir, "-out.maegz") if "CHEMBL" in n])

1280

In [83]:
undone_ligands

['CID_1049240',
 'CHEMBL250144',
 'CHEMBL1221636',
 'CID_9894801',
 'CHEMBL3298213',
 'CID_716312',
 'CID_657272',
 'CID_5104',
 'CID_667688',
 'CID_44287268',
 'CID_4011',
 'CHEMBL508338',
 'CID_913103',
 'CID_5384001',
 'CID_443495',
 'CHEMBL2068577',
 'CID_50878551',
 'CHEMBL1794855',
 'CHEMBL1095305',
 'CID_656665',
 'CID_70692139',
 'CID_49864412',
 'CID_8447',
 'CID_44622849',
 'CID_5566',
 'CHEMBL3099664',
 'CID_52941603',
 'CHEMBL1221801',
 'CID_52943770',
 'CHEMBL1629810',
 'CID_10310047',
 'CID_36324',
 'CID_52943362',
 'CID_2333',
 'CID_9860739',
 'CID_4543',
 'CID_6238',
 'CID_5897',
 'CHEMBL1909062',
 'CHEMBL39334',
 'CHEMBL1800960',
 'CID_24360',
 'CID_5353627',
 'CHEMBL1909065',
 'CHEMBL186247',
 'CHEMBL3099661',
 'CID_445154',
 'CID_18343',
 'CID_174',
 'CID_444034',
 'CID_411735',
 'CID_55283',
 'CID_46889767',
 'CID_15054194',
 'CHEMBL414804',
 'CID_7577',
 'CID_183812',
 'CID_4292932',
 'CID_46889727',
 'CID_27200',
 'CID_9898639',
 'CID_3519',
 'CID_46889699',
 'CID

In [16]:
lig_dir = "/home/enf/b2ar_analysis/all_ligands/32-stereoisomers_6-ring-conf"
[n for n in get_ligands(lig_dir, ".sdf") if "CID" in n]

['CID_10028830',
 'CID_10077130',
 'CID_10087493',
 'CID_10090',
 'CID_10096344',
 'CID_10100',
 'CID_10101',
 'CID_10112',
 'CID_10113978',
 'CID_10133',
 'CID_10178705',
 'CID_10180',
 'CID_10182969',
 'CID_10184653',
 'CID_10220503',
 'CID_10235',
 'CID_10240',
 'CID_102484',
 'CID_10280735',
 'CID_10297443',
 'CID_10311306',
 'CID_10324367',
 'CID_10353067',
 'CID_10378992',
 'CID_10413',
 'CID_10429215',
 'CID_10443654',
 'CID_1046',
 'CID_104741',
 'CID_104747',
 'CID_104758',
 'CID_10482134',
 'CID_104850',
 'CID_104865',
 'CID_1050',
 'CID_105075',
 'CID_10517',
 'CID_10518',
 'CID_1054',
 'CID_10548',
 'CID_1060',
 'CID_10607',
 'CID_10660',
 'CID_10734',
 'CID_10770',
 'CID_107807',
 'CID_107970',
 'CID_108000',
 'CID_108143',
 'CID_108150',
 'CID_10917',
 'CID_1103',
 'CID_110634',
 'CID_110635',
 'CID_1110',
 'CID_11154555',
 'CID_11167602',
 'CID_11219835',
 'CID_11238823',
 'CID_11243969',
 'CID_11250029',
 'CID_11273',
 'CID_11291',
 'CID_11304743',
 'CID_11333',
 'CID_1

In [56]:
chosen_ligands = [n for n in get_ligands(lig_dir) if "cid" in str(n).lower()]
chosen_ligands

['CID_10087493',
 'CID_10353067',
 'CID_10443654',
 'CID_123686',
 'CID_1237',
 'CID_155774',
 'CID_161394',
 'CID_170373',
 'CID_18026',
 'CID_2083',
 'CID_2119',
 'CID_217246',
 'CID_2249',
 'CID_23843',
 'CID_23844',
 'CID_2405',
 'CID_2585',
 'CID_2687',
 'CID_2755',
 'CID_2783',
 'CID_3343',
 'CID_33572',
 'CID_3609',
 'CID_36811',
 'CID_3682',
 'CID_3779',
 'CID_3869',
 'CID_4086',
 'CID_4382',
 'CID_4546390',
 'CID_4828',
 'CID_4883',
 'CID_4930',
 'CID_4946',
 'CID_5241141',
 'CID_5253',
 'CID_5403',
 'CID_5478',
 'CID_5606',
 'CID_5806',
 'CID_681',
 'CID_7436',
 'CID_838',
 'CID_913',
 'CID_951']

In [58]:
get_ligands(lig_dir, ".mae")

['CID_10087493',
 'CID_10353067',
 'CID_10443654',
 'CID_123686',
 'CID_1237',
 'CID_155774',
 'CID_161394',
 'CID_170373',
 'CID_18026',
 'CID_2083',
 'CID_2119',
 'CID_217246',
 'CID_2249',
 'CID_23843',
 'CID_23844',
 'CID_2405',
 'CID_2585',
 'CID_2687',
 'CID_2755',
 'CID_2783',
 'CID_3343',
 'CID_33572',
 'CID_3609',
 'CID_36811',
 'CID_3682',
 'CID_3779',
 'CID_3869',
 'CID_4086',
 'CID_4382',
 'CID_4546390',
 'CID_4828',
 'CID_4883',
 'CID_4930',
 'CID_4946',
 'CID_5241141',
 'CID_5253',
 'CID_5403',
 'CID_5478',
 'CID_5606',
 'CID_5806',
 'CID_681',
 'CID_7436',
 'CID_838',
 'CID_913',
 'CID_951',
 'b2ar_dude_inactive-0',
 'b2ar_dude_inactive-1',
 'b2ar_dude_inactive-10',
 'b2ar_dude_inactive-100',
 'b2ar_dude_inactive-101',
 'b2ar_dude_inactive-102',
 'b2ar_dude_inactive-103',
 'b2ar_dude_inactive-104',
 'b2ar_dude_inactive-105',
 'b2ar_dude_inactive-106',
 'b2ar_dude_inactive-107',
 'b2ar_dude_inactive-108',
 'b2ar_dude_inactive-109',
 'b2ar_dude_inactive-11',
 'b2ar_dude_in

In [None]:
from importlib import reload
import grids
reload(grids)
from grids import *

grid_center = "64.4, 16.9, 11.99"


#reimaged_dir = samples_dir
#mae_dir = reimaged_dir
#remove_ter(reimaged_dir)
#reorder(reimaged_dir)


#agonist_ligands = [a for a in agonist_ligands if "TA" not in a]

#pprep(mae_dir, ref = active_ref_dir, chosen_receptors = chosen_receptors, worker_pool=None, parallel=True)
#generate_grids(mae_dir, grid_center, grid_dir, remove_lig = "BIA", chosen_receptors = chosen_receptors, worker_pool=None, outer_box=25.)

precision = "SP"
htbc_dir = "/home/enf/htbc/sdfs"
docking_dir = "/home/enf/htbc/b2ar/docking_SP_1-stereoisomer/bret"
grid_dir = "/home/enf/htbc/b2ar/grids"

lig_dir = "/home/enf/b2ar_analysis/all_ligands/1_stereoisomer"
#df = prepare_ligands(lig_dir, exts = [".sdf", ".smi"],
#                     n_ring_conf=1, n_stereoisomers=1,
#                     force_field=16, worker_pool=dview,
#                     parallel=True, redo=False,
#                     smiles_df=smiles_df, cid_df=None,
#                     binding_db=None,
#                     return_df=True)

if not os.path.exists(docking_dir):
    os.makedirs(docking_dir)
if not os.path.exists(grid_dir):
    os.makedirs(grid_dir)

#chosen_ligands = df.index.values.tolist()[:2000]
chosen_ligands = [n for n in get_ligands(lig_dir, ".mae") if "inactive" in n]
#lig_dir = all_ligands_dir
dock_ligands_and_receptors(grid_dir, docking_dir, lig_dir,
                           precision = precision, ext = "-out.maegz",
                           chosen_ligands=chosen_ligands, chosen_receptors = None,
                           parallel = False, grid_ext = ".zip", worker_pool=dview,
                           retry_after_failed=True, timeout=60*1200*1000)
#dock_ligands_and_receptors(grid_dir, docking_dir, lig_dir, precision = precision, ext = "-out.maegz", chosen_ligands=chosen_ligands, chosen_receptors = None, parallel = False, grid_ext = ".zip", worker_pool=dview, retry_after_failed=True, timeout=60*1200)


#dock_ligands_and_receptors(grid_dir, docking_dir,  biased_agonist_dir, precision = precision, ext = "-out.maegz", chosen_ligands = biased_ligands, chosen_receptors = chosen_receptors, parallel = None, grid_ext = ".grd", worker_pool=dview)
#dock_ligands_and_receptors(grid_dir, docking_dir, agonist_dir, precision = precision, ext = "-out.maegz", chosen_ligands = agonist_ligands, chosen_receptors = chosen_receptors, parallel = None, grid_ext = ".grd", worker_pool=dview)





In [None]:
from importlib import reload
import grids
reload(grids)
from grids import *

grid_center = "64.4, 16.9, 11.99"


#reimaged_dir = samples_dir
#mae_dir = reimaged_dir
#remove_ter(reimaged_dir)
#reorder(reimaged_dir)


#agonist_ligands = [a for a in agonist_ligands if "TA" not in a]

#pprep(mae_dir, ref = active_ref_dir, chosen_receptors = chosen_receptors, worker_pool=None, parallel=True)
#generate_grids(mae_dir, grid_center, grid_dir, remove_lig = "BIA", chosen_receptors = chosen_receptors, worker_pool=None, outer_box=25.)

precision = "XP"
htbc_dir = "/home/enf/htbc/sdfs/32_stereoisomers"
docking_dir = "/home/enf/htbc/b2ar/docking_xp"
grid_dir = "/home/enf/htbc/b2ar/grids"

df = prepare_ligands(htbc_dir, exts = [".sdf"],
                n_ring_conf=6, n_stereoisomers=32,
               force_field=16, worker_pool=dview,
               parallel=None, redo=False,
              smiles_df=None, cid_df=None,
             binding_db=None,
             return_df=True)

if not os.path.exists(docking_dir):
    os.makedirs(docking_dir)
if not os.path.exists(grid_dir):
    os.makedirs(grid_dir)

#chosen_ligands = df.index.values.tolist()[:2000]
#chosen_ligands = [n for n in get_ligands(all_ligands_dir) if "CID" in n[0:3]]
#print(len(chosen_ligands))
#lig_dir = all_ligands_dir
#dock_ligands_and_receptors(grid_dir, docking_dir, lig_dir, precision = precision, ext = "-out.maegz", chosen_ligands=chosen_ligands, chosen_receptors = None, parallel = False, grid_ext = ".zip", worker_pool=None, retry_after_failed=True, timeout=180)

#dock_ligands_and_receptors(grid_dir, docking_dir,  biased_agonist_dir, precision = precision, ext = "-out.maegz", chosen_ligands = biased_ligands, chosen_receptors = chosen_receptors, parallel = None, grid_ext = ".grd", worker_pool=dview)
#dock_ligands_and_receptors(grid_dir, docking_dir, agonist_dir, precision = precision, ext = "-out.maegz", chosen_ligands = agonist_ligands, chosen_receptors = chosen_receptors, parallel = None, grid_ext = ".grd", worker_pool=dview)





In [34]:
grid_dir

'/home/enf/b2ar_analysis/sparse-tICA_t5_n_components2all_residues_2rh1_3sn6_under_cutoff6A_regularization_wolf_autoShrinkage_rho0pt01/grids_n_clusters25_n_samples1_random'

In [None]:
tica_object = compat_verboseload(projection_operator_dir)
print(tica_object.timescales_)

In [None]:
tic_components_dir = tica_dir
important_contact_features = interpret_tIC_components(projection_operator_dir, tic_components_dir, feature_residues_pkl, n_tica_components=5, percentile=95)

In [None]:
feature_names = compat_verboseload(feature_residues_pkl)

In [None]:
tic_subsampled_features_file = "%s/features_subsampled.pkl" % tica_dir
subsampled_features_dir = os.path.join(tica_dir, "subsampled_features")
if not os.path.exists(subsampled_features_dir): os.makedirs(subsampled_features_dir)
important_contact_features_pruned, important_contact_features_indices = find_non_zero_features(important_contact_features[0], feature_names)
#subsample_features(features_dir, important_contact_features_indices, important_contact_features_pruned, tic_subsampled_features_file)

In [None]:
tica_coords = compat_verboseload(projected_features_dir)
pnas_coords = compat_verboseload(pnas_coords_dir)
for pnas_coord in pnas_coords: pnas_coord[:,0]*=7.14
tica_names = ["tIC.%d" %i for i in range(1,n_components+1)]
pnas_names = ["tm6_tm3_dist", "rmsd_npxxy_inactive", "rmsd_npxxy_active", "rmsd_connector_inactive", "rmsd_connector_active"]

In [None]:
import plots
reload(plots)
from plots import *
#plot_histograms(projected_features_dir, analysis_dir, "tICA histogram", titles=["tIC.%d" %i for i in range(1,n_components+1)])

In [None]:
lag_time = 25
msm_model_dir = "%s/msm_lag_time%d.h5" % (tica_dir, lag_time)
#build_msm(clusterer_tICs_1_2_3_filename, lag_time=lag_time, msm_model_dir=msm_model_dir)
msm_object = compat_verboseload(msm_model_dir)
prior_counts = 0.

In [None]:

#compute_aggregate_scores(docking_multiple_ligands, inverse_agonists = inverse_ligands, summary = aggregate_docking, z_scores_csv = docking_z_scores_csv)
#aggregate_docking_joined_map = convert_csv_to_joined_map(aggregate_docking, aggregate_docking_joined)[0]
#aggregate_docking_means = calc_mean(aggregate_docking_joined_map)
#write_map_to_csv(aggregate_docking_joined, aggregate_docking_means, ["cluster", "mean_aggregate_docking_z_score"])
#r['do.analysis'](tica_dir, analysis_dir, pnas_coords_csv, tica_coords_csv, features_dir, docking_multiple_ligands)
#tics_vs_docking_file = "%s/tICA_vs_docking_carazolol.pdf" % analysis_dir
#plot_tICs_vs_docking(docking_multiple_ligands, tica_coords_csv, tics_vs_docking_file, chosen_ligand="s-carazolol")


In [None]:
projection_operator_dir

In [None]:
alt_pp_tica_dir = "/home/enf/b2ar_analysis/sparse-tICA_t5_n_components25all_residues_2rh1_3sn6_under_cutoff6A_regularization_wolf_autoShrinkage_rho0pt01"
alt_projection_operator_dir = "%s/phi_psi_chi2_allprot_tica_coords.h5" %alt_pp_tica_dir
tic_subsampled_features_file = "%s/features_subsampled.pkl" % alt_pp_tica_dir

alt_important_contact_features = interpret_tIC_components(alt_projection_operator_dir, alt_pp_tica_dir, feature_residues_pkl, n_tica_components=10, percentile=95)

subsampled_features_dir = os.path.join(alt_pp_tica_dir, "subsampled_features")
if not os.path.exists(subsampled_features_dir): os.makedirs(subsampled_features_dir)
alt_important_contact_features_pruned, alt_important_contact_features_indices = find_non_zero_features(alt_important_contact_features[0], feature_names)
if not os.path.exists(tic_subsampled_features_file):
    subsample_features(features_dir, alt_important_contact_features_indices, alt_important_contact_features_pruned, tic_subsampled_features_file)

In [None]:
top_features = load_file(tic_subsampled_features_file)
top_features = [t*10. for t in top_features]

user_defined_coords = compat_verboseload(user_defined_features_file)
user_defined_names = sorted(feature_name_residues_dict.keys())
user_defined_dfs = [pd.DataFrame(t, columns=user_defined_names) for t in user_defined_coords]

tica_dfs = [pd.DataFrame(t, columns=["tIC.%d" %i for i in range(1,n_components+1)]) for t in tica_coords]

all_feature_dfs = [pd.concat([top_features[i], user_defined_dfs[i], tica_dfs[i]], axis=1) for i in range(0, len(top_features))]


In [None]:
all_traj_features_np = [f.values for f in all_feature_dfs]
cluster_features_averages = calculate_cluster_averages_per_feature(clusterer, all_traj_features_np)
cluster_features_averages = pd.DataFrame(cluster_features_averages, columns=all_feature_dfs[0].columns, index=["cluster%d" %i for i in range(0,n_clusters)])

In [None]:
import custom_clusterer
reload(custom_clusterer)
from custom_clusterer import *
feature_name = "Asn148-Leu266_ca_dist"
find_snapshots_within_feature_range(all_feature_dfs, feature_name, [31., 32.], 
                                    get_trajectory_files(traj_dir, traj_ext), analysis_dir,
                                    "%s_31_32" %feature_name, 5, lig_name="BIA", 
                                    structure=None)

In [None]:
cluster_features_averages.loc[(cluster_features_averages["Asn148-Leu266_ca_dist"] > 35.) & (cluster_features_averages["Asn148-Leu266_ca_dist"] < 40.)]["Asn148-Leu266_ca_dist"]

In [None]:
all_features_onehot, names_onehot = multi_onehot_trajectories([t.values for t in all_feature_dfs], all_feature_dfs[0].columns.values.tolist(), subsample=100)
all_features_onehot = [pd.DataFrame(t, columns=names_onehot) for t in all_features_onehot]

In [None]:
[n for n in top_features[0].columns.values.tolist() if "TYR326" in n]

In [None]:
from imp import reload
import efficacy_scripts
reload(efficacy_scripts)
from efficacy_scripts import *

clusterer, cluster_averages, active_clusters, inactive_clusters, biased_ligands, agonist_ligands, inverse_ligands, all_ligands, c, feature_residues, tica_coords, user_defined_coords, pp_n_components, apriori_dfs, tica_dfs, cluster_pnas_averages, cluster_tica_averages, cluster_tica_pnas, top_features, clusters_map, tica_resampled_file, projected_features, num_trajs, features_eq, all_traj_features, samples_indices_file, samples_dir, samples_tica_avg_df, samples_pnas_avg_df, samples_features_avg_df, samples_normalized_features_avg_df, feature_names, feature_strings, samples_pnas_tica, reference_docking = initialize_analysis(clusterer_dir, user_defined_coords, user_defined_names, biased_agonist_dir, agonist_dir, inverse_agonist_dir, docking_dir, precision, docking_multiple_ligands, aggregate_docking, feature_residues_pkl, n_components, all_feature_dfs,
                        lag_time, n_clusters, projected_features_dir, traj_dir, traj_ext, tica_dir,
                        prior_counts, msm_object, analysis_dir, n_samples)

In [None]:
analysis_dir

In [None]:
import msm_resampled
reload(msm_resampled)
from msm_resampled import *

import efficacy_scripts
reload(efficacy_scripts)
from efficacy_scripts import *

docking_cluster_averages = c
all_ligands_dir = "/home/enf/b2ar_analysis/all_ligands"
ligands = get_ligands(all_ligands_dir)
apo_populations, df_agg, aggregate_docking_msm, docking_normalized, ddg_scaled, deltas_tica, delta_delta_g, lig_features_eq, new_populations, bi_msm, num_trajs, features, null_features, classes, agonists, antagonists, labels, X, N, C, y = compute_docking_dg(docking_cluster_averages, msm_object, samples_tica_avg_df, samples_pnas_avg_df, samples_normalized_features_avg_df, important_contact_features, traj_dir, traj_ext, tica_dir, ligands, reference_docking, clusters_map, all_features_onehot, analysis_dir)

In [None]:
lig_continuous_features = msm_reweighted_features_per_ligand(all_feature_dfs, new_populations, bi_msm, 10000, clusters_map, num_trajs, apo_populations, save_dir)

In [None]:
import detect_intermediates
reload(detect_intermediates)
from detect_intermediates import *

import efficacy_scripts
reload(efficacy_scripts)
from efficacy_scripts import *
#compare_feature_to_apo(lig_continuous_features, ["r_isopreterenol", "nebivolol", "3p0g_lig", "r_epinephrine", "s-carvedilol"], "")
ligands = ["r_isopreterenol", "3p0g_lig", "r_epinephrine", 'Ici118551', "s-carazolol", "salbutamol", "salmeterol"]
compare_feature_to_apo(lig_continuous_features, ligands, "s-carazolol", 'Asn148-Leu266_ca_dist')
plot_overall_kde(lig_continuous_features, ligands, 'Asn148-Leu266_ca_dist')

In [None]:
X_binarized = np.zeros((len(lig_features_eq.keys()), lig_features_eq[lig_features_eq.keys()[0]].shape[1]))
for i, lig in enumerate(lig_features_eq.keys()):
    print(i)
    x = lig_features_eq[lig]
    X_binarized[i,:] = x.astype(bool).sum(axis=0).values
    

In [None]:
X_onehot_df = pd.DataFrame(X_binarized, index=lig_features_eq.keys(), columns=names_onehot)

In [None]:
[n for n in names_onehot if "TYR326" in n]

In [None]:
X_onehot_df['rmsd_npxxy_active < 0.334651'].sort(inplace=False).plot(kind='barh')
plt.show()

In [None]:
#X_all_features = np.zeros((len(lig_features_eq.keys()), lig_features_eq[lig_features_eq.keys()[0]].shape[1]))
#for i, lig in enumerate(lig_features_eq.keys()):
#    x = lig_features_eq[lig].mean().values
#    X_all_features[i,:] = x
#X_all_features_df = pd.DataFrame(X_all_features, index=lig_features_eq.keys(), columns=lig_features_eq[lig].columns)
#X_all_features_df = standardize_df(X_all_features_df)

In [None]:
salt_bridge = []
for ligand in common_ligands:
    salt_bridge.append(lig_features_eq[ligand].loc[lig_features_eq[ligand]["Glu268-Arg328_dist"] < 5.].shape[0])
    

In [None]:
salt_bridge = []
for ligand in common_ligands:
    salt_bridge.append(lig_features_eq[ligand].loc[lig_features_eq[ligand]["Asn148-Leu266_dist"] > 37.5].shape[0])
pd.DataFrame(salt_bridge, index=common_ligands, columns=["dist"]).sort(columns="dist", inplace=False)

In [None]:
plt.hist(lig_features_eq["r_isopreterenol"]["Asn148-Leu266_ca_dist"].values, bins=50)
plt.show()
plt.hist(lig_features_eq["s-carazolol"]["Asn148-Leu266_ca_dist"].values, bins=50)
plt.show()
plt.hist(lig_features_eq["practolol"]["Asn148-Leu266_ca_dist"].values, bins=50)
plt.show()
plt.hist(lig_features_eq["practolol"]["tm6_tm3_dist"].values, bins=50)
plt.show()
plt.hist(lig_features_eq["r_isopreterenol"]["tm6_tm3_dist"].values, bins=50)
plt.show()
plt.hist(lig_features_eq["s-carvedilol"]["tm6_tm3_dist"].values, bins=50)
plt.show()

In [None]:
plt.scatter(standardize_df(X_onehot_df.loc[common_ligands][X_onehot_df.columns.values[-13]]).values, bret.loc[common_ligands]["B2AR-Gprotein, Mean"])
plt.show()

In [None]:
df = pd.DataFrame(compute_pearson_matrix(standardize_df(X_onehot_df.loc[common_ligands].transpose()).values.T,bret["B2AR-Gprotein, Mean"].subtract(bret["B2AR-Arrestin, Mean"]).loc[common_agonists].values.reshape((-1,1))), index=X_onehot_df.columns, columns=["correlation"]).sort("correlation", inplace=False)
print(df)

In [None]:
list(feature_name_residues_dict.keys())

In [59]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
cluster_pnas_averages

In [None]:
samples_features_avg_df

In [None]:
bret = pd.read_excel("/home/enf/b2ar_analysis/bret_bias_study2.xlsx", header=0).dropna().set_index("EvanName")
#bret = pd.read_csv("/home/enf/b2ar_analysis/bias_analysis/bret_bias_study.csv", header=0).dropna().set_index("EvanName")
common_ligands = [n for n in bret.index.values if n in delta_delta_g.columns.values]
bret["B2AR-Arrestin, Mean"].sort(inplace=False)#.subtract(bret["B2AR-Arrestin, Mean"])


In [None]:
bret

In [None]:
from sklearn.preprocessing import binarize
import efficacy_scripts
reload(efficacy_scripts)
from efficacy_scripts import *
#arrestin_antagonists = ["s-carvedilol", "nebivolol"]
#non_arrestin_antagonists = [n for n in antagonists if n not in arrestin_antagonists and n not in ["Carvedilol"]]
#y = np.array([1. for i in arrestin_antagonists] + [0. for i in non_arrestin_antagonists]).reshape((-1,1))


total_activity = bret["B2AR-Arrestin, Mean"].loc[common_ligands].add(bret["B2AR-Gprotein, Mean"].loc[common_ligands])
#common_agonists = arrestin_antagonists + non_arrestin_antagonists
#biased_ligands = ["ethylnorepinephrine", "isoetharine", "N-Cyclopentylbutanephrine"]
#non_biased_ligands =  ["r_isopreterenol", "r_epinephrine", "norepinephrine", "zinterol", "orciprenaline", "epinine", "terbutaline", "fenoterol", "procaterol", "formoterol", "salbutamol", "salmeterol"]
#y = np.array([1. for i in biased_ligands] + [0. for i in non_biased_ligands]).reshape((-1,1))
#common_agonists = biased_ligands + non_biased_ligands

common_agonists = total_activity.loc[total_activity > 0.2].index.values
y_ori = bret["B2AR-Arrestin, Mean"].loc[common_agonists].subtract(bret["B2AR-Gprotein, Mean"].loc[common_agonists]).values.reshape((-1,1))

top_clusters = delta_delta_g.index.values
#top_clusters = list(set(delta_delta_g.sort("nebivolol").index.values[:10].tolist() + delta_delta_g.sort("3p0g_lig").index.values[:10].tolist()))
#top_clusters = list(set(delta_delta_g.sort("N-Cyclopentylbutanephrine", inplace=False).index.values[:4].tolist() + delta_delta_g.sort("procaterol", inplace=False).index.values[:4].tolist()))
#agonists_df = [a for a in agonists if a in delta_delta_g.columns.values]
#common_agonists = agonists_df + antagonists
#y = np.array([1. for i in agonists_df] + [0. for i in antagonists]).reshape((-1,1))
#common_agonists = common_ligands
y_arr = bret["B2AR-Arrestin, Mean"].loc[common_agonists].values.reshape((-1,1))
y_gpr = bret["B2AR-Gprotein, Mean"].loc[common_agonists].values.reshape((-1,1))
#y_ori = y_arr - y_gpr
#y_ori = y_arr
#y = y_arr
#y = multi_binarizer(y_gpr, [0.2, 0.8])
y = multi_binarizer(y_ori, [-0.2])
print(y)

C = null_features.loc[common_agonists].values
#X = delta_delta_g.loc[top_clusters][common_agonists].values.T
X = np.hstack([delta_delta_g.loc[top_clusters][common_agonists].values.T, C*-1.0])
#X_scaled = ddg_scaled.loc[top_clusters][common_agonists].values.T
X_scaled = np.hstack([ddg_scaled.loc[top_clusters][common_agonists].values.T, C*-1.0])
#D_scaled = docking_normalized.loc[top_clusters][common_agonists].values.T
D_scaled = np.hstack([docking_normalized.loc[top_clusters][common_agonists].values.T, C*-1.0])
X_diff = np.zeros((X.shape[0], X.shape[1]**2/2))
k=0
for i in range(0,X.shape[1]):
    for j in range(i+1,X.shape[1]):
        X_diff[:,k] = X[:,i] - X[:,j]
        k+=1
X_diff = np.hstack([X_scaled, X_diff, C*-1.0])

all_features_df = pd.concat([ddg_scaled.loc[top_clusters], null_features.multiply(-1.0).transpose(), X_onehot_df.transpose()], axis=0)[common_agonists].transpose()
all_features = all_features_df.values
features = [C*-1.0, X_scaled]
features_y = [C*-1.0, X_scaled]
feature_names = ["Crystal Structures", "Docking ddG Scaled"]#, "Docking ddG and Observables"]

n_trials=1000
a_vs_g_results_t1000_single0pt2_split0pt8_logistic_cv_2f_obs = do_classification_experiment(features, y, feature_names, n_trials, 0.8, regularize=True, model="logistic_cv")
with open("%s/a_vs_g_results_t1000_single0pt2_split0pt8_logistic_cv_2f_obs.pkl" %analysis_dir, "wb") as f:
    pickle.dump(a_vs_g_results_t1000_single0pt2_split0pt8_logistic_cv_2f_obs, f)


analyze_multiclass_experiment(a_vs_g_results_t1000_single0pt2_split0pt8_logistic_cv_2f_obs, 
                              ["Crystal Structures", "MSM Docking"],
                              top_clusters.tolist() + null_features.columns.values.tolist(), common_agonists, analysis_dir,
                              ["Arrestin vs. G Protein", "G Protein Agonists"], X_scaled, 
                              exp_title="Arrestin vs. G Protein Two Class", coef_name="Logistic Coefficient")



In [None]:
all_features_df.loc["r_isopreterenol"]

In [None]:
analysis_dir = "%s_%dsamples" %(analysis_dir, n_samples)
print(analysis_dir)

In [None]:
if not os.path.exists(analysis_dir):
    os.makedirs(analysis_dir)

In [None]:
n_trials = 1000

y = multi_binarizer(y_gpr, [0.33, 0.66])
gprot_results_t1000_multi0pt33_0pt66_split0pt6_logistic_2f = do_classification_experiment(features, y, feature_names, n_trials, 0.6, regularize=False, model="logistic_cv")
with open("%s/gprot_results_t1000_multi0pt33_0pt66_split0pt6_logistic_2f.pkl" %analysis_dir, "wb") as f:
    pickle.dump(gprot_results_t1000_multi0pt33_0pt66_split0pt6_logistic_2f, f)

y = multi_binarizer(y_arr, [0.33, 0.66])
arr_results_t1000_multi0pt33_0pt66_split0pt6_logistic_2f = do_classification_experiment(features, y, feature_names, n_trials, 0.6, regularize=False, model="logistic_cv")
with open("%s/arr_results_t1000_multi0pt33_0pt66_split0pt6_logistic_2f.pkl" %analysis_dir, "wb") as f:
    pickle.dump(arr_results_t1000_multi0pt33_0pt66_split0pt6_logistic_2f, f)

print(np.median(np.array(gprot_results_t1000_multi0pt33_0pt66_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))
print(np.median(np.array(arr_results_t1000_multi0pt33_0pt66_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))
    
    
y = multi_binarizer(y_gpr, [0.2, 0.8])
gprot_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f = do_classification_experiment(features, y, feature_names, n_trials, 0.6, regularize=False, model="logistic_cv")
with open("%s/gprot_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f.pkl" %analysis_dir, "wb") as f:
    pickle.dump(gprot_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f, f)

y = multi_binarizer(y_arr, [0.2, 0.8])
arr_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f = do_classification_experiment(features, y, feature_names, n_trials, 0.6, regularize=False, model="logistic_cv")
with open("%s/arr_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f.pkl" %analysis_dir, "wb") as f:
    pickle.dump(arr_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f, f)

print(np.median(np.array(gprot_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))
print(np.median(np.array(arr_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))   

y = multi_binarizer(y_gpr, [0.5])
gprot_results_t1000_single0pt5_split0pt6_logistic_2f = do_classification_experiment(features, y, feature_names, n_trials, 0.6, regularize=False, model="logistic_cv")
with open("%s/gprot_results_t1000_single0pt5_split0pt6_logistic_2f.pkl" %analysis_dir, "wb") as f:
    pickle.dump(gprot_results_t1000_single0pt5_split0pt6_logistic_2f, f)

y = multi_binarizer(y_arr, [0.5])
arr_results_t1000_single0pt5_split0pt6_logistic_2f = do_classification_experiment(features, y, feature_names, n_trials, 0.6, regularize=False, model="logistic_cv")
with open("%s/arr_results_t1000_single0pt5_split0pt6_logistic_2f.pkl" %analysis_dir, "wb") as f:
    pickle.dump(arr_results_t1000_single0pt5_split0pt6_logistic_2f, f)
    
print(np.median(np.array(gprot_results_t1000_single0pt5_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))
print(np.median(np.array(arr_results_t1000_single0pt5_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))

    
y = multi_binarizer(y_gpr, [0.2])
gprot_results_t1000_single0pt2_split0pt6_logistic_2f = do_classification_experiment(features, y, feature_names, n_trials, 0.6, regularize=False, model="logistic_cv")
with open("%s/gprot_results_t1000_single0pt2_split0pt6_logistic_2f.pkl" %analysis_dir, "wb") as f:
    pickle.dump(gprot_results_t1000_single0pt2_split0pt6_logistic_2f, f)

y = multi_binarizer(y_arr, [0.2])
arr_results_t1000_single0pt2_split0pt6_logistic_2f = do_classification_experiment(features, y, feature_names, n_trials, 0.6, regularize=False, model="logistic_cv")
with open("%s/arr_results_t1000_single0pt2_split0pt6_logistic_2f.pkl" %analysis_dir, "wb") as f:
    pickle.dump(arr_results_t1000_single0pt2_split0pt6_logistic_2f, f)

print(np.median(np.array(gprot_results_t1000_single0pt2_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))
print(np.median(np.array(arr_results_t1000_single0pt2_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))

        
gprot_results_t1000_0pt8_ridge = do_regression_experiment(features, y_gpr, feature_names, n_trials, .8, regularize=False, model="RidgeCV")
with open("%s/gprot_results_t1000_0pt8_ridge.pkl" %analysis_dir, "wb") as f:
    pickle.dump(gprot_results_t1000_0pt8_ridge, f)

arr_results_t1000_0pt8_ridge = do_regression_experiment(features, y_arr, feature_names, n_trials, .8, regularize=False, model="RidgeCV")
with open("%s/arr_results_t1000_0pt8_ridge.pkl" %analysis_dir, "wb") as f:
    pickle.dump(arr_results_t1000_0pt8_ridge, f)

print(np.median(np.array(gprot_results_t1000_0pt8_ridge["test_r2s"]), axis=0))
print(np.median(np.array(arr_results_t1000_0pt8_ridge["test_r2s"]), axis=0))





In [None]:
import efficacy_scripts
reload(efficacy_scripts)
from efficacy_scripts import *
n_trials = 1000

y = multi_binarizer(y_gpr, [0.2, 0.8])
gprot_results_t1000_multi0pt2_split0pt8_logistic_cv_2f_obs = do_classification_experiment(features, y, feature_names, n_trials, 0.8, regularize=True, model="logistic_cv")
with open("%s/gprot_results_t1000_multi0pt2_split0pt8_logistic_cv_2f_obs.pkl" %analysis_dir, "wb") as f:
    pickle.dump(gprot_results_t1000_multi0pt2_split0pt8_logistic_cv_2f_obs, f)

y = multi_binarizer(y_arr, [0.2, 0.8])
arr_results_t1000_multi0pt2_split0pt8_logistic_cv_2f_obs = do_classification_experiment(features, y, feature_names, n_trials, 0.8, regularize=True, model="logistic_cv")
with open("%s/arr_results_t1000_multi0pt2_split0pt8_logistic_cv_2f_obs.pkl" %analysis_dir, "wb") as f:
    pickle.dump(arr_results_t1000_multi0pt2_split0pt8_logistic_cv_2f_obs, f)


y = multi_binarizer(y_gpr, [0.2])
gprot_results_t1000_single0pt2_split0pt8_logistic_cv_2f_obs = do_classification_experiment(features, y, feature_names, n_trials, 0.8, regularize=True, model="logistic_cv")
with open("%s/gprot_results_t1000_single0pt2_split0pt8_logistic_cv_2f_obs.pkl" %analysis_dir, "wb") as f:
    pickle.dump(gprot_results_t1000_single0pt2_split0pt8_logistic_cv_2f_obs, f)

y = multi_binarizer(y_arr, [0.2])
arr_results_t1000_single0pt2_split0pt8_logistic_cv_2f_obs = do_classification_experiment(features, y, feature_names, n_trials, 0.8, regularize=True, model="logistic_cv")
with open("%s/arr_results_t1000_single0pt2_split0pt8_logistic_cv_2f_obs.pkl" %analysis_dir, "wb") as f:
    pickle.dump(arr_results_t1000_single0pt2_split0pt8_logistic_cv_2f_obs, f)



In [None]:
np.median(np.array(a_vs_g_results_t1000_single0pt2_split0pt8_logistic_cv_2f_obs["test_roc_aucs"]), axis=0)


In [None]:
arr_results_t100_single0pt2_split0pt9_rfr_2f_obs["feature_importances"][0]

In [None]:
import efficacy_scripts
reload(efficacy_scripts)
from efficacy_scripts import *
analyze_multiclass_experiment(gprot_results_t100_single0pt2_split0pt9_logistic_cv_2f_obs, 
                              ["Crystal Structures", "MSM Docking"],
                              top_clusters.tolist() + null_features.columns.values.tolist(), common_agonists, analysis_dir,
                              ["GProtein Antagonists", "GProtein Agonists"], X_scaled, 
                              exp_title="GProtein Class", coef_name="Logistic Coefficient")


In [None]:
all_features_df.columns.shape

In [None]:
arr_results_t100_single0pt2_split0pt9_rfr_2f_obs["feature_importances"][0][2].shape

In [None]:
all_features_df.columns.values.tolist()

In [None]:
print(np.median(np.array(gprot_results_t1000_0pt8_ridge["test_r2s"]), axis=0))
print(np.median(np.array(arr_results_t1000_0pt8_ridge["test_r2s"]), axis=0))
print("/n")

print(np.median(np.array(gprot_results_t1000_multi0pt33_0pt66_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))
print(np.median(np.array(arr_results_t1000_multi0pt33_0pt66_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))
print("/n")

print(np.median(np.array(gprot_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))
print(np.median(np.array(arr_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))
print("/n")

print(np.median(np.array(gprot_results_t1000_single0pt5_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))
print(np.median(np.array(arr_results_t1000_single0pt5_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))
print("/n")

print(np.median(np.array(gprot_results_t1000_single0pt2_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))
print(np.median(np.array(arr_results_t1000_single0pt2_split0pt6_logistic_2f["test_roc_aucs"]), axis=0))





In [None]:
import plots
reload(plots)
from plots import *

import efficacy_scripts
reload(efficacy_scripts)
from efficacy_scripts import *


feature_names = top_clusters.tolist() +  ["Inactive Crystal", "Active Crystal", "Crystal Difference"] 
feature_names = [s.replace("cluster", "MSM State ") for s in feature_names]
print(feature_names)

#with open("%s/gprot_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f.pkl" %analysis_dir, "rb") as f:
#    gprot_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f = pickle.load(f)
#analyze_multiclass_experiment(gprot_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f, ["Crystal Structures", "MSM Docking"], feature_names, common_agonists, analysis_dir, ["G Protein Antagonists", "G Protein Partial Agonists", "G Protein Full Agonists"], X_scaled, exp_title="GProtein Three Clas 1000 Trials", coef_name="Logistic Coefficient")


#with open("%s/arr_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f.pkl" %analysis_dir, "rb") as f:
#    arr_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f = pickle.load(f)
#analyze_multiclass_experiment(arr_results_t1000_multi0pt2_0pt8_split0pt6_logistic_2f, ["Crystal Structures", "MSM Docking"], feature_names, common_agonists, analysis_dir, ["Arrestin Antagonists", "Arrestin Partial Agonists", "Arrestin Full Agonists"], X_scaled, exp_title="Arrestin Three Class 1000 Trials", coef_name="Logistic Coefficient")

#plt.clf()
#analyze_multiclass_experiment(gprot_results_t100_single0pt2_split0pt6_logistic_2f, ["Crystal Structures", "MSM Docking"], feature_names, common_agonists, analysis_dir, ["G Protein Antagonists", "G Protein Agonists"], X_scaled, exp_title="Two Class", coef_name="Logistic Coefficient")
with open("%s/arr_results_t1000_single0pt2_split0pt6_logistic_2f.pkl" %analysis_dir, "rb") as f:
    arr_results_t1000_single0pt2_split0pt6_logistic_2f = pickle.load(f)
analyze_multiclass_experiment(arr_results_t1000_single0pt2_split0pt6_logistic_2f, ["Crystal Structures", "MSM Docking"],feature_names, common_agonists, analysis_dir, ["Arrestin Antagonists", "Arrestin Agonists"], X_scaled, exp_title="Arrestin Two Class", coef_name="Logistic Coefficient")

with open("%s/gprot_results_t1000_single0pt2_split0pt6_logistic_2f.pkl" %analysis_dir, "rb") as f:
    gprot_results_t1000_single0pt2_split0pt6_logistic_2f = pickle.load(f)
analyze_multiclass_experiment(gprot_results_t1000_single0pt2_split0pt6_logistic_2f, ["Crystal Structures", "MSM Docking"],feature_names, common_agonists, analysis_dir, ["G Protein Antagonists", "G Protein Agonists"], X_scaled, exp_title="G Protein Two Class", coef_name="Logistic Coefficient")



In [None]:
analysis_dir

In [None]:
X_scaled[:,25]

In [None]:
ddg_scaled["salbutamol"].sort(inplace=False
                             )

In [None]:
arr_results_t100_multi_0pt6_logistic = arr_results_t100_multi_0pt_6_rfr 

= arr_results_t1000_multi_0pt_6_rfr 

In [None]:
save_file = "%s/arr_results_t1000_multi_0pt6_logistic.pkl" %(analysis_dir)
with open(save_file, "wb") as f:
    pickle.dump(arr_results_t1000_multi_0pt6_logistic, f)
    

In [None]:
save_file = "%s/gprot_results_t1000_multi_0pt6_logistic.pkl" %(analysis_dir)
with open(save_file, "wb") as f:
    pickle.dump(gprot_results_t1000_multi_0pt6_logistic, f)
    

In [None]:
np.median(np.array(arr_results_t1000_0pt5_md3_ridge["test_r2s"]), axis=0)

In [None]:
np.median(np.array(arr_results_t1000_0pt5_md3_ridge["kendall_pvalues"]), axis=0)

In [None]:
np.median(np.array(arr_results_t1000_0pt5_md3_ridge["kendall_pvalues"]), axis=0)

In [None]:
np.median(np.array(arr_results_t1000_multi_0pt6_logistic["test_roc_aucs"]), axis=0)


In [None]:
plt.hist(y_gpr, bins=50)
plt.show()

In [None]:
from sklearn.preprocessing import binarize
#arrestin_antagonists = ["s-carvedilol", "nebivolol"]
#non_arrestin_antagonists = [n for n in antagonists if n not in arrestin_antagonists and n not in ["Carvedilol"]]
#y = np.array([1. for i in arrestin_antagonists] + [0. for i in non_arrestin_antagonists]).reshape((-1,1))


total_activity = bret["B2AR-Arrestin, Mean"].loc[common_ligands].add(bret["B2AR-Gprotein, Mean"].loc[common_ligands])
#common_agonists = arrestin_antagonists + non_arrestin_antagonists
#biased_ligands = ["ethylnorepinephrine", "isoetharine", "N-Cyclopentylbutanephrine"]
#non_biased_ligands =  ["r_isopreterenol", "r_epinephrine", "norepinephrine", "zinterol", "orciprenaline", "epinine", "terbutaline", "fenoterol", "procaterol", "formoterol", "salbutamol", "salmeterol"]
#y = np.array([1. for i in biased_ligands] + [0. for i in non_biased_ligands]).reshape((-1,1))
#common_agonists = biased_ligands + non_biased_ligands

#common_agonists = total_activity.loc[total_activity > 0.2].index.values
#y_ori = bret["B2AR-Arrestin, Mean"].loc[common_agonists].subtract(bret["B2AR-Gprotein, Mean"].loc[common_agonists]).values.reshape((-1,1))

top_clusters = delta_delta_g.index.values
#top_clusters = list(set(delta_delta_g.sort("nebivolol").index.values[:10].tolist() + delta_delta_g.sort("3p0g_lig").index.values[:10].tolist()))
#top_clusters = list(set(delta_delta_g.sort("N-Cyclopentylbutanephrine", inplace=False).index.values[:4].tolist() + delta_delta_g.sort("procaterol", inplace=False).index.values[:4].tolist()))
#agonists_df = [a for a in agonists if a in delta_delta_g.columns.values]
#common_agonists = agonists_df + antagonists
#y = np.array([1. for i in agonists_df] + [0. for i in antagonists]).reshape((-1,1))
common_agonists = common_ligands
y_arr = bret["B2AR-Arrestin, Mean"].loc[common_agonists].values.reshape((-1,1))
y_gpr = bret["B2AR-Gprotein, Mean"].loc[common_agonists].values.reshape((-1,1))
#y_ori = y_arr - y_gpr
#y_ori = y_arr
y = multi_binarizer(y_arr, [0.33, 0.66])
#y = binarize(y_gpr, threshold=0.2)

C = null_features.loc[common_agonists].values
#X = delta_delta_g.loc[top_clusters][common_agonists].values.T
X = np.hstack([delta_delta_g.loc[top_clusters][common_agonists].values.T, C])
#X_scaled = ddg_scaled.loc[top_clusters][common_agonists].values.T
X_scaled = np.hstack([ddg_scaled.loc[top_clusters][common_agonists].values.T, C])
#D_scaled = docking_normalized.loc[top_clusters][common_agonists].values.T
D_scaled = np.hstack([docking_normalized.loc[top_clusters][common_agonists].values.T, C])
X_diff = np.zeros((X.shape[0], X.shape[1]**2/2))
k=0
for i in range(0,X.shape[1]):
    for j in range(i+1,X.shape[1]):
        X_diff[:,k] = X[:,i] - X[:,j]
        k+=1
X_diff = np.hstack([X_scaled, X_diff, C])

features = [C, D_scaled, X, X_scaled, X_diff]
features_y = [C, D_scaled, X, X_scaled, X_diff]
feature_names = ["Crystal Structures", "Normalized Docking", "Docking ddG", "Docking ddg Scaled", "Docking ddG Differences"]

n_trials = 1000
test_accuracies, test_aucs, test_log_aucs, C_test_aucs, C_test_log_aucs, feature_importances = do_classification_experiment(features, y, feature_names, n_trials, 0.8, regularize=False)
#arr_results_t100_0pt9_md3 = do_regression_experiment(features, y, feature_names, n_trials, 0.9, regularize=False)

In [None]:
save_file = "%s/arr_results_t1000_0pt9_md3.pkl" %analysis_dir
with open(save_file, "wb") as f:
    pickle.dump(arr_results_t1000_0pt9_md3, f)

In [None]:
print(np.median(np.array(arr_results_t1000_0pt9_md3["test_r2s"]), axis=0))

In [None]:
from sklearn.preprocessing import binarize
#arrestin_antagonists = ["s-carvedilol", "nebivolol"]
#non_arrestin_antagonists = [n for n in antagonists if n not in arrestin_antagonists and n not in ["Carvedilol"]]
#y = np.array([1. for i in arrestin_antagonists] + [0. for i in non_arrestin_antagonists]).reshape((-1,1))


total_activity = bret["B2AR-Arrestin, Mean"].loc[common_ligands].add(bret["B2AR-Gprotein, Mean"].loc[common_ligands])
#common_agonists = arrestin_antagonists + non_arrestin_antagonists
#biased_ligands = ["ethylnorepinephrine", "isoetharine", "N-Cyclopentylbutanephrine"]
#non_biased_ligands =  ["r_isopreterenol", "r_epinephrine", "norepinephrine", "zinterol", "orciprenaline", "epinine", "terbutaline", "fenoterol", "procaterol", "formoterol", "salbutamol", "salmeterol"]
#y = np.array([1. for i in biased_ligands] + [0. for i in non_biased_ligands]).reshape((-1,1))
#common_agonists = biased_ligands + non_biased_ligands

common_agonists = total_activity.loc[total_activity > 0.2].index.values
#y_ori = bret["B2AR-Arrestin, Mean"].loc[common_agonists].subtract(bret["B2AR-Gprotein, Mean"].loc[common_agonists]).values.reshape((-1,1))

top_clusters = delta_delta_g.index.values
#top_clusters = list(set(delta_delta_g.sort("nebivolol").index.values[:10].tolist() + delta_delta_g.sort("3p0g_lig").index.values[:10].tolist()))
#top_clusters = list(set(delta_delta_g.sort("N-Cyclopentylbutanephrine", inplace=False).index.values[:4].tolist() + delta_delta_g.sort("procaterol", inplace=False).index.values[:4].tolist()))
#agonists_df = [a for a in agonists if a in delta_delta_g.columns.values]
#common_agonists = agonists_df + antagonists
#y = np.array([1. for i in agonists_df] + [0. for i in antagonists]).reshape((-1,1))
#common_agonists = common_ligands
y_arr = bret["B2AR-Arrestin, Mean"].loc[common_agonists].values.reshape((-1,1))
y_gpr = bret["B2AR-Gprotein, Mean"].loc[common_agonists].values.reshape((-1,1))
y_ori = y_arr - y_gpr
#y_ori = y_arr
#y = y_arr
y = binarize(y_ori, threshold=-0.2)

C = null_features.loc[common_agonists].values
#X = delta_delta_g.loc[top_clusters][common_agonists].values.T
X = np.hstack([delta_delta_g.loc[top_clusters][common_agonists].values.T, C])
#X_scaled = ddg_scaled.loc[top_clusters][common_agonists].values.T
X_scaled = np.hstack([ddg_scaled.loc[top_clusters][common_agonists].values.T, C])
#D_scaled = docking_normalized.loc[top_clusters][common_agonists].values.T
D_scaled = np.hstack([docking_normalized.loc[top_clusters][common_agonists].values.T, C])
X_diff = np.zeros((X.shape[0], X.shape[1]**2/2))
k=0
for i in range(0,X.shape[1]):
    for j in range(i+1,X.shape[1]):
        X_diff[:,k] = X[:,i] - X[:,j]
        k+=1
X_diff = np.hstack([X_scaled, X_diff, C])

features = [C, D_scaled, X, X_scaled, X_diff]
features_y = [C, D_scaled, X, X_scaled]
feature_names = ["Crystal Structures", "Normalized Docking", "Docking ddG", "Docking ddg Scaled", "Docking ddG Differences"]

n_trials = 1000
#arr_classification_results_t1000_0pt6 = do_classification_experiment(features, y, feature_names, n_trials, 0.6, regularize=False)
arr_vs_gprot_classification_results_t1000_0pt8 = do_classification_experiment(features, y, feature_names, n_trials, 0.8, regularize=False)
#arr_results_1000 = do_regression_experiment(features, y, feature_names, n_trials, 0.8, regularize=False)

In [None]:
with open("%s/arr_vs_gprot_classification_results_t100_0pt8.pkl" %analysis_dir, "wb") as f:
    pickle.dump(arr_vs_gprot_classification_results_t100_0pt8, f)

In [None]:
print(np.median(np.array(arr_vs_gprot_classification_results_t100_0pt8["test_aucs"]), axis=0))

In [None]:
plt.hist(y_ori,bins=50)
plt.show()

In [None]:

save_file = "%s/arr_classification_results_t1000_0pt6.pkl" %analysis_dir
with open(save_file, "wb") as f:
    pickle.dump(arr_classification_results_t1000_0pt6, f)

In [None]:
np.median(np.array(arr_classification_results_t1000_0pt6["test_log_aucs"]),axis=0)

In [None]:
from sklearn.preprocessing import binarize
#arrestin_antagonists = ["s-carvedilol", "nebivolol"]
#non_arrestin_antagonists = [n for n in antagonists if n not in arrestin_antagonists and n not in ["Carvedilol"]]
#y = np.array([1. for i in arrestin_antagonists] + [0. for i in non_arrestin_antagonists]).reshape((-1,1))


total_activity = bret["B2AR-Arrestin, Mean"].loc[common_ligands].add(bret["B2AR-Gprotein, Mean"].loc[common_ligands])
#common_agonists = arrestin_antagonists + non_arrestin_antagonists
#biased_ligands = ["ethylnorepinephrine", "isoetharine", "N-Cyclopentylbutanephrine"]
#non_biased_ligands =  ["r_isopreterenol", "r_epinephrine", "norepinephrine", "zinterol", "orciprenaline", "epinine", "terbutaline", "fenoterol", "procaterol", "formoterol", "salbutamol", "salmeterol"]
#y = np.array([1. for i in biased_ligands] + [0. for i in non_biased_ligands]).reshape((-1,1))
#common_agonists = biased_ligands + non_biased_ligands

#common_agonists = total_activity.loc[total_activity > 0.2].index.values
#y_ori = bret["B2AR-Arrestin, Mean"].loc[common_agonists].subtract(bret["B2AR-Gprotein, Mean"].loc[common_agonists]).values.reshape((-1,1))

top_clusters = delta_delta_g.index.values
#top_clusters = list(set(delta_delta_g.sort("nebivolol").index.values[:10].tolist() + delta_delta_g.sort("3p0g_lig").index.values[:10].tolist()))
#top_clusters = list(set(delta_delta_g.sort("N-Cyclopentylbutanephrine", inplace=False).index.values[:4].tolist() + delta_delta_g.sort("procaterol", inplace=False).index.values[:4].tolist()))
#agonists_df = [a for a in agonists if a in delta_delta_g.columns.values]
#common_agonists = agonists_df + antagonists
#y = np.array([1. for i in agonists_df] + [0. for i in antagonists]).reshape((-1,1))
common_agonists = common_ligands
y_arr = bret["B2AR-Arrestin, Mean"].loc[common_agonists].values.reshape((-1,1))
y_gpr = bret["B2AR-Gprotein, Mean"].loc[common_agonists].values.reshape((-1,1))
#y_ori = y_arr - y_gpr
y_ori = y_arr
#y = y_arr
y = binarize(y_arr, threshold=0.2)

C = null_features.loc[common_agonists].values
#X = delta_delta_g.loc[top_clusters][common_agonists].values.T
X = np.hstack([delta_delta_g.loc[top_clusters][common_agonists].values.T, C])
#X_scaled = ddg_scaled.loc[top_clusters][common_agonists].values.T
X_scaled = np.hstack([ddg_scaled.loc[top_clusters][common_agonists].values.T, C])
#D_scaled = docking_normalized.loc[top_clusters][common_agonists].values.T
D_scaled = np.hstack([docking_normalized.loc[top_clusters][common_agonists].values.T, C])
X_diff = np.zeros((X.shape[0], X.shape[1]**2/2))
k=0
for i in range(0,X.shape[1]):
    for j in range(i+1,X.shape[1]):
        X_diff[:,k] = X[:,i] - X[:,j]
        k+=1
X_diff = np.hstack([X_scaled, X_diff, C])

features = [C, D_scaled, X, X_scaled, X_diff]
features_y = [C, D_scaled, X, X_scaled]
feature_names = ["Crystal Structures", "Normalized Docking", "Docking ddG", "Docking ddg Scaled", "Docking ddG Differences"]

n_trials = 1000
test_accuracies, test_aucs, test_log_aucs, C_test_aucs, C_test_log_aucs, feature_importances = do_classification_experiment(features, y, feature_names, n_trials, 0.8, regularize=False)
#arr_auc_results = do_regression_experiment(features, y, feature_names, n_trials, 0.8, regularize=False)

In [None]:
np.median(np.array(test_log_aucs),axis=0)

In [None]:
np.median(np.array(gprot_results["test_r2s"]),axis=0)

In [None]:
gprot_results["test_r2s"]

In [None]:
features_y = copy.deepcopy(features)
features_y.append(y)
train_test_arrays = train_test_split(*features_y, train_size=0.8)
print([a.shape for a in train_test_arrays])

In [None]:
train_test_arrays = train_test_split(*features_y, train_size=0.8)
print([a.shape for a in train_test_arrays])

In [None]:
gprot_results = copy.deepcopy([test_accuracies, test_aucs, test_log_aucs, C_test_aucs, C_test_log_aucs, feature_importances])
#import pickle
gprot_file = "%s/gprot_results_0pt2_XP_no-regularization.pkl"
with open("%s/gprot_results_0pt2_XP_no-regularization.pkl", "wb") as f:
    pickle.dump(gprot_results, f)
with open(gprot_file) as f:
    test_accuracies, test_aucs, test_log_aucs, C_test_aucs, C_test_log_aucs, feature_importances = pickle.load(f)

In [None]:
#arrestin_results = copy.deepcopy([test_accuracies, test_aucs, test_log_aucs, C_test_aucs, C_test_log_aucs, feature_importances])
#import pickle
arrestin_file = "%s/arrestin_results_0pt2_XP_no-regularization.pkl"
#with open("%s/arrestin_results_0pt2_XP_no-regularization.pkl", "wb") as f:
#    pickle.dump(arrestin_results, f)
with open(arrestin_file) as f:
    test_accuracies, test_aucs, test_log_aucs, C_test_aucs, C_test_log_aucs, feature_importances = pickle.load(f)

In [None]:
#arrestin_rfr_results = copy.deepcopy([test_r2s, rfr_feature_importances])
arrestin_rfr_file = "%s/arrestin_results_rfr_XP_no-regularization.pkl"
#with open("%s/arrestin_results_rfr_XP_no-regularization.pkl", "wb") as f:
#    pickle.dump(arrestin_rfr_results, f)
with open(arrestin_rfr_file) as f:
    test_r2s, rfr_feature_importances = pickle.load(f)

In [None]:
#gprot_rfr_results = copy.deepcopy([test_r2s, rfr_feature_importances])
gprot_rfr_file = "%s/gprot_results_rfr_XP_no-regularization.pkl"
#with open("%s/gprot_results_rfr_XP_no-regularization.pkl", "wb") as f:
#    pickle.dump(gprot_rfr_results, f)
with open(gprot_rfr_file) as f:
    test_r2s, rfr_feature_importances = pickle.load(f)

In [None]:
test_aucs = np.array(test_aucs)
plt.clf()
plt.boxplot(test_aucs[:,2]-test_aucs[:,0])
plt.show()

In [None]:
test_log_aucs = np.array(test_log_aucs)
np.percentile(test_aucs[:,2]-test_aucs[:,0], 50)

In [None]:
import sklearn
null_preds = binarize(C[:,2])
sklearn.metrics.roc_auc_score(binarize(y, 0.2).ravel(), null_preds.ravel())

In [None]:
test_aucs = np.array(test_aucs)
print(np.median(test_aucs, axis=0))
print(np.median(test_aucs[:,2] - test_aucs[:,0]))

In [None]:
n_successes = len(np.where(test_aucs[:,2]-test_aucs[:,0] > 0.)[0])
nobs = test_aucs.shape[0]
statsmodels.stats.proportion.proportion_confint(count=n_successes, nobs=nobs, alpha=0.01, method='wilson')


In [None]:
np.median(np.array(test_r2s),axis=0)

In [None]:
arrestin_vs_gprot_results = copy.deepcopy([test_accuracies, test_aucs, test_log_aucs, C_test_aucs, C_test_log_aucs, feature_importances])
import pickle
with open("%s/arrestin_vs_gprot_results_-0pt2_XP.pkl", "wb") as f:
    pickle.dump(arrestin_vs_gprot_results, f)

In [None]:
arrestin_vs_gprot_rfr_results = copy.deepcopy([test_r2s, rfr_feature_importances])
import pickle
with open("%s/arrestin_vs_gprot_rfr_results_XP.pkl", "wb") as f:
    pickle.dump(arrestin_vs_gprot_rfr_results, f)

In [None]:
test_accuracies, test_aucs, test_log_aucs, C_test_aucs, C_test_log_aucs, feature_importances = arrestin_vs_gprot_results

In [None]:
#gprotein_results = copy.deepcopy([test_accuracies, test_aucs, test_log_aucs, C_test_aucs, C_test_log_aucs, feature_importances])

In [None]:
results_dict = gprot_results_t1000_multi_0pt6_logistic
import efficacy_scripts
reload(efficacy_scripts)
from efficacy_scripts import *
importances_df, results_df = analyze_classification_experiment(results_dict["test_roc_aucs"],
                                               results_dict["feature_importances"],
                                               feature_names,
                                               X_scaled, y, pd.concat([ddg_scaled, null_features.transpose()], axis=0),
                                               top_clusters.tolist() + null_features.columns.values.tolist(), 
                                               common_agonists, 
                                                "Predicting G Protein Three Class, 1000 trials, 60:40 Split, XP", analysis_dir)

In [None]:
feature_names

In [None]:
importances_df, results_df = analyze_experiment(test_aucs, test_log_aucs, feature_importances, feature_names,
                        X_scaled, y, pd.concat([ddg_scaled, null_features.transpose()], axis=0) , top_clusters.tolist() + null_features.columns.values.tolist(), common_agonists, "Predicting Arrestin, w Crystal Features, 0.2, SP", analysis_dir)

In [None]:
importances_df, results_df = analyze_regression_experiment(test_r2s, rfr_feature_importances, feature_names,
                        X, y, pd.concat([delta_delta_g, null_features.transpose()], axis=0) , top_clusters.tolist() + null_features.columns.values.tolist(), common_agonists, "Predicting Arrestin, RFR, w Crystal Features, SP", analysis_dir)

In [None]:
def get_top_measurable_features(samples_normalized_features_avg_df, cluster_name):
    import re
    top_features_cluster = []
    #top_features_cluster = samples_normalized_features_avg_df.loc["cluster_name"].loc[samples_normalized_features_avg_df.loc["cluster_name"].abs() > .75].index.values
    #print(top_features_cluster)
    #print(len(top_features_cluster))
    [top_features_cluster.append(pair) for pair in samples_normalized_features_avg_df.loc[cluster_name].abs().sort(inplace=False, ascending=False).index.values[:10]]
    all_features = []
    features = []
    for f in top_features_cluster:
        fs = f.split(",")
        for i in range(0, len(fs)):
            res = int(re.findall(r'\d+', fs[i])[0])
            all_features.append(res)
            if "TRP" in fs[i] or "CYS" in fs[i] or "TYR" in fs[i] or "LYS" in fs[i]:
                features.append(res)
            
    top_features_cluster = sorted(list(set(features)))
    #print(sorted(list(set([int(re.findall(r'\d+', r)[0]) for r in all_features]))))
    #print(top_features_cluster)
    return top_features_cluster, all_features
a, b = get_top_measurable_features(samples_normalized_features_averages_df, 6)

In [None]:
print(b)

In [None]:
ddg_scaled[["norepinephrine", "r_epinephrine", "ethylnorepinephrine", "nebivolol", "s-carvedilol", "s-carazolol"]]

In [None]:
print(np.mean(test_aucs, axis=0))
print(np.median(test_aucs, axis=0))
print(np.mean(test_log_aucs, axis=0))
print(np.median(test_log_aucs, axis=0))



In [None]:
plt.scatter(ddg_scaled.loc["cluster2"][common_agonists], y)

In [None]:
y_ori = bret["B2AR-Gprotein, Mean"].loc[common_agonists].values.reshape((-1,1))
y = binarize(y_ori, threshold=0.05)

X = delta_delta_g.loc[top_clusters][common_agonists].values.T
X_scaled = ddg_scaled.loc[top_clusters][common_agonists].values.T
C = null_features.loc[common_agonists].values
D_scaled = docking_normalized.loc[top_clusters][common_agonists].values.T

X_train = D_scaled
y_train = y
f = np.zeros(X_train.shape[1])

rfr = RandomForestClassifier(n_estimators=1000, max_features='sqrt', n_jobs=-1, oob_score=True)
rfr.fit(X_train, y_train)

#top_indices = np.argsort(rfr.feature_importances_*-1.)[:min(20, X_train.shape[1])]
#rfr = RandomForestClassifier(n_estimators=100, max_features='sqrt', n_jobs=-1, oob_score=True)
#X_train = X_train[:, top_indices]
#rfr.fit(X_train, y_train)
#f[top_indices] = rfr.feature_importances_
#y_pred = rfr.predict(X_train)
#y_score = rfr.predict_proba(X_train)
top_indices=range(0,100)

In [None]:
top_indices

In [None]:
X_scaled.shape

In [None]:
#G Protein, Agonist Results
test_drugs = secret_compounds + ["nebivolol", "s-carvedilol", "xamoterol", "3p0g_lig", "isoetharine", "ethylnorepinephrine", "N-Cyclopentylbutanephrine", "ta-2005", "procaterol"]
X_test = docking_normalized.transpose().loc[test_drugs].values[:, top_indices]
print(X_test)
pd.DataFrame(rfr.predict_proba(X_test), index=test_drugs, columns=["P(Antagonist", "P(Agonist)"])

In [None]:
#ARRESTIN, Agonist Results
X_test = docking_normalized.transpose().loc[test_drugs].values[:, top_indices]
X_test.shape
pd.DataFrame(rfr.predict_proba(X_test), index=test_drugs, columns=["P(Antagonist", "P(Agonist)"])

In [None]:
#G Protein, Full Agonist Results
test_drugs = secret_compounds + ["nebivolol", "s-carvedilol", "xamoterol", "3p0g_lig", "isoetharine", "ethylnorepinephrine", "N-Cyclopentylbutanephrine"]
X_test = docking_normalized.transpose().loc[test_drugs].values[:, top_indices]
pd.DataFrame(rfr.predict_proba(X_test), index=test_drugs, columns=["P(Antagonist", "P(Agonist)"])

In [None]:
#ARRESTIN, Agonist Results
X_test = docking_normalized.transpose().loc[test_drugs].values[:, top_indices]
X_test.shape
pd.DataFrame(rfr.predict_proba(X_test), index=test_drugs, columns=["P(Antagonist", "P(Agonist)"])

In [None]:
common_agonists = total_activity.loc[total_activity > 0.2].index.values
plt.scatter(docking_normalized.loc["cluster21"][common_agonists], bret["B2AR-Gprotein, Mean"].subtract(bret["B2AR-Arrestin, Mean"])[common_agonists])

In [None]:
len(common_ligands)

In [None]:
#ARRESTIN, Partial Agonist Results
X_test = docking_normalized.transpose().loc[test_drugs].values[:, top_indices]
X_test.shape
pd.DataFrame(rfr.predict_proba(X_test), index=test_drugs, columns=["P(Antagonist", "P(Agonist)"])

In [None]:
#ARRESTIN, Full Agonist Results
X_test = docking_normalized.transpose().loc[test_drugs].values[:, top_indices]
X_test.shape
pd.DataFrame(rfr.predict_proba(X_test), index=test_drugs, columns=["P(Antagonist", "P(Agonist)"])

In [None]:
top_indices

In [None]:
ddg_scaled.transpose().loc[secret_compounds]

In [None]:
plot_clustermap(docking_normalized[secret_compounds].iloc[top_indices].transpose(), save_file="%s/mehrdad_clustermap.pdf" %(save_dir), method='average', z_score=None)



In [None]:
np.median(test_aucs, axis=0)

In [None]:
plt.scatter(ddg_scaled.loc["cluster11"][common_agonists], y)



In [None]:
arrestin_top = [16, 80, 43, 21, 84, 38, 44, 6, 13, 99]
gprot_top = [44, 6, 83, 4, 76, 99, 62, 92, 39, 80]

arrestin_only = sorted(list(set(arrestin_top).difference(set(gprot_top))))
print(arrestin_only)
gprot_only = sorted(list(set(gprot_top).difference(set(arrestin_top))))
print(gprot_only)
both = sorted(list(set(arrestin_top).intersection(set(gprot_top))))
print(both)


In [None]:
samples_pnas_tica.loc[importances_df.index.values.tolist()[:5]]

In [None]:
importances_df

In [None]:
import sklearn
reload(sklearn)
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import binarize
from sklearn import cross_validation

test_accuracies = []
test_aucs = []
test_log_aucs = []
C_test_aucs = []
C_test_log_aucs = []
n_trials = 10
feature_importances = []
total_activity = bret["B2AR-Arrestin, Mean"].loc[common_ligands].add(bret["B2AR-Gprotein, Mean"].loc[common_ligands])
common_agonists = total_activity.loc[total_activity > 0.1].index.values

#biased_ligands = ["ethylnorepinephrine", "isoetharine", "N-Cyclopentylbutanephrine"]
#biased_ligands += ["nebivolol", "s-carvedilol"]
#non_biased_ligands =  ["r_isopreterenol", "r_epinephrine", "norepinephrine", "zinterol", "orciprenaline", "epinine", "terbutaline", "fenoterol", "procaterol", "formoterol", "salbutamol", "salmeterol"]
#non_biased_ligands += ["s-carazolol", "Ici215001", "bisoprolol", "timolol", "s-atenolol"]
#non_biased_ligands = [n for n in df.columns.values.tolist() if n not in biased_ligands and "Carvedilol" not in n]
#common_agonists = biased_ligands + non_biased_ligands
#common_agonists = common_ligands
#top_clusters = ["cluster80", "cluster62", "cluster11", "cluster21", "cluster16", "cluster43", "cluster38"]
#differences = np.zeros((len(top_clusters), len(top_clusters)))
#for i, cluster in enumerate(top_clusters):
#    for j in range(i, len(top_clusters)):
#        differences[i][j] = 

#top_clusters = importances_df.index.values
top_clusters = delta_delta_g.index.values

y_ori = bret["B2AR-Arrestin, Mean"].loc[common_agonists].subtract(bret["B2AR-Gprotein, Mean"].loc[common_agonists]).values.reshape((-1,1))
#y_ori = np.vstack([y_ori, np.ones(3).reshape((-1,1))])
#common_agonists = common_agonists.tolist() + biased_ligands

X = delta_delta_g.loc[top_clusters][common_agonists].values.T
X_scaled = ddg_scaled.loc[top_clusters][common_agonists].values.T
C = null_features.loc[common_agonists].values
D_scaled = docking_normalized.loc[top_clusters][common_agonists].values.T

#y = np.array([1. for n in biased_ligands] + [0. for n in non_biased_ligands]).reshape((-1,1))
#print(y_ori)
#y_ori = bret["B2AR-Arrestin, Mean"].loc[common_agonists].values.reshape((-1,1))
#plt.hist(y_ori, bins=25)
y = binarize(y_ori, threshold=-0.2)

features = [C, X_scaled, D_scaled]
features_y = [C, X_scaled, D_scaled, y]
feature_names = ["Crystal Structures", "MSM ddG", "Docking"]

In [None]:
xt = ddg_scaled[biased_ligands].values.T
xt_preds = []


for j in range(0,n_trials):
    print(j)
    aucs = []
    log_aucs = []
    train_test_arrays = sklearn.cross_validation.train_test_split(*features_y, train_size=0.8, stratify=y) 
    y_train = train_test_arrays[2*len(features)]
    y_test = train_test_arrays[2*len(features) + 1]
    feature_importance = []
    
    for i in range(0, len(features)):
        X_train = train_test_arrays[2*i]
        X_test = train_test_arrays[2*i+1]

        sc = StandardScaler()
        sc.fit(X_train)
        X_train = sc.transform(X_train)
        X_test = sc.transform(X_test)

        rfr = RandomForestClassifier(n_estimators=100, max_features='sqrt', max_depth=3, n_jobs=-1, oob_score=True)
        rfr.fit(X_train, y_train)
        #top_indices = np.argsort(rfr.feature_importances_*-1.)[:min(10, X.shape[1])]
        feature_importance.append(rfr.feature_importances_)
        #rfr = RandomForestClassifier(n_estimators=10, max_features=None, n_jobs=-1, oob_score=True)
        #X_train = X_train[:, top_indices]
        #X_test = X_test[:, top_indices]
        #rfr.fit(X_train, y_train)
        #f = np.zeros(X.shape[1])
        #f[top_indices] = rfr.feature_importances_
        #feature_importance.append(f)
        
        if i == 1:
            xt_preds.append(rfr.predict(xt))
        
        y_pred = rfr.predict(X_test)
        y_score = rfr.predict_proba(X_test)
        auc, logauc = compute_auc(y_test, y_score)
        aucs.append(auc)
        log_aucs.append(logauc)  
    feature_importances.append(feature_importance)
    test_aucs.append(aucs)
    test_log_aucs.append(log_aucs)

In [None]:
biased_ligands = ["ethylnorepinephrine", "isoetharine", "N-Cyclopentylbutanephrine"]

non_biased_ligands =  ["r_isopreterenol", "r_epinephrine", "norepinephrine", "zinterol", "orciprenaline", "epinine", "terbutaline", "fenoterol", "procaterol", "formoterol", "salbutamol", "salmeterol"]

ddg_scaled.loc[importances_df.index.values[:5]][biased_ligands + non_biased_ligands]

In [None]:
ddg_scaled.sort("procaterol", inplace=False).iloc[:10]

In [None]:
plt.scatter(ddg_scaled.loc["cluster36"][common_agonists], y)

In [None]:
y

In [None]:
import sklearn
reload(sklearn)
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import binarize
from sklearn import cross_validation

test_accuracies = []
test_aucs = []
test_log_aucs = []
C_test_aucs = []
C_test_log_aucs = []
n_trials = 1000
feature_importances = []
reg = []
total_activity = bret["B2AR-Arrestin, Mean"].loc[common_ligands].add(bret["B2AR-Gprotein, Mean"].loc[common_ligands])
common_agonists = total_activity.loc[total_activity > 0.3].index.values

#biased_ligands = ["ethylnorepinephrine", "isoetharine", "N-Cyclopentylbutanephrine", "3p0g_lig"]
#biased_ligands = ["nebivolol", "s-carvedilol"]
#non_biased_ligands =  ["r_isopreterenol", "r_epinephrine", "norepinephrine", "zinterol", "orciprenaline", "epinine", "terbutaline", "fenoterol", "procaterol", "formoterol", "salbutamol", "salmeterol"]
#non_biased_ligands = ["s-carazolol", "Ici215001", "bisoprolol", "timolol", "s-atenolol"]
#non_biased_ligands = [n for n in df.columns.values.tolist() if n not in biased_ligands and "Carvedilol" not in n]
#common_agonists = biased_ligands + non_biased_ligands

X = delta_delta_g[common_agonists].values.T
X_scaled = ddg_scaled[common_agonists].values.T
C = null_features.loc[common_agonists].values

#y = np.array([1. for n in biased_ligands] + [0. for n in non_biased_ligands]).reshape((-1,1))
#y_ori = bret["B2AR-Arrestin, Mean"].loc[common_ligands].divide(bret["B2AR-Arrestin, Mean"].loc[common_agonists].add(bret["B2AR-Gprotein, Mean"].loc[common_agonists])).values.reshape((-1,1))
y_ori = bret["B2AR-Arrestin, Mean"].loc[common_agonists].values.reshape((-1,1))
plt.hist(y_ori, bins=25)
y = binarize(y_ori, threshold=0.2) 

features = [C, X, X_scaled, docking_normalized[common_agonists].values.T]
features_y = [C, X, X_scaled, docking_normalized[common_agonists].values.T, y]
feature_names = ["Crystal Structures", "MSM States", "Normalized MSM States", "Normalized Docking"]
  

for j in range(0,n_trials):
    print(j)
    aucs = []
    log_aucs = []
    train_test_arrays = sklearn.cross_validation.train_test_split(*features_y, train_size=0.8, stratify=y) 
    y_train = train_test_arrays[2*len(features)]
    y_test = train_test_arrays[2*len(features) + 1]
    feature_importance = []
    r = []
    
    for i in range(0, len(features)):
        X_train = train_test_arrays[2*i]
        X_test = train_test_arrays[2*i+1]

        sc = StandardScaler()
        sc.fit(X_train)
        X_train = sc.transform(X_train)
        X_test = sc.transform(X_test)
        
        cs = np.logspace(-3., 20.)
        rfr = LogisticRegressionCV(Cs=cs, penalty='l2')
        rfr.fit(X_train, y_train)
        feature_importance.append(rfr.coef_)
        y_pred = rfr.predict(X_test)
        y_score = rfr.predict_proba(X_test)
        auc, logauc = compute_auc(y_test, y_score)
        aucs.append(auc)
        log_aucs.append(logauc)  
        r.append(rfr.C_)
    reg.append(r)
    feature_importances.append(feature_importance)
    test_aucs.append(aucs)
    test_log_aucs.append(log_aucs)

In [None]:
plt.scatter(docking_normalized.loc["cluster80"][common_ligands], -1.0*bret.loc[common_ligands]["B2AR-Gprotein, Mean"].subtract(bret.loc[common_ligands]["B2AR-Arrestin, Mean"]))

In [None]:
auc_df = pd.DataFrame(np.array(test_aucs), columns=feature_names)
auc_df.plot(kind='box')

In [None]:
auc_df.median(axis=0)

In [None]:
normalized_docking_importances = [f[1] for f in feature_importances]

In [None]:
importances_df = make_importances_df(normalized_docking_importances, top_clusters)
importances_df


In [None]:
from sklearn.preprocessing import binarize
X = ddg_scaled[common_ligands].values.T
y = bret["B2AR-Gprotein, Mean"].loc[common_ligands].values.reshape((-1,1))
y = binarize(y, threshold=0.5)
print(np.shape(y))
from sklearn.svm import l1_min_c
from sklearn import linear_model

#cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)
cs = np.logspace(-3., 20.)
print("Computing regularization path ...")
clf = linear_model.LogisticRegression(C=1.0, penalty='l2', tol=1e-6)
coefs_ = []
for c in cs:
    clf.set_params(C=c)
    clf.fit(X, y)
    coefs_.append(clf.coef_.ravel().copy())

coefs_ = pd.DataFrame(np.array(coefs_), columns=ddg_scaled.index.values, index=np.log10(cs))
#coefs_[list(set(inactive_clusters.tolist()).intersection(set(importances_df.iloc[10:20].index.values.tolist())))].plot()
coefs_[importances_df.index.values[:5]].plot()
#plt.plot(np.log10(cs), coefs_)
#ymin, ymax = plt.ylim()
###plt.xlabel('log(C)')
#plt.ylabel('Coefficients')
##plt.title('Logistic Regression Path')
#plt.axis('tight')
#plt.show()

In [None]:
samples_pnas_tica.loc[importances_df.index.values[:10]]

In [None]:
plt.scatter(deltas_tica.loc[importances_df.index.values[:10]]["tIC.6"], coefs_[importances_df.index.values[:10]].values[49])

In [None]:
plot_clustermap(ddg_scaled[common_agonists.tolist()].loc[importances_df.index.values.tolist()[:5]].transpose(), save_file="%s/msm_n-clusters%d_lag-time%d_tICs%d.pdf" %(tica_dir, n_clusters, msm_lag_time, n_components), method='average')



In [None]:
samples_pnas_tica.loc[samples_pnas_tica["tm6_tm3_dist"] < 18.0].loc[importances_df.iloc[0:5].index].dropna()

In [None]:
ddg_scaled["nebivolol"].subtract(ddg_scaled["s-carazolol"]).sort(inplace=False).iloc[:10]

In [None]:
ddg_scaled["s-carvedilol"].subtract(ddg_scaled["s-carazolol"]).sort(inplace=False).iloc[:10]

In [None]:
samples_pnas_tica.loc[["cluster74", "cluster69", "cluster13", "cluster12", "cluster66"]]

In [None]:
import plots
reload(plots)
from plots import *
#plot_importances_barh(importances_df.values, importances_df.index.values, "MSM State Importance in Arrestin Prediction", "Feature Importance", "MSM State", "%s/arrestin_0pt5_classification_rfr.pdf" %(tica_dir), n_features=50)
importances_df.iloc[0:25].plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("MSM State")
plt.title("Importance of MSM States in Predicting Arrestin Activity")
plt.savefig("%s/msm_%dstates_arrestin_0pt2_agonists_only_classification_rfr.pdf" %(tica_dir, n_clusters))
#plt.clf()


In [None]:
X = docking_normalized[common_agonists].values.T
y = bret["B2AR-Arrestin, Mean"].subtract(bret["B2AR-Gprotein, Mean"]).loc[common_agonists].values.reshape((-1,1))
y = binarize(y, threshold=-0.2)
print(y)
from sklearn import linear_model

cs = np.logspace(-3., 200.)
clf = linear_model.LogisticRegression(C=1.0, penalty='l2', tol=1e-6)
coefs_ = []
for c in cs:
    clf.set_params(C=c)
    clf.fit(X, y)
    coefs_.append(clf.coef_.ravel().copy())

coefs_ = pd.DataFrame(np.array(coefs_), columns=ddg_scaled.index.values, index=np.log10(cs))
coefs_[importances_df.iloc[0:10].loc[samples_pnas_tica["tm6_tm3_dist"] < 9.].index].plot()
plt.xlabel("Log Regularization Parameter")
plt.ylabel("Coefficient for Arrestin Activity")
plt.title("Logistic Regression Coefficient in Predicting Arrestin Activity")
plt.savefig("%s/msm_%dstates_arrestin_0pt2_agonists_only_classification_logistic.pdf" %(tica_dir, n_clusters))
#plt.clf()

In [None]:
np.shape(coefs_)

In [None]:
print(np.median(np.nan_to_num(test_aucs)))
print(np.median(np.nan_to_num(C_test_aucs)))
print(np.median(np.nan_to_num(test_log_aucs)))
print(np.median(np.nan_to_num(C_test_log_aucs)))

In [None]:
lr = LinearRegression()
states = importances_df.index.values.tolist()
model = lr.fit(X, y_ori)
pd.DataFrame(model.coef_.T, index=delta_delta_g.index, columns=["importance"]).loc[states]#.sort("importance", inplace=False)

In [None]:
model.coef_.shape

In [None]:
import seaborn
reload(seaborn)
import seaborn as sns
plt.style.use('ggplot')
plt.figure(figsize=(5, 5))
sns.set_style("darkgrid")
g = (auc_df
    .pipe((sns.boxplot, 'data'), orient='v', showfliers=True))
g.set_xticklabels(auc_df.columns.values, rotation=90)
sns.despine()
plt.title("AUC for Arrestin Prediction")
plt.ylabel("Frequency AUCs over Random Splits")
plt.xlabel("Featurization")
plt.show()
plt.savefig("%s/auc_arrestin_prediction_all_ligands_0pt2_cutoff.pdf" %tica_dir)

In [None]:
corr_matrix = compute_pearson_matrix(delta_delta_g[common_agonists].values.T, y)
corr_df = pd.DataFrame(model.coef_.T, index=delta_delta_g.index.values, columns=["Correlation"]).sort("Correlation",inplace=False)
#corr_df.loc[["cluster80", "cluster16", "cluster43", "cluster44"]].plot(kind='barh')
corr_df.loc[importances_df.index.values[:20]].sort("Correlation", inplace=False).plot(kind='barh')#, figsize=(5,20))
plt.xlabel("Pearsson Correlation with Arrestin Activity")
plt.ylabel("MSM State")
plt.title("Correlation of MSM States with Arrestin Activity")


In [None]:
samples_pnas_tica.loc[corr_df.loc[importances_df.index.values[:20]].sort("Correlation", inplace=False).index.values]

In [None]:
df = copy.deepcopy(aggregate_docking_msm)
df[df.columns.values] = scale(df.values)
plt.scatter(df[common_ligands].loc["cluster13"].values, bret["B2AR-Arrestin, Mean"].loc[common_ligands].values.T)

In [None]:
bret

In [None]:
import sklearn
reload(sklearn)
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import binarize
from sklearn import cross_validation

test_accuracies = []
test_aucs = []
test_log_aucs = []
C_test_aucs = []
C_test_log_aucs = []
n_trials = 100
feature_importances = []

for j in range(0,n_trials):
    print(j)

    X = delta_delta_g[common_ligands].values.T
    C = null_features.loc[common_ligands].values
    y = bret["B2AR-Arrestin, Mean"].loc[common_ligands].values.reshape((-1,1))
    y = binarize(y, threshold=0.5)

    X_train, X_test, y_train, y_test, C_train, C_test = sklearn.cross_validation.train_test_split(X, y, C, train_size=0.8, stratify=y)
    
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    X_test = sc.transform(X_test)

    sc = StandardScaler()
    sc.fit(C_train)
    C_train = sc.transform(C_train)
    C_test = sc.transform(C_test)
 
    rfr = RandomForestClassifier(n_estimators=100, max_features='sqrt', max_depth=2, n_jobs=-1, oob_score=True)
    rfr.fit(X_train, y_train)
    feature_importances.append(rfr.feature_importances_)
    y_pred = rfr.predict(X_test)
    test_accuracies.append(np.sqrt(np.mean(np.square(y_test-y_pred.reshape((-1,1))))))
    y_score = rfr.predict_proba(X_test)
    auc, logauc = compute_auc(y_test, y_score)
    test_aucs.append(auc)
    test_log_aucs.append(logauc)
    
    rfr = RandomForestClassifier(n_estimators=100, max_features='sqrt', max_depth=2, n_jobs=-1, oob_score=True)
    rfr.fit(C_train, y_train)
    C_y_pred = rfr.predict(C_test)
    y_score = rfr.predict_proba(C_test)
    auc, logauc = compute_auc(y_test, y_score)
    C_test_aucs.append(auc)
    C_test_log_aucs.append(logauc)

    
    

    

In [None]:
import seaborn
reload(seaborn)
import seaborn as sns
auc_df = pd.DataFrame(np.vstack([test_aucs, C_test_aucs]).T, columns=["MSM States", "Crystal Structures"])

plt.style.use('ggplot')
plt.figure(figsize=(5, 5))
sns.set_style("darkgrid")
g = (auc_df
    .pipe((sns.boxplot, 'data'), orient='v', showfliers=True))
#g.set_xticklabels(experiments.columns.values, rotation=90)
sns.despine()
plt.title("AUC for G Protein Prediction")
plt.ylabel("Frequency AUCs over Random Splits")
plt.xlabel("Featurization")
plt.show()
plt.savefig("%s/msm_n-states%d_auc_gprot_prediction_cutoff0pt5.pdf" %(tica_dir, n_clusters))

In [None]:
importances_df = make_importances_df(feature_importances, delta_delta_g.index.values.tolist())
importances_df



In [None]:
import sklearn
from sklearn.linear_model import LinearRegression, Lasso
model = Lasso(alpha=0.0001)                                
model.fit(ddg_scaled[common_agonists].values.T, bret.loc[common_agonists]["B2AR-Arrestin, Mean"].subtract(bret.loc[common_agonists]["B2AR-Gprotein, Mean"].values))
pd.DataFrame(model.coef_, index=ddg_scaled.index, columns=["importance"]).sort("importance", inplace=False)

In [None]:
print(np.median(np.nan_to_num(test_aucs)))
print(np.median(np.nan_to_num(C_test_aucs)))
print(np.median(np.nan_to_num(test_log_aucs)))
print(np.median(np.nan_to_num(C_test_log_aucs)))

In [None]:
import plots
reload(plots)
from plots import *
#plot_importances_barh(importances_df.values, importances_df.index.values, "MSM State Importance in Arrestin Prediction", "Feature Importance", "MSM State", "%s/arrestin_0pt5_classification_rfr.pdf" %(tica_dir), n_features=50)
importances_df.iloc[0:25].plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("MSM State")
plt.title("Importance of MSM States in Predicting G Protein Activity")
#plt.savefig("%s/msm_%dstates_gprot_0pt5_classification_rfr.pdf" %(tica_dir, n_clusters))
#plt.clf()



In [None]:
corr_matrix = compute_pearson_matrix(ddg_scaled[common_ligands].values.T, y)
corr_df = pd.DataFrame(model.coef_, index=ddg_scaled.index.values, columns=["Correlation"])
corr_df.loc[importances_df.index.values[:10]].sort("Correlation", inplace=False).plot(kind='barh')
plt.xlabel("Pearsson Correlation with G Protein Activity")
plt.ylabel("MSM State")
plt.title("Correlation of MSM States with G Protein Activity")
#plt.savefig("%s/msm_%dstates_gprot_0pt5_classification_correlations.pdf" %(tica_dir, n_clusters))
#plt.clf()




In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)