AI/ML directed high-throughput virtual screening of probes to selectively bind with PFOS using SDS as interferent

Siva Dasetty, Max Topel | Ferguson Lab


Imports

In [46]:
import os
import os.path
import sys
import importlib
import yaml

import subprocess

from time import time
from datetime import datetime

import numpy as np
import pandas as pd
import math

import scipy

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions
from rdkit.Chem import Draw
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import Descriptors
from rdkit.Chem.Fragments import *


Read SI tables for your analysis

In [54]:
#### read binding free energy and binding constant data

simdir = '../Data/JCED_data_each_cycle/'

cycle_data = {}
for c in range(0,11):
        
    column_names = ['SMILES', 'MW', 'Sensitivity','Sensitivity_SDS', 
                   'Selectivity', 'Kb_PFOS', 'Kb_SDS', 'Kb_PFOS_by_Kb_SDS']
    cycle_data[c] = pd.read_csv(simdir+'./cycle-'+str(c)+'.txt', delimiter='\t', 
                                names = column_names, comment="#", header=0)
    
    ## NOTE: Sensitivity is defined as negative deltaG

In [60]:
cycle_data[10]

Unnamed: 0,SMILES,MW,Sensitivity,Sensitivity_SDS,Selectivity,Kb_PFOS,Kb_SDS,Kb_PFOS_by_Kb_SDS
0,C[N+](C)(C)CCCCCCCCCC(F)(F)C(F)(F)F,304.21,2.6$\pm$0.2,6.9$\pm$0.2,-4.3$\pm$0.2,7.6$\pm$0.5,24.3$\pm$1.3,0.3$\pm$0.0
1,CCCCCCCCC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)P(C)C,853.5,6.0$\pm$0.5,8.2$\pm$0.5,-2.2$\pm$0.7,41.3$\pm$9.0,57.0$\pm$4.0,0.7$\pm$0.2
2,CCCCCCCCC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)C(...,992.35,5.9$\pm$0.4,8.2$\pm$0.5,-2.4$\pm$0.6,43.5$\pm$6.0,78.5$\pm$12.8,0.6$\pm$0.1
3,CPCCCC(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)...,615.65,5.3$\pm$0.8,7.6$\pm$0.6,-2.3$\pm$1.0,30.4$\pm$8.8,53.6$\pm$10.5,0.6$\pm$0.2
4,PCCCCCCCC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)C(...,1229.04,6.3$\pm$0.6,10.0$\pm$0.1,-3.7$\pm$0.6,52.5$\pm$11.8,160.3$\pm$3.5,0.3$\pm$0.1
5,CCCCC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(...,1278.92,5.0$\pm$0.5,5.0$\pm$0.7,0.0$\pm$0.8,33.8$\pm$5.6,26.2$\pm$5.5,1.3$\pm$0.3
6,CNC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br...,1403.75,3.2$\pm$0.5,6.4$\pm$0.4,-3.2$\pm$0.7,16.5$\pm$3.4,45.7$\pm$6.3,0.4$\pm$0.1
7,CP(C)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)...,731.55,4.9$\pm$0.4,7.9$\pm$0.9,-3.1$\pm$1.0,25.6$\pm$3.3,78.5$\pm$28.0,0.3$\pm$0.1
8,CPCCCCCC(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(C...,807.55,7.4$\pm$0.7,10.1$\pm$0.7,-2.7$\pm$1.0,69.4$\pm$13.6,152.4$\pm$39.5,0.5$\pm$0.1
9,CN(C)CCCCCC(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)...,804.6,7.6$\pm$0.1,8.8$\pm$0.5,-1.2$\pm$0.5,69.4$\pm$3.1,97.0$\pm$14.3,0.7$\pm$0.1


Separate average and error for convenience. 

In [56]:

avg_sensitivity = []
se_sensitivity = []
avg_deltaG_sds = []
se_deltaG_sds = []
avg_selectivity = []
se_selectivity = []
avg_kb_pfos = []
se_kb_pfos = []
avg_kb_sds = []
se_kb_sds = []
avg_kb_ratio = []
se_kb_ratio = []

formatted_cycle_data = {}
for c in range(0,11):
    
    avg_sensitivity = np.array([x.split('$\pm$')[0] for x in cycle_data[c]['Sensitivity']])
    se_sensitivity = np.array([x.split('$\pm$')[1] for x in cycle_data[c]['Sensitivity']])
    
    avg_deltaG_sds = np.array([x.split('$\pm$')[0] for x in cycle_data[c]['Sensitivity_SDS']])
    se_deltaG_sds = np.array([x.split('$\pm$')[1] for x in cycle_data[c]['Sensitivity_SDS']])
    
    avg_selectivity = np.array([x.split('$\pm$')[0] for x in cycle_data[c]['Selectivity']])
    se_selectivity = np.array([x.split('$\pm$')[1] for x in cycle_data[c]['Selectivity']])
    
    avg_kb_pfos = np.array([x.split('$\pm$')[0] for x in cycle_data[c]['Kb_PFOS']])
    se_kb_pfos = np.array([x.split('$\pm$')[1] for x in cycle_data[c]['Kb_PFOS']])
    
    avg_kb_sds = np.array([x.split('$\pm$')[0] for x in cycle_data[c]['Kb_SDS']])
    se_kb_sds = np.array([x.split('$\pm$')[1] for x in cycle_data[c]['Kb_SDS']])

    avg_kb_ratio = np.array([x.split('$\pm$')[0] for x in cycle_data[c]['Kb_PFOS_by_Kb_SDS']])
    se_kb_ratio = np.array([x.split('$\pm$')[1] for x in cycle_data[c]['Kb_PFOS_by_Kb_SDS']])    
    
    
    formatted_data = {'SMILES': cycle_data[c]['SMILES'], 'MW': cycle_data[c]['MW'],
                      'Avg_Sensitivity': avg_sensitivity, 'SE_Sensitivity': se_sensitivity,
                      'Avg_Sensitivity_SDS': avg_deltaG_sds, 'SE_Sensitivity_SDS': se_deltaG_sds,
                      'Avg_Selectivity': avg_selectivity, 'SE_Selectivity': se_selectivity,
                      'Avg_Kb_PFOS': avg_kb_pfos, 'SE_Kb_PFOS': se_kb_pfos,
                      'Avg_Kb_SDS': avg_kb_sds, 'SE_Kb_SDS': se_kb_sds,
                      'Avg_Kb_ratio': avg_kb_ratio, 'SE_Kb_ratio': se_kb_ratio}
    
    formatted_cycle_data[c] = pd.DataFrame(formatted_data)
    

In [62]:
formatted_cycle_data[10]

Unnamed: 0,SMILES,MW,Avg_Sensitivity,SE_Sensitivity,Avg_Sensitivity_SDS,SE_Sensitivity_SDS,Avg_Selectivity,SE_Selectivity,Avg_Kb_PFOS,SE_Kb_PFOS,Avg_Kb_SDS,SE_Kb_SDS,Avg_Kb_ratio,SE_Kb_ratio
0,C[N+](C)(C)CCCCCCCCCC(F)(F)C(F)(F)F,304.21,2.6,0.2,6.9,0.2,-4.3,0.2,7.6,0.5,24.3,1.3,0.3,0.0
1,CCCCCCCCC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)P(C)C,853.5,6.0,0.5,8.2,0.5,-2.2,0.7,41.3,9.0,57.0,4.0,0.7,0.2
2,CCCCCCCCC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)C(...,992.35,5.9,0.4,8.2,0.5,-2.4,0.6,43.5,6.0,78.5,12.8,0.6,0.1
3,CPCCCC(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)...,615.65,5.3,0.8,7.6,0.6,-2.3,1.0,30.4,8.8,53.6,10.5,0.6,0.2
4,PCCCCCCCC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)C(...,1229.04,6.3,0.6,10.0,0.1,-3.7,0.6,52.5,11.8,160.3,3.5,0.3,0.1
5,CCCCC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(...,1278.92,5.0,0.5,5.0,0.7,0.0,0.8,33.8,5.6,26.2,5.5,1.3,0.3
6,CNC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br...,1403.75,3.2,0.5,6.4,0.4,-3.2,0.7,16.5,3.4,45.7,6.3,0.4,0.1
7,CP(C)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)...,731.55,4.9,0.4,7.9,0.9,-3.1,1.0,25.6,3.3,78.5,28.0,0.3,0.1
8,CPCCCCCC(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(C...,807.55,7.4,0.7,10.1,0.7,-2.7,1.0,69.4,13.6,152.4,39.5,0.5,0.1
9,CN(C)CCCCCC(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)...,804.6,7.6,0.1,8.8,0.5,-1.2,0.5,69.4,3.1,97.0,14.3,0.7,0.1


Read from raw data containing higher floating point precision of $\Delta G$ and $K_{b}$

In [58]:
#### read previously computed and newly computed delta G data

simdir = '../Data/GPR_training_data'

new_Kb_data_cycle_PFOS = []
new_Kb_data_cycle_PFOS_SD = []
new_Kb_data_cycle_SDS = []
new_Kb_data_cycle_SDS_SD = []

new_deltaG_data_cycle_PFOS = []
new_deltaG_data_cycle_PFOS_SD = []
new_deltaG_data_cycle_SDS = []
new_deltaG_data_cycle_SDS_SD = []

smiles_cycle = []

for analyte in ['PFOS', 'SDS']:
    
    for c in range(0,11):
        
        
        new_deltaG_data = pd.read_csv(simdir+'/wsmiles_deltaG-linearProbesBlockArea-b3-pbmd-4cvs_cycle_'+str(c)+'.txt', delim_whitespace=True, 
                              names=["Method", "Analyte", "lprobe", "lflourines", "lhydrogen", "hhydrogen", 
                                     "hcarbons", "Probe", "Avg", "SD", "smiles", "formula"], comment="#")

        if analyte == 'PFOS':
            new_deltaG_data_cycle_PFOS.append(new_deltaG_data[(new_deltaG_data["Analyte"] == analyte)].Avg.values)
            new_deltaG_data_cycle_PFOS_SD.append(new_deltaG_data[(new_deltaG_data["Analyte"] == analyte)].SD.values)
        else:
            new_deltaG_data_cycle_SDS.append(new_deltaG_data[(new_deltaG_data["Analyte"] == analyte)].Avg.values)
            new_deltaG_data_cycle_SDS_SD.append(new_deltaG_data[(new_deltaG_data["Analyte"] == analyte)].SD.values)
                    
        new_Kb_data = pd.read_csv(simdir+'/wsmiles_Kb-linearProbesBlockArea-b3-pbmd-4cvs_cycle_'+str(c)+'.txt', delim_whitespace=True, 
                              names=["Method", "Analyte", "lprobe", "lflourines", "lhydrogen", "hhydrogen", 
                                     "hcarbons", "Probe", "Avg", "SD", "smiles", "formula"], comment="#")

        if analyte == 'PFOS':
            new_Kb_data_cycle_PFOS.append(new_Kb_data[(new_Kb_data["Analyte"] == analyte)].Avg.values)
            new_Kb_data_cycle_PFOS_SD.append(new_Kb_data[(new_Kb_data["Analyte"] == analyte)].SD.values)
            smiles_cycle.append(new_Kb_data[(new_Kb_data["Analyte"] == analyte)].smiles.values)
        else:
            new_Kb_data_cycle_SDS.append(new_Kb_data[(new_Kb_data["Analyte"] == analyte)].Avg.values)
            new_Kb_data_cycle_SDS_SD.append(new_Kb_data[(new_Kb_data["Analyte"] == analyte)].SD.values)
                    
            
            
cycles=11

sensitivity_cycle = []
sensitivity_cycle_SD = []
sds_sensitivity_cycle = []
sds_sensitivity_cycle_SD = []
selectivity_cycle = []
selectivity_cycle_SD = []

kT = 1 #300*0.008314
for c in range(cycles):

    sensitivity_cycle.append(-np.array(new_deltaG_data_cycle_PFOS[c]).flatten()/kT)
    sensitivity_cycle_SD.append(np.array(new_deltaG_data_cycle_PFOS_SD[c]).flatten()/kT)

    sds_sensitivity_cycle.append(-np.array(new_deltaG_data_cycle_SDS[c]).flatten()/kT)
    sds_sensitivity_cycle_SD.append(np.array(new_deltaG_data_cycle_SDS_SD[c]).flatten()/kT)
    
    selectivity = np.array(new_deltaG_data_cycle_PFOS[c]).flatten() - np.array(new_deltaG_data_cycle_SDS[c]).flatten()
    selectivityError = np.sqrt(np.array(new_deltaG_data_cycle_PFOS_SD[c]).flatten()**2 + np.array(new_deltaG_data_cycle_SDS_SD[c]).flatten()**2)
    
    selectivity_cycle.append(-selectivity/kT)
    selectivity_cycle_SD.append(selectivityError)
    
kb_sensitivity_cycle = []
kb_sensitivity_cycle_SD = []
sds_kb_sensitivity_cycle = []
sds_kb_sensitivity_cycle_SD = []
kb_selectivity_cycle = []
kb_selectivity_cycle_SD = []

kT = 1 #300*0.008314
for c in range(cycles):

    kb_sensitivity_cycle.append(np.array(new_Kb_data_cycle_PFOS[c]).flatten())
    kb_sensitivity_cycle_SD.append(np.array(new_Kb_data_cycle_PFOS_SD[c]).flatten())
    
    sds_kb_sensitivity_cycle.append(np.array(new_Kb_data_cycle_SDS[c]).flatten())
    sds_kb_sensitivity_cycle_SD.append(np.array(new_Kb_data_cycle_SDS_SD[c]).flatten())
    
    kb_selectivity = np.array(new_Kb_data_cycle_PFOS[c]).flatten()/np.array(new_Kb_data_cycle_SDS[c]).flatten()
    
    delta_x = new_Kb_data_cycle_PFOS_SD[c]
    x = new_Kb_data_cycle_PFOS[c]
    delta_y = new_Kb_data_cycle_SDS_SD[c]
    y = new_Kb_data_cycle_SDS[c]
    kb_selectivityError = kb_selectivity * np.sqrt((delta_x/x)**2 + (delta_y/y)**2)
    
    kb_selectivity_cycle.append(kb_selectivity)
    kb_selectivity_cycle_SD.append(kb_selectivityError)
    
    

raw_cycle_data = {}
for c in range(0,11):
      
    raw_data = {'SMILES': smiles_cycle[c],
                'Avg_Sensitivity': np.array(sensitivity_cycle[c]), 'SE_Sensitivity': np.array(sensitivity_cycle_SD[c]),
                'Avg_Sensitivity_SDS': np.array(sds_sensitivity_cycle[c]), 'SE_Sensitivity_SDS': np.array(sds_sensitivity_cycle_SD[c]),
                'Avg_Selectivity': np.array(selectivity_cycle[c]), 'SE_Selectivity': np.array(selectivity_cycle_SD[c]),
                'Avg_Kb_PFOS': np.array(kb_sensitivity_cycle[c]), 'SE_Kb_PFOS': np.array(kb_sensitivity_cycle_SD[c]),
                'Avg_Kb_SDS': np.array(sds_kb_sensitivity_cycle[c]), 'SE_Kb_SDS': np.array(sds_kb_sensitivity_cycle_SD[c]),
                'Avg_Kb_ratio': np.array(kb_selectivity_cycle[c]), 'SE_Kb_ratio': np.array(kb_selectivity_cycle_SD[c])}
    
    raw_cycle_data[c] = pd.DataFrame(raw_data)
    

In [61]:
raw_cycle_data[10]

Unnamed: 0,SMILES,Avg_Sensitivity,SE_Sensitivity,Avg_Sensitivity_SDS,SE_Sensitivity_SDS,Avg_Selectivity,SE_Selectivity,Avg_Kb_PFOS,SE_Kb_PFOS,Avg_Kb_SDS,SE_Kb_SDS,Avg_Kb_ratio,SE_Kb_ratio
0,C[N+](C)(C)CCCCCCCCCC(F)(F)C(F)(F)F,2.62033,0.162713,6.927085,0.180464,-4.306754,0.242987,7.629013,0.483082,24.344509,1.312122,0.313377,0.026059
1,CCCCCCCCC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)P(C)C,5.996775,0.522804,8.166229,0.503269,-2.169454,0.725674,41.287716,8.978479,56.985148,4.018063,0.724535,0.165634
2,CCCCCCCCC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)C(...,5.86518,0.432162,8.230304,0.470073,-2.365123,0.63854,43.476765,6.001344,78.506293,12.757433,0.5538,0.118079
3,CPCCCC(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)...,5.264021,0.839485,7.596323,0.559248,-2.332302,1.008708,30.360693,8.806649,53.571105,10.539968,0.566736,0.19864
4,PCCCCCCCC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)C(...,6.310388,0.623836,9.994089,0.118218,-3.683701,0.634939,52.510292,11.832361,160.289631,3.482588,0.327596,0.074161
5,CCCCC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(...,4.990227,0.473081,4.988264,0.685507,0.001963,0.832902,33.814979,5.627856,26.196514,5.458241,1.29082,0.344221
6,CNC(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br)C(Br)(Br...,3.206299,0.513938,6.361708,0.42309,-3.155409,0.665686,16.509793,3.3857,45.719407,6.267076,0.361111,0.089074
7,CP(C)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)...,4.851293,0.410463,7.902541,0.904446,-3.051249,0.993229,25.565963,3.31963,78.474295,27.966249,0.325788,0.123569
8,CPCCCCCC(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(C...,7.397476,0.691405,10.140398,0.655473,-2.742923,0.952726,69.444035,13.640178,152.366109,39.452632,0.455771,0.148127
9,CN(C)CCCCCC(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)C(Cl)(Cl)...,7.586552,0.099883,8.788789,0.473082,-1.202237,0.483511,69.40709,3.145704,97.044238,14.258008,0.715211,0.109967
