# DATA PREPARATION

### Imports

In [1]:
# Essentials
import pandas as pd
import numpy as np

In [2]:
# Import custom class and functions
from lib.descriptors import QMDescriptors
from lib.utils import combine_data, format_compound_name, calculate_redox_potential

In [3]:
# Keep kernel updated 
%load_ext autoreload
%autoreload 2

### Load experimental data

In [4]:
# Read experimental data
file_experimental = 'data/TFKs_experimental_data.ods'
experimental = pd.read_excel(file_experimental)

In [5]:
# To avoid problems with MHC = 0, a small number is used instead
experimental.replace(0, 0.0001, inplace=True)

In [6]:
experimental['Name'] = experimental['Labjournal'].apply(format_compound_name)
experimental.head()

Unnamed: 0,Labjournal,Group,Compounds (paper),SMILES,MIC [uM] M.vaccae,MIC [uM] M.smegmatis,clogP (SwissADME),HeLa CC50 [uM],HUVEC GI50 [uM],K-562 GI50 [uM],MHC,LLE,Name
0,TFK-483,Nitrile,15,O=C(N=C(N1CCC2(OC[C@H](C)O2)CC1)S3)C4=C3C([N+]...,0.015963,0.128734,1.58,">128,7",49.9,94.4,11.505857,6.21,TFK483
1,TFK-471,Halogenid,16,O=C1C2=CC(I)=CC([N+]([O-])=O)=C2SC(N3CCC4(OC[C...,0.006336,0.10219,2.49,">102,1",">102,1",">102,1",0.453893,5.7,TFK471
2,TFK-449,Halogenid,17,O=C1C2=CC(Br)=CC([N+]([O-])=O)=C2SC(N3CCC4(OC[...,0.003618,0.056525,2.45,>113,>113,>113,0.935009,5.99,TFK449
3,TFK-520,Carboxylic acid,19,O=C(N=C(N1CCC2(OC[C@H](C)O2)CC1)S3)C4=C3C([N+]...,0.98,15.316755,1.33,">122,7",">122,7",">122,7",0.0001,3.484138,TFK520
4,TFK-543,Esters,21a,O=C(N=C(N1CCC2(OC[C@H](C)O2)CC1)S3)C4=C3C([N+]...,0.237291,3.701735,1.72,">118,6",80.6,66.9,2.583682,4.9,TFK543


### Extract local descriptors for nitro-BTZ core

The calculated QM properties are retrieved from the file generated by Jaguar (using `qm_descriptors.py` from Schrödinger). 

In [7]:
# Define file with QM properties
file_prop = 'data/TFKs_qm_descriptors.csv'
# Define file with atomic indices (given by Maestro)
file_atoms = 'data/TFKs_atoms.txt'

In [8]:
# retrieve descriptors
descriptors = QMDescriptors(file_prop, file_atoms)
atomic = descriptors.get_local_descriptors()

In [9]:
atomic

Unnamed: 0_level_0,cesp_C2,cesp_C4,cesp_C4a,cesp_C5,cesp_C6,cesp_C7,cesp_C8,cesp_C8a,cesp_N,cesp_N3,...,mul_C5,mul_C6,mul_C7,mul_C8,mul_C8a,mul_N,mul_N3,mul_O1,mul_O2,mul_S1
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BTZ043,0.5001,0.69611,0.08219,-0.21726,0.04107,-0.36865,0.29266,-0.13566,0.68881,-0.61053,...,-0.05721,-0.20978,-0.05668,-0.02104,-0.03853,0.1982,-0.22687,-0.20075,-0.19977,0.21979
TFK449,0.49523,0.71525,0.00921,-0.00412,-0.08248,-0.15406,0.24678,-0.17155,0.66054,-0.6112,...,0.05028,-0.47763,-0.01032,-0.03974,-0.00397,0.18482,-0.20646,-0.20289,-0.19834,0.20474
TFK471,0.49527,0.74005,-0.12248,0.20042,-0.33651,0.04339,0.12578,-0.06857,0.67922,-0.62301,...,0.0224,-0.4737,-0.05204,-0.0361,0.0005,0.18313,-0.2087,-0.20524,-0.19549,0.19534
TFK483,0.50982,0.69106,0.06901,-0.2092,0.10271,-0.35286,0.26291,-0.10461,0.70261,-0.59924,...,-0.04525,0.02296,-0.03366,-0.0363,-0.0328,0.18033,-0.22013,-0.19678,-0.18834,0.2329
TFK520,0.50261,0.68648,0.09763,-0.33027,0.22398,-0.4806,0.29893,-0.17094,0.70996,-0.56997,...,-0.08134,-0.13453,-0.06531,-0.0233,-0.0376,0.16441,-0.2362,-0.2095,-0.20861,0.20248
TFK543,0.45402,0.64689,0.02258,-0.102,-0.0805,-0.31976,0.26662,-0.12888,0.62437,-0.54703,...,-0.05882,-0.12362,-0.094,-0.04021,-0.04606,0.13377,-0.15236,-0.15415,-0.15632,0.1841
TFK659,0.47192,0.69895,0.10102,-0.13746,-0.09538,-0.31835,0.33475,-0.22731,0.64395,-0.60244,...,-0.07972,-0.06395,-0.11056,0.01408,-0.08448,0.1741,-0.20447,-0.20211,-0.20593,0.22671
TFK660,0.5002,0.70642,0.12861,-0.27042,0.12787,-0.43339,0.42033,-0.27444,0.62962,-0.60452,...,-0.06844,-0.08947,-0.07695,0.00663,-0.08183,0.17402,-0.23613,-0.20427,-0.20203,0.23321
TFK661,0.47976,0.7376,0.07108,-0.16925,0.11387,-0.40806,0.34912,-0.23938,0.66404,-0.63641,...,-0.06771,-0.05656,-0.06933,-0.03056,-0.06808,0.18874,-0.22088,-0.20591,-0.20318,0.2438
TFK7092,0.49121,0.72835,0.09235,-0.1607,0.01071,-0.40521,0.40297,-0.2531,0.63944,-0.62672,...,-0.05264,-0.10266,-0.07819,-0.04257,-0.02049,0.18819,-0.21664,-0.19805,-0.19808,0.1903


In [10]:
# Combine calculated properties with experimental values
atomic.reset_index(inplace=True, names='Name')
atomic_full = combine_data(atomic, experimental)
atomic_full.head()

Unnamed: 0,Name,MIC_Mv,MIC_Ms,MHC,pMIC_Mv,pMHC,cesp_C2,cesp_C4,cesp_C4a,cesp_C5,...,mul_C5,mul_C6,mul_C7,mul_C8,mul_C8a,mul_N,mul_N3,mul_O1,mul_O2,mul_S1
0,BTZ043,0.000348,0.058,1.0,9.458775,-0.0,0.5001,0.69611,0.08219,-0.21726,...,-0.05721,-0.20978,-0.05668,-0.02104,-0.03853,0.1982,-0.22687,-0.20075,-0.19977,0.21979
1,TFK449,0.003618,0.056525,0.935009,8.441581,0.029184,0.49523,0.71525,0.00921,-0.00412,...,0.05028,-0.47763,-0.01032,-0.03974,-0.00397,0.18482,-0.20646,-0.20289,-0.19834,0.20474
2,TFK471,0.006336,0.10219,0.453893,8.1982,0.343046,0.49527,0.74005,-0.12248,0.20042,...,0.0224,-0.4737,-0.05204,-0.0361,0.0005,0.18313,-0.2087,-0.20524,-0.19549,0.19534
3,TFK483,0.015963,0.128734,11.505857,7.796885,-1.060919,0.50982,0.69106,0.06901,-0.2092,...,-0.04525,0.02296,-0.03366,-0.0363,-0.0328,0.18033,-0.22013,-0.19678,-0.18834,0.2329
4,TFK520,0.98,15.316755,0.0001,6.008774,4.0,0.50261,0.68648,0.09763,-0.33027,...,-0.08134,-0.13453,-0.06531,-0.0233,-0.0376,0.16441,-0.2362,-0.2095,-0.20861,0.20248


### Extract global descriptors (CDFT)

In [11]:
# retrieve global descriptors
molecular = descriptors.get_global_descriptors()
molecular

Unnamed: 0_level_0,r_j_HOMO_Energy_Hartree,r_j_LUMO_Energy_Hartree,r_j_HOMO-LUMO_Gap_Hartree,Chemical_potential,Electrophilicity_index
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BTZ043,-0.25982,-0.12815,0.13167,-0.193985,0.142896
dehalBTZ,-0.25523,-0.12213,0.1331,-0.18868,0.133735
TFK449,-0.25535,-0.1269,0.12845,-0.191125,0.142191
TFK471,-0.25188,-0.12591,0.12597,-0.188895,0.141626
TFK483,-0.26057,-0.12932,0.13125,-0.194945,0.144775
TFK520,-0.25026,-0.11962,0.13064,-0.18494,0.130905
TFK543,-0.25419,-0.12374,0.13045,-0.188965,0.136864
TFK659,-0.25688,-0.12524,0.13164,-0.19106,0.138651
TFK660,-0.25662,-0.12486,0.13176,-0.19074,0.138061
TFK661,-0.25608,-0.12381,0.13227,-0.189945,0.136384


In [12]:
# Combine the two dataframes
molecular.columns = molecular.columns.str.removeprefix('r_j_')
molecular.reset_index(inplace=True, names='Name')
molecular_full = combine_data(molecular, experimental)

In [13]:
# Add pMIC for M. smegmatis (for comparison later)
col = [x for x in experimental.columns if 'smeg' in x]
experimental['pMIC_Ms'] = -np.log10(experimental[col] / 1e6)

In [14]:
experimental.set_index('Name', inplace=True)
molecular_full.set_index('Name', inplace=True)
molecular_full = pd.concat((molecular_full, experimental[['clogP (SwissADME)', 
                                                          'pMIC_Ms']]), 
                           axis=1)
molecular_full.reset_index(inplace=True, names='Name')

In [15]:
molecular_full.head()

Unnamed: 0,Name,MIC_Mv,MIC_Ms,MHC,pMIC_Mv,pMHC,HOMO_Energy_Hartree,LUMO_Energy_Hartree,HOMO-LUMO_Gap_Hartree,Chemical_potential,Electrophilicity_index,clogP (SwissADME),pMIC_Ms
0,BTZ043,0.000348,0.058,1.0,9.458775,-0.0,-0.25982,-0.12815,0.13167,-0.193985,0.142896,2.87,7.236572
1,dehalBTZ,1.1,8.59,0.000627,5.958607,3.202449,-0.25523,-0.12213,0.1331,-0.18868,0.133735,1.75,5.066007
2,TFK449,0.003618,0.056525,0.935009,8.441581,0.029184,-0.25535,-0.1269,0.12845,-0.191125,0.142191,2.45,7.247761
3,TFK471,0.006336,0.10219,0.453893,8.1982,0.343046,-0.25188,-0.12591,0.12597,-0.188895,0.141626,2.49,6.990591
4,TFK483,0.015963,0.128734,11.505857,7.796885,-1.060919,-0.26057,-0.12932,0.13125,-0.194945,0.144775,1.58,6.890307


In [16]:
experimental.reset_index(inplace=True, names='Name')

### Extract reaction energetics

In [17]:
# Read reaction energetics data (autodE)
file_dE = 'data/TFKs_reactivity_BH4_autode.csv'
reactivity = pd.read_csv(file_dE)
reactivity.sort_values(by='Cmpd', inplace=True)

In [18]:
# Rename without hyphen
reactivity.rename(columns={'Cmpd': 'Name'}, inplace=True)
reactivity['Name'] = reactivity.Name.apply(format_compound_name)
reactivity.head()

Unnamed: 0,Name,dE,dE‡,dG,dG‡
12,BTZ043,9.218489,8.724466,9.634038,15.263565
15,TFK449,10.471704,9.30576,11.048474,16.426274
7,TFK471,10.7046,9.231779,11.560231,16.295228
13,TFK483,6.262366,6.7833,7.301913,13.49966
1,TFK520,17.059218,12.291914,18.269437,19.740947


In [19]:
# Combine with experimental
reactivity_full = combine_data(reactivity, experimental)
reactivity_full.head()

Unnamed: 0,Name,MIC_Mv,MIC_Ms,MHC,pMIC_Mv,pMHC,dE,dE‡,dG,dG‡
0,BTZ043,0.000348,0.058,1.0,9.458775,-0.0,9.218489,8.724466,9.634038,15.263565
1,TFK449,0.003618,0.056525,0.935009,8.441581,0.029184,10.471704,9.30576,11.048474,16.426274
2,TFK471,0.006336,0.10219,0.453893,8.1982,0.343046,10.7046,9.231779,11.560231,16.295228
3,TFK483,0.015963,0.128734,11.505857,7.796885,-1.060919,6.262366,6.7833,7.301913,13.49966
4,TFK520,0.98,15.316755,0.0001,6.008774,4.0,17.059218,12.291914,18.269437,19.740947


### Extract covalent docking data

In [20]:
# Read covalent docking data
covdock_file = 'data/TFKs_CovDock.csv'
covdock = pd.read_csv(covdock_file)

In [21]:
# Rename columns for simplicity
cols = ['Title', 'MMGBSA dG Bind', 'docking score']
covdock = covdock.loc[:, cols].copy()

In [22]:
# Rename without hyphen
covdock.rename(columns={'Title': 'Name'}, inplace=True)
covdock['Name'] = covdock.Name.apply(format_compound_name)
covdock.head()

Unnamed: 0,Name,MMGBSA dG Bind,docking score
0,BTZ043,-53.98,-7.011
1,TFK471,-47.94,-5.586
2,TFK449,-48.93,-5.553
3,TFK483,-51.66,-6.195
4,TFK520,-33.8,-5.107


In [23]:
# Combine with experimental
covdock_full = combine_data(covdock, experimental)
covdock_full.head()

Unnamed: 0,Name,MIC_Mv,MIC_Ms,MHC,pMIC_Mv,pMHC,MMGBSA dG Bind,docking score
0,BTZ043,0.000348,0.058,1.0,9.458775,-0.0,-53.98,-7.011
1,TFK471,0.006336,0.10219,0.453893,8.1982,0.343046,-47.94,-5.586
2,TFK449,0.003618,0.056525,0.935009,8.441581,0.029184,-48.93,-5.553
3,TFK483,0.015963,0.128734,11.505857,7.796885,-1.060919,-51.66,-6.195
4,TFK520,0.98,15.316755,0.0001,6.008774,4.0,-33.8,-5.107


### Extract permeability data

In [24]:
# Read permeability data
perm_file = 'data/TFKs_membrane_perm.csv'
perm = pd.read_csv(perm_file)

In [25]:
# Rename without hyphen
perm.rename(columns={'title': 'Name'}, inplace=True)
perm['Name'] = perm.Name.apply(format_compound_name)
perm.head()

Unnamed: 0,Name,r_adme_Log_Perm_RRCK,r_adme_Membrane_dG_Insert,r_adme_Membrane_Penalty,r_adme_Membrane_HDLD,r_adme_Membrane_Energy,r_adme_Solvent_Energy,r_adme_Membrane_GB,r_adme_Solvent_GB,r_adme_Volume,r_adme_Membrane_HDLD_GB
0,BTZ043,-4.539085,1.178409,0.1028,1.075609,-3.844548,-4.920157,-3.127464,-10.319747,707.770699,7.192283
1,TFK449,-4.59003,3.014551,0.1447,2.869851,-9.659625,-12.529475,-3.159509,-9.768251,671.756133,6.608742
2,TFK471,-4.652516,4.082418,0.2748,3.807618,-3.290078,-7.097697,-2.973296,-9.485235,677.490823,6.511939
3,TFK483,-4.646541,4.159544,0.0255,4.134044,1.906698,-2.227346,-4.494394,-12.304219,693.816791,7.809824
4,TFK520,-5.842402,17.529522,6.6327,10.896822,-10.130156,-21.026978,-5.418847,-19.573772,705.272627,14.154925


In [26]:
# Combine with experimental
perm_full = combine_data(perm, experimental)
perm_full.head()

Unnamed: 0,Name,MIC_Mv,MIC_Ms,MHC,pMIC_Mv,pMHC,r_adme_Log_Perm_RRCK,r_adme_Membrane_dG_Insert,r_adme_Membrane_Penalty,r_adme_Membrane_HDLD,r_adme_Membrane_Energy,r_adme_Solvent_Energy,r_adme_Membrane_GB,r_adme_Solvent_GB,r_adme_Volume,r_adme_Membrane_HDLD_GB
0,BTZ043,0.000348,0.058,1.0,9.458775,-0.0,-4.539085,1.178409,0.1028,1.075609,-3.844548,-4.920157,-3.127464,-10.319747,707.770699,7.192283
1,TFK449,0.003618,0.056525,0.935009,8.441581,0.029184,-4.59003,3.014551,0.1447,2.869851,-9.659625,-12.529475,-3.159509,-9.768251,671.756133,6.608742
2,TFK471,0.006336,0.10219,0.453893,8.1982,0.343046,-4.652516,4.082418,0.2748,3.807618,-3.290078,-7.097697,-2.973296,-9.485235,677.490823,6.511939
3,TFK483,0.015963,0.128734,11.505857,7.796885,-1.060919,-4.646541,4.159544,0.0255,4.134044,1.906698,-2.227346,-4.494394,-12.304219,693.816791,7.809824
4,TFK520,0.98,15.316755,0.0001,6.008774,4.0,-5.842402,17.529522,6.6327,10.896822,-10.130156,-21.026978,-5.418847,-19.573772,705.272627,14.154925


### Calculate redox potentials

In [27]:
# Read energies from redox calculations (autodE)
file_energies = 'data/TFKs_redox_autode.csv'
redox = calculate_redox_potential(file_energies)

In [28]:
# Rename without hyphen
redox['Name'] = redox.Name.apply(format_compound_name)
redox.head()

Unnamed: 0,Name,dG_aq (Ha),Ered (V)
0,BTZ043,-0.138759,3.774525
1,TFK449,-0.137629,3.743783
2,TFK471,-0.136911,3.72427
3,TFK483,-0.14024,3.814829
4,TFK520,-0.135824,3.694697


In [29]:
redox_full = combine_data(redox, experimental)
redox_full.head()

Unnamed: 0,Name,MIC_Mv,MIC_Ms,MHC,pMIC_Mv,pMHC,dG_aq (Ha),Ered (V)
0,BTZ043,0.000348,0.058,1.0,9.458775,-0.0,-0.138759,3.774525
1,TFK449,0.003618,0.056525,0.935009,8.441581,0.029184,-0.137629,3.743783
2,TFK471,0.006336,0.10219,0.453893,8.1982,0.343046,-0.136911,3.72427
3,TFK483,0.015963,0.128734,11.505857,7.796885,-1.060919,-0.14024,3.814829
4,TFK520,0.98,15.316755,0.0001,6.008774,4.0,-0.135824,3.694697


In [30]:
redox_full['E_vs_SHE'] = redox_full['Ered (V)'] - 4.28 # reference value

## Save compiled data

In [31]:
# Define dataframes
dfs = [atomic_full, molecular_full, reactivity_full, 
       covdock_full, perm_full, redox_full]
# Define labels
labels = ['Atomic', 'Molecular', 'Reactivity', 
          'CovDock', 'Permeability', 'Redox']

# Save all dfs to unique Excel file
with pd.ExcelWriter('results/Full_SI_datasets.xlsx') as writer:
    for df, lab in zip(dfs, labels):
        df.to_excel(writer, sheet_name=lab, index=False)