-
Notifications
You must be signed in to change notification settings - Fork 24
/
vina_dock.py
executable file
·473 lines (400 loc) · 21.9 KB
/
vina_dock.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
#!/usr/bin/env python3
# authors; Aleksandra Nikonenko, Pavel Polishchuk
import argparse
import os
import re
import sqlite3
import subprocess
import sys
import tempfile
import traceback
from functools import partial
from multiprocessing import Pool, Manager, cpu_count
import dask
from dask import bag
from dask.distributed import Lock as daskLock, Client
from meeko import MoleculePreparation
from meeko import obutils
from openbabel import openbabel as ob
from rdkit import Chem
from rdkit.Chem import AllChem
from read_input import read_input
from vina import Vina
class RawTextArgumentDefaultsHelpFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
pass
def cpu_type(x):
return max(1, min(int(x), cpu_count()))
def filepath_type(x):
if x:
return os.path.abspath(x)
else:
return x
def add_protonation(db_fname):
'''
Protonate SMILES by Chemaxon cxcalc utility to get molecule ionization states at pH 7.4.
Parse console output and update db.
:param conn:
:return:
'''
conn = sqlite3.connect(db_fname)
try:
cur = conn.cursor()
protonate, done = list(cur.execute('SELECT protonation, protonation_done FROM setup'))[0]
if protonate and not done:
smiles_dict = cur.execute('SELECT smi, id FROM mols')
if not smiles_dict:
sys.stderr.write(f'no molecules to protonate')
return
smiles, mol_ids = zip(*smiles_dict)
with tempfile.NamedTemporaryFile(suffix='.smi', mode='w', encoding='utf-8') as tmp:
tmp.writelines(['\n'.join(smiles)])
tmp.seek(0)
cmd_run = f"cxcalc majormicrospecies -H 7.4 -f smiles -M -K '{tmp.name}'"
smiles_protonated = subprocess.check_output(cmd_run, shell=True).decode().split()
for mol_id, smi_protonated in zip(mol_ids, smiles_protonated):
cur.execute("""UPDATE mols
SET smi_protonated = ?
WHERE
id = ?
""", (Chem.MolToSmiles(Chem.MolFromSmiles(smi_protonated), isomericSmiles=True), mol_id))
conn.commit()
cur.execute('UPDATE setup SET protonation_done = 1')
conn.commit()
finally:
conn.close()
def mk_prepare_ligand_string(molecule_string, build_macrocycle=True, add_water=False, merge_hydrogen=True,
add_hydrogen=False, pH_value=None, verbose=False, mol_format='SDF'):
mol = obutils.load_molecule_from_string(molecule_string, molecule_format=mol_format)
if pH_value is not None:
mol.CorrectForPH(float(pH_value))
if add_hydrogen:
mol.AddHydrogens()
charge_model = ob.OBChargeModel.FindType("Gasteiger")
charge_model.ComputeCharges(mol)
m = Chem.MolFromMolBlock(molecule_string)
amide_rigid = len(m.GetSubstructMatch(Chem.MolFromSmarts('[C;!R](=O)[#7]([!#1])[!#1]'))) == 0
preparator = MoleculePreparation(merge_hydrogens=merge_hydrogen, macrocycle=build_macrocycle,
hydrate=add_water, amide_rigid=amide_rigid)
#additional parametrs
#rigidify_bonds_smarts=[], rigidify_bonds_indices=[])
try:
preparator.prepare(mol)
except Exception:
sys.stderr.write('Warning. Incorrect mol object to convert to pdbqt. Continue. \n')
traceback.print_exc()
return None
if verbose:
preparator.show_setup()
return preparator.write_pdbqt_string()
def ligand_preparation(smi, seed=0):
def convert2mol(m):
def gen_conf(mol, useRandomCoords, randomSeed):
params = AllChem.ETKDGv3()
params.useRandomCoords = useRandomCoords
params.randomSeed = randomSeed
conf_stat = AllChem.EmbedMolecule(mol, params)
return mol, conf_stat
if not m:
return None
m = Chem.AddHs(m, addCoords=True)
m, conf_stat = gen_conf(m, useRandomCoords=False, randomSeed=seed)
if conf_stat == -1:
# if molecule is big enough and rdkit cannot generate a conformation - use params.useRandomCoords = True
m, conf_stat = gen_conf(m, useRandomCoords=True, randomSeed=seed)
if conf_stat == -1:
return None
AllChem.UFFOptimizeMolecule(m, maxIters=100)
return Chem.MolToMolBlock(m)
try:
mol = Chem.MolFromSmiles(smi)
mol_conf_sdf = convert2mol(mol)
except TypeError:
sys.stderr.write(f'incorrect SMILES {smi} for converting to molecule\n')
return None
if mol_conf_sdf is None:
return None
mol_conf_pdbqt = mk_prepare_ligand_string(mol_conf_sdf,
build_macrocycle=False,
# can do it True, but there is some problem with >=7-chains mols
add_water=False, merge_hydrogen=True, add_hydrogen=False,
# pH_value=7.4, can use this opt but some results are different in comparison to chemaxon
verbose=False, mol_format='SDF')
return mol_conf_pdbqt
def fix_pdbqt(pdbqt_block):
pdbqt_fixed = []
for line in pdbqt_block.split('\n'):
if not line.startswith('HETATM') and not line.startswith('ATOM'):
pdbqt_fixed.append(line)
continue
atom_type = line[12:16].strip()
# autodock vina types
if 'CA' in line[77:79]: #Calcium is exception
atom_pdbqt_type = 'CA'
else:
atom_pdbqt_type = re.sub('D|A', '', line[77:79]).strip() # can add meeko macrocycle types (G and \d (CG0 etc) in the sub expression if will be going to use it
if re.search('\d', atom_type[0]) or len(atom_pdbqt_type) == 2: #1HG or two-letter atom names such as CL,FE starts with 13
atom_format_type = '{:<4s}'.format(atom_type)
else: # starts with 14
atom_format_type = ' {:<3s}'.format(atom_type)
line = line[:12] + atom_format_type + line[16:]
pdbqt_fixed.append(line)
return '\n'.join(pdbqt_fixed)
def docking(ligands_pdbqt_string, receptor_pdbqt_fname, center, box_size, exhaustiveness, seed, n_poses, ncpu):
'''
:param ligands_pdbqt_string: str or list of strings
:param receptor_pdbqt_fname:
:param center: (x_float,y_float,z_float)
:param box_size: (size_x_int, size_y_int, size_z_int)
:param n_poses: int
:param ncpu: int
:return: (score_top, pdbqt_string_block)
'''
v = Vina(sf_name='vina', cpu=ncpu, seed=seed, no_refine=False, verbosity=0)
v.set_receptor(rigid_pdbqt_filename=receptor_pdbqt_fname)
v.set_ligand_from_string(ligands_pdbqt_string)
v.compute_vina_maps(center=center, box_size=box_size, spacing=1)
v.dock(exhaustiveness=exhaustiveness, n_poses=n_poses)
return v.energies(n_poses=1)[0][0], v.poses(n_poses=1)
def pdbqt2molblock(pdbqt_block, smi, mol_id):
mol_block = None
mol = Chem.MolFromPDBBlock('\n'.join([i[:66] for i in pdbqt_block.split('MODEL')[1].split('\n')]), removeHs=False, sanitize=False)
if mol:
try:
template_mol = Chem.MolFromSmiles(smi)
# explicit hydrogends are removed from carbon atoms (chiral hydrogens) to match pdbqt mol,
# e.g. [NH3+][C@H](C)C(=O)[O-]
template_mol = Chem.AddHs(template_mol, explicitOnly=True,
onlyOnAtoms=[a.GetIdx() for a in template_mol.GetAtoms() if
a.GetAtomicNum() != 6])
mol = AllChem.AssignBondOrdersFromTemplate(template_mol, mol)
Chem.SanitizeMol(mol)
Chem.AssignStereochemistry(mol, cleanIt=True, force=True, flagPossibleStereoCenters=True)
mol.SetProp('_Name', mol_id)
mol_block = Chem.MolToMolBlock(mol)
except Exception:
sys.stderr.write(f'Could not assign bond orders while parsing PDB: {mol_id}\n')
return mol_block
def process_mol_docking(mol_id, smi, receptor_pdbqt_fname, center, box_size, dbname, seed, exhaustiveness, n_poses, ncpu,
lock=None):
def insert_data(dbname, pdbqt_out, score, mol_block, mol_id):
with sqlite3.connect(dbname) as conn:
conn.execute("""UPDATE mols
SET pdb_block = ?,
docking_score = ?,
mol_block = ?,
time = CURRENT_TIMESTAMP
WHERE
id = ?
""", (pdbqt_out, score, mol_block, mol_id))
ligand_pdbqt = ligand_preparation(smi, seed)
if ligand_pdbqt is None:
return mol_id
score, pdbqt_out = docking(ligands_pdbqt_string=ligand_pdbqt, receptor_pdbqt_fname=receptor_pdbqt_fname,
center=center, box_size=box_size, exhaustiveness=exhaustiveness, seed=seed, n_poses=n_poses, ncpu=ncpu)
mol_block = pdbqt2molblock(pdbqt_out, smi, mol_id)
if mol_block is None:
pdbqt_out = fix_pdbqt(pdbqt_out)
mol_block = pdbqt2molblock(pdbqt_out, smi, mol_id)
if mol_block:
sys.stderr.write('PDBQT was fixed\n')
if lock is not None: # multiprocessing
with lock:
insert_data(dbname, pdbqt_out, score, mol_block, mol_id)
else: # dask
with daskLock(dbname):
insert_data(dbname, pdbqt_out, score, mol_block, mol_id)
return mol_id
def iter_docking(dbname, receptor_pdbqt_fname, protein_setup, protonation, exhaustiveness, seed, n_poses, ncpu, use_dask,
add_sql=None):
'''
This function should update output db with docked poses and scores. Docked poses should be stored as pdbqt (source)
and mol block. All other post-processing will be performed separately.
:param dbname:
:param receptor_pdbqt_fname:
:param protein_setup: text file with vina grid box parameters
:param protonation: True or False
:param exhaustiveness: int
:param seed: int
:param n_poses: int
:param ncpu: int
:param use_dask: indicate whether or not using dask cluster
:type use_dask: bool
:param add_sql: string with additional selection requirements which will be concatenated to the main SQL query
with AND operator, e.g. "iteration = 1".
:return:
'''
def get_param_from_config(config_fname):
config = {}
with open(config_fname) as inp:
for line in inp:
if not line.strip():
continue
param_name, value = line.replace(' ', '').split('=')
config[param_name] = float(value)
center, box_size = (config['center_x'], config['center_y'], config['center_z']),\
(config['size_x'], config['size_y'], config['size_z'])
return center, box_size
with sqlite3.connect(dbname) as conn:
cur = conn.cursor()
smi_field_name = 'smi_protonated' if protonation else 'smi'
sql = f"SELECT id, {smi_field_name} FROM mols WHERE docking_score IS NULL"
if isinstance(add_sql, str) and add_sql:
sql += f" AND {add_sql}"
smiles_dict = dict(cur.execute(sql))
if not smiles_dict:
return
center, box_size = get_param_from_config(protein_setup)
if use_dask:
i = 0
b = bag.from_sequence(smiles_dict.items(), npartitions=len(smiles_dict))
for i, mol_id in enumerate(b.starmap(process_mol_docking,
receptor_pdbqt_fname=receptor_pdbqt_fname, center=center,
box_size=box_size, dbname=dbname, exhaustiveness=exhaustiveness,
seed=seed, n_poses=n_poses, ncpu=1).compute(),
1):
if i % 100 == 0:
sys.stderr.write(f'\r{i} molecules were docked')
sys.stderr.write(f'\r{i} molecules were docked\n')
else:
pool = Pool(ncpu)
manager = Manager()
lock = manager.Lock()
i = 0
for i, mol_id in enumerate(pool.starmap(partial(process_mol_docking, dbname=dbname,
receptor_pdbqt_fname=receptor_pdbqt_fname,
center=center, box_size=box_size, seed=seed,
exhaustiveness=exhaustiveness, n_poses=n_poses, ncpu=1, lock=lock),
smiles_dict.items()), 1):
if i % 100 == 0:
sys.stderr.write(f'\r{i} molecules were docked')
sys.stderr.write(f'\r{i} molecules were docked\n')
def create_db(db_fname, input_fname, protonation, pdbqt_fname, protein_setup_fname, prefix):
conn = sqlite3.connect(db_fname)
cur = conn.cursor()
cur.execute("""CREATE TABLE IF NOT EXISTS mols
(
id TEXT PRIMARY KEY,
smi TEXT,
smi_protonated TEXT,
docking_score REAL,
pdb_block TEXT,
mol_block TEXT,
time TEXT
)""")
conn.commit()
data = [(f'{prefix}-{mol_name}' if prefix else mol_name,
Chem.MolToSmiles(mol, isomericSmiles=True))
for mol, mol_name in read_input(input_fname)]
cur.executemany(f'INSERT INTO mols (id, smi) VALUES(?, ?)', data)
conn.commit()
cur.execute("""CREATE TABLE IF NOT EXISTS setup
(
protonation INTEGER,
protonation_done INTEGER DEFAULT 0,
protein_pdbqt TEXT,
protein_setup TEXT
)""")
conn.commit()
pdbqt_string = open(pdbqt_fname).read()
setup_string = open(protein_setup_fname).read()
cur.execute('INSERT INTO setup VALUES (?,?,?,?)', (int(protonation), 0, pdbqt_string, setup_string))
conn.commit()
conn.close()
def save_sdf(db_fname):
sdf_fname = os.path.splitext(db_fname)[0] + '.sdf'
with open(sdf_fname, 'wt') as w:
conn = sqlite3.connect(db_fname)
cur = conn.cursor()
for mol_block, mol_name, score in cur.execute('SELECT mol_block, id, docking_score '
'FROM mols '
'WHERE docking_score IS NOT NULL '
'AND mol_block IS NOT NULL'):
w.write(mol_block + '\n')
w.write(f'> <ID>\n{mol_name}\n\n')
w.write(f'> <docking_score>\n{score}\n\n')
w.write('$$$$\n')
sys.stderr.write(f'Best poses were saved to {sdf_fname}\n')
def main():
parser = argparse.ArgumentParser(description='Perform docking of input molecules using Vina 1.2. The script '
'automates the whole pipeline: protonate molecules, creates '
'3D structures, converts to PDBQT format, run docking using a single '
'machine (multiprocessing) or a cluster of servers (dask), stores '
'the best scores and poses in PDBQT and MOL formats to DB.\n\n'
'It has multiple dependencies:\n'
' - rdkit - conda install -c conda-forge rdkit\n'
' - vina - conda install -c conda-forge -c ccsb-scripps vina or pip install vina\n'
' - openbabel - conda install -c conda-forge openbabel\n'
' - meeko - pip install git+https://github.com/forlilab/Meeko@7b1a60d9451eabaeb16b08a4a497cf8e695acc63\n'
' - Chemaxon cxcalc utility\n\n'
'To run on a single machine:\n'
' vina_dock.py -i input.smi -o output.db -p protein.pdbqt -s config.txt -c 4 -v\n\n'
'To run on several machines using dask ssh-cluster (on PBS system):\n'
' dask-ssh --hostfile $PBS_NODEFILE --nprocs 32 --nthreads 1 &\n'
' sleep 10\n'
' vina_dock.py -i input.smi -o output.db -p protein.pdbqt -s config.txt -c 4 -v --hostfile $PBS_NODEFILE\n\n'
' config.txt contains coordinates of a gridbox\n'
' $PBS_NODEFILE contains the list of addresses of servers\n',
formatter_class=RawTextArgumentDefaultsHelpFormatter)
parser.add_argument('-i', '--input', metavar='FILENAME', required=False, type=filepath_type,
help='input file with molecules (SMI, SDF, SDF.GZ, PKL). Maybe be omitted if output DB exists. '
'In this case calculations will be continued from interrupted point.')
parser.add_argument('-o', '--output', metavar='FILENAME', required=True, type=filepath_type,
help='output SQLite DB with scores and poses in PDBQT and MOL formats. It also stores '
'other information (input structures, protein pdbqt file and grid box config). '
'If output DB exists all other inputs will be ignored and calculations will be continued.')
parser.add_argument('--no_protonation', action='store_true', default=False,
help='disable protonation of molecules before docking. Protonation requires installed '
'cxcalc chemaxon utility. It will be omitted if output DB exists.')
parser.add_argument('-p', '--protein', metavar='protein.pdbqt', required=False, type=filepath_type,
help='input PDBQT file with a prepared protein. It will be omitted if output DB exists.')
parser.add_argument('-s', '--protein_setup', metavar='protein.log', required=False, type=filepath_type,
help='input text file with Vina docking setup. It will be omitted if output DB exists.')
parser.add_argument('-e', '--exhaustiveness', metavar='INTEGER', required=False, type=int, default=8,
help='exhaustiveness of docking search.')
parser.add_argument('--sdf', action='store_true', default=False,
help='save best docked poses to SDF file with the same name as output DB. Can be used with DB '
'of previously docked molecules to retrieve SDF file.')
parser.add_argument('--hostfile', metavar='FILENAME', required=False, type=filepath_type, default=None,
help='text file with addresses of nodes of dask SSH cluster. The most typical, it can be '
'passed as $PBS_NODEFILE variable from inside a PBS script. The first line in this file '
'will be the address of the scheduler running on the standard port 8786. If omitted, '
'calculations will run on a single machine as usual.')
parser.add_argument('--tmpdir', metavar='DIRNAME', required=False, type=filepath_type, default=None,
help='path to a dir where to store temporary files accessible to a program. If use dask this '
'argument must be specified because dask cannot access ordinary tmp locations.')
parser.add_argument('--seed', metavar='INTEGER', required=False, type=int, default=0,
help='seed to make results reproducible.')
parser.add_argument('--prefix', metavar='STRING', required=False, type=str, default=None,
help='prefix which will be added to all names. This might be useful if multiple runs are made '
'which will be analyzed together.')
parser.add_argument('--n_poses', default=50, type=int,
help='the number of poses generated by Vina, but only the top one will be stored in the output db.'
'This parameter seems affect docking results to find better top pose.')
parser.add_argument('-c', '--ncpu', default=1, type=cpu_type,
help='number of cpus.')
parser.add_argument('-v', '--verbose', action='store_true', default=False,
help='print progress to STDERR.')
args = parser.parse_args()
if args.tmpdir is not None:
tempfile.tempdir = args.tmpdir
if args.hostfile is not None:
dask.config.set({'distributed.scheduler.allowed-failures': 30})
dask_client = Client(open(args.hostfile).readline().strip() + ':8786')
if not os.path.isfile(args.output):
create_db(args.output, args.input, not args.no_protonation, args.protein, args.protein_setup, args.prefix)
add_protonation(args.output)
conn = sqlite3.connect(args.output)
protein = tempfile.NamedTemporaryFile(suffix='.pdbqt', mode='w', encoding='utf-8')
protein.write(list(conn.execute('SELECT protein_pdbqt FROM setup'))[0][0])
protein.flush()
setup = tempfile.NamedTemporaryFile(suffix='.txt', mode='w', encoding='utf-8')
setup.write(list(conn.execute('SELECT protein_setup FROM setup'))[0][0])
setup.flush()
protonation = list(conn.execute('SELECT protonation FROM setup'))[0][0]
iter_docking(dbname=args.output, receptor_pdbqt_fname=protein.name, protein_setup=setup.name,
protonation=protonation, exhaustiveness=args.exhaustiveness, seed=args.seed, n_poses=args.n_poses, ncpu=args.ncpu,
use_dask=args.hostfile is not None)
if args.sdf:
save_sdf(args.output)
if __name__ == '__main__':
main()