# **prepare current working directory (cwd)**

In [None]:
%%bash
# remove everything except this notebook
shopt -s extglob
rm -rf !("notebook.ipynb")
# to test any error in the terminal, use:
# ipython3 -c "%run notebook.ipynb"

In [None]:
# import tools
import sys, os; sys.path.append(os.path.realpath(os.path.join(os.getcwd(), "../../")))
from tools import *
# while updating modules
# import importlib; importlib.reload(visualize_molecules)
# import importlib; importlib.reload(utils)

import ipywidgets
import glob
import ast
from tqdm import tqdm
import pandas as pd

import rdkit
from rdkit import Chem
import meeko
from meeko import MoleculePreparation
from meeko import PDBQTMolecule
from meeko import RDKitMolCreate
from pymol import cmd
from vina import Vina

import shutil
from pdbfixer import PDBFixer
from openmm.app import PDBFile

# Global variables
dirs_dict = {
             # 'raw_ligands':'./raw_ligands/', 
             # 'prepared_ligands':'./prepared_ligands/', 
             'raw_receptors':'./raw_receptors/', 
             'prepared_receptors':'./prepared_receptors/', 
             # 'vina_scoring':'./vina_scoring/', 
             # 'vina_docking':'./vina_docking/'
            }
vina_seed = 1

In [None]:
# create empty directories
directory_scraping.prepare_directory_from_dict(dirs_dict)

In [None]:
# list directory before execution
!tree .

# **read and prepare receptors with pockets**

## read pdb files from dir download receptors

In [None]:
external_pdb_dir = "../../../*_mutations/*_downloads/"

file_paths, file_names, subdirectory_names = directory_scraping.list_files_in_dir_recursively(external_pdb_dir, search_pattern='*.pdb')

for i,file_path in enumerate(tqdm(file_paths)):
    file_name = file_names[i]
    subdirectory_name = subdirectory_names[i]
    # shutil.copy(pdb_file, dirs_dict['raw_receptors'])
    shutil.copy(file_path, dirs_dict['raw_receptors'] + subdirectory_name + '_' + file_name)
    
# 112 mutations * approx 6 files per mutation = approx 672 files
!ls -1q {dirs_dict['raw_receptors']} | wc -l

## clean receptors and convert them to pdbqt

for comments and more info about this section, take a look at notebook in directory 'example03 - prepare receptors'

In [None]:
receptor_paths, receptor_names = directory_scraping.list_files_in_dir(directory=dirs_dict['raw_receptors'], search_pattern='*.pdb*')

errors_and_warnings = []
for i,receptor_path in enumerate(tqdm(receptor_paths)):
    # print("\n", "-"*50, "\n\n", receptor_path, "\n")
    receptor_name = receptor_names[i];
    # check if output pdbqt file already exists
    if os.path.exists(dirs_dict['prepared_receptors'] + receptor_name + ".pdbqt"):
        continue
    # Summarizes the contents of a PDB file, like the wc command in UNIX.
    # !pdb_wc $receptor_path
    # -------------------------------------------------------------------------------
    # step01: remove extra chains if more than one
    fixer = PDBFixer(filename=receptor_path)
    numChains = len(list(fixer.topology.chains()))
    if numChains > 1:
        step01_out = dirs_dict['prepared_receptors'] + receptor_name + "_step01.pdb"
        # !pdb_selchain -A {receptor_path} > {step01_out}
        fixer.removeChains(range(1, numChains))
        PDBFile.writeFile(fixer.topology, fixer.positions, open(step01_out, 'w'))
        errors_and_warnings.append(("warning", "n_chains=" + str(numChains), receptor_name))
    else:
        step01_out = receptor_path
    # continue if last step didn't work
    if not os.path.exists(step01_out):
        errors_and_warnings.append(("error", "performing step_01", receptor_name))
        continue
    # -------------------------------------------------------------------------------
    # step02: remove water molecules
    # [-U]  cleanup type:
    #              'nphs': merge charges and remove non-polar hydrogens
    #              'lps': merge charges and remove lone pairs
    #              'waters': remove water residues
    #              'nonstdres': remove chains composed entirely of residues of
    #                       types other than the standard 20 amino acids
    #              'deleteAltB': remove XX@B atoms and rename XX@A atoms->XX
    #              (default is 'nphs_lps_waters_nonstdres')
    step02_out = dirs_dict['prepared_receptors'] + receptor_name + "_step02.pdbqt"
    !prepare_receptor -r {step01_out} -o {step02_out} -U 'waters'
    # continue if last step didn't work
    if not os.path.exists(step02_out):
        errors_and_warnings.append(("error", "performing step_02", receptor_name))
        continue
    # -------------------------------------------------------------------------------
    # [-e]  delete every nonstd residue from any chain
    #               'True': any residue whose name is not in this list:
    #                       ['CYS','ILE','SER','VAL','GLN','LYS','ASN', 
    #                       'PRO','THR','PHE','ALA','HIS','GLY','ASP', 
    #                       'LEU', 'ARG', 'TRP', 'GLU', 'TYR','MET', 
    #                       'HID', 'HSP', 'HIE', 'HIP', 'CYX', 'CSS']
    #               will be deleted from any chain. 
    #               NB: there are no  nucleic acid residue names at all 
    #               in the list and no metals. 
    #              (default is False which means not to do this)
    step03_out = dirs_dict['prepared_receptors'] + receptor_name + "_step03.pdbqt"
    !prepare_receptor -r {step02_out} -o {step03_out} -e
    # continue if last step didn't work
    if not os.path.exists(step03_out):
        errors_and_warnings.append(("error", "performing step_03", receptor_name))
        continue
    # -------------------------------------------------------------------------------
    # [-A]  type(s) of repairs to make: 
    #              'bonds_hydrogens': build bonds and add hydrogens 
    #              'bonds': build a single bond from each atom with no bonds to its closest neighbor
    #              'hydrogens': add hydrogens
    #              'checkhydrogens': add hydrogens only if there are none already
    #              'None': do not make any repairs 
    #              (default is 'None')
    step04_out = dirs_dict['prepared_receptors'] + receptor_name + "_step04.pdbqt"
    !prepare_receptor -r {step03_out} -o {step04_out} -A 'hydrogens'
    # continue if last step didn't work
    if not os.path.exists(step04_out):
        errors_and_warnings.append(("error", "performing step_04", receptor_name))
        continue
    # -------------------------------------------------------------------------------
    # change name to last step and remove all middle-stage files
    !mv {step04_out} {dirs_dict['prepared_receptors']}{receptor_name}".pdbqt"
    !rm {dirs_dict['prepared_receptors']}*_step*
    # -------------------------------------------------------------------------------
# print and count files in output dir
%ls {dirs_dict['prepared_receptors']} -l
!ls -1q {dirs_dict['prepared_receptors']} | wc -l

In [None]:
errors_and_warnings
# print only different
# set([x[1] for x in errors_and_warnings])

## find pockets in receptors

for comments and more info about this section, take a look at notebook in directory 'example07 - find pockets in prepared_receptors'

In [None]:
# List prepared_receptors
receptor_paths, receptor_names = directory_scraping.list_files_in_dir(directory=dirs_dict['prepared_receptors'], search_pattern='*.pdb*')

In [None]:
# Execute fpocket for each receptor
for i,receptor_path in enumerate(tqdm(receptor_paths)):
    # print("\n", "-"*50, "\n\n", receptor_path, "\n")
    receptor_name = receptor_names[i]; #print(receptor_name)
    # -------------------------------------------------------------------------------
    # step01: compute pockets for each receptor
    !fpocket -f {receptor_path} -d > 'prepared_receptors/fp_'{receptor_name}'.csv'
    # print("fpocket finished running")
print("\n", "-"*50, "\n")
%ls {dirs_dict['prepared_receptors']} -l

In [None]:
len(receptor_paths)

In [None]:
# Summarize fpocket data into tables
list_pockets_data = []
for i,receptor_path in enumerate(tqdm(receptor_paths)):
    # print("\n", "-"*50, "\n\n", receptor_path, "\n")
    receptor_name = receptor_names[i];
    pocket_paths = sorted(glob.glob(dirs_dict['prepared_receptors'] + receptor_name + '_out/*.pqr'))
    # pockets_data = pd.read_csv(dirs_dict['prepared_receptors']+'fp_' + receptor_name + '.csv',sep=' ',index_col=[0])
    pockets_data = pd.read_csv(dirs_dict['prepared_receptors']+'fp_' + receptor_name + '.csv',sep=' ',index_col=False).set_index('cav_id')
    for cav_id, pocket_path in enumerate(pocket_paths, start=1):
        drug_score = pockets_data["drug_score"].loc[cav_id]
        # compute the box center and size out from the pocket
        pocket_num = cav_id
        cmd.load(filename=pocket_path,format='pqr',object=pocket_num)
        center,size = utils.getbox(selection=pocket_num,extending=0,software='vina')
        center = [center['center_x'], center['center_y'], center['center_z']]
        size = [size['size_x'], size['size_y'], size['size_z']]
        size_max = max(size)
        pockets_data.loc[pocket_num,'receptor_name'] = receptor_name
        pockets_data.loc[pocket_num,'center'] = str(center)
        pockets_data.loc[pocket_num,'size'] = str(size)
        pockets_data.loc[pocket_num,'size_max'] = size_max
    # update pockets data list index. add 'drug_id' to the beginning of each number
    pockets_data.index = pockets_data.index.map(str)
    pockets_data.index = receptor_name + '_' + pockets_data.index
    # append it to list of dataframes
    list_pockets_data.append(pockets_data)

# Merge pockets data and sort it by the drugability score
merged_pockets_data = pd.concat(list_pockets_data).sort_values(by=['drug_score'], ascending=False)
# Save merged data to csv file
merged_pockets_data.to_csv(dirs_dict['prepared_receptors']+'merged_pockets_data.csv', sep='\t')
# Re-read file to make sure everything is fine
# merged_pockets_data = pd.read_csv(dirs_dict['prepared_receptors']+'merged_pockets_data.csv',sep='\t')

In [None]:
# Inspect pockets_data to decide on which pockets will be used from here onwards
merged_pockets_data = pd.read_csv(dirs_dict['prepared_receptors']+'merged_pockets_data.csv',sep='\t')
columns_to_keep = ['cav_id', 'drug_score', 'receptor_name', 'center', 'size', 'size_max', 
                   # 'volume', 'hydrophobicity_score', 'volume_score', 'charge_score','polarity_score', 
                  ]
merged_pockets_data = merged_pockets_data[columns_to_keep]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(merged_pockets_data)

# **export output to a safe directory**

In [None]:
!cp -r {dirs_dict['prepared_receptors']}* ../../../data/prev_prepared_molecules/prev_prepared_receptors/

In [None]:
%%bash
# remove everything except this notebook
shopt -s extglob
rm -rf !("notebook.ipynb")

# clear all output and save this notebook