Merge pull request #426 from ExcitedStates/new_qfit_ligand

New qfit ligand
ExcitedStates · May 15, 2024 · 2216f61 · 2216f61
2 parents 9376358 + 13a4ab1
commit 2216f61
Show file tree

Hide file tree

Showing 9 changed files with 995 additions and 317 deletions.
diff --git a/README.md b/README.md
@@ -7,6 +7,7 @@ qFit is a collection of programs for modeling multi-conformer protein structures
 
 Electron density maps obtained from high-resolution X-ray diffraction data are a spatial and temporal average of all conformations within the crystal. qFit evaluates an extremely large number of combinations of sidechain conformers, backbone fragments and small-molecule ligands to locally explain the electron density.
 
+
 ## Installation
 
 1) Download qFit
@@ -101,6 +102,25 @@ After *multiconformer_model2.pdb* has been generated, refine this model using:
 
 More advanced features of qFit (modeling single residue, more advanced options, and further explainations) are explained in [TUTORIAL](example/TUTORIAL.md).
 
+To model alternate conformations of ligands using qFit, execute the following command:
+
+`qfit_ligand [COMPOSITE_OMIT_MAP_FILE] [PDB_FILE] -l [LABELS] [SELECTION] -sm [SMILES]`
+
+This command facilitates the incorporation of alternate ligand conformations into your protein model. The results are outputted to two files: *multiconformer_ligand_bound_with_protein.pdb*, which is the multiconformer model of the protein-ligand complex, and *multiconformer_ligand_only.pdb*, which is the multiconformer model of the ligand alone. 
+
+After running qFit-ligand, it is recommended to perform a final refinement using the script found in [scripts](scripts/post). Run this in the same directory as your models.
+
+If you wish to specify the number of ligand conformers for qFit to sample, use the flag `-nc [NUM_CONFS]`. The default number is set to 10,000. 
+
+Using the example 4MS6:
+
+`qfit_ligand example/qfit_ligand_example/4ms6_composit_map.mtz example/qfit_ligand_example/4ms6.pdb -l 2FOFCWT,PH2FOFCWT A,702 -sm 'C1C[C@H](NC1)C(=O)CCC(=O)N2CCC[C@H]2C(=O)O' -nc 10000`
+
+To refine *multiconformer_ligand_bound_with_protein.pdb*, use the following command
+
+`qfit_final_refine_ligand.sh 4ms6.mtz`
+
+
 ## Citations
 If you use this software, please cite: 
 - [Wankowicz SA, Ravikumar A, Sharma S, Riley BT, Raju A, Hogan DW, van den Bedem H, Keedy DA, & Fraser JS. Uncovering Protein Ensembles: Automated Multiconformer Model Building for X-ray Crystallography and Cryo-EM. bioRxiv. (2023).](https://www.biorxiv.org/content/10.1101/2023.06.28.546963v2.abstract)

diff --git a/environment.yml b/environment.yml
@@ -12,3 +12,4 @@ dependencies:
   - pyparsing>=2.2.0
   - cvxopt
   - tqdm
+  - rdkit
diff --git a/example/README.md b/example/README.md
@@ -87,11 +87,19 @@ Using the example 3K0N, spawning 30 parallel processes:
 To model alternate conformers of ligands, the command line tool *qfit_ligand*
 should be used:
 
-`qfit_ligand [COMPOSITE_OMIT_MAP_FILE] -l [LABEL] [PDB_FILE] [CHAIN,LIGAND]`
+`qfit_ligand [COMPOSITE_OMIT_MAP_FILE] -l [LABEL] [PDB_FILE] [CHAIN,LIGAND] -sm [SMILES]`
 
 Where *LIGAND* corresponds to the numeric identifier of the ligand on the PDB
-(aka res. number). The main output file is named *multiconformer_model_[CHAIN]_[LIGAND].pdb*
+(aka res. number). The main output file is named *multiconformer_ligand_bound_with_protein.pdb*
+
+
+If you wish to specify the number of ligand conformers for qFit to sample, use the flag `-nc [NUM_CONFS]`. The default number is set to 10,000. 
 
 Using the example 4MS6:
 
-`qfit_ligand qfit_ligand_example/4ms6_composite_map.mtz -l 2FOFCWT,PH2FOFCWT qfit_ligand_example/4ms6.pdb A,702`
+`qfit_ligand qfit_ligand_example/4ms6_composite_map.mtz -l 2FOFCWT,PH2FOFCWT qfit_ligand_example/4ms6.pdb A,702 -sm 'C1C[C@H](NC1)C(=O)CCC(=O)N2CCC[C@H]2C(=O)O' -nc 10000`
+
+
+To refine *multiconformer_ligand_bound_with_protein.pdb*, use the following command
+
+`qfit_final_refine_ligand.sh 4ms6.mtz`
diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,4 @@ cvxpy
 pyscipopt
 scipy
 scikit-learn
+rdkit
diff --git a/scripts/post/qfit_final_refine_ligand.sh b/scripts/post/qfit_final_refine_ligand.sh
@@ -0,0 +1,173 @@
+#!/bin/bash
+# This script works with Phenix version 1.20.
+
+qfit_usage() {
+  echo >&2 "Usage:";
+  echo >&2 "  $0 mapfile.mtz [multiconformer_ligand_bound_with_protein.pdb] [multiconformer_ligand_only.pdb]";
+  echo >&2 "";
+  echo >&2 "mapfile.mtz, multiconformer_ligand_bound_with_protein.pdb, and multiconformer_ligand_only.pdb MUST exist in this directory.";
+  echo >&2 "Outputs will be written to mapfile_qFit.{pdb|mtz|log}.";
+  exit 1;
+}
+
+#___________________________SOURCE__________________________________
+# Check that Phenix is loaded
+if [ -z "${PHENIX}" ]; then
+  echo >&2 "I require PHENIX but it's not loaded.";
+  echo >&2 "Please \`source phenix_env.sh\` from where it is installed.";
+  exit 1;
+else
+  export PHENIX_OVERWRITE_ALL=true;
+fi
+
+# Check that qFit is loaded.
+command -v remove_duplicates >/dev/null 2>&1 || {
+  echo >&2 "I require qFit (remove_duplicates) but it's not loaded.";
+  echo >&2 "Please activate the environment where qFit is installed.";
+  echo >&2 "   e.g. conda activate qfit"
+  exit 1;
+}
+
+# Assert required files exist
+mapfile=$1
+multiconf=${2:-multiconformer_ligand_bound_with_protein.pdb}
+multiconf_lig=${2:-multiconformer_ligand_only.pdb}
+echo "mapfile              : ${mapfile} $([[ -f ${mapfile} ]] || echo '[NOT FOUND]')";
+echo "qfit unrefined model : ${multiconf} $([[ -f ${multiconf} ]] || echo '[NOT FOUND]')";
+echo "qfit unrefined ligand model : ${multiconf_lig} $([[ -f ${multiconf_lig} ]] || echo '[NOT FOUND]')";
+
+
+echo "";
+if [[ ! -f "${mapfile}" ]] || [[ ! -f "${multiconf}" ]]; then
+  qfit_usage;
+fi
+pdb_name="${mapfile%.mtz}"
+echo >&2 "PDB name is "${pdb_name}". ";
+echo "${pdb_name}"
+
+
+#__________________________________DETERMINE RESOLUTION AND (AN)ISOTROPIC REFINEMENT__________________________________
+mtzmetadata=`phenix.mtz.dump "${pdb_name}.mtz"`
+resrange=`grep "Resolution range:" <<< "${mtzmetadata}"`
+
+echo "${resrange}"
+
+res=`echo "${resrange}" | cut -d " " -f 4 | cut -c 1-5`
+res1000=`echo $res | awk '{tot = $1*1000}{print tot }'`
+
+if (( $res1000 < 1550 )); then
+  adp='adp.individual.anisotropic="not (water or element H)"'
+else
+  adp='adp.individual.isotropic=all'
+fi
+
+#__________________________________DETERMINE FOBS v IOBS v FP__________________________________
+# List of Fo types we will check for
+obstypes=("FP" "FOBS" "F-obs" "I" "IOBS" "I-obs" "F(+)" "I(+)" "FSIM")
+
+# Get amplitude fields
+ampfields=`grep -E "amplitude|intensity|F\(\+\)|I\(\+\)" <<< "${mtzmetadata}"`
+ampfields=`echo "${ampfields}" | awk '{$1=$1};1' | cut -d " " -f 1`
+
+# Clear xray_data_labels variable
+xray_data_labels=""
+
+# Is amplitude an Fo?
+for field in ${ampfields}; do
+  # Check field in obstypes
+  if [[ " ${obstypes[*]} " =~ " ${field} " ]]; then
+    # Check SIGFo is in the mtz too!
+    if grep -F -q -w "SIG$field" <<< "${mtzmetadata}"; then
+      xray_data_labels="${field},SIG${field}";
+      break
+    fi
+  fi
+done
+if [ -z "${xray_data_labels}" ]; then
+  echo >&2 "Could not determine Fo field name with corresponding SIGFo in .mtz.";
+  echo >&2 "Was not among "${obstypes[*]}". Please check .mtz file\!";
+  exit 1;
+else
+  echo "data labels: ${xray_data_labels}"
+  # Start writing refinement parameters into a parameter file
+  echo "refinement.input.xray_data.labels=$xray_data_labels" > ${pdb_name}_refine.params
+fi
+
+#_____________________________DETERMINE R FREE FLAGS______________________________
+gen_Rfree=True
+rfreetypes="FREE R-free-flags"
+for field in ${rfreetypes}; do
+  if grep -F -q -w $field <<< "${mtzmetadata}"; then
+    gen_Rfree=False;
+    echo "Rfree column: ${field}";
+    echo "refinement.input.xray_data.r_free_flags.label=${field}" >> ${pdb_name}_refine.params
+    break
+  fi
+done
+echo "refinement.input.xray_data.r_free_flags.generate=${gen_Rfree}" >> ${pdb_name}_refine.params
+
+#__________________________________REMOVE DUPLICATE HET ATOMS__________________________________
+remove_duplicates "${multiconf}"
+
+#__________________________________NORMALIZE OCCUPANCIES________________________________________
+echo "Normalizing occupancies, before refinement "
+
+redistribute_cull_low_occupancies -occ 0.09 "${multiconf}.fixed"
+mv -v "${multiconf}.f_norm.pdb" "${multiconf}.fixed"
+
+
+#________________________________REMOVE TRAILING HYDROGENS___________________________________
+phenix.pdbtools remove="element H" "${multiconf}.fixed"
+
+#__________________________________GET CIF FILE__________________________________
+echo "Getting the cif file with ready_set/elbow" 
+phenix.ready_set hydrogens=false \
+                 trust_residue_code_is_chemical_components_code=true \
+                 pdb_file_name="${multiconf}.f_modified.pdb"
+
+# If ready_set doesn't generate a ligand cif file, use elbow
+if [ ! -f "${multiconf}.f_modified.ligands.cif" ]; then
+  phenix.elbow multiconformer_ligand_only.pdb --output ${multiconf}.f_modified.ligands
+fi
+
+if [ ! -f "${multiconf}.f_modified.ligands.cif" ]; then
+  echo "Ligand CIF generation failed"
+fi
+
+#__________________________________FINAL REFINEMENT__________________________________
+
+# Write refinement parameters into parameters file
+echo "refinement.refine.strategy=*individual_sites *individual_adp *occupancies"  >> ${pdb_name}_refine.params
+echo "refinement.output.prefix=${pdb_name}"      >> ${pdb_name}_refine.params
+echo "refinement.output.serial=2"                >> ${pdb_name}_refine.params
+echo "refinement.main.number_of_macro_cycles=5"  >> ${pdb_name}_refine.params
+echo "refinement.main.nqh_flips=True"            >> ${pdb_name}_refine.params
+echo "refinement.refine.${adp}"                  >> ${pdb_name}_refine.params
+echo "refinement.output.write_maps=False"        >> ${pdb_name}_refine.params
+echo "refinement.hydrogens.refine=riding"        >> ${pdb_name}_refine.params
+echo "refinement.main.ordered_solvent=True"      >> ${pdb_name}_refine.params
+echo "refinement.target_weights.optimize_xyz_weight=true"  >> ${pdb_name}_refine.params
+echo "refinement.target_weights.optimize_adp_weight=true"  >> ${pdb_name}_refine.params
+
+echo "refinement.input.monomers.file_name='${multiconf}.f_modified.ligands.cif'" >> ${pdb_name}_refine.params
+
+phenix.refine  "${multiconf}.f_modified.pdb" \
+               "${pdb_name}.mtz" \
+               "${pdb_name}_refine.params" \
+               --overwrite
+
+#__________________________________NAME FINAL FILES__________________________________
+cp -v "${pdb_name}_002.pdb" "${pdb_name}_qFit_ligand.pdb"
+cp -v "${pdb_name}_002.mtz" "${pdb_name}_qFit_ligand.mtz"
+cp -v "${pdb_name}_002.log" "${pdb_name}_qFit_ligand.log"
+
+#__________________________COMMENTARY FOR USER_______________________________________
+if [ -f "${pdb_name}_002.pdb" ]; then
+   echo ""
+   echo "[qfit_final_refine_ligand] Refinement is complete."
+   echo "                         Please be sure to INSPECT your refined structure, especially all new altconfs."
+   echo "                         The output can be found at ${pdb_name}_qFit_ligand.(pdb|mtz|log) ."
+else
+   echo "Refinement did not complete."
+   echo "Please check for failure reports by examining log files."
+fi
diff --git a/setup.py b/setup.py
@@ -32,6 +32,7 @@ def main():
         "pandas>=1.2",
         "pyparsing>=2.2.0",
         "tqdm>=4.0.0",
+        "rdkit",
     ]
 
     setup(
@@ -70,6 +71,7 @@ def main():
         scripts=[
             "scripts/post/qfit_final_refine_xray.sh",
             "scripts/post/qfit_final_refine_cryoem.sh",
+            "scripts/post/qfit_final_refine_ligand.sh",
             "scripts/post/find_largest_ligand.py",
             "scripts/post/find_altlocs_near_ligand.py",
             "scripts/post/get_lig_chain_res.py",