Merge pull request #488 from Ecogenomics/staging

Merge staging
Ecogenomics · Mar 16, 2023 · ad342d3 · ad342d3
2 parents 4970f4a + 6cce4b1
commit ad342d3
Show file tree

Hide file tree

Showing 7 changed files with 52 additions and 16 deletions.
diff --git a/docs/src/faq.rst b/docs/src/faq.rst
@@ -11,6 +11,14 @@ GTDB-Tk uses the GTDB taxonomy (`https://gtdb.ecogenomic.org/ <https://gtdb.ecog
 This taxonomy is similar, but not identical to NCBI and Silva.
 In many cases the GTDB taxonomy more strictly follows the nomenclatural rules for rank suffixes which is why there is Nitrospirota instead of Nitrospirae.
 
+Can you combine the bacterial and archaeal trees into a single tree?
+--------------------------------------------------------------------
+
+
+The bacterial and archaeal trees are inferred from different marker genes. Currently, the correct rootings of these trees remain an open area of research.
+GTDB-Tk does not provide a tool to merge the trees but It is possible to artificially combine them by manipulating the Newick files.
+One solution would be to use (`DendroPy <https://dendropy.org/>`_); a Python library used for phylogenetic computing.
+
 
 .. _faq_pplacer:
 
@@ -92,6 +100,6 @@ From GTDB-Tk v2.0.0 the conda environment will automatically have FastANI v1.3 i
 
 **Docker:**
 
-From GTDB-Tk v2.2.2 the Docker cotnainer will automatically have FastANI v1.32 installed. Otherwise, manually
+From GTDB-Tk v2.2.2 the Docker container will automatically have FastANI v1.32 installed. Otherwise, manually
 build the container from the `Dockerfile <https://github.com/Ecogenomics/GTDBTk/blob/master/Dockerfile>`_, making
 sure to specify FastANI v1.32.
diff --git a/gtdbtk/__init__.py b/gtdbtk/__init__.py
@@ -29,4 +29,4 @@
 __status__ = 'Production'
 __title__ = 'GTDB-Tk'
 __url__ = 'https://github.com/Ecogenomics/GTDBTk'
-__version__ = '2.2.4'
+__version__ = '2.2.5'
diff --git a/gtdbtk/classify.py b/gtdbtk/classify.py
@@ -331,6 +331,7 @@ def run(self,
             debugopt=False,
             fulltreeopt=False,
             skip_ani_screen=False,
+            genes=False,
             no_mash=False,
             mash_k=Config.MASH_K_VALUE,
             mash_v=Config.MASH_V_VALUE,
@@ -351,7 +352,10 @@ def run(self,
         # rest of the pipeline.
         mash_classified_user_genomes = {}
         if not skip_ani_screen:
-            if not no_mash:
+            if genes:
+                self.logger.warning('The --genes flag is set to True. The ANI screening steps will be skipped.')
+                skip_ani_screen = True
+            elif not no_mash:
                 # if mash_db finishes with a backslash, it should be considered a directory
                 if mash_db.endswith('/'):
                     make_sure_path_exists(mash_db)
@@ -511,8 +515,6 @@ def run(self,
                     user_msa_file = prescreened_msa_file_path
 
 
-
-
                 # Write the RED dictionary to disk (intermediate file).
                 red_dict_file.write()
 
@@ -604,10 +606,10 @@ def run(self,
                                 disappearing_genomes_file.add_genome(disappearing_genome, tree_iter)
 
                         class_level_classification, classified_user_genomes,warning_counter = self._parse_tree(mrca_lowtree, genomes, msa_dict,
-                                                                      percent_multihit_dict, tln_table_summary_file.genomes,
+                                                                      percent_multihit_dict,genes, tln_table_summary_file.genomes,
                                                                       bac_ar_diff, submsa_file_path, red_dict_file.data,
                                                                       summary_file, pplacer_taxonomy_dict,warning_counter,
-                                                                      high_classification, debug_file,skip_ani_screen, debugopt,
+                                                                      high_classification, debug_file, debugopt,
                                                                       tree_mapping_file, tree_iter,
                                                                       tree_mapping_dict_reverse)
 
@@ -1144,9 +1146,9 @@ def _classify_red_topology(self, tree, msa_dict, percent_multihit_dict, trans_ta
 
         return class_level_classification,warning_counter
 
-    def _parse_tree(self, tree, genomes, msa_dict, percent_multihit_dict, trans_table_dict, bac_ar_diff,
+    def _parse_tree(self, tree, genomes, msa_dict, percent_multihit_dict,genes, trans_table_dict, bac_ar_diff,
                     user_msa_file, red_dict, summary_file, pplacer_taxonomy_dict,warning_counter, high_classification,
-                    debug_file,prescreening, debugopt, tree_mapping_file, tree_iter, tree_mapping_dict_reverse):
+                    debug_file, debugopt, tree_mapping_file, tree_iter, tree_mapping_dict_reverse):
         # Genomes can be classified by using FastANI or RED values
         # We go through all leaves of the tree. if the leaf is a user
         # genome we take its parent node and look at all the leaves
@@ -1156,8 +1158,10 @@ def _parse_tree(self, tree, genomes, msa_dict, percent_multihit_dict, trans_tabl
         tt = TreeTraversal()
 
         self.logger.log(Config.LOG_TASK, 'Traversing tree to determine classification method.')
-        fastani_verification, qury_nodes = self._get_fastani_verification(tree, self.reference_ids, tt)
-
+        if genes:
+            fastani_verification = {}
+        else:
+            fastani_verification, qury_nodes = self._get_fastani_verification(tree, self.reference_ids, tt)
 
         #DEBUG: Skip FastANI step
         #fastani_verification = {}
@@ -1176,14 +1180,15 @@ def _parse_tree(self, tree, genomes, msa_dict, percent_multihit_dict, trans_tabl
         else:
             all_fastani_dict = {}
 
-
-
         classified_user_genomes, unclassified_user_genomes,warning_counter = self._sort_fastani_results(
             fastani_verification, pplacer_taxonomy_dict, all_fastani_dict, msa_dict, percent_multihit_dict,
             trans_table_dict, bac_ar_diff,warning_counter, summary_file)
         #if not prescreening:
-        self.logger.info(f'{len(classified_user_genomes):,} genome(s) have '
+        if not genes:
+            self.logger.info(f'{len(classified_user_genomes):,} genome(s) have '
                              f'been classified using FastANI and pplacer.')
+        else:
+            self.logger.info('ANI classification has been skipped (--genes option used).')
 
         if tree_mapping_file:
             for genome in classified_user_genomes.keys():

diff --git a/gtdbtk/cli.py b/gtdbtk/cli.py
@@ -34,7 +34,8 @@ def __temp_dir(group):
 
 def __genes(group):
     group.add_argument('--genes', action='store_true', default=False,
-                       help='indicates input files contain called genes (skip gene calling)')
+                       help='indicates input files contain called genes (skip gene calling).Warning: This flag will also skip '
+                            'the ANI comparison steps (ani_screen and classification).')
 
 
 def __genome_dir(group):
@@ -478,6 +479,7 @@ def get_main_parser():
             __cpus(grp)
             __pplacer_cpus(grp)
             __scratch_dir(grp)
+            __genes(grp)
             __full_tree(grp)
             __min_af(grp)
             __temp_dir(grp)

diff --git a/gtdbtk/config/config.py b/gtdbtk/config/config.py
@@ -7,6 +7,8 @@
 """
 try:
     GENERIC_PATH = os.environ['GTDBTK_DATA_PATH']
+    #expand the variables in the path
+    GENERIC_PATH = os.path.expandvars(GENERIC_PATH)
 except KeyError:
     print('\n' + '=' * 80)
     print(' ERROR '.center(80))

diff --git a/gtdbtk/external/mash.py b/gtdbtk/external/mash.py
@@ -210,7 +210,11 @@ def _load_metadata(self):
 
     def _is_consistent(self):
         """Returns True if the sketch was generated from the genomes."""
-        return set(self.data.keys()) == set(self.genomes.values())
+        # to compare the consistency of the sketch file, we need to compare only the file names, not the full paths
+        # the mash_db can be moves to another folder in the cloud so the full path will be different
+        data_keys = set(map(os.path.basename,self.data.keys()))
+        genomes_values = set(map(os.path.basename,self.genomes.values()))
+        return data_keys == genomes_values
 
     def _generate(self):
         """Generate a new sketch file."""

diff --git a/gtdbtk/main.py b/gtdbtk/main.py
@@ -592,6 +592,7 @@ def classify(self, options,all_classified_ani=False):
                      debugopt=options.debug,
                      fulltreeopt=options.full_tree,
                      skip_ani_screen=options.skip_ani_screen,
+                     genes=options.genes,
                      no_mash=options.no_mash,
                      mash_k=options.mash_k,
                      mash_v=options.mash_v,
@@ -611,6 +612,12 @@ def classify(self, options,all_classified_ani=False):
 
         self.stage_logger.steps.append(classify_step)
 
+        if options.genes:
+            self.logger.warning('The final classification predicted may be less accurate '
+                                'due to the use of amino acid files instead of nucleotide files as input to the pipeline.'
+                                ' Without nucleotides files, the ANI classification step of the workflow has been skipped and therefore'
+                                ' no ANI matches with existing species in GTDB could be reported.')
+
         self.logger.info('Note that Tk classification mode is insufficient for publication of new taxonomic '
                          'designations. New designations should be based on one or more de novo trees, an '
                          'example of which can be produced by Tk in de novo mode.')
@@ -1123,9 +1130,17 @@ def parse_options(self, options):
                             all_classified_ani = True
 
                         self.stage_logger.reset_steps(keep_steps=['ANI screen'])
+                    else:
+                        self.stage_logger.reset_steps()
                 else:
                     self.stage_logger.reset_steps()
 
+            if options.genes:
+                if not options.skip_ani_screen:
+                    self.logger.warning('The --genes flag is set to True. The ANI screening step will be skipped.')
+                    options.skip_ani_screen = True
+
+
             if not options.skip_ani_screen:
                 all_classified_ani,classified_genomes = self.ani_screen(options)