Skip to content

Commit

Permalink
Merge pull request #488 from Ecogenomics/staging
Browse files Browse the repository at this point in the history
Merge staging
  • Loading branch information
pchaumeil committed Mar 16, 2023
2 parents 4970f4a + 6cce4b1 commit ad342d3
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 16 deletions.
10 changes: 9 additions & 1 deletion docs/src/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ GTDB-Tk uses the GTDB taxonomy (`https://gtdb.ecogenomic.org/ <https://gtdb.ecog
This taxonomy is similar, but not identical to NCBI and Silva.
In many cases the GTDB taxonomy more strictly follows the nomenclatural rules for rank suffixes which is why there is Nitrospirota instead of Nitrospirae.

Can you combine the bacterial and archaeal trees into a single tree?
--------------------------------------------------------------------


The bacterial and archaeal trees are inferred from different marker genes. Currently, the correct rootings of these trees remain an open area of research.
GTDB-Tk does not provide a tool to merge the trees but It is possible to artificially combine them by manipulating the Newick files.
One solution would be to use (`DendroPy <https://dendropy.org/>`_); a Python library used for phylogenetic computing.


.. _faq_pplacer:

Expand Down Expand Up @@ -92,6 +100,6 @@ From GTDB-Tk v2.0.0 the conda environment will automatically have FastANI v1.3 i

**Docker:**

From GTDB-Tk v2.2.2 the Docker cotnainer will automatically have FastANI v1.32 installed. Otherwise, manually
From GTDB-Tk v2.2.2 the Docker container will automatically have FastANI v1.32 installed. Otherwise, manually
build the container from the `Dockerfile <https://github.com/Ecogenomics/GTDBTk/blob/master/Dockerfile>`_, making
sure to specify FastANI v1.32.
2 changes: 1 addition & 1 deletion gtdbtk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@
__status__ = 'Production'
__title__ = 'GTDB-Tk'
__url__ = 'https://github.com/Ecogenomics/GTDBTk'
__version__ = '2.2.4'
__version__ = '2.2.5'
29 changes: 17 additions & 12 deletions gtdbtk/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,7 @@ def run(self,
debugopt=False,
fulltreeopt=False,
skip_ani_screen=False,
genes=False,
no_mash=False,
mash_k=Config.MASH_K_VALUE,
mash_v=Config.MASH_V_VALUE,
Expand All @@ -351,7 +352,10 @@ def run(self,
# rest of the pipeline.
mash_classified_user_genomes = {}
if not skip_ani_screen:
if not no_mash:
if genes:
self.logger.warning('The --genes flag is set to True. The ANI screening steps will be skipped.')
skip_ani_screen = True
elif not no_mash:
# if mash_db finishes with a backslash, it should be considered a directory
if mash_db.endswith('/'):
make_sure_path_exists(mash_db)
Expand Down Expand Up @@ -511,8 +515,6 @@ def run(self,
user_msa_file = prescreened_msa_file_path




# Write the RED dictionary to disk (intermediate file).
red_dict_file.write()

Expand Down Expand Up @@ -604,10 +606,10 @@ def run(self,
disappearing_genomes_file.add_genome(disappearing_genome, tree_iter)

class_level_classification, classified_user_genomes,warning_counter = self._parse_tree(mrca_lowtree, genomes, msa_dict,
percent_multihit_dict, tln_table_summary_file.genomes,
percent_multihit_dict,genes, tln_table_summary_file.genomes,
bac_ar_diff, submsa_file_path, red_dict_file.data,
summary_file, pplacer_taxonomy_dict,warning_counter,
high_classification, debug_file,skip_ani_screen, debugopt,
high_classification, debug_file, debugopt,
tree_mapping_file, tree_iter,
tree_mapping_dict_reverse)

Expand Down Expand Up @@ -1144,9 +1146,9 @@ def _classify_red_topology(self, tree, msa_dict, percent_multihit_dict, trans_ta

return class_level_classification,warning_counter

def _parse_tree(self, tree, genomes, msa_dict, percent_multihit_dict, trans_table_dict, bac_ar_diff,
def _parse_tree(self, tree, genomes, msa_dict, percent_multihit_dict,genes, trans_table_dict, bac_ar_diff,
user_msa_file, red_dict, summary_file, pplacer_taxonomy_dict,warning_counter, high_classification,
debug_file,prescreening, debugopt, tree_mapping_file, tree_iter, tree_mapping_dict_reverse):
debug_file, debugopt, tree_mapping_file, tree_iter, tree_mapping_dict_reverse):
# Genomes can be classified by using FastANI or RED values
# We go through all leaves of the tree. if the leaf is a user
# genome we take its parent node and look at all the leaves
Expand All @@ -1156,8 +1158,10 @@ def _parse_tree(self, tree, genomes, msa_dict, percent_multihit_dict, trans_tabl
tt = TreeTraversal()

self.logger.log(Config.LOG_TASK, 'Traversing tree to determine classification method.')
fastani_verification, qury_nodes = self._get_fastani_verification(tree, self.reference_ids, tt)

if genes:
fastani_verification = {}
else:
fastani_verification, qury_nodes = self._get_fastani_verification(tree, self.reference_ids, tt)

#DEBUG: Skip FastANI step
#fastani_verification = {}
Expand All @@ -1176,14 +1180,15 @@ def _parse_tree(self, tree, genomes, msa_dict, percent_multihit_dict, trans_tabl
else:
all_fastani_dict = {}



classified_user_genomes, unclassified_user_genomes,warning_counter = self._sort_fastani_results(
fastani_verification, pplacer_taxonomy_dict, all_fastani_dict, msa_dict, percent_multihit_dict,
trans_table_dict, bac_ar_diff,warning_counter, summary_file)
#if not prescreening:
self.logger.info(f'{len(classified_user_genomes):,} genome(s) have '
if not genes:
self.logger.info(f'{len(classified_user_genomes):,} genome(s) have '
f'been classified using FastANI and pplacer.')
else:
self.logger.info('ANI classification has been skipped (--genes option used).')

if tree_mapping_file:
for genome in classified_user_genomes.keys():
Expand Down
4 changes: 3 additions & 1 deletion gtdbtk/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def __temp_dir(group):

def __genes(group):
group.add_argument('--genes', action='store_true', default=False,
help='indicates input files contain called genes (skip gene calling)')
help='indicates input files contain called genes (skip gene calling).Warning: This flag will also skip '
'the ANI comparison steps (ani_screen and classification).')


def __genome_dir(group):
Expand Down Expand Up @@ -478,6 +479,7 @@ def get_main_parser():
__cpus(grp)
__pplacer_cpus(grp)
__scratch_dir(grp)
__genes(grp)
__full_tree(grp)
__min_af(grp)
__temp_dir(grp)
Expand Down
2 changes: 2 additions & 0 deletions gtdbtk/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
"""
try:
GENERIC_PATH = os.environ['GTDBTK_DATA_PATH']
#expand the variables in the path
GENERIC_PATH = os.path.expandvars(GENERIC_PATH)
except KeyError:
print('\n' + '=' * 80)
print(' ERROR '.center(80))
Expand Down
6 changes: 5 additions & 1 deletion gtdbtk/external/mash.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,11 @@ def _load_metadata(self):

def _is_consistent(self):
"""Returns True if the sketch was generated from the genomes."""
return set(self.data.keys()) == set(self.genomes.values())
# to compare the consistency of the sketch file, we need to compare only the file names, not the full paths
# the mash_db can be moves to another folder in the cloud so the full path will be different
data_keys = set(map(os.path.basename,self.data.keys()))
genomes_values = set(map(os.path.basename,self.genomes.values()))
return data_keys == genomes_values

def _generate(self):
"""Generate a new sketch file."""
Expand Down
15 changes: 15 additions & 0 deletions gtdbtk/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@ def classify(self, options,all_classified_ani=False):
debugopt=options.debug,
fulltreeopt=options.full_tree,
skip_ani_screen=options.skip_ani_screen,
genes=options.genes,
no_mash=options.no_mash,
mash_k=options.mash_k,
mash_v=options.mash_v,
Expand All @@ -611,6 +612,12 @@ def classify(self, options,all_classified_ani=False):

self.stage_logger.steps.append(classify_step)

if options.genes:
self.logger.warning('The final classification predicted may be less accurate '
'due to the use of amino acid files instead of nucleotide files as input to the pipeline.'
' Without nucleotides files, the ANI classification step of the workflow has been skipped and therefore'
' no ANI matches with existing species in GTDB could be reported.')

self.logger.info('Note that Tk classification mode is insufficient for publication of new taxonomic '
'designations. New designations should be based on one or more de novo trees, an '
'example of which can be produced by Tk in de novo mode.')
Expand Down Expand Up @@ -1123,9 +1130,17 @@ def parse_options(self, options):
all_classified_ani = True

self.stage_logger.reset_steps(keep_steps=['ANI screen'])
else:
self.stage_logger.reset_steps()
else:
self.stage_logger.reset_steps()

if options.genes:
if not options.skip_ani_screen:
self.logger.warning('The --genes flag is set to True. The ANI screening step will be skipped.')
options.skip_ani_screen = True


if not options.skip_ani_screen:
all_classified_ani,classified_genomes = self.ani_screen(options)

Expand Down

0 comments on commit ad342d3

Please sign in to comment.