Skip to content

Commit

Permalink
Small bug fix for tree parsing. Pep8 format
Browse files Browse the repository at this point in the history
  • Loading branch information
pchaumeil committed Sep 14, 2018
1 parent b2eeb1d commit 313756d
Show file tree
Hide file tree
Showing 8 changed files with 207 additions and 187 deletions.
2 changes: 1 addition & 1 deletion gtdbtk/classify.py
Expand Up @@ -242,7 +242,7 @@ def run(self,
# and parent_taxon.split(";")[-1].startswith('g__'))):
while par_node is not None and not parent_taxon:
par_node = par_node.parent_node
if leaf_ref_genome == '':
if leaf_ref_genome is None:
leaf_ref_genomes = [subnd for subnd in par_node.leaf_iter(
) if subnd.taxon.label.replace("'", '')[0:3] in ['RS_', 'GB_', 'UBA']]
if len(leaf_ref_genomes) == 1:
Expand Down
4 changes: 2 additions & 2 deletions gtdbtk/markers.py
Expand Up @@ -133,8 +133,8 @@ def _report_identified_marker_genes(self, gene_dict, outdir, prefix):
sublist = each_mark.split(",")
markerid = sublist[0]

if (markerid not in marker_bac_list_original
and markerid not in marker_arc_list_original):
if (markerid not in marker_bac_list_original and
markerid not in marker_arc_list_original):
continue

if markerid in marker_bac_list_original:
Expand Down
67 changes: 36 additions & 31 deletions gtdbtk/misc.py
Expand Up @@ -16,7 +16,6 @@
###############################################################################

import os
import sys
import logging

import config.config as Config
Expand All @@ -25,60 +24,66 @@


class Misc():

def __init__(self):
"""Initialize."""
self.logger = logging.getLogger('timestamp')
self.logger = logging.getLogger('timestamp')

def trim_msa(self,untrimmed_msa,mask_type,maskid,output_file):
def trim_msa(self, untrimmed_msa, mask_type, maskid, output_file):
if maskid == 'bac' and mask_type == 'reference':
mask = os.path.join(Config.MASK_DIR,Config.MASK_BAC120)
mask = os.path.join(Config.MASK_DIR, Config.MASK_BAC120)
elif maskid == 'arc' and mask_type == 'reference':
mask = os.path.join(Config.MASK_DIR,Config.MASK_AR122)
mask = os.path.join(Config.MASK_DIR, Config.MASK_AR122)
elif mask_type == 'file':
mask = maskid
with open(mask, 'r') as f:
maskstr = f.readline()

outfwriter = open(output_file,'w')
dict_genomes = read_fasta(untrimmed_msa,False)

for k,v in dict_genomes.iteritems():
aligned_seq = ''.join([v[i] for i in xrange(0, len(maskstr)) if maskstr[i]=='1'])

outfwriter = open(output_file, 'w')
dict_genomes = read_fasta(untrimmed_msa, False)

for k, v in dict_genomes.iteritems():
aligned_seq = ''.join([v[i] for i in xrange(
0, len(maskstr)) if maskstr[i] == '1'])
fasta_outstr = ">%s\n%s\n" % (k, aligned_seq)
outfwriter.write(fasta_outstr)
outfwriter.close()
return True
def checkfile(self,file_path,file_name):
self.logger.warning("Check file {}: {}".format(file_name,file_path))

def checkfile(self, file_path, file_name):
self.logger.warning("Check file {}: {}".format(file_name, file_path))
if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
self.logger.warning("{} file..........OK".format(file_name))
else:
self.logger.warning("{} file..........missing".format(file_name))
raise Exception("GTDB-Tk installation is incomplete.")

def checkfolder(self,folder_path,folder_name):
self.logger.warning("Check folder {}: {}".format(folder_name,folder_path))

def checkfolder(self, folder_path, folder_name):
self.logger.warning(
"Check folder {}: {}".format(folder_name, folder_path))
if os.path.isdir(folder_path) and len(os.listdir(folder_path)) > 0:
self.logger.warning("{} dir..........OK".format(folder_name))
else:
self.logger.warning("{} dir..........missing".format(folder_name))
raise Exception("GTDB-Tk installation is incomplete.")

def check_install(self):
try:
self.checkfile(Config.TAXONOMY_FILE,'Taxonomy')
self.checkfile(Config.CONCAT_BAC120,'concat_bac120')
self.checkfile(Config.CONCAT_AR122,'concat_ar122')
self.checkfile(os.path.join(Config.MASK_DIR,Config.MASK_BAC120),'mask_bac120')
self.checkfile(os.path.join(Config.MASK_DIR,Config.MASK_AR122),'mask_ar122')
self.checkfile(Config.TIGRFAM_HMMS,'tirgfam_hmms')
pfam_test_file = os.path.join(Config.PFAM_HMM_DIR,'Pfam-A.hmm')
self.checkfile(pfam_test_file,'pfam_hmms')

self.checkfolder(Config.FASTANI_GENOMES,'fastani_genomes')
self.checkfolder(os.path.join(Config.PPLACER_DIR,Config.PPLACER_BAC120_REF_PKG),'pplacer_bac120')
self.checkfolder(os.path.join(Config.PPLACER_DIR,Config.PPLACER_AR122_REF_PKG),'pplacer_ar122')
self.checkfile(Config.TAXONOMY_FILE, 'Taxonomy')
self.checkfile(Config.CONCAT_BAC120, 'concat_bac120')
self.checkfile(Config.CONCAT_AR122, 'concat_ar122')
self.checkfile(os.path.join(Config.MASK_DIR,
Config.MASK_BAC120), 'mask_bac120')
self.checkfile(os.path.join(Config.MASK_DIR,
Config.MASK_AR122), 'mask_ar122')
self.checkfile(Config.TIGRFAM_HMMS, 'tirgfam_hmms')
pfam_test_file = os.path.join(Config.PFAM_HMM_DIR, 'Pfam-A.hmm')
self.checkfile(pfam_test_file, 'pfam_hmms')

self.checkfolder(Config.FASTANI_GENOMES, 'fastani_genomes')
self.checkfolder(os.path.join(Config.PPLACER_DIR,
Config.PPLACER_BAC120_REF_PKG), 'pplacer_bac120')
self.checkfolder(os.path.join(Config.PPLACER_DIR,
Config.PPLACER_AR122_REF_PKG), 'pplacer_ar122')
except Exception as e:
raise
56 changes: 33 additions & 23 deletions gtdbtk/reroot_tree.py
Expand Up @@ -42,10 +42,10 @@ def root_with_outgroup(self, input_tree, output_tree, outgroup):
Labels of taxa in outgroup.
"""

tree = dendropy.Tree.get_from_path(input_tree,
schema='newick',
rooting='force-rooted',
preserve_underscores=True)
tree = dendropy.Tree.get_from_path(input_tree,
schema='newick',
rooting='force-rooted',
preserve_underscores=True)

outgroup = set(outgroup)
outgroup_in_tree = set()
Expand All @@ -56,57 +56,65 @@ def root_with_outgroup(self, input_tree, output_tree, outgroup):
else:
ingroup_leaves.add(n)

self.logger.info('Identified %d outgroup taxa in the tree.' % len(outgroup_in_tree))
self.logger.info('Identified %d ingroup taxa in the tree.' % len(ingroup_leaves))
self.logger.info('Identified %d outgroup taxa in the tree.' %
len(outgroup_in_tree))
self.logger.info('Identified %d ingroup taxa in the tree.' %
len(ingroup_leaves))

if len(outgroup_in_tree) == 0:
self.logger.warning('No outgroup taxa identified in the tree.')
self.logger.warning('Tree was not rerooted.')
sys.exit(0)

# Since finding the MRCA is a rooted tree operation,
# the tree is first rerooted on an ingroup taxa. This
# ensures the MRCA of the outgroup can be identified
# so long as the outgroup is monophyletic. If the
# outgroup is polyphyletic trying to root on it
# so long as the outgroup is monophyletic. If the
# outgroup is polyphyletic trying to root on it
# is ill defined. To try and pick a "good" root for
# polyphyletic outgroups, random ingroup taxa are
# polyphyletic outgroups, random ingroup taxa are
# selected until two of them give the same size
# lineage. This will, likely, be the smallest
# lineage. This will, likely, be the smallest
# bipartition possible for the given outgroup though
# this is not guaranteed.
mrca = tree.mrca(taxa=outgroup_in_tree)
mrca_leaves = len(mrca.leaf_nodes())
while True:
rnd_ingroup = random.sample(ingroup_leaves, 1)[0]
tree.reroot_at_edge(rnd_ingroup.edge,
length1=0.5 * rnd_ingroup.edge_length,
length2=0.5 * rnd_ingroup.edge_length)
length1=0.5 * rnd_ingroup.edge_length,
length2=0.5 * rnd_ingroup.edge_length)

mrca = tree.mrca(taxa=outgroup_in_tree)
if len(mrca.leaf_nodes()) == mrca_leaves:
break

mrca_leaves = len(mrca.leaf_nodes())

if len(mrca.leaf_nodes()) != len(outgroup_in_tree):
self.logger.info('Outgroup is not monophyletic. Tree will be rerooted at the MRCA of the outgroup.')
self.logger.info('The outgroup consisted of %d taxa, while the MRCA has %d leaf nodes.' % (len(outgroup_in_tree), len(mrca.leaf_nodes())))
self.logger.info(
'Outgroup is not monophyletic. Tree will be rerooted at the MRCA of the outgroup.')
self.logger.info('The outgroup consisted of %d taxa, while the MRCA has %d leaf nodes.' % (
len(outgroup_in_tree), len(mrca.leaf_nodes())))
if len(mrca.leaf_nodes()) == len(tree.leaf_nodes()):
self.logger.warning('The MRCA spans all taxa in the tree.')
self.logger.warning('This indicating the selected outgroup is likely polyphyletic in the current tree.')
self.logger.warning('Polyphyletic outgroups are not suitable for rooting. Try another outgroup.')
self.logger.warning(
'This indicating the selected outgroup is likely polyphyletic in the current tree.')
self.logger.warning(
'Polyphyletic outgroups are not suitable for rooting. Try another outgroup.')
else:
self.logger.info('Outgroup is monophyletic.')

if mrca.edge_length is None:
self.logger.info('Tree appears to already be rooted on this outgroup.')
self.logger.info(
'Tree appears to already be rooted on this outgroup.')
else:
self.logger.info('Rerooting tree.')
tree.reroot_at_edge(mrca.edge,
length1=0.5 * mrca.edge_length,
length2=0.5 * mrca.edge_length)
tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
tree.write_to_path(output_tree, schema='newick',
suppress_rooting=True, unquoted_underscores=True)
self.logger.info('Rerooted tree written to: %s' % output_tree)

def midpoint(self, input_tree, output_tree):
Expand All @@ -120,6 +128,8 @@ def midpoint(self, input_tree, output_tree):
Name of file for rerooted tree.
"""

tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True)
tree = dendropy.Tree.get_from_path(
input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True)
tree.reroot_at_midpoint()
tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
tree.write_to_path(output_tree, schema='newick',
suppress_rooting=True, unquoted_underscores=True)
13 changes: 8 additions & 5 deletions gtdbtk/tools.py
Expand Up @@ -15,9 +15,9 @@

def add_ncbi_prefix(refname):
if refname.startswith("GCF_"):
return "RS_"+refname
return "RS_" + refname
elif refname.startswith("GCA_"):
return "GB_"+refname
return "GB_" + refname
else:
return refname

Expand All @@ -27,7 +27,8 @@ def splitchunks(d, n):
it = iter(d)
for _ in xrange(0, len(d), chunksize):
yield {k: d[k] for k in islice(it, chunksize)}



def splitchunks_list(l, n):
"""Yield successive n-sized chunks from l."""
chunksize = int(math.ceil(len(l) / float(n)))
Expand Down Expand Up @@ -60,12 +61,14 @@ def list_genomes_dir(userdir):
if not os.path.exists(userdir):
raise ValueError('{0} does not exist.'.format(userdir))
else:
onlygenomefiles = {f: os.path.join(userdir, f) for f in os.listdir(userdir) if os.path.isfile(os.path.join(userdir, f))}
onlygenomefiles = {f: os.path.join(userdir, f) for f in os.listdir(
userdir) if os.path.isfile(os.path.join(userdir, f))}
for potential_file in onlygenomefiles:
try:
read_fasta(os.path.join(userdir, potential_file))
except:
raise IOError("{0} is not a fasta file." .format(os.path.join(userdir, potential_file)))
raise IOError("{0} is not a fasta file." .format(
os.path.join(userdir, potential_file)))
return onlygenomefiles


Expand Down
1 change: 1 addition & 0 deletions scripts/remove_user_genomes.py
Expand Up @@ -46,6 +46,7 @@ def removeGenomes(self, inf, outf):
output_file.write(line)
output_file.close()


if __name__ == "__main__":
print __prog_name__ + ' v' + __version__ + ': ' + __prog_desc__
print ' by ' + __author__ + ' (' + __email__ + ')' + '\n'
Expand Down

0 comments on commit 313756d

Please sign in to comment.