Skip to content

Commit

Permalink
Fix to coding density calculation which was off by 1 base per sequence.
Browse files Browse the repository at this point in the history
  • Loading branch information
donovan-h-parks committed May 11, 2021
1 parent 1a8d128 commit d25ba8d
Showing 1 changed file with 21 additions and 17 deletions.
38 changes: 21 additions & 17 deletions checkm/prodigal.py
Expand Up @@ -39,6 +39,7 @@ class ProdigalError(BaseException):

class ProdigalRunner():
"""Wrapper for running prodigal."""

def __init__(self, outDir):
self.logger = logging.getLogger('timestamp')

Expand All @@ -50,15 +51,15 @@ def __init__(self, outDir):
self.gffFile = os.path.join(outDir, DefaultValues.PRODIGAL_GFF)

def run(self, query, bNucORFs=True):

prodigal_input = query

# decompress gzip input files
if prodigal_input.endswith('.gz'):
tmp_dir = tempfile.mkdtemp()
prodigal_input = os.path.join(tmp_dir, os.path.basename(prodigal_input[0:-3]) + '.fna')
writeFasta(seqs, prodigal_input)

# gather statistics about query file
seqs = readFasta(prodigal_input)
totalBases = 0
Expand All @@ -79,17 +80,17 @@ def run(self, query, bNucORFs=True):
procedureStr = 'single' # estimate parameters from data

if bNucORFs:
cmd = ('prodigal -p %s -q -m -f gff -g %d -a %s -d %s -i %s > %s 2> /dev/null' % (procedureStr,
translationTable,
aaGeneFile,
ntGeneFile,
prodigal_input,
gffFile))
cmd = ('prodigal -p %s -q -m -f gff -g %d -a %s -d %s -i %s > %s 2> /dev/null' % (procedureStr,
translationTable,
aaGeneFile,
ntGeneFile,
prodigal_input,
gffFile))
else:
cmd = ('prodigal -p %s -q -m -f gff -g %d -a %s -i %s > %s 2> /dev/null' % (procedureStr,
translationTable,
aaGeneFile,
prodigal_input,
cmd = ('prodigal -p %s -q -m -f gff -g %d -a %s -i %s > %s 2> /dev/null' % (procedureStr,
translationTable,
aaGeneFile,
prodigal_input,
gffFile))

os.system(cmd)
Expand Down Expand Up @@ -129,7 +130,7 @@ def run(self, query, bNucORFs=True):
os.remove(self.gffFile + '.' + str(translationTable))
if bNucORFs:
os.remove(self.ntGeneFile + '.' + str(translationTable))

if prodigal_input.endswith('.gz'):
shutil.rmtree(tmp_dir)

Expand Down Expand Up @@ -160,6 +161,7 @@ def checkForProdigal(self):

class ProdigalFastaParser():
"""Parses prodigal FASTA output."""

def __init__(self):
pass

Expand All @@ -182,6 +184,7 @@ def genePositions(self, filename):

class ProdigalGeneFeatureParser():
"""Parses prodigal FASTA output."""

def __init__(self, filename):
checkFileExists(filename)

Expand All @@ -204,10 +207,10 @@ def __parseGFF(self, filename):
if 'transl_table' in token:
self.translationTable = int(token[token.find('=') + 1:])

if line[0] == '#' or line.strip() == '"':
if line[0] == '#' or line.strip() == '"':
# work around for Prodigal having lines with just a
# quotation on it when FASTA files have Windows style
# line endings
# line endings
continue

lineSplit = line.split('\t')
Expand All @@ -230,10 +233,11 @@ def __buildCodingBaseMask(self, seqId):
"""Build mask indicating which bases in a sequences are coding."""

# safe way to calculate coding bases as it accounts
# for the potential of overlapping genes
# for the potential of overlapping genes; indices adjusted
# to account for GFF file using 1-based indexing
codingBaseMask = np.zeros(self.lastCodingBase[seqId])
for pos in self.genes[seqId].values():
codingBaseMask[pos[0]:pos[1] + 1] = 1
codingBaseMask[pos[0]-1:pos[1]] = 1

return codingBaseMask

Expand Down

0 comments on commit d25ba8d

Please sign in to comment.