Skip to content

Commit

Permalink
Merge branch 'development' of https://github.com/glennhickey/hal into…
Browse files Browse the repository at this point in the history
… development
  • Loading branch information
joelarmstrong committed Jan 17, 2014
2 parents d3b17d1 + 4632ce1 commit fb68c23
Show file tree
Hide file tree
Showing 3 changed files with 639 additions and 17 deletions.
72 changes: 57 additions & 15 deletions alignability/halAlignability.cpp
Expand Up @@ -14,7 +14,7 @@ using namespace std;
using namespace hal;

/** This is a tool that counts the number of other genomes each base in
* a query region is aligned to. Duplications are considered
* a query region is aligned to.
*
* Coordinates are always genome-relative by default (as opposed to
* sequence-relative). The one exception is all methods within the
Expand All @@ -39,14 +39,16 @@ using namespace hal;
* the output stream. */
static void printSequence(ostream& outStream, const Sequence* sequence,
const set<const Genome*>& targetSet,
hal_size_t start, hal_size_t length, hal_size_t step);
hal_size_t start, hal_size_t length, hal_size_t step,
bool countDupes, bool noAncestors);

/** If given genome-relative coordinates, map them to a series of
* sequence subranges */
static void printGenome(ostream& outStream,
const Genome* genome, const Sequence* sequence,
const set<const Genome*>& targetSet,
hal_size_t start, hal_size_t length, hal_size_t step);
hal_size_t start, hal_size_t length, hal_size_t step,
bool countDupes, bool noAncestors);

static const hal_size_t StringBufferSize = 1024;

Expand Down Expand Up @@ -80,7 +82,19 @@ static CLParserPtr initParser()
"(others are excluded) (vist all if empty)",
"\"\"");
optionsParser->addOption("step", "step size", 1);
optionsParser->setDescription("Make alignability wiggle plot for a genome.");
optionsParser->addOptionFlag("countDupes",
"count each other *position* each base aligns "
"to, rather than the number of unique genomes, "
"including paralogies so a genome can be "
"counted multiple times. This will give the "
"height of the MAF column created with hal2maf.",
false);
optionsParser->addOptionFlag("noAncestors",
"do not count ancestral genomes.", false);
optionsParser->setDescription("Make alignability wiggle plot for a genome. "
"By default, this is a count of the number of "
"other unique genomes each base aligns to, "
"including ancestral genomes.");
return optionsParser;
}

Expand All @@ -97,6 +111,8 @@ int main(int argc, char** argv)
hal_size_t start;
hal_size_t length;
hal_size_t step;
bool countDupes;
bool noAncestors;
try
{
optionsParser->parseOptions(argc, argv);
Expand All @@ -109,6 +125,8 @@ int main(int argc, char** argv)
rootGenomeName = optionsParser->getOption<string>("rootGenome");
targetGenomes = optionsParser->getOption<string>("targetGenomes");
step = optionsParser->getOption<hal_size_t>("step");
countDupes = optionsParser->getFlag("countDupes");
noAncestors = optionsParser->getFlag("noAncestors");

if (rootGenomeName != "\"\"" && targetGenomes != "\"\"")
{
Expand Down Expand Up @@ -202,6 +220,13 @@ int main(int argc, char** argv)
}
}

if (refGenome->getNumChildren() != 0 && noAncestors == true)
{
throw hal_exception(string("--noAncestors cannot be used when reference "
"genome (") + refGenome->getName() +
string(") is ancetral"));
}

ofstream ofile;
ostream& outStream = wigPath == "stdout" ? cout : ofile;
if (wigPath != "stdout")
Expand All @@ -215,7 +240,7 @@ int main(int argc, char** argv)
}

printGenome(outStream, refGenome, refSequence, targetSet, start, length,
step);
step, countDupes, noAncestors);

}
catch(hal_exception& e)
Expand All @@ -237,7 +262,8 @@ int main(int argc, char** argv)
* in the target set */
void printSequence(ostream& outStream, const Sequence* sequence,
const set<const Genome*>& targetSet,
hal_size_t start, hal_size_t length, hal_size_t step)
hal_size_t start, hal_size_t length, hal_size_t step,
bool countDupes, bool noAncestors)
{
hal_size_t seqLen = sequence->getSequenceLength();
if (seqLen == 0)
Expand Down Expand Up @@ -275,7 +301,8 @@ void printSequence(ostream& outStream, const Sequence* sequence,
ColumnIteratorConstPtr colIt = sequence->getColumnIterator(&targetSet,
0, pos,
last - 1,
true);
false,
noAncestors);
// note wig coordinates are 1-based for some reason so we shift to right
outStream << "fixedStep chrom=" << sequenceName << " start=" << start + 1
<< " step=" << step << "\n";
Expand All @@ -285,8 +312,11 @@ void printSequence(ostream& outStream, const Sequence* sequence,
// convert to genome coordinates
pos += sequence->getStartPosition();
last += sequence->getStartPosition();
// keep track of unique genomes
set<const Genome*> genomeSet;
while (pos <= last)
{
genomeSet.clear();
hal_size_t count = 0;
/** ColumnIterator::ColumnMap maps a Sequence to a list of bases
* the bases in the map form the alignment column. Some sequences
Expand All @@ -297,14 +327,23 @@ void printSequence(ostream& outStream, const Sequence* sequence,
for (ColumnIterator::ColumnMap::const_iterator i = cmap->begin();
i != cmap->end(); ++i)
{
/** If the sequence belongs to the reference genome, we don't
* count it. So a homology in the same genome is not considered
* here. But paralgous bases in other genomes will each be counted */
if (i->first->getGenome() != genome && !i->second->empty())
if (countDupes == true)
{
++count;
// countDupes enabled: we just count everything
count += i->second->size();
}
else if (!i->second->empty())
{
// just counting unique genomes: add it if there's at least one base
genomeSet.insert(i->first->getGenome());
}
}
if (countDupes == false)
{
count = genomeSet.size();
}
// don't want to include reference base in output
--count;

outStream << count << '\n';

Expand Down Expand Up @@ -352,11 +391,13 @@ void printSequence(ostream& outStream, const Sequence* sequence,
void printGenome(ostream& outStream,
const Genome* genome, const Sequence* sequence,
const set<const Genome*>& targetSet,
hal_size_t start, hal_size_t length, hal_size_t step)
hal_size_t start, hal_size_t length, hal_size_t step,
bool countDupes, bool noAncestors)
{
if (sequence != NULL)
{
printSequence(outStream, sequence, targetSet, start, length, step);
printSequence(outStream, sequence, targetSet, start, length, step, countDupes,
noAncestors);
}
else
{
Expand Down Expand Up @@ -389,7 +430,8 @@ void printGenome(ostream& outStream,
hal_size_t readStart = seqStart >= start ? 0 : start - seqStart;
hal_size_t readLen = min(seqLen - readStart, length);
readLen = min(readLen, length - runningLength);
printSequence(outStream, sequence, targetSet, readStart, readLen, step);
printSequence(outStream, sequence, targetSet, readStart, readLen, step,
countDupes, noAncestors);
runningLength += readLen;
}
}
Expand Down

0 comments on commit fb68c23

Please sign in to comment.