diff --git a/alphabet.h b/alphabet.h index 4db1b73..2b143d2 100644 --- a/alphabet.h +++ b/alphabet.h @@ -207,6 +207,23 @@ static inline float entropyDna5(const String& read) { return ent; } +/** + * Return the DNA complement of the given ASCII char. + */ +static inline char comp(char c) { + switch(c) { + case 'a': return 't'; + case 'A': return 'T'; + case 'c': return 'g'; + case 'C': return 'G'; + case 'g': return 'c'; + case 'G': return 'C'; + case 't': return 'a'; + case 'T': return 'A'; + default: return c; + } +} + extern uint8_t dna4Cat[]; extern uint8_t charToDna5[]; extern uint8_t rcCharToDna5[]; diff --git a/ebwt_search_backtrack.h b/ebwt_search_backtrack.h index 038a67f..8dd79db 100644 --- a/ebwt_search_backtrack.h +++ b/ebwt_search_backtrack.h @@ -2424,7 +2424,7 @@ class EbwtRangeSource : public RangeSource { for(size_t i = 0; i < br->edits_.size(); i++) { Edit e = br->edits_.get(i); cout << (curEbwt_->fw() ? (qlen_ - e.pos - 1) : e.pos) - << "ACGT"[e.chr]; + << (char)e.chr; if(i < br->edits_.size()-1) cout << " "; } cout << endl; @@ -2653,7 +2653,7 @@ class EbwtRangeSource : public RangeSource { curRange_.refcs.clear(); for(size_t i = 0; i < nedits; i++) { curRange_.mms.push_back(qlen_ - br->edits_.get(i).pos - 1); - curRange_.refcs.push_back("ACGTN"[br->edits_.get(i).chr]); + curRange_.refcs.push_back((char)br->edits_.get(i).chr); } addPartialEdits(); curRange_.ebwt = ebwt_; diff --git a/edit.cpp b/edit.cpp index a4e8416..6bb4b17 100644 --- a/edit.cpp +++ b/edit.cpp @@ -11,6 +11,6 @@ using namespace std; ostream& operator<< (ostream& os, const Edit& e) { - os << e.pos << "ACGT"[e.chr]; + os << e.pos << (char)e.chr; return os; } diff --git a/edit.h b/edit.h index 94625ab..ea7c269 100644 --- a/edit.h +++ b/edit.h @@ -18,7 +18,7 @@ * reference, deletion in the reference. */ enum { - EDIT_TYPE_MM = 0, + EDIT_TYPE_MM = 1, EDIT_TYPE_SNP, EDIT_TYPE_INS, EDIT_TYPE_DEL @@ -33,21 +33,21 @@ struct Edit { Edit() : pos(1023) { } Edit(int po, int ch, int ty = EDIT_TYPE_MM) : - type(ty), pos(po), chr(ch) { } + chr(ch), qchr(0), type(ty), pos(po) { } /** * Write Edit to an OutFileBuf. */ void serialize(OutFileBuf& fb) const { - assert_eq(2, sizeof(Edit)); - fb.writeChars((const char*)this, 2); + assert_eq(4, sizeof(Edit)); + fb.writeChars((const char*)this, 4); } /** * Read Edit from a FileBuf. */ void deserialize(FileBuf& fb) { - fb.get((char*)this, 2); + fb.get((char*)this, 4); } /** @@ -56,7 +56,9 @@ struct Edit { int operator< (const Edit &rhs) const { if(pos < rhs.pos) return 1; if(pos > rhs.pos) return 0; - return (chr < rhs.chr)? 1 : 0; + if(chr < rhs.chr) return 1; + if(chr > rhs.chr) return 0; + return (qchr < rhs.qchr)? 1 : 0; } /** @@ -65,6 +67,7 @@ struct Edit { int operator== (const Edit &rhs) const { return(pos == rhs.pos && chr == rhs.chr && + qchr == rhs.qchr && type == rhs.type); } @@ -75,10 +78,11 @@ struct Edit { return pos != 1023; } - uint16_t type : 2; // 1 -> subst, 2 -> ins, 3 -> del, 0 -> empty - uint16_t pos : 10; // position w/r/t search root - uint16_t chr : 2; // character involved (for subst and ins) - uint16_t reserved : 2; // reserved + uint32_t chr : 8; // reference character involved (for subst and ins) + uint32_t qchr : 8; // read character involved (for subst and del + uint32_t type : 4; // 1 -> mm, 2 -> SNP, 3 -> ins, 4 -> del + uint32_t pos : 10; // position w/r/t search root + uint32_t reserved : 2; // padding friend std::ostream& operator<< (std::ostream& os, const Edit& e); }; diff --git a/hit.h b/hit.h index 496ac6b..7f1cb2e 100644 --- a/hit.h +++ b/hit.h @@ -129,7 +129,7 @@ class Hit { hse.expand(); hse.back().type = EDIT_TYPE_MM; hse.back().pos = i; - hse.back().chr = charToDna5[(int)refcs[i]]; + hse.back().chr = refcs[i]; } } } @@ -183,7 +183,7 @@ class Hit { for(size_t j = 0; j < hs[i].size(); j++) { if(hs[i][j].type != EDIT_TYPE_MM) continue; hits[i].mms.set(hs[i][j].pos); - hits[i].refcs[hs[i][j].pos] = "ACGT"[hs[i][j].chr]; + hits[i].refcs[hs[i][j].pos] = hs[i][j].chr; } } } diff --git a/hit_set.cpp b/hit_set.cpp index eb269f8..054f73a 100644 --- a/hit_set.cpp +++ b/hit_set.cpp @@ -43,8 +43,15 @@ void HitSet::reportUpTo(ostream& os, int khits) { const Edit& e = h.edits[i]; os << e.pos; if(e.type == EDIT_TYPE_SNP) os << "S"; - os << ":" << "ACGT"[e.chr] << ">" << seq[e.pos]; - if(i < h.edits.size()-1) os << ","; + os << ":" << (char)e.chr << ">" << (e.qchr != 0 ? (char)e.qchr : (char)seq[e.pos]); + if(i < h.edits.size()-1 || !h.cedits.empty()) os << ","; + } + for(size_t i = 0; i < h.cedits.size(); i++) { + const Edit& e = h.cedits[i]; + os << e.pos; + if(e.type == EDIT_TYPE_SNP) os << "S"; + os << ":" << (char)e.chr << ">" << (e.qchr != 0 ? (char)e.qchr : (char)seq[e.pos]); + if(i < h.cedits.size()-1) os << ","; } os << endl; } diff --git a/hit_set.h b/hit_set.h index 7c8e2cd..b833c1b 100644 --- a/hit_set.h +++ b/hit_set.h @@ -47,6 +47,11 @@ struct HitSetEnt { for(it = edits.begin(); it != edits.end(); it++) { it->serialize(fb); } + sz = cedits.size(); + fb.writeChars((const char*)&sz, 4); + for(it = cedits.begin(); it != cedits.end(); it++) { + it->serialize(fb); + } } /** @@ -70,6 +75,12 @@ struct HitSetEnt { for(uint32_t i = 0; i < sz; i++) { edits[i].deserialize(fb); } + fb.get((char*)&sz, 4); + assert_lt(sz, 1024); + cedits.resize(sz); + for(uint32_t i = 0; i < sz; i++) { + cedits[i].deserialize(fb); + } } /** @@ -127,6 +138,34 @@ struct HitSetEnt { } /** + * Another way to get at an edit. + */ + Edit& editAt(unsigned i) { + return edits[i]; + } + + /** + * Another way to get at a const edit. + */ + const Edit& editAt(unsigned i) const { + return edits[i]; + } + + /** + * Get the ith color edit. + */ + Edit& colorEditAt(unsigned i) { + return cedits[i]; + } + + /** + * Another way to get at an edit. + */ + const Edit& colorEditAt(unsigned i) const { + return cedits[i]; + } + + /** * Return the front entry. */ Edit& front() { @@ -176,6 +215,7 @@ struct HitSetEnt { uint16_t cost; // cost, including stratum uint32_t oms; // # others std::vector edits; // edits to get from reference to subject + std::vector cedits; // color edits to get from reference to subject }; /** @@ -200,12 +240,14 @@ struct HitSet { * Write binary representation of HitSet to an OutFileBuf. */ void serialize(OutFileBuf& fb) const { + fb.write(color ? 1 : 0); uint32_t i = seqan::length(name); assert_gt(i, 0); fb.writeChars((const char*)&i, 4); fb.writeChars(seqan::begin(name), i); i = seqan::length(seq); assert_gt(i, 0); + assert_lt(i, 1024); fb.writeChars((const char*)&i, 4); for(size_t j = 0; j < i; j++) { fb.write("ACGTN"[(int)seq[j]]); @@ -224,6 +266,7 @@ struct HitSet { * Repopulate a HitSet from its binary representation in FileBuf. */ void deserialize(FileBuf& fb) { + color = (fb.get() != 0 ? true : false); uint32_t sz = 0; if(fb.get((char*)&sz, 4) != 4) { seqan::clear(name); @@ -389,6 +432,7 @@ struct HitSet { seqan::clear(seq); seqan::clear(qual); ents.clear(); + color = false; } /** @@ -433,6 +477,7 @@ struct HitSet { seqan::String qual; int8_t maxedStratum; std::vector ents; + bool color; // whether read was orginally in colorspace }; #endif /* HIT_SET_H_ */ diff --git a/range_source.h b/range_source.h index 6a45498..7516820 100644 --- a/range_source.h +++ b/range_source.h @@ -90,20 +90,20 @@ struct EditList { /** * Get most recently added Edit. - */ - const Edit& top() const { - assert_gt(size(), 0); - return get(size()-1); - } - - /** + */ + const Edit& top() const { + assert_gt(size(), 0); + return get(size()-1); + } + + /** * Return true iff no Edits have been added. - */ - bool empty() const { - return size() == 0; - } - - /** + */ + bool empty() const { + return size() == 0; + } + + /** * Set a particular element of the EditList. */ void set(size_t i, const Edit& e) { @@ -161,137 +161,137 @@ struct EditList { union ElimsAndQual { /** - * Assuming qualA/C/G/T are already set, set quallo and quallo2 - * accordingly. - */ - void updateLo() { - flags.quallo = 127; - flags.quallo2 = 127; - if(!flags.mmA) { - if(flags.qualA < flags.quallo) { - flags.quallo2 = flags.quallo; - flags.quallo = flags.qualA; - } else if(flags.qualA == flags.quallo) { - flags.quallo2 = flags.quallo; - } else if(flags.qualA < flags.quallo2) { - flags.quallo2 = flags.qualA; - } - } - if(!flags.mmC) { - if(flags.qualC < flags.quallo) { - flags.quallo2 = flags.quallo; - flags.quallo = flags.qualC; - } else if(flags.qualC == flags.quallo) { - flags.quallo2 = flags.quallo; - } else if(flags.qualC < flags.quallo2) { - flags.quallo2 = flags.qualC; - } - } - if(!flags.mmG) { - if(flags.qualG < flags.quallo) { - flags.quallo2 = flags.quallo; - flags.quallo = flags.qualG; - } else if(flags.qualG == flags.quallo) { - flags.quallo2 = flags.quallo; - } else if(flags.qualG < flags.quallo2) { - flags.quallo2 = flags.qualG; - } - } - if(!flags.mmT) { - if(flags.qualT < flags.quallo) { - flags.quallo2 = flags.quallo; - flags.quallo = flags.qualT; - } else if(flags.qualT == flags.quallo) { - flags.quallo2 = flags.quallo; - } else if(flags.qualT < flags.quallo2) { - flags.quallo2 = flags.qualT; - } - } - assert(repOk()); - } - - /** + * Assuming qualA/C/G/T are already set, set quallo and quallo2 + * accordingly. + */ + void updateLo() { + flags.quallo = 127; + flags.quallo2 = 127; + if(!flags.mmA) { + if(flags.qualA < flags.quallo) { + flags.quallo2 = flags.quallo; + flags.quallo = flags.qualA; + } else if(flags.qualA == flags.quallo) { + flags.quallo2 = flags.quallo; + } else if(flags.qualA < flags.quallo2) { + flags.quallo2 = flags.qualA; + } + } + if(!flags.mmC) { + if(flags.qualC < flags.quallo) { + flags.quallo2 = flags.quallo; + flags.quallo = flags.qualC; + } else if(flags.qualC == flags.quallo) { + flags.quallo2 = flags.quallo; + } else if(flags.qualC < flags.quallo2) { + flags.quallo2 = flags.qualC; + } + } + if(!flags.mmG) { + if(flags.qualG < flags.quallo) { + flags.quallo2 = flags.quallo; + flags.quallo = flags.qualG; + } else if(flags.qualG == flags.quallo) { + flags.quallo2 = flags.quallo; + } else if(flags.qualG < flags.quallo2) { + flags.quallo2 = flags.qualG; + } + } + if(!flags.mmT) { + if(flags.qualT < flags.quallo) { + flags.quallo2 = flags.quallo; + flags.quallo = flags.qualT; + } else if(flags.qualT == flags.quallo) { + flags.quallo2 = flags.quallo; + } else if(flags.qualT < flags.quallo2) { + flags.quallo2 = flags.qualT; + } + } + assert(repOk()); + } + + /** * Set all the non-qual bits of the flags field to 1, indicating * that all outgoing paths are eliminated. */ inline void eliminate() { - join.elims = ((1 << 13) - 1); - } - - bool repOk() const { - uint8_t lo = 127; - uint8_t lo2 = 127; - if(!flags.mmA) { - if(flags.qualA < lo) { - lo2 = lo; - lo = flags.qualA; - } else if(flags.qualA == lo || flags.qualA < lo2) { - lo2 = flags.qualA; - } - } - if(!flags.mmC) { - if(flags.qualC < lo) { - lo2 = lo; - lo = flags.qualC; - } else if(flags.qualC == lo || flags.qualC < lo2) { - lo2 = flags.qualC; - } - } - if(!flags.mmG) { - if(flags.qualG < lo) { - lo2 = lo; - lo = flags.qualG; - } else if(flags.qualG == lo || flags.qualG < lo2) { - lo2 = flags.qualG; - } - } - if(!flags.mmT) { - if(flags.qualT < lo) { - lo2 = lo; - lo = flags.qualT; - } else if(flags.qualT == lo || flags.qualT < lo2) { - lo2 = flags.qualT; - } - } - assert_eq((int)lo, (int)flags.quallo); - assert_eq((int)lo2, (int)flags.quallo2); - return true; + join.elims = ((1 << 13) - 1); + } + + bool repOk() const { + uint8_t lo = 127; + uint8_t lo2 = 127; + if(!flags.mmA) { + if(flags.qualA < lo) { + lo2 = lo; + lo = flags.qualA; + } else if(flags.qualA == lo || flags.qualA < lo2) { + lo2 = flags.qualA; + } + } + if(!flags.mmC) { + if(flags.qualC < lo) { + lo2 = lo; + lo = flags.qualC; + } else if(flags.qualC == lo || flags.qualC < lo2) { + lo2 = flags.qualC; + } + } + if(!flags.mmG) { + if(flags.qualG < lo) { + lo2 = lo; + lo = flags.qualG; + } else if(flags.qualG == lo || flags.qualG < lo2) { + lo2 = flags.qualG; + } + } + if(!flags.mmT) { + if(flags.qualT < lo) { + lo2 = lo; + lo = flags.qualT; + } else if(flags.qualT == lo || flags.qualT < lo2) { + lo2 = flags.qualT; + } + } + assert_eq((int)lo, (int)flags.quallo); + assert_eq((int)lo2, (int)flags.quallo2); + return true; } struct { - uint64_t mmA : 1; // A in ref aligns to non-A char in read - uint64_t mmC : 1; // C in ref aligns to non-C char in read - uint64_t mmG : 1; // G in ref aligns to non-G char in read - uint64_t mmT : 1; // T in ref aligns to non-T char in read - uint64_t snpA : 1; // Same as mmA, but we think it's a SNP and not a miscall - uint64_t snpC : 1; // Same as mmC, but we think it's a SNP and not a miscall - uint64_t snpG : 1; // Same as mmG, but we think it's a SNP and not a miscall - uint64_t snpT : 1; // Same as mmT, but we think it's a SNP and not a miscall - uint64_t insA : 1; // A insertion in reference w/r/t read - uint64_t insC : 1; // C insertion in reference w/r/t read - uint64_t insG : 1; // G insertion in reference w/r/t read - uint64_t insT : 1; // T insertion in reference w/r/t read - uint64_t del : 1; // deletion of read character - uint64_t qualA : 7; // quality penalty for picking A at this position - uint64_t qualC : 7; // quality penalty for picking C at this position - uint64_t qualG : 7; // quality penalty for picking G at this position - uint64_t qualT : 7; // quality penalty for picking T at this position - uint64_t quallo : 7; // lowest quality penalty at this position - uint64_t quallo2 : 7; // 2nd-lowest quality penalty at this position - uint64_t reserved : 9; + uint64_t mmA : 1; // A in ref aligns to non-A char in read + uint64_t mmC : 1; // C in ref aligns to non-C char in read + uint64_t mmG : 1; // G in ref aligns to non-G char in read + uint64_t mmT : 1; // T in ref aligns to non-T char in read + uint64_t snpA : 1; // Same as mmA, but we think it's a SNP and not a miscall + uint64_t snpC : 1; // Same as mmC, but we think it's a SNP and not a miscall + uint64_t snpG : 1; // Same as mmG, but we think it's a SNP and not a miscall + uint64_t snpT : 1; // Same as mmT, but we think it's a SNP and not a miscall + uint64_t insA : 1; // A insertion in reference w/r/t read + uint64_t insC : 1; // C insertion in reference w/r/t read + uint64_t insG : 1; // G insertion in reference w/r/t read + uint64_t insT : 1; // T insertion in reference w/r/t read + uint64_t del : 1; // deletion of read character + uint64_t qualA : 7; // quality penalty for picking A at this position + uint64_t qualC : 7; // quality penalty for picking C at this position + uint64_t qualG : 7; // quality penalty for picking G at this position + uint64_t qualT : 7; // quality penalty for picking T at this position + uint64_t quallo : 7; // lowest quality penalty at this position + uint64_t quallo2 : 7; // 2nd-lowest quality penalty at this position + uint64_t reserved : 9; } flags; struct { - uint64_t elims : 13; // all of the edit-elim flags bundled together - uint64_t quals : 42; // quality of positions - uint64_t reserved : 9; + uint64_t elims : 13; // all of the edit-elim flags bundled together + uint64_t quals : 42; // quality of positions + uint64_t reserved : 9; } join; struct { - uint64_t mmElims : 4; // substitution flags bundled together - uint64_t snpElims : 4; // substitution flags bundled together - uint64_t insElims : 4; // inserts-in-reference flags bundled together - uint64_t delElims : 1; // deletion of read character - uint64_t quals : 42; // quality of positions - uint64_t reserved : 9; + uint64_t mmElims : 4; // substitution flags bundled together + uint64_t snpElims : 4; // substitution flags bundled together + uint64_t insElims : 4; // inserts-in-reference flags bundled together + uint64_t delElims : 1; // deletion of read character + uint64_t quals : 42; // quality of positions + uint64_t reserved : 9; } join2; }; @@ -305,161 +305,167 @@ struct RangeState { /** * Using randomness when picking from among multiple choices, pick - * an edit to make. TODO: Only knows how to pick mismatches for - * now. + * an edit to make. TODO: Only knows how to pick mismatches for + * now. */ - Edit pickEdit(int pos, RandomSource& rand, bool fuzzy, - uint32_t& top, uint32_t& bot, bool& last) + Edit pickEdit(int pos, RandomSource& rand, bool fuzzy, + uint32_t& top, uint32_t& bot, bool& last) { + bool color = false; Edit ret; ret.type = EDIT_TYPE_MM; ret.pos = pos; ret.chr = 0; + ret.qchr = 0; ret.reserved = 0; assert(!eliminated_); - assert(!fuzzy || eq.repOk()); + assert(!fuzzy || eq.repOk()); assert(!eq.flags.mmA || !eq.flags.mmC || !eq.flags.mmG || !eq.flags.mmT); int num = !eq.flags.mmA + !eq.flags.mmC + !eq.flags.mmG + !eq.flags.mmT; assert_leq(num, 4); assert_gt(num, 0); - uint8_t lo2 = eq.flags.quallo2; - if(num == 2) eq.flags.quallo2 = 127; - // Only need to pick randomly if there's a quality tie - if(num > 1 && (!fuzzy || eq.flags.quallo == lo2)) { + uint8_t lo2 = eq.flags.quallo2; + if(num == 2) eq.flags.quallo2 = 127; + // Only need to pick randomly if there's a quality tie + if(num > 1 && (!fuzzy || eq.flags.quallo == lo2)) { last = false; // not the last at this pos // Sum up range sizes and do a random weighted pick uint32_t tot = 0; - bool candA = !eq.flags.mmA; - bool candC = !eq.flags.mmC; - bool candG = !eq.flags.mmG; - bool candT = !eq.flags.mmT; - if(fuzzy) { - candA = (candA && eq.flags.qualA == eq.flags.quallo); - candC = (candC && eq.flags.qualC == eq.flags.quallo); - candG = (candG && eq.flags.qualG == eq.flags.quallo); - candT = (candT && eq.flags.qualT == eq.flags.quallo); - } - ASSERT_ONLY(int origNum = num); - if(candA) { + bool candA = !eq.flags.mmA; + bool candC = !eq.flags.mmC; + bool candG = !eq.flags.mmG; + bool candT = !eq.flags.mmT; + if(fuzzy) { + candA = (candA && eq.flags.qualA == eq.flags.quallo); + candC = (candC && eq.flags.qualC == eq.flags.quallo); + candG = (candG && eq.flags.qualG == eq.flags.quallo); + candT = (candT && eq.flags.qualT == eq.flags.quallo); + } + ASSERT_ONLY(int origNum = num); + if(candA) { assert_gt(bots[0], tops[0]); tot += (bots[0] - tops[0]); assert_gt(num, 0); ASSERT_ONLY(num--); } - if(candC) { + if(candC) { assert_gt(bots[1], tops[1]); tot += (bots[1] - tops[1]); assert_gt(num, 0); ASSERT_ONLY(num--); } - if(candG) { + if(candG) { assert_gt(bots[2], tops[2]); tot += (bots[2] - tops[2]); assert_gt(num, 0); ASSERT_ONLY(num--); } - if(candT) { + if(candT) { assert_gt(bots[3], tops[3]); tot += (bots[3] - tops[3]); assert_gt(num, 0); ASSERT_ONLY(num--); } - assert_geq(num, 0); - assert_lt(num, origNum); + assert_geq(num, 0); + assert_lt(num, origNum); // Throw a dart randomly that hits one of the possible // substitutions, with likelihoods weighted by range size uint32_t dart = rand.nextU32() % tot; - if(candA) { + if(candA) { if(dart < (bots[0] - tops[0])) { // Eliminate A mismatch top = tops[0]; bot = bots[0]; eq.flags.mmA = 1; assert_lt(eq.join2.mmElims, 15); - ret.chr = 0; - if(fuzzy) eq.updateLo(); + ret.chr = color ? '0' : 'A'; + if(fuzzy) eq.updateLo(); return ret; } dart -= (bots[0] - tops[0]); } - if(candC) { + if(candC) { if(dart < (bots[1] - tops[1])) { // Eliminate C mismatch top = tops[1]; bot = bots[1]; eq.flags.mmC = 1; assert_lt(eq.join2.mmElims, 15); - ret.chr = 1; - if(fuzzy) eq.updateLo(); + ret.chr = color ? '1' : 'C'; + if(fuzzy) eq.updateLo(); return ret; } dart -= (bots[1] - tops[1]); } - if(candG) { + if(candG) { if(dart < (bots[2] - tops[2])) { // Eliminate G mismatch top = tops[2]; bot = bots[2]; eq.flags.mmG = 1; assert_lt(eq.join2.mmElims, 15); - ret.chr = 2; - if(fuzzy) eq.updateLo(); + ret.chr = color ? '2' : 'G'; + if(fuzzy) eq.updateLo(); return ret; } dart -= (bots[2] - tops[2]); } - if(candT) { + if(candT) { assert_lt(dart, (bots[3] - tops[3])); // Eliminate T mismatch top = tops[3]; bot = bots[3]; eq.flags.mmT = 1; assert_lt(eq.join2.mmElims, 15); - ret.chr = 3; - if(fuzzy) eq.updateLo(); - } - } else { - if(fuzzy) { - last = (num == 1); - if(eq.flags.qualA == eq.flags.quallo && eq.flags.mmA == 0) { - eq.flags.mmA = 1; - ret.chr = 0; - } else if(eq.flags.qualC == eq.flags.quallo && eq.flags.mmC == 0) { - eq.flags.mmC = 1; - ret.chr = 1; - } else if(eq.flags.qualG == eq.flags.quallo && eq.flags.mmG == 0) { - eq.flags.mmG = 1; - ret.chr = 2; - } else { - assert_eq(eq.flags.qualT, eq.flags.quallo); - assert_eq(0, eq.flags.mmT); - eq.flags.mmT = 1; - ret.chr = 3; + ret.chr = color ? '3' : 'T'; + if(fuzzy) eq.updateLo(); } - top = tops[ret.chr]; - bot = bots[ret.chr]; - eliminated_ = last; - if(!last) eq.updateLo(); + } else { + if(fuzzy) { + last = (num == 1); + int chr = -1; + if(eq.flags.qualA == eq.flags.quallo && eq.flags.mmA == 0) { + eq.flags.mmA = 1; + chr = 0; + } else if(eq.flags.qualC == eq.flags.quallo && eq.flags.mmC == 0) { + eq.flags.mmC = 1; + chr = 1; + } else if(eq.flags.qualG == eq.flags.quallo && eq.flags.mmG == 0) { + eq.flags.mmG = 1; + chr = 2; + } else { + assert_eq(eq.flags.qualT, eq.flags.quallo); + assert_eq(0, eq.flags.mmT); + eq.flags.mmT = 1; + chr = 3; + } + ret.chr = color ? "0123"[chr] : "ACGT"[chr]; + top = tops[chr]; + bot = bots[chr]; + eliminated_ = last; + if(!last) eq.updateLo(); } else { last = true; // last at this pos // There's only one; pick it! + int chr = -1; if(!eq.flags.mmA) { - ret.chr = 0; + chr = 0; } else if(!eq.flags.mmC) { - ret.chr = 1; + chr = 1; } else if(!eq.flags.mmG) { - ret.chr = 2; + chr = 2; } else { assert(!eq.flags.mmT); - ret.chr = 3; + chr = 3; } - top = tops[ret.chr]; - bot = bots[ret.chr]; + ret.chr = color ? "0123"[chr] : "ACGT"[chr]; + top = tops[chr]; + bot = bots[chr]; //assert_eq(15, eq.join2.mmElims); // Mark entire position as eliminated eliminated_ = true; - } - } + } + } return ret; } @@ -496,9 +502,9 @@ struct RangeState { class Branch { typedef std::pair U32Pair; public: - Branch() : - delayedCost_(0), curtailed_(false), exhausted_(false), - prepped_(false), delayedIncrease_(false) { } + Branch() : + delayedCost_(0), curtailed_(false), exhausted_(false), + prepped_(false), delayedIncrease_(false) { } /** * Initialize a new branch object with an empty path. @@ -589,16 +595,16 @@ class Branch { assert(!exhausted_); if(i <= len_ && i < rangesSz_) { assert(ranges_ != NULL); -#ifndef NDEBUG - if(!ranges_[i].eliminated_) { - // Someone must be as-yet-uneliminated - assert(!ranges_[i].eq.flags.mmA || - !ranges_[i].eq.flags.mmC || - !ranges_[i].eq.flags.mmG || - !ranges_[i].eq.flags.mmT); - assert_lt(ranges_[i].eq.flags.quallo, 127); - } -#endif +#ifndef NDEBUG + if(!ranges_[i].eliminated_) { + // Someone must be as-yet-uneliminated + assert(!ranges_[i].eq.flags.mmA || + !ranges_[i].eq.flags.mmC || + !ranges_[i].eq.flags.mmG || + !ranges_[i].eq.flags.mmT); + assert_lt(ranges_[i].eq.flags.quallo, 127); + } +#endif return ranges_[i].eliminated_; } return true; @@ -609,20 +615,20 @@ class Branch { * creating a new Branch object for it and inserting that branch * into the priority queue. Mark that outgoing path from the * parent branch as eliminated. If the second-best outgoing path - * cost more, add the difference to the cost of this branch (since + * cost more, add the difference to the cost of this branch (since * that's the best we can do starting from here from now on). */ Branch* splitBranch(AllocOnlyPool& rpool, AllocOnlyPool& epool, AllocOnlyPool& bpool, uint32_t pmSz, - RandomSource& rand, uint32_t qlen, - uint32_t qualLim, int seedLen, + RandomSource& rand, uint32_t qlen, + uint32_t qualLim, int seedLen, bool qualOrder, const EbwtParams& ep, - const uint8_t* ebwt, bool ebwtFw, - bool fuzzy, - bool verbose, - bool quiet) + const uint8_t* ebwt, bool ebwtFw, + bool fuzzy, + bool verbose, + bool quiet) { assert(!exhausted_); assert(ranges_ != NULL); @@ -650,31 +656,31 @@ class Branch { // position if(!eliminated(i)) { numNotEliminated++; - if(fuzzy) { - assert_lt(ranges_[i].eq.flags.quallo, 127); - if(ranges_[i].eq.flags.quallo2 < 127) { - numNotEliminated++; - } - } + if(fuzzy) { + assert_lt(ranges_[i].eq.flags.quallo, 127); + if(ranges_[i].eq.flags.quallo2 < 127) { + numNotEliminated++; + } + } uint16_t stratum = (rdepth_ + i < seedLen) ? (1 << 14) : 0; - uint16_t cost = stratum; - cost |= (qualOrder ? ranges_[i].eq.flags.quallo : 0); + uint16_t cost = stratum; + cost |= (qualOrder ? ranges_[i].eq.flags.quallo : 0); if(cost < bestCost) { // Demote the old best to the next-best nextCost = bestCost; - if(fuzzy) { - if(ranges_[i].eq.flags.quallo2 < 127) { - nextCost = min( - nextCost, ranges_[i].eq.flags.quallo2 | stratum); - } - } + if(fuzzy) { + if(ranges_[i].eq.flags.quallo2 < 127) { + nextCost = min( + nextCost, ranges_[i].eq.flags.quallo2 | stratum); + } + } // Update the new best bestCost = cost; numTiedPositions = 1; tiedPositions[0] = i; } else if(cost == bestCost) { // As good as the best so far - if(fuzzy) nextCost = cost; + if(fuzzy) nextCost = cost; assert_gt(numTiedPositions, 0); if(numTiedPositions < 3) { tiedPositions[numTiedPositions++] = i; @@ -692,21 +698,21 @@ class Branch { } assert_gt(numNotEliminated, 0); assert_gt(numTiedPositions, 0); - //if(nextCost != 0xffff) assert_gt(nextCost, bestCost); + //if(nextCost != 0xffff) assert_gt(nextCost, bestCost); int r = 0; int pos = tiedPositions[r]; bool last = false; - // Pick an edit from among the edits tied for lowest cost - // (using randomness to break ties). If the selected edit is - // the last remaining one at this position, 'last' is set to - // true. + // Pick an edit from among the edits tied for lowest cost + // (using randomness to break ties). If the selected edit is + // the last remaining one at this position, 'last' is set to + // true. uint32_t top = 0, bot = 0; - Edit e = ranges_[pos].pickEdit(pos + rdepth_, rand, fuzzy, top, bot, last); + Edit e = ranges_[pos].pickEdit(pos + rdepth_, rand, fuzzy, top, bot, last); assert_gt(bot, top); // Create and initialize a new Branch uint16_t newRdepth = rdepth_ + pos + 1; assert_lt((bestCost >> 14), 4); - uint32_t hamadd = (bestCost & ~0xc000); + uint32_t hamadd = (bestCost & ~0xc000); uint16_t depth = pos + rdepth_; assert_geq(depth, depth0_); uint16_t newDepth0 = depth0_; @@ -716,11 +722,11 @@ class Branch { if(depth < depth1_) newDepth0 = depth1_; if(depth < depth2_) newDepth1 = depth2_; if(depth < depth3_) newDepth2 = depth3_; - assert_eq((uint32_t)(cost_ & ~0xc000), (uint32_t)(ham_ + hamadd)); + assert_eq((uint32_t)(cost_ & ~0xc000), (uint32_t)(ham_ + hamadd)); if(!newBranch->init( rpool, epool, id, qlen, newDepth0, newDepth1, newDepth2, newDepth3, - newRdepth, 0, cost_, ham_ + hamadd, + newRdepth, 0, cost_, ham_ + hamadd, top, bot, ep, ebwt, &edits_)) { return NULL; @@ -740,16 +746,16 @@ class Branch { rangesSz_ = 0; } } - } - else if(fuzzy && bestCost != nextCost) { - // We exhausted the last outgoing edge at the current best - // cost; update the best cost to be the next-best - assert_gt(nextCost, bestCost); - delayedCost_ = (cost_ - bestCost + nextCost); - assert_gt(delayedCost_, cost_); - delayedIncrease_ = true; - } - else if(!fuzzy && numTiedPositions == 1 && last) { + } + else if(fuzzy && bestCost != nextCost) { + // We exhausted the last outgoing edge at the current best + // cost; update the best cost to be the next-best + assert_gt(nextCost, bestCost); + delayedCost_ = (cost_ - bestCost + nextCost); + assert_gt(delayedCost_, cost_); + delayedIncrease_ = true; + } + else if(!fuzzy && numTiedPositions == 1 && last) { // We exhausted the last outgoing edge at the current best // cost; update the best cost to be the next-best assert_neq(0xffff, nextCost); @@ -828,7 +834,7 @@ class Branch { if(rdepth_ > 0) { for(size_t i = 0; i < rdepth_; i++) { if(editidx < numEdits && edits_.get(editidx).pos == i) { - ss3 << " " << "acgt"[edits_.get(editidx).chr]; + ss3 << " " << tolower(edits_.get(editidx).chr); editidx++; } else { ss3 << " " << qry[qlen - i - 1]; @@ -841,7 +847,7 @@ class Branch { } for(size_t i = 0; i < len_; i++) { if(editidx < numEdits && edits_.get(editidx).pos == printed) { - ss3 << "acgt"[edits_.get(editidx).chr] << " "; + ss3 << tolower(edits_.get(editidx).chr) << " "; editidx++; } else { ss3 << qry[qlen - printed - 1] << " "; @@ -882,7 +888,7 @@ class Branch { if(!eliminated(i)) { eliminatedStretch = 0; uint16_t stratum = (rdepth_ + i < seedLen) ? (1 << 14) : 0; - uint16_t cost = (qualOrder ? ranges_[i].eq.flags.quallo : 0) | stratum; + uint16_t cost = (qualOrder ? ranges_[i].eq.flags.quallo : 0) | stratum; if(cost < lowestCost) lowestCost = cost; } else if(i < rangesSz_) { eliminatedStretch++; @@ -955,32 +961,32 @@ class Branch { * Set the elims to match the ranges in ranges_[len_], already * calculated by the caller. Only does mismatches for now. */ - int installRanges(int c, int nextc, bool fuzzy, uint32_t qAllow, - const uint8_t* qs) - { + int installRanges(int c, int nextc, bool fuzzy, uint32_t qAllow, + const uint8_t* qs) + { assert(!exhausted_); assert(ranges_ != NULL); RangeState& r = ranges_[len_]; int ret = 0; r.eliminated_ = true; // start with everything eliminated - r.eq.eliminate(); // set all elim flags to 1 - assert_lt(qs[0], 127); - assert_lt(qs[1], 127); - assert_lt(qs[2], 127); - assert_lt(qs[3], 127); - if(!fuzzy) { - assert_eq(qs[0], qs[1]); - assert_eq(qs[0], qs[2]); - assert_eq(qs[0], qs[3]); - r.eq.flags.quallo = qs[0]; - if(qs[0] > qAllow) return 0; - } + r.eq.eliminate(); // set all elim flags to 1 + assert_lt(qs[0], 127); + assert_lt(qs[1], 127); + assert_lt(qs[2], 127); + assert_lt(qs[3], 127); + if(!fuzzy) { + assert_eq(qs[0], qs[1]); + assert_eq(qs[0], qs[2]); + assert_eq(qs[0], qs[3]); + r.eq.flags.quallo = qs[0]; + if(qs[0] > qAllow) return 0; + } // Set one/both of these to true to do the accounting for // insertions and deletions as well as mismatches bool doInserts = false; bool doDeletes = false; // We can proceed on an A - if(c != 0 && r.bots[0] > r.tops[0] && qs[0] <= qAllow) { + if(c != 0 && r.bots[0] > r.tops[0] && qs[0] <= qAllow) { r.eliminated_ = false; r.eq.flags.mmA = 0; // A substitution is an option if(doInserts) r.eq.flags.insA = 0; @@ -988,7 +994,7 @@ class Branch { ret++; } // We can proceed on a C - if(c != 1 && r.bots[1] > r.tops[1] && qs[1] <= qAllow) { + if(c != 1 && r.bots[1] > r.tops[1] && qs[1] <= qAllow) { r.eliminated_ = false; r.eq.flags.mmC = 0; // C substitution is an option if(doInserts) r.eq.flags.insC = 0; @@ -996,7 +1002,7 @@ class Branch { ret++; } // We can proceed on a G - if(c != 2 && r.bots[2] > r.tops[2] && qs[2] <= qAllow) { + if(c != 2 && r.bots[2] > r.tops[2] && qs[2] <= qAllow) { r.eliminated_ = false; r.eq.flags.mmG = 0; // G substitution is an option if(doInserts) r.eq.flags.insG = 0; @@ -1004,27 +1010,27 @@ class Branch { ret++; } // We can proceed on a T - if(c != 3 && r.bots[3] > r.tops[3] && qs[3] <= qAllow) { + if(c != 3 && r.bots[3] > r.tops[3] && qs[3] <= qAllow) { r.eliminated_ = false; r.eq.flags.mmT = 0; // T substitution is an option if(doInserts) r.eq.flags.insT = 0; if(doDeletes && nextc == 3) r.eq.flags.del = 0; ret++; } - if(!r.eliminated_ && fuzzy) { - // Copy quals - r.eq.flags.qualA = qs[0]; - r.eq.flags.qualC = qs[1]; - r.eq.flags.qualG = qs[2]; - r.eq.flags.qualT = qs[3]; - - // Now that the quals are set and the elim flags are set - // according to which Burrows-Wheeler ranges are empty, - // determine best and second-best quals - r.eq.updateLo(); - - assert_lt(r.eq.flags.quallo, 127); - } + if(!r.eliminated_ && fuzzy) { + // Copy quals + r.eq.flags.qualA = qs[0]; + r.eq.flags.qualC = qs[1]; + r.eq.flags.qualG = qs[2]; + r.eq.flags.qualT = qs[3]; + + // Now that the quals are set and the elim flags are set + // according to which Burrows-Wheeler ranges are empty, + // determine best and second-best quals + r.eq.updateLo(); + + assert_lt(r.eq.flags.quallo, 127); + } return ret; } @@ -1158,8 +1164,8 @@ class BranchQueue { public: - BranchQueue(bool verbose, bool quiet) : - sz_(0), branchQ_(), patid_(0), verbose_(verbose), quiet_(quiet) + BranchQueue(bool verbose, bool quiet) : + sz_(0), branchQ_(), patid_(0), verbose_(verbose), quiet_(quiet) { } /** @@ -1280,7 +1286,7 @@ class BranchQueue { TBranchQueue branchQ_; // priority queue of branches uint32_t patid_; bool verbose_; - bool quiet_; + bool quiet_; }; /** @@ -1309,15 +1315,15 @@ class PathManager { public: - PathManager(ChunkPool* cpool_, int *btCnt, bool verbose, bool quiet) : - branchQ_(verbose, quiet), + PathManager(ChunkPool* cpool_, int *btCnt, bool verbose, bool quiet) : + branchQ_(verbose, quiet), cpool(cpool_), bpool(cpool, "branch"), rpool(cpool, "range state"), epool(cpool, "edit"), - minCost(0), btCnt_(btCnt), - verbose_(verbose), - quiet_(quiet) + minCost(0), btCnt_(btCnt), + verbose_(verbose), + quiet_(quiet) { } ~PathManager() { } @@ -1456,11 +1462,11 @@ class PathManager { * If the frontmost branch is a curtailed branch, split off an * extendable branch and add it to the queue. */ - bool splitAndPrep(RandomSource& rand, uint32_t qlen, - uint32_t qualLim, int seedLen, - bool qualOrder, bool fuzzy, - const EbwtParams& ep, const uint8_t* ebwt, - bool ebwtFw) + bool splitAndPrep(RandomSource& rand, uint32_t qlen, + uint32_t qualLim, int seedLen, + bool qualOrder, bool fuzzy, + const EbwtParams& ep, const uint8_t* ebwt, + bool ebwtFw) { if(empty()) return true; // This counts as a backtrack @@ -1495,8 +1501,8 @@ class PathManager { } } Branch* newbr = splitBranch( - f, rand, qlen, qualLim, seedLen, qualOrder, fuzzy, ep, ebwt, - ebwtFw); + f, rand, qlen, qualLim, seedLen, qualOrder, fuzzy, ep, ebwt, + ebwtFw); if(newbr == NULL) { return false; } @@ -1529,14 +1535,14 @@ class PathManager { * Split off an extendable branch from a curtailed branch. */ Branch* splitBranch(Branch* src, RandomSource& rand, uint32_t qlen, - uint32_t qualLim, int seedLen, bool qualOrder, bool fuzzy, - const EbwtParams& ep, const uint8_t* ebwt, - bool ebwtFw) + uint32_t qualLim, int seedLen, bool qualOrder, bool fuzzy, + const EbwtParams& ep, const uint8_t* ebwt, + bool ebwtFw) { Branch* dst = src->splitBranch( rpool, epool, bpool, size(), rand, - qlen, qualLim, seedLen, qualOrder, ep, ebwt, ebwtFw, fuzzy, - verbose_, quiet_); + qlen, qualLim, seedLen, qualOrder, ep, ebwt, ebwtFw, fuzzy, + verbose_, quiet_); assert(dst->repOk()); return dst; } @@ -1568,8 +1574,8 @@ class PathManager { /// Pointer to the aligner's per-read backtrack counter. We /// increment it in splitBranch. int *btCnt_; - bool verbose_; - bool quiet_; + bool verbose_; + bool quiet_; }; /** @@ -1727,7 +1733,7 @@ class SingleRangeSourceDriver : public RangeSourceDriver { HitSinkPerThread* sinkPt, vector >& os, bool verbose, - bool quiet, + bool quiet, bool mate1, uint32_t minCostAdjustment, ChunkPool* pool, @@ -1738,7 +1744,7 @@ class SingleRangeSourceDriver : public RangeSourceDriver { params_(params), fw_(fw), rs_(rs), ebwtFw_(rs_->curEbwt()->fw()), - pm_(pool, btCnt, verbose, quiet) + pm_(pool, btCnt, verbose, quiet) { assert(rs_ != NULL); } @@ -1753,12 +1759,12 @@ class SingleRangeSourceDriver : public RangeSourceDriver { virtual void setQueryImpl(PatternSourcePerThread* patsrc, Range *r) { this->done = false; pm_.reset(patsrc->patid()); - ReadBuf* buf = mate1_ ? &patsrc->bufa() : &patsrc->bufb(); - len_ = buf->length(); - rs_->setQuery(*buf, r); - initRangeSource((fw_ == ebwtFw_) ? buf->qual : buf->qualRev, - buf->fuzzy, buf->alts, - (fw_ == ebwtFw_) ? buf->altQual : buf->altQualRev); + ReadBuf* buf = mate1_ ? &patsrc->bufa() : &patsrc->bufb(); + len_ = buf->length(); + rs_->setQuery(*buf, r); + initRangeSource((fw_ == ebwtFw_) ? buf->qual : buf->qualRev, + buf->fuzzy, buf->alts, + (fw_ == ebwtFw_) ? buf->altQual : buf->altQualRev); assert_gt(len_, 0); if(this->done) return; ASSERT_ONLY(allTops_.clear()); @@ -1866,8 +1872,8 @@ class SingleRangeSourceDriver : public RangeSourceDriver { return fw_; } - virtual void initRangeSource(const String& qual, bool fuzzy, - int alts, const String* altQuals) = 0; + virtual void initRangeSource(const String& qual, bool fuzzy, + int alts, const String* altQuals) = 0; protected: @@ -2050,12 +2056,12 @@ class CostAwareRangeSourceDriver : public RangeSourceDriver { bool strandFix, const TRangeSrcDrPtrVec* rss, bool verbose, - bool quiet, - bool mixesReads) : + bool quiet, + bool mixesReads) : RangeSourceDriver(false), rss_(), active_(), strandFix_(strandFix), lastRange_(NULL), delayedRange_(NULL), patsrc_(NULL), - verbose_(verbose), quiet_(quiet), mixesReads_(mixesReads) + verbose_(verbose), quiet_(quiet), mixesReads_(mixesReads) { if(rss != NULL) { rss_ = (*rss); @@ -2466,7 +2472,7 @@ class CostAwareRangeSourceDriver : public RangeSourceDriver { Range *delayedRange_; PatternSourcePerThread* patsrc_; bool verbose_; - bool quiet_; + bool quiet_; bool mixesReads_; ASSERT_ONLY(std::set allTopsRc_); };