diff --git a/Makefile b/Makefile index 94b9853..ebae9a8 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ ifeq (1,$(WINDOWS)) SHMEM_DEF = endif -PREFETCH_LOCALITY = 0 +PREFETCH_LOCALITY = 2 PREF_DEF = -DPREFETCH_LOCALITY=$(PREFETCH_LOCALITY) LIBS = diff --git a/ebwt.h b/ebwt.h index 9850ccd..4f12597 100644 --- a/ebwt.h +++ b/ebwt.h @@ -32,11 +32,13 @@ using namespace seqan; #ifndef PREFETCH_LOCALITY // No locality by default -#define PREFETCH_LOCALITY 0 +#define PREFETCH_LOCALITY 2 #endif +#ifndef NEW_FORMAT // From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl extern uint8_t cCntLUT_4[4][4][256]; +#endif #ifndef VMSG_NL #define VMSG_NL(args...) \ @@ -1391,7 +1393,7 @@ struct SideLocus { _fw(-1), _by(-1), _bp(-1), - _side(NULL) { } + _side(NULL), _oside(NULL) { } /** * Construct from row and other relevant information about the Ebwt. @@ -1422,6 +1424,7 @@ struct SideLocus { lbot._sideNum = ltop._sideNum; lbot._sideByteOff = ltop._sideByteOff; lbot._side = ltop._side; + lbot._oside = ltop._oside; lbot._fw = ltop._fw; lbot._by = lbot._charOff >> 2; assert_lt(lbot._by, (int)sideBwtSz); @@ -1449,6 +1452,7 @@ struct SideLocus { _side = ebwt + _sideByteOff; // prefetch tjside too _fw = _sideNum & 1; // odd-numbered sides are forward + _oside = _side + (_fw? (-128) : (128)); _by = _charOff >> 2; // byte within side assert_lt(_by, (int)ep._sideBwtSz); _bp = _charOff & 3; // bit-pair within byte @@ -1469,17 +1473,24 @@ struct SideLocus { void prefetch() { assert(valid()); #ifndef NO_PREFETCH - // prefetch this side - __builtin_prefetch((const void *)_side, - 0 /* prepare for read */, - PREFETCH_LOCALITY /* no locality */); - // prefetch the other half of this side - __builtin_prefetch((const void *)(_side + 64), - 0 /* prepare for read */, - PREFETCH_LOCALITY /* no locality */); + // prefetch the other 3/4 of this side +// __builtin_prefetch((const void *)(_side + 32), +// 0 /* prepare for read */, +// PREFETCH_LOCALITY /* no locality */); +// __builtin_prefetch((const void *)(_side + 64), +// 0 /* prepare for read */, +// PREFETCH_LOCALITY /* no locality */); +// __builtin_prefetch((const void *)(_side + 96), +// 0 /* prepare for read */, +// PREFETCH_LOCALITY /* no locality */); // prefetch the half of the opposite side that contains the // cumulative character occurrence counts - __builtin_prefetch((const void *)(_side + (_fw? (-64) : 128)), + __builtin_prefetch((const void *)_oside, + 0 /* prepare for read */, + PREFETCH_LOCALITY /* no locality */); + + // prefetch this side + __builtin_prefetch((const void *)_side, 0 /* prepare for read */, PREFETCH_LOCALITY /* no locality */); #endif @@ -1503,7 +1514,8 @@ struct SideLocus { int _fw; // side is forward or backward? int _by; // byte within side (not adjusted for bw sides) int _bp; // bitpair within byte (not adjusted for bw sides) - const uint8_t *_side; // ptr to beginning of top side + const uint8_t *_side; // ptr to beginning of side + const uint8_t *_oside; // ptr to beginning of opposite side }; #include "ebwt_search_backtrack.h" @@ -1698,15 +1710,18 @@ void Ebwt::sanityCheckAll() const { template inline int Ebwt::rowL(const SideLocus& l) const { // Extract and return appropriate bit-pair +#ifdef NEW_FORMAT + return (((uint64_t*)l._side)[l._by >> 3] >> ((((l._by & 7) << 2) + l._bp) << 1)) & 3; +#else return unpack_2b_from_8b(l._side[l._by], l._bp); +#endif } /** * Inline-function version of the above. This does not always seem to * be inlined - * - * Function gets 3.38% in profile */ +#if 0 inline static int pop64(uint64_t x) { x = x - ((x >> 1) & 0x5555555555555555llu); x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu); @@ -1716,12 +1731,16 @@ inline static int pop64(uint64_t x) { x = x + (x >> 32); return x & 0x3F; } +#else +// Use gcc's intrinsic popcountll. It's somewhat faster than the above +// routine and hopefully it's smart enough to use a 64-bit popcnt +// instruction if one is available (e.g. on Itanium). +#define pop64(x) __builtin_popcountll(x) +#endif /** * Tricky-bit-bashing bitpair counting for given two-bit value (0-3) * within a 64-bit argument. - * - * Function gets 9.26% in profile */ inline static int countInU64(int c, uint64_t dw) { uint64_t dwA = dw & 0xAAAAAAAAAAAAAAAAllu; @@ -1778,26 +1797,33 @@ inline static void countInU64Ex(uint64_t dw, uint32_t* arrs) { */ template inline uint32_t Ebwt::countUpTo(const SideLocus& l, int c) const { - uint32_t cCnt = 0; - int i = 0; // Count occurrences of c in each 64-bit (using bit trickery); - // note: this seems does not seem to lend a significant boost to - // performance. If you comment out this whole loop (which won't - // affect correctness - it will just cause the following loop to - // take up the slack) then runtime does not change noticeably. - // Someday the countInU64() and pop() functions should be + // Someday countInU64() and pop() functions should be // vectorized/SSE-ized in case that helps. - for(; i+7 < l._by; i += 8) { // 2.98% in profile - cCnt += countInU64(c, *(uint64_t*)&l._side[i]); // 1.28% in profile + uint32_t cCnt = 0; + int i = 0; + for(; i + 7 < l._by; i += 8) { + cCnt += countInU64(c, *(uint64_t*)&l._side[i]); + } +#ifdef NEW_FORMAT + // Calculate number of bit pairs to shift off the end + const int bpShiftoff = 32 - (((l._by & 7) << 2) + l._bp); + if(bpShiftoff < 32) { + assert_lt(bpShiftoff, 32); + const uint64_t sw = (*(uint64_t*)&l._side[i]) << (bpShiftoff << 1); + cCnt += countInU64(c, sw); + if(c == 0) cCnt -= bpShiftoff; // we turned these into As } +#else // Count occurences of c in the rest of the side (using LUT) - for(; i < l._by; i++) { // 2.89% in profile - cCnt += cCntLUT_4[0][c][l._side[i]]; // 2.37% in profile + for(; i < l._by; i++) { + cCnt += cCntLUT_4[0][c][l._side[i]]; } // Count occurences of c in the rest of the byte if(l._bp > 0) { - cCnt += cCntLUT_4[l._bp][c][l._side[i]]; // 1.15% in profile + cCnt += cCntLUT_4[l._bp][c][l._side[i]]; } +#endif return cCnt; } @@ -1820,12 +1846,22 @@ inline void Ebwt::countUpToEx(const SideLocus& l, uint32_t* arrs) const { for(; i+7 < l._by; i += 8) { countInU64Ex(*(uint64_t*)&l._side[i], arrs); } +#ifdef NEW_FORMAT + // Calculate number of bit pairs to shift off the end + const int bpShiftoff = 32 - (((l._by & 7) << 2) + l._bp); + assert_leq(bpShiftoff, 32); + if(bpShiftoff < 32) { + const uint64_t sw = (*(uint64_t*)&l._side[i]) << (bpShiftoff << 1); + countInU64Ex(sw, arrs); + arrs[0] -= bpShiftoff; + } +#else // Count occurences of c in the rest of the side (using LUT) for(; i < l._by; i++) { - arrs[0] += cCntLUT_4[0][0][l._side[i]]; // 0.73% + 0.63% + 0.63% + 0.63% in profile + arrs[0] += cCntLUT_4[0][0][l._side[i]]; arrs[1] += cCntLUT_4[0][1][l._side[i]]; arrs[2] += cCntLUT_4[0][2][l._side[i]]; - arrs[3] += cCntLUT_4[0][3][l._side[i]]; // 0.91% in profile + arrs[3] += cCntLUT_4[0][3][l._side[i]]; } // Count occurences of c in the rest of the byte if(l._bp > 0) { @@ -1834,6 +1870,7 @@ inline void Ebwt::countUpToEx(const SideLocus& l, uint32_t* arrs) const { arrs[2] += cCntLUT_4[l._bp][2][l._side[i]]; arrs[3] += cCntLUT_4[l._bp][3][l._side[i]]; } +#endif } /** @@ -1843,24 +1880,19 @@ inline void Ebwt::countUpToEx(const SideLocus& l, uint32_t* arrs) const { */ template inline uint32_t Ebwt::countFwSide(const SideLocus& l, int c) const { - const EbwtParams& eh = this->_eh; - int by = l._by; - int bp = l._bp; - const uint8_t *ebwtSide = l._side; - uint32_t sideByteOff = l._sideByteOff; assert_lt(c, 4); assert_geq(c, 0); - assert_lt(by, (int)eh._sideBwtSz); - assert_geq(by, 0); - assert_lt(bp, 4); - assert_geq(bp, 0); + assert_lt(l._by, (int)this->_eh._sideBwtSz); + assert_geq(l._by, 0); + assert_lt(l._bp, 4); + assert_geq(l._bp, 0); uint32_t cCnt = countUpTo(l, c); - assert_leq(cCnt, eh._sideBwtLen); - if(c == 0 && sideByteOff <= _zEbwtByteOff && sideByteOff + by >= _zEbwtByteOff) { + assert_leq(cCnt, this->_eh._sideBwtLen); + if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { // Adjust for the fact that we represented $ with an 'A', but // shouldn't count it as an 'A' here - if(sideByteOff + by > _zEbwtByteOff || - sideByteOff + by == _zEbwtByteOff && bp > _zEbwtBpOff) + if(l._sideByteOff + l._by > _zEbwtByteOff || + l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff) { cCnt--; // Adjust for '$' looking like an 'A' } @@ -1868,21 +1900,21 @@ inline uint32_t Ebwt::countFwSide(const SideLocus& l, int c) const { uint32_t ret; // Now factor in the occ[] count at the side break if(c < 2) { - const uint32_t *ac = reinterpret_cast(ebwtSide - 8); - assert_leq(ac[0], eh._numSides * eh._sideBwtLen); // b/c it's used as padding - assert_lt(ac[1], eh._len); + const uint32_t *ac = reinterpret_cast(l._side - 8); + assert_leq(ac[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding + assert_lt(ac[1], this->_eh._len); ret = ac[c] + cCnt + this->_fchr[c]; } else { - const uint32_t *gt = reinterpret_cast(ebwtSide + eh._sideSz - 8); // next - assert_lt(gt[0], eh._len); assert_lt(gt[1], eh._len); + const uint32_t *gt = reinterpret_cast(l._side + this->_eh._sideSz - 8); // next + assert_lt(gt[0], this->_eh._len); assert_lt(gt[1], this->_eh._len); ret = gt[c-2] + cCnt + this->_fchr[c]; } #ifndef NDEBUG assert_leq(ret, this->_fchr[c+1]); // can't have jumpded into next char's section if(c == 0) { - assert_leq(cCnt, eh._sideBwtLen); + assert_leq(cCnt, this->_eh._sideBwtLen); } else { - assert_lt(ret, eh._bwtLen); + assert_lt(ret, this->_eh._bwtLen); } #endif return ret; @@ -1896,40 +1928,41 @@ inline uint32_t Ebwt::countFwSide(const SideLocus& l, int c) const { template inline void Ebwt::countFwSideEx(const SideLocus& l, uint32_t* arrs) const { - const EbwtParams& eh = this->_eh; - int by = l._by; - int bp = l._bp; - const uint8_t *ebwtSide = l._side; - uint32_t sideByteOff = l._sideByteOff; - assert_lt(by, (int)eh._sideBwtSz); - assert_geq(by, 0); - assert_lt(bp, 4); - assert_geq(bp, 0); + assert_lt(l._by, (int)this->_eh._sideBwtSz); + assert_geq(l._by, 0); + assert_lt(l._bp, 4); + assert_geq(l._bp, 0); countUpToEx(l, arrs); - assert_leq(arrs[0], eh._sideBwtLen); - assert_leq(arrs[1], eh._sideBwtLen); - assert_leq(arrs[2], eh._sideBwtLen); - assert_leq(arrs[3], eh._sideBwtLen); - if(sideByteOff <= _zEbwtByteOff && sideByteOff + by >= _zEbwtByteOff) { +#ifndef NDEBUG + assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section + assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section + assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section + assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section +#endif + assert_leq(arrs[0], this->_eh._sideBwtLen); + assert_leq(arrs[1], this->_eh._sideBwtLen); + assert_leq(arrs[2], this->_eh._sideBwtLen); + assert_leq(arrs[3], this->_eh._sideBwtLen); + if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { // Adjust for the fact that we represented $ with an 'A', but // shouldn't count it as an 'A' here - if(sideByteOff + by > _zEbwtByteOff || - sideByteOff + by == _zEbwtByteOff && bp > _zEbwtBpOff) + if(l._sideByteOff + l._by > _zEbwtByteOff || + l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff) { arrs[0]--; // Adjust for '$' looking like an 'A' } } // Now factor in the occ[] count at the side break - const uint32_t *ac = reinterpret_cast(ebwtSide - 8); - const uint32_t *gt = reinterpret_cast(ebwtSide + eh._sideSz - 8); + const uint32_t *ac = reinterpret_cast(l._side - 8); + const uint32_t *gt = reinterpret_cast(l._side + this->_eh._sideSz - 8); #ifndef NDEBUG - assert_leq(ac[0], this->_fchr[1] + eh.sideBwtLen()); + assert_leq(ac[0], this->_fchr[1] + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_fchr[2]-this->_fchr[1]); assert_leq(gt[0], this->_fchr[3]-this->_fchr[2]); assert_leq(gt[1], this->_fchr[4]-this->_fchr[3]); #endif - assert_lt(ac[0], eh._len + eh.sideBwtLen()); assert_lt(ac[1], eh._len); - assert_lt(gt[0], eh._len); assert_lt(gt[1], eh._len); + assert_lt(ac[0], this->_eh._len + this->_eh.sideBwtLen()); assert_lt(ac[1], this->_eh._len); + assert_lt(gt[0], this->_eh._len); assert_lt(gt[1], this->_eh._len); arrs[0] += (ac[0] + this->_fchr[0]); arrs[1] += (ac[1] + this->_fchr[1]); arrs[2] += (gt[0] + this->_fchr[2]); @@ -1949,25 +1982,20 @@ inline void Ebwt::countFwSideEx(const SideLocus& l, uint32_t* arrs) const */ template inline uint32_t Ebwt::countBwSide(const SideLocus& l, int c) const { - const EbwtParams& eh = this->_eh; - int by = l._by; - int bp = l._bp; - const uint8_t *ebwtSide = l._side; - uint32_t sideByteOff = l._sideByteOff; assert_lt(c, 4); assert_geq(c, 0); - assert_lt(by, (int)eh._sideBwtSz); - assert_geq(by, 0); - assert_lt(bp, 4); - assert_geq(bp, 0); + assert_lt(l._by, (int)this->_eh._sideBwtSz); + assert_geq(l._by, 0); + assert_lt(l._bp, 4); + assert_geq(l._bp, 0); uint32_t cCnt = countUpTo(l, c); - if(unpack_2b_from_8b(ebwtSide[by], bp) == c) cCnt++; - assert_leq(cCnt, eh._sideBwtLen); - if(c == 0 && sideByteOff <= _zEbwtByteOff && sideByteOff + by >= _zEbwtByteOff) { + if(rowL(l) == c) cCnt++; + assert_leq(cCnt, this->_eh._sideBwtLen); + if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { // Adjust for the fact that we represented $ with an 'A', but // shouldn't count it as an 'A' here - if(sideByteOff + by > _zEbwtByteOff || - sideByteOff + by == _zEbwtByteOff && bp >= _zEbwtBpOff) + if(l._sideByteOff + l._by > _zEbwtByteOff || + l._sideByteOff + l._by == _zEbwtByteOff && l._bp >= _zEbwtBpOff) { cCnt--; } @@ -1975,21 +2003,21 @@ inline uint32_t Ebwt::countBwSide(const SideLocus& l, int c) const { uint32_t ret; // Now factor in the occ[] count at the side break if(c < 2) { - const uint32_t *ac = reinterpret_cast(ebwtSide + eh._sideSz - 8); - assert_leq(ac[0], eh._numSides * eh._sideBwtLen); // b/c it's used as padding - assert_lt(ac[1], eh._len); + const uint32_t *ac = reinterpret_cast(l._side + this->_eh._sideSz - 8); + assert_leq(ac[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding + assert_lt(ac[1], this->_eh._len); ret = ac[c] - cCnt + this->_fchr[c]; } else { - const uint32_t *gt = reinterpret_cast(ebwtSide + (2*eh._sideSz) - 8); // next - assert_lt(gt[0], eh._len); assert_lt(gt[1], eh._len); + const uint32_t *gt = reinterpret_cast(l._side + (2*this->_eh._sideSz) - 8); // next + assert_lt(gt[0], this->_eh._len); assert_lt(gt[1], this->_eh._len); ret = gt[c-2] - cCnt + this->_fchr[c]; } #ifndef NDEBUG assert_leq(ret, this->_fchr[c+1]); // can't have jumped into next char's section if(c == 0) { - assert_leq(cCnt, eh._sideBwtLen); + assert_leq(cCnt, this->_eh._sideBwtLen); } else { - assert_lt(ret, eh._bwtLen); + assert_lt(ret, this->_eh._bwtLen); } #endif return ret; @@ -2002,50 +2030,45 @@ inline uint32_t Ebwt::countBwSide(const SideLocus& l, int c) const { */ template inline void Ebwt::countBwSideEx(const SideLocus& l, uint32_t* arrs) const { - const EbwtParams& eh = this->_eh; - int by = l._by; - int bp = l._bp; - const uint8_t *ebwtSide = l._side; - uint32_t sideByteOff = l._sideByteOff; - assert_lt(by, (int)eh._sideBwtSz); - assert_geq(by, 0); - assert_lt(bp, 4); - assert_geq(bp, 0); + assert_lt(l._by, (int)this->_eh._sideBwtSz); + assert_geq(l._by, 0); + assert_lt(l._bp, 4); + assert_geq(l._bp, 0); countUpToEx(l, arrs); - arrs[unpack_2b_from_8b(ebwtSide[by], bp)]++; - assert_leq(arrs[0], eh._sideBwtLen); - assert_leq(arrs[1], eh._sideBwtLen); - assert_leq(arrs[2], eh._sideBwtLen); - assert_leq(arrs[3], eh._sideBwtLen); - if(sideByteOff <= _zEbwtByteOff && sideByteOff + by >= _zEbwtByteOff) { + arrs[rowL(l)]++; + assert_leq(arrs[0], this->_eh._sideBwtLen); + assert_leq(arrs[1], this->_eh._sideBwtLen); + assert_leq(arrs[2], this->_eh._sideBwtLen); + assert_leq(arrs[3], this->_eh._sideBwtLen); + if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { // Adjust for the fact that we represented $ with an 'A', but // shouldn't count it as an 'A' here - if(sideByteOff + by > _zEbwtByteOff || - sideByteOff + by == _zEbwtByteOff && bp >= _zEbwtBpOff) + if(l._sideByteOff + l._by > _zEbwtByteOff || + l._sideByteOff + l._by == _zEbwtByteOff && l._bp >= _zEbwtBpOff) { arrs[0]--; // Adjust for '$' looking like an 'A' } } // Now factor in the occ[] count at the side break - const uint32_t *ac = reinterpret_cast(ebwtSide + eh._sideSz - 8); - const uint32_t *gt = reinterpret_cast(ebwtSide + (2*eh._sideSz) - 8); + const uint32_t *ac = reinterpret_cast(l._side + this->_eh._sideSz - 8); + const uint32_t *gt = reinterpret_cast(l._side + (2*this->_eh._sideSz) - 8); #ifndef NDEBUG - assert_leq(ac[0], this->_fchr[1] + eh.sideBwtLen()); + assert_leq(ac[0], this->_fchr[1] + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_fchr[2]-this->_fchr[1]); assert_leq(gt[0], this->_fchr[3]-this->_fchr[2]); assert_leq(gt[1], this->_fchr[4]-this->_fchr[3]); #endif - assert_lt(ac[0], eh._len + eh.sideBwtLen()); assert_lt(ac[1], eh._len); - assert_lt(gt[0], eh._len); assert_lt(gt[1], eh._len); + assert_lt(ac[0], this->_eh._len + this->_eh.sideBwtLen()); assert_lt(ac[1], this->_eh._len); + assert_lt(gt[0], this->_eh._len); assert_lt(gt[1], this->_eh._len); arrs[0] = (ac[0] - arrs[0] + this->_fchr[0]); arrs[1] = (ac[1] - arrs[1] + this->_fchr[1]); arrs[2] = (gt[0] - arrs[2] + this->_fchr[2]); arrs[3] = (gt[1] - arrs[3] + this->_fchr[3]); #ifndef NDEBUG - assert_leq(arrs[0], this->_fchr[1]); // can't have jumpded into next char's section - assert_leq(arrs[1], this->_fchr[2]); // can't have jumpded into next char's section - assert_leq(arrs[2], this->_fchr[3]); // can't have jumpded into next char's section - assert_leq(arrs[3], this->_fchr[4]); // can't have jumpded into next char's section + assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section + assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section + assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section + assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section #endif } @@ -2127,7 +2150,7 @@ inline uint32_t Ebwt::mapLF(const SideLocus& l { uint32_t ret; assert(l._side != NULL); - int c = unpack_2b_from_8b(l._side[l._by], l._bp); + int c = rowL(l); // unpack_2b_from_8b(l._side[l._by], l._bp); assert_lt(c, 4); assert_geq(c, 0); if(l._fw) ret = countFwSide(l, c); // Forward side @@ -2188,7 +2211,7 @@ inline uint32_t Ebwt::mapLF1(const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity)) const { uint32_t ret; - if(c == 4 || unpack_2b_from_8b(l._side[l._by], l._bp) != c) { // L2 miss? + if(c == 4 || rowL(l) /*unpack_2b_from_8b(l._side[l._by], l._bp)*/ != c) { // L2 miss? return 0xffffffff; } assert_lt(c, 4); @@ -2526,7 +2549,7 @@ inline bool Ebwt::reportReconstruct(const String& query, // Walk along until we reach the next marked row to the left while(((i & offMask) != i) && i != _zOff) { // Not a marked row; walk left one more char - int c = unpack_2b_from_8b(l->_side[l->_by], l->_bp); + int c = rowL(*l); // unpack_2b_from_8b(l->_side[l->_by], l->_bp); appendValue(lbuf, (Dna5)c); uint32_t newi; assert_lt(c, 4); @@ -2580,7 +2603,7 @@ inline bool Ebwt::reportReconstruct(const String& query, uint32_t diff = textoff-jumps; for(size_t j = 0; j < diff; j++) { // Not a marked row; walk left one more char - int c = unpack_2b_from_8b(l->_side[l->_by], l->_bp); + int c = rowL(*l); // unpack_2b_from_8b(l->_side[l->_by], l->_bp); appendValue(lbuf, (Dna5)c); uint32_t newi; assert_lt(c, 4); @@ -2622,7 +2645,7 @@ inline bool Ebwt::reportReconstruct(const String& query, l->prefetch(); for(size_t j = 0; j < right_steps_rounded; j++) { // Not a marked row; walk left one more char - int c = unpack_2b_from_8b(l->_side[l->_by], l->_bp); + int c = rowL(*l); // unpack_2b_from_8b(l->_side[l->_by], l->_bp); appendValue(rbuf, (Dna5)c); uint32_t newi; assert_lt(c, 4); assert_geq(c, 0); @@ -4021,9 +4044,17 @@ void Ebwt::buildToDisk(InorderBlockwiseSA& sa, // Allocate the side buffer; holds a single side as its being // constructed and then written to disk. Reused across all sides. +#ifdef NEW_FORMAT + uint64_t *ebwtSide = NULL; +#else uint8_t *ebwtSide = NULL; +#endif try { +#ifdef NEW_FORMAT + ebwtSide = new uint64_t[sideSz >> 3]; +#else ebwtSide = new uint8_t[sideSz]; +#endif } catch(bad_alloc &e) { cerr << "Out of memory allocating ebwtSide[] in " << "Ebwt::buildToDisk() at " << __FILE__ << ":" @@ -4055,7 +4086,11 @@ void Ebwt::buildToDisk(InorderBlockwiseSA& sa, uint32_t side = 0; // Points to a byte offset from 'side' within ebwt[] where next // char should be written +#ifdef NEW_FORMAT + int sideCur = (eh._sideBwtSz >> 3) - 1; +#else int sideCur = eh._sideBwtSz - 1; +#endif // Whether we're assembling a forward or a reverse bucket bool fw = false; @@ -4084,7 +4119,12 @@ void Ebwt::buildToDisk(InorderBlockwiseSA& sa, ebwtSide[sideCur] = 0; // clear assert_lt(side + sideCur, ebwtTotSz); // Iterate over bit-pairs in the si'th character of the BWT - for(int bpi = 0; bpi < 4; bpi++, si++) { +#ifdef NEW_FORMAT + for(int bpi = 0; bpi < 32; bpi++, si++) +#else + for(int bpi = 0; bpi < 4; bpi++, si++) +#endif + { int bwtChar; bool count = true; if(si <= len) { @@ -4173,22 +4213,45 @@ void Ebwt::buildToDisk(InorderBlockwiseSA& sa, // Append BWT char to bwt section of current side if(fw) { // Forward bucket: fill from least to most +#ifdef NEW_FORMAT + ebwtSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1)); + if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0); +#else pack_2b_in_8b(bwtChar, ebwtSide[sideCur], bpi); assert_eq((ebwtSide[sideCur] >> (bpi*2)) & 3, bwtChar); +#endif } else { // Backward bucket: fill from most to least +#ifdef NEW_FORMAT + ebwtSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1)); + if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0); +#else pack_2b_in_8b(bwtChar, ebwtSide[sideCur], 3-bpi); assert_eq((ebwtSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar); +#endif } } // end loop over bit-pairs assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3); +#ifdef NEW_FORMAT + assert_eq(0, si & 31); +#else assert_eq(0, si & 3); +#endif if(fw) sideCur++; else sideCur--; - if(sideCur == (int)eh._sideBwtSz) { +#ifdef NEW_FORMAT + if(sideCur == (int)eh._sideBwtSz >> 3) +#else + if(sideCur == (int)eh._sideBwtSz) +#endif + { // Forward side boundary assert_eq(0, si % eh._sideBwtLen); +#ifdef NEW_FORMAT + sideCur = (eh._sideBwtSz >> 3) - 1; +#else sideCur = eh._sideBwtSz - 1; +#endif assert(fw); fw = false; wroteFwBucket = true; // Write 'G' and 'T' assert_leq(occSave[0], occ[2]); diff --git a/ebwt_search.cpp b/ebwt_search.cpp index 11fc5d3..2bb9bea 100644 --- a/ebwt_search.cpp +++ b/ebwt_search.cpp @@ -750,6 +750,9 @@ static void parseOptions(int argc, char **argv) { case 'z': fullIndex = false; break; case ARG_REFIDX: noRefNames = true; break; case ARG_STATEFUL: stateful = true; break; + case ARG_PREFETCH_WIDTH: + prefetchWidth = parseInt(1, "--prewidth must be at least 1"); + break; case ARG_SEED: seed = parseInt(0, "--seed arg must be at least 0"); break; diff --git a/ebwt_search_backtrack.h b/ebwt_search_backtrack.h index 0121bd9..1d854e2 100644 --- a/ebwt_search_backtrack.h +++ b/ebwt_search_backtrack.h @@ -595,19 +595,6 @@ class GreedyDFSRangeSource : public RangeSource { assert_gt(contMan.size(), 0); GreedyDFSContinuation& cont = contMan.front(); assert(cont.prepped); // must have been prepped - uint32_t stackDepth = cont.stackDepth; // depth of the recursion stack; = # mismatches so far - uint32_t depth = cont.depth; // next depth where a post-pair needs to be calculated - uint32_t unrevOff = cont.unrevOff; // depths < unrevOff are unrevisitable - uint32_t oneRevOff = cont.oneRevOff; // depths < oneRevOff are 1-revisitable - uint32_t twoRevOff = cont.twoRevOff; // depths < twoRevOff are 2-revisitable - uint32_t threeRevOff = cont.threeRevOff;// depths < threeRevOff are 3-revisitable - uint32_t top = cont.top; // top arrow in pair prior to 'depth' - uint32_t bot = cont.bot; // bottom arrow in pair prior to 'depth' - uint32_t ham = cont.ham; // weighted hamming distance so far - uint32_t iham = cont.iham; // initial weighted hamming distance - uint32_t* pairs = cont.pairs; // portion of pairs array to be used for this backtrack frame - uint8_t* elims = cont.elims; // portion of elims array to be used for this backtrack frame - bool disableFtab = cont.disableFtab;// // TODO: Also need to save and restore _muts, _mms, _refcs, _chars somehow // Let _foundRange = false; we'll set it to true iff this call @@ -615,13 +602,13 @@ class GreedyDFSRangeSource : public RangeSource { _foundRange = false; // Can't have already exceeded weighted hamming distance threshold - assert_leq(stackDepth, depth); + assert_leq(cont.stackDepth, cont.depth); assert_gt(length(*_qry), 0); assert_leq(_qlen, length(*_qry)); assert_geq(length(*_qual), length(*_qry)); - assert_leq(ham, _qualThresh); - assert_geq(bot, top); - assert_leq(stackDepth, _qlen); + assert_leq(cont.ham, _qualThresh); + assert_geq(cont.bot, cont.top); + assert_leq(cont.stackDepth, _qlen); const Ebwt >& ebwt = *_ebwt; if(_halfAndHalf) { assert_eq(0, _reportPartials); @@ -629,42 +616,42 @@ class GreedyDFSRangeSource : public RangeSource { } if(_reportPartials) { assert(!_halfAndHalf); - assert_leq(stackDepth, _reportPartials); + assert_leq(cont.stackDepth, _reportPartials); } if(_verbose) { - cout << " advance(stackDepth=" << stackDepth << ", " - << "depth=" << depth << ", " - << "top=" << top << ", " - << "bot=" << bot << ", " - << "ham=" << ham << ", " - << "iham=" << iham << ", " - << "pairs=" << pairs << ", " - << "elims=" << (void*)elims << "): \""; - for(int i = (int)depth - 1; i >= 0; i--) { + cout << " advance(stackDepth=" << cont.stackDepth << ", " + << "depth=" << cont.depth << ", " + << "top=" << cont.top << ", " + << "bot=" << cont.bot << ", " + << "ham=" << cont.ham << ", " + << "iham=" << cont.iham << ", " + << "pairs=" << cont.pairs << ", " + << "elims=" << (void*)cont.elims << "): \""; + for(int i = (int)cont.depth - 1; i >= 0; i--) { cout << _chars[i]; } cout << "\"" << endl; } // We can't have already processed the entire read - assert_leq(depth, _qlen); + assert_leq(cont.depth, _qlen); // If we're searching for a half-and-half solution, then // enforce the boundary-crossing constraints here. if(_halfAndHalf && - !halfAndHalfOK(stackDepth, depth, iham)) + !halfAndHalfOK(cont.stackDepth, cont.depth, cont.iham)) { // Continuation is rejected because it violates a boundary- // crossing constraint. contMan.pop(); return false; } - uint32_t cur = _qlen - depth - 1; // current offset into _qry - if(depth < _qlen) { + uint32_t cur = _qlen - cont.depth - 1; // current offset into _qry + if(cont.depth < _qlen) { // If we're still trying to make progress along the length of // the read... if(_verbose) { cout << " cur=" << cur << " \""; - for(int i = (int)depth - 1; i >= 0; i--) { + for(int i = (int)cont.depth - 1; i >= 0; i--) { cout << _chars[i]; } cout << "\""; @@ -680,41 +667,41 @@ class GreedyDFSRangeSource : public RangeSource { // not in the unrevisitable region, and b) its selection would // not cause the quality ceiling (if one exists) to be exceeded bool curIsAlternative = - (depth >= unrevOff) && + (cont.depth >= cont.unrevOff) && (!_considerQuals || - (ham + mmPenalty(_maqPenalty, q) <= _qualThresh)); + (cont.ham + mmPenalty(_maqPenalty, q) <= _qualThresh)); if(_verbose) { cout << " alternative: " << curIsAlternative << endl; } // If c is 'N', then it's a mismatch - if(c == 4 && depth > 0) { + if(c == 4 && cont.depth > 0) { // Force the 'else if(curIsAlternative)' or 'else' // branches below - top = bot = 1; + cont.top = cont.bot = 1; } else if(c == 4) { // We'll take the 'if(top == 0 && bot == 0)' branch below - assert_eq(0, top); - assert_eq(0, bot); + assert_eq(0, cont.top); + assert_eq(0, cont.bot); } // Calculate the ranges for this position - if(top == 0 && bot == 0) { + if(cont.top == 0 && cont.bot == 0) { // Calculate first quartet of ranges using the _fchr[] // array - pairs[0 + 0] = ebwt._fchr[0]; - pairs[0 + 4] = pairs[1 + 0] = ebwt._fchr[1]; - pairs[1 + 4] = pairs[2 + 0] = ebwt._fchr[2]; - pairs[2 + 4] = pairs[3 + 0] = ebwt._fchr[3]; - pairs[3 + 4] = ebwt._fchr[4]; + cont.pairs[0 + 0] = ebwt._fchr[0]; + cont.pairs[0 + 4] = cont.pairs[1 + 0] = ebwt._fchr[1]; + cont.pairs[1 + 4] = cont.pairs[2 + 0] = ebwt._fchr[2]; + cont.pairs[2 + 4] = cont.pairs[3 + 0] = ebwt._fchr[3]; + cont.pairs[3 + 4] = ebwt._fchr[4]; // Update top and bot if(c < 4) { - top = pairTop(pairs, depth, c); - bot = pairBot(pairs, depth, c); + cont.top = pairTop(cont.pairs, cont.depth, c); + cont.bot = pairBot(cont.pairs, cont.depth, c); } } else if(curIsAlternative) { // Clear pairs - memset(&pairs[depth*8], 0, 8 * 4); + memset(&cont.pairs[cont.depth*8], 0, 8 * 4); // Calculate next quartet of ranges. We hope that the // appropriate cache lines are prefetched before now; // otherwise, we're about to take an expensive cache @@ -722,11 +709,12 @@ class GreedyDFSRangeSource : public RangeSource { assert(cont.ltop.valid()); assert(cont.lbot.valid()); ebwt.mapLFEx(cont.ltop, cont.lbot, - &pairs[depth*8], &pairs[(depth*8)+4]); + &cont.pairs[cont.depth*8], + &cont.pairs[(cont.depth*8)+4]); // Update top and bot if(c < 4) { - top = pairTop(pairs, depth, c); - bot = pairBot(pairs, depth, c); + cont.top = pairTop(cont.pairs, cont.depth, c); + cont.bot = pairBot(cont.pairs, cont.depth, c); } } else { // This read position is not a legitimate backtracking @@ -738,8 +726,8 @@ class GreedyDFSRangeSource : public RangeSource { assert(cont.ltop.valid()); assert(cont.lbot.valid()); if(c < 4) { - top = ebwt.mapLF(cont.ltop, c); - bot = ebwt.mapLF(cont.lbot, c); + cont.top = ebwt.mapLF(cont.ltop, c); + cont.bot = ebwt.mapLF(cont.lbot, c); } } @@ -750,57 +738,57 @@ class GreedyDFSRangeSource : public RangeSource { // elims, altNum, eligibleNum, eligibleSz for(int i = 0; i < 4; i++) { if(i == c) continue; // skip the actual query character - uint32_t atop = pairTop(pairs, depth, i); - uint32_t abot = pairBot(pairs, depth, i); + uint32_t atop = pairTop(cont.pairs, cont.depth, i); + uint32_t abot = pairBot(cont.pairs, cont.depth, i); assert_leq(atop, abot); if(abot == atop) continue; // Add a new continuation on the back contMan.expand1(); GreedyDFSContinuation& back = contMan.back(); // Update revisitability boundaries - uint32_t btUnrevOff = unrevOff; - uint32_t btOneRevOff = oneRevOff; - uint32_t btTwoRevOff = twoRevOff; - uint32_t btThreeRevOff = threeRevOff; - if(depth < oneRevOff) btUnrevOff = oneRevOff; - if(depth < twoRevOff) btOneRevOff = twoRevOff; - if(depth < threeRevOff) btTwoRevOff = threeRevOff; + uint32_t btUnrevOff = cont.unrevOff; + uint32_t btOneRevOff = cont.oneRevOff; + uint32_t btTwoRevOff = cont.twoRevOff; + uint32_t btThreeRevOff = cont.threeRevOff; + if(cont.depth < cont.oneRevOff) btUnrevOff = cont.oneRevOff; + if(cont.depth < cont.twoRevOff) btOneRevOff = cont.twoRevOff; + if(cont.depth < cont.threeRevOff) btTwoRevOff = cont.threeRevOff; // TODO: Check if we can use the ftab to jump a // little further into the read // Note the character that we're backtracking on in the // mm array: - _mms[stackDepth] = cur; - _refcs[stackDepth] = i; + _mms[cont.stackDepth] = cur; + _refcs[cont.stackDepth] = i; #ifndef NDEBUG - for(int j = 0; j < (int)stackDepth; j++) { + for(int j = 0; j < (int)cont.stackDepth; j++) { assert_neq(_mms[j], cur) } #endif - _chars[depth] = i; + _chars[cont.depth] = i; // Get a new stretch of the pairs and elims arrays - uint32_t *newPairs = pairs + (_qlen*8); - uint8_t *newElims = elims + (_qlen); - back.init(stackDepth+1, // adding a mismatch - depth+1, + uint32_t *newPairs = cont.pairs + (_qlen*8); + uint8_t *newElims = cont.elims + (_qlen); + back.init(cont.stackDepth+1, // adding a mismatch + cont.depth+1, btUnrevOff, // TODO btOneRevOff, // TODO btTwoRevOff, // TODO btThreeRevOff, // TODO atop, abot, - ham + mmPenalty(_maqPenalty, q), - iham, + cont.ham + mmPenalty(_maqPenalty, q), + cont.iham, newPairs, newElims, - disableFtab); + cont.disableFtab); } } } else { // The continuation had already processed the whole read - depth--; + cont.depth--; cur = 0; } - bool empty = (top == bot); + bool empty = (cont.top == cont.bot); bool hit = (cur == 0 && !empty); // Is this a potential partial-alignment range? @@ -812,8 +800,8 @@ class GreedyDFSRangeSource : public RangeSource { backtrackDespiteMatch = true; // We don't care to report exact partial alignments, only // ones with mismatches. - if(stackDepth > 0) { - reportPartial(stackDepth); + if(cont.stackDepth > 0) { + reportPartial(cont.stackDepth); reportedPartial = true; } // Now continue on to find legitimate partial @@ -822,12 +810,12 @@ class GreedyDFSRangeSource : public RangeSource { // Check whether we've obtained an exact alignment when // we've been instructed not to report exact alignments - bool invalidExact = (hit && stackDepth == 0 && !_reportExacts); + bool invalidExact = (hit && cont.stackDepth == 0 && !_reportExacts); // Set this to true if the only way to make legal progress // is via one or more additional backtracks. if(_halfAndHalf && - !halfAndHalfCheckBounds(stackDepth, depth, _mms, empty)) + !halfAndHalfCheckBounds(cont.stackDepth, cont.depth, _mms, empty)) { // This alignment doesn't satisfy the half-and-half // requirements; reject it @@ -839,10 +827,10 @@ class GreedyDFSRangeSource : public RangeSource { !invalidExact && // not disqualified by no-exact-hits setting !reportedPartial) // not an already-reported partial alignment { - _curRange.top = top; - _curRange.bot = bot; - _curRange.stratum = calcStratum(_mms, stackDepth); - _curRange.numMms = stackDepth; + _curRange.top = cont.top; + _curRange.bot = cont.bot; + _curRange.stratum = calcStratum(_mms, cont.stackDepth); + _curRange.numMms = cont.stackDepth; _curRange.mms = _mms; _curRange.refcs = _refcs; _foundRange = true; @@ -857,8 +845,6 @@ class GreedyDFSRangeSource : public RangeSource { // Push the front continuation forward by one position assert_neq(0, cur); cont.depth++; - cont.top = top; - cont.bot = bot; } return false; } @@ -1133,20 +1119,29 @@ class GreedyDFSRangeSource : public RangeSource { pairs[2 + 4] = pairs[3 + 0] = ebwt._fchr[3]; pairs[3 + 4] = ebwt._fchr[4]; // Update top and bot - if(c < 4) { top = pairTop(pairs, d, c); bot = pairBot(pairs, d, c); } + if(c < 4) { + top = pairTop(pairs, d, c); bot = pairBot(pairs, d, c); + assert_geq(bot, top); + } } else if(curIsAlternative) { // Clear pairs memset(&pairs[d*8], 0, 8 * 4); // Calculate next quartet of ranges ebwt.mapLFEx(ltop, lbot, &pairs[d*8], &pairs[(d*8)+4]); // Update top and bot - if(c < 4) { top = pairTop(pairs, d, c); bot = pairBot(pairs, d, c); } + if(c < 4) { + top = pairTop(pairs, d, c); bot = pairBot(pairs, d, c); + assert_geq(bot, top); + } } else { // This query character is not even a legitimate // alternative (because backtracking here would blow // our mismatch quality budget), so no need to do the // bookkeeping for the entire quartet, just do c - if(c < 4) { top = ebwt.mapLF(ltop, c); bot = ebwt.mapLF(lbot, c); } + if(c < 4) { + top = ebwt.mapLF(ltop, c); bot = ebwt.mapLF(lbot, c); + assert_geq(bot, top); + } } if(top != bot) { // Calculate loci from row indices; do it now so that diff --git a/genomes/.cvsignore b/genomes/.cvsignore new file mode 100644 index 0000000..616d443 --- /dev/null +++ b/genomes/.cvsignore @@ -0,0 +1 @@ +NC_008253.bfa