From c04a125d030e1ea87daa5a4cd1cdad629e30dca1 Mon Sep 17 00:00:00 2001 From: Valentin Antonescu Date: Wed, 9 Apr 2014 18:34:54 -0400 Subject: [PATCH] Added all code coresponding to bowtie2 for LFS. --- SeqAn-1.1/seqan/index/index_sa_lss.h | 2 +- bowtie_inspect.cpp | 24 +- diff_sample.h | 107 ++-- ebwt.cpp | 17 +- ebwt.h | 937 ++++++++++++++++++----------------- ebwt_build.cpp | 1 + filebuf.h | 2 +- multikey_qsort.h | 95 ++-- random_source.h | 17 + ref_read.cpp | 16 +- ref_read.h | 56 ++- reference.h | 142 +++--- scripts/best_verify.pl | 0 scripts/bs_mapability.pl | 0 scripts/build_test.sh | 0 scripts/colorize_fasta.pl | 0 scripts/colorize_fastq.pl | 0 scripts/convert_quals.pl | 0 scripts/fastq_to_tabbed.pl | 0 scripts/gen_dnamasks2colormask.pl | 0 scripts/gen_solqual_lookup.pl | 0 scripts/infer_fraglen.pl | 0 scripts/make_c_elegans.sh | 0 scripts/make_canFam2.sh | 0 scripts/make_d_melanogaster.sh | 0 scripts/make_e_coli.sh | 0 scripts/make_hg18.sh | 0 scripts/make_hg19.sh | 0 scripts/make_mm9.sh | 0 scripts/make_rn4.sh | 0 scripts/make_s_cerevisiae.sh | 0 scripts/mapability.pl | 0 scripts/pe_verify.pl | 0 scripts/random_bowtie_tests.pl | 0 scripts/random_bowtie_tests.sh | 0 scripts/random_bowtie_tests_p.sh | 0 scripts/reconcile_alignments.pl | 0 scripts/reconcile_alignments_pe.pl | 0 shmem.h | 3 + zbox.h | 8 +- 40 files changed, 758 insertions(+), 669 deletions(-) mode change 100644 => 100755 scripts/best_verify.pl mode change 100644 => 100755 scripts/bs_mapability.pl mode change 100644 => 100755 scripts/build_test.sh mode change 100644 => 100755 scripts/colorize_fasta.pl mode change 100644 => 100755 scripts/colorize_fastq.pl mode change 100644 => 100755 scripts/convert_quals.pl mode change 100644 => 100755 scripts/fastq_to_tabbed.pl mode change 100644 => 100755 scripts/gen_dnamasks2colormask.pl mode change 100644 => 100755 scripts/gen_solqual_lookup.pl mode change 100644 => 100755 scripts/infer_fraglen.pl mode change 100644 => 100755 scripts/make_c_elegans.sh mode change 100644 => 100755 scripts/make_canFam2.sh mode change 100644 => 100755 scripts/make_d_melanogaster.sh mode change 100644 => 100755 scripts/make_e_coli.sh mode change 100644 => 100755 scripts/make_hg18.sh mode change 100644 => 100755 scripts/make_hg19.sh mode change 100644 => 100755 scripts/make_mm9.sh mode change 100644 => 100755 scripts/make_rn4.sh mode change 100644 => 100755 scripts/make_s_cerevisiae.sh mode change 100644 => 100755 scripts/mapability.pl mode change 100644 => 100755 scripts/pe_verify.pl mode change 100644 => 100755 scripts/random_bowtie_tests.pl mode change 100644 => 100755 scripts/random_bowtie_tests.sh mode change 100644 => 100755 scripts/random_bowtie_tests_p.sh mode change 100644 => 100755 scripts/reconcile_alignments.pl mode change 100644 => 100755 scripts/reconcile_alignments_pe.pl diff --git a/SeqAn-1.1/seqan/index/index_sa_lss.h b/SeqAn-1.1/seqan/index/index_sa_lss.h index df23b8c..cb9c58a 100644 --- a/SeqAn-1.1/seqan/index/index_sa_lss.h +++ b/SeqAn-1.1/seqan/index/index_sa_lss.h @@ -237,7 +237,7 @@ struct _Context_LSS b=b<" << endl - << " ebwt filename minus trailing .1.ebwt/.2.ebwt" << endl + << " ebwt filename minus trailing .1." + gEbwt_ext + "/.2." + gEbwt_ext << endl << endl << " By default, prints FASTA records of the indexed nucleotide sequences to" << endl << " standard out. With -n, just prints names. With -s, just prints a summary of" << endl @@ -222,7 +222,7 @@ void print_ref_sequences( ostream& fout, bool color, const vector& refnames, - const uint32_t* plen, + const TIndexOffU* plen, const string& adjustedEbwtFileBase) { BitPairReference ref( @@ -282,25 +282,25 @@ void print_index_sequences( TStr cat_ref; ebwt.restore(cat_ref); - uint32_t curr_ref = 0xffffffff; + TIndexOffU curr_ref = OFF_MASK; string curr_ref_seq = ""; - uint32_t curr_ref_len = 0xffffffff; + TIndexOffU curr_ref_len = OFF_MASK; uint32_t last_text_off = 0; size_t orig_len = seqan::length(cat_ref); - uint32_t tlen = 0xffffffff; + TIndexOffU tlen = OFF_MASK; bool first = true; for(size_t i = 0; i < orig_len; i++) { - uint32_t tidx = 0xffffffff; - uint32_t textoff = 0xffffffff; - tlen = 0xffffffff; + TIndexOffU tidx = OFF_MASK; + TIndexOffU textoff = OFF_MASK; + tlen = OFF_MASK; - ebwt.joinedToTextOff(1 /* qlen */, i, tidx, textoff, tlen); + ebwt.joinedToTextOff(1 /* qlen */, (TIndexOffU)i, tidx, textoff, tlen); - if (tidx != 0xffffffff && textoff < tlen) + if (tidx != OFF_MASK && textoff < tlen) { if (curr_ref != tidx) { - if (curr_ref != 0xffffffff) + if (curr_ref != OFF_MASK) { // Add trailing gaps, if any exist if(curr_ref_seq.length() < curr_ref_len) { @@ -315,7 +315,7 @@ void print_index_sequences( first = true; } - uint32_t textoff_adj = textoff; + TIndexOffU textoff_adj = textoff; if(first && textoff > 0) textoff_adj++; if (textoff_adj - last_text_off > 1) curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N'); diff --git a/diff_sample.h b/diff_sample.h index 287cd69..28e25ef 100644 --- a/diff_sample.h +++ b/diff_sample.h @@ -8,6 +8,7 @@ #include "multikey_qsort.h" #include "timer.h" #include "auto_array.h" +#include "btypes.h" using namespace std; using namespace seqan; @@ -525,7 +526,7 @@ class DifferenceCoverSample { _isaPrime(), _dInv(), _log2v(myLog2(_v)), - _vmask(0xffffffff << _log2v), + _vmask(OFF_MASK << _log2v), _logger(__logger) { assert_gt(_d, 0); @@ -547,15 +548,15 @@ class DifferenceCoverSample { size_t sPrimeSz = (len / v) * length(ds); // sPrime, sPrimeOrder, _isaPrime all exist in memory at // once and that's the peak - AutoArray aa(sPrimeSz * 3 + (1024 * 1024 /*out of caution*/)); + AutoArray aa(sPrimeSz * 3 + (1024 * 1024 /*out of caution*/)); return sPrimeSz * 4; // sPrime array } uint32_t v() const { return _v; } uint32_t log2v() const { return _log2v; } uint32_t vmask() const { return _vmask; } - uint32_t modv(uint32_t i) const { return i & ~_vmask; } - uint32_t divv(uint32_t i) const { return i >> _log2v; } + uint32_t modv(TIndexOffU i) const { return (uint32_t)(i & ~_vmask); } + TIndexOffU divv(TIndexOffU i) const { return i >> _log2v; } uint32_t d() const { return _d; } bool verbose() const { return _verbose; } bool sanityCheck() const { return _sanity; } @@ -565,10 +566,10 @@ class DifferenceCoverSample { ostream& log() const { return _logger; } void build(); - uint32_t tieBreakOff(uint32_t i, uint32_t j) const; - int64_t breakTie(uint32_t i, uint32_t j) const; - bool isCovered(uint32_t i) const; - uint32_t rank(uint32_t i) const; + TIndexOffU tieBreakOff(TIndexOffU i, TIndexOffU j) const; + int64_t breakTie(TIndexOffU i, TIndexOffU j) const; + bool isCovered(TIndexOffU i) const; + TIndexOffU rank(TIndexOffU i) const; /** * Print out the suffix array such that every sample offset has its @@ -591,7 +592,7 @@ class DifferenceCoverSample { private: void doBuiltSanityCheck() const; - void buildSPrime(String& sPrime); + void buildSPrime(String& sPrime); bool built() const { return length(_isaPrime) > 0; @@ -611,11 +612,11 @@ class DifferenceCoverSample { String _ds; // samples: idx -> d String _dmap; // delta map uint32_t _d; // |D| - size of sample - String _doffs; // offsets into sPrime/isaPrime for each d idx - String _isaPrime; // ISA' array + String _doffs; // offsets into sPrime/isaPrime for each d idx + String _isaPrime; // ISA' array String _dInv; // Map from d -> idx uint32_t _log2v; - uint32_t _vmask; + TIndexOffU _vmask; ostream& _logger; }; @@ -650,16 +651,16 @@ void DifferenceCoverSample::doBuiltSanityCheck() const { uint32_t v = this->v(); assert(built()); VMSG_NL(" Doing sanity check"); - uint32_t added = 0; + TIndexOffU added = 0; String sorted; - fill(sorted, length(_isaPrime), 0xffffffff, Exact()); + fill(sorted, length(_isaPrime), OFF_MASK, Exact()); for(size_t di = 0; di < this->d(); di++) { uint32_t d = _ds[di]; size_t i = 0; for(size_t doi = _doffs[di]; doi < _doffs[di+1]; doi++, i++) { - assert_eq(0xffffffff, sorted[_isaPrime[doi]]); + assert_eq(OFF_MASK, sorted[_isaPrime[doi]]); // Maps the offset of the suffix to its rank - sorted[_isaPrime[doi]] = v*i + d; + sorted[_isaPrime[doi]] = (TIndexOffU)(v*i + d); added++; } } @@ -678,24 +679,24 @@ void DifferenceCoverSample::doBuiltSanityCheck() const { * Also builds _doffs map. */ template -void DifferenceCoverSample::buildSPrime(String& sPrime) { +void DifferenceCoverSample::buildSPrime(String& sPrime) { const TStr& t = this->text(); const String& ds = this->ds(); - uint32_t tlen = length(t); + TIndexOffU tlen = length(t); uint32_t v = this->v(); uint32_t d = this->d(); assert_gt(v, 2); assert_lt(d, v); // Record where each d section should begin in sPrime - uint32_t tlenDivV = this->divv(tlen); + TIndexOffU tlenDivV = this->divv(tlen); uint32_t tlenModV = this->modv(tlen); - uint32_t sPrimeSz = 0; + TIndexOffU sPrimeSz = 0; assert(empty(_doffs)); reserve(_doffs, d+1, Exact()); assert_eq(capacity(_doffs), d+1); for(uint32_t di = 0; di < d; di++) { // mu mapping - uint32_t sz = tlenDivV + ((ds[di] <= tlenModV) ? 1 : 0); + TIndexOffU sz = tlenDivV + ((ds[di] <= tlenModV) ? 1 : 0); assert_geq(sz, 0); appendValue(_doffs, sPrimeSz); sPrimeSz += sz; @@ -705,7 +706,7 @@ void DifferenceCoverSample::buildSPrime(String& sPrime) { if(tlenDivV > 0) { for(size_t i = 0; i < d; i++) { assert_gt(_doffs[i+1], _doffs[i]); - uint32_t diff = _doffs[i+1] - _doffs[i]; + TIndexOffU diff = _doffs[i+1] - _doffs[i]; assert(diff == tlenDivV || diff == tlenDivV+1); } } @@ -713,20 +714,20 @@ void DifferenceCoverSample::buildSPrime(String& sPrime) { assert_eq(length(_doffs), d+1); // Size sPrime appropriately reserve(sPrime, sPrimeSz+1, Exact()); // reserve extra slot for LS - fill(sPrime, sPrimeSz, 0xffffffff, Exact()); + fill(sPrime, sPrimeSz, OFF_MASK, Exact()); // Slot suffixes from text into sPrime according to the mu // mapping; where the mapping would leave a blank, insert a 0 - uint32_t added = 0; - uint32_t i = 0; - for(uint32_t ti = 0; ti <= tlen; ti += v) { + TIndexOffU added = 0; + TIndexOffU i = 0; + for(uint64_t ti = 0; ti <= tlen; ti += v) { for(uint32_t di = 0; di < d; di++) { - uint32_t tti = ti + ds[di]; + TIndexOffU tti = ti + ds[di]; if(tti > tlen) break; - uint32_t spi = _doffs[di] + i; + TIndexOffU spi = _doffs[di] + i; assert_lt(spi, _doffs[di+1]); assert_leq(tti, tlen); assert_lt(spi, sPrimeSz); - assert_eq(0xffffffff, sPrime[spi]); + assert_eq(OFF_MASK, sPrime[spi]); sPrime[spi] = tti; added++; } i++; @@ -740,11 +741,11 @@ void DifferenceCoverSample::buildSPrime(String& sPrime) { */ template static inline bool suffixSameUpTo(const TStr& host, - uint32_t suf1, - uint32_t suf2, - uint32_t v) + TIndexOffU suf1, + TIndexOffU suf2, + TIndexOffU v) { - for(uint32_t i = 0; i < v; i++) { + for(TIndexOffU i = 0; i < v; i++) { bool endSuf1 = suf1+i >= length(host); bool endSuf2 = suf2+i >= length(host); if((endSuf1 && !endSuf2) || (!endSuf1 && endSuf2)) return false; @@ -768,15 +769,15 @@ void DifferenceCoverSample::build() { uint32_t v = this->v(); assert_gt(v, 2); // Build s' - String sPrime; + String sPrime; VMSG_NL(" Building sPrime"); buildSPrime(sPrime); assert_gt(length(sPrime), 0); assert_leq(length(sPrime), length(t)+1); // +1 is because of the end-cap - uint32_t nextRank = 0; + TIndexOffU nextRank = 0; { VMSG_NL(" Building sPrimeOrder"); - String sPrimeOrder; + String sPrimeOrder; reserve(sPrimeOrder, length(sPrime)+1, Exact()); // reserve extra slot for LS resize(sPrimeOrder, length(sPrime), Exact()); for(size_t i = 0; i < length(sPrimeOrder); i++) { @@ -789,11 +790,11 @@ void DifferenceCoverSample::build() { // Extract backing-store array from sPrime and sPrimeOrder; // the mkeyQSortSuf2 routine works on the array for maximum // efficiency - uint32_t *sPrimeArr = (uint32_t*)begin(sPrime); + TIndexOffU *sPrimeArr = (uint32_t*)begin(sPrime); size_t slen = length(sPrime); assert_eq(sPrimeArr[0], sPrime[0]); assert_eq(sPrimeArr[slen-1], sPrime[slen-1]); - uint32_t *sPrimeOrderArr = (uint32_t*)begin(sPrimeOrder); + TIndexOffU *sPrimeOrderArr = (uint32_t*)begin(sPrimeOrder); assert_eq(sPrimeOrderArr[0], sPrimeOrder[0]); assert_eq(sPrimeOrderArr[slen-1], sPrimeOrder[slen-1]); // Sort sample suffixes up to the vth character using a @@ -818,7 +819,7 @@ void DifferenceCoverSample::build() { // arrays back into sPrime. VMSG_NL(" Allocating rank array"); reserve(_isaPrime, length(sPrime)+1, Exact()); - fill(_isaPrime, length(sPrime), 0xffffffff, Exact()); + fill(_isaPrime, length(sPrime), OFF_MASK, Exact()); assert_gt(length(_isaPrime), 0); { Timer timer(cout, " Ranking v-sort output time: ", this->verbose()); @@ -838,7 +839,7 @@ void DifferenceCoverSample::build() { #ifndef NDEBUG // Check that all ranks are sane for(size_t i = 0; i < length(_isaPrime); i++) { - assert_neq(_isaPrime[i], 0xffffffff); + assert_neq(_isaPrime[i], OFF_MASK); assert_lt(_isaPrime[i], length(_isaPrime)); } #endif @@ -873,7 +874,7 @@ void DifferenceCoverSample::build() { * logic elsewhere. */ template -bool DifferenceCoverSample::isCovered(uint32_t i) const { +bool DifferenceCoverSample::isCovered(TIndexOffU i) const { assert(built()); uint32_t modi = this->modv(i); assert_lt(modi, length(_dInv)); @@ -885,16 +886,16 @@ bool DifferenceCoverSample::isCovered(uint32_t i) const { * among the sample suffixes. */ template -uint32_t DifferenceCoverSample::rank(uint32_t i) const { +TIndexOffU DifferenceCoverSample::rank(TIndexOffU i) const { assert(built()); assert_lt(i, length(this->text())); uint32_t imodv = this->modv(i); assert_neq(0xffffffff, _dInv[imodv]); // must be in the sample - uint32_t ioff = this->divv(i); + TIndexOffU ioff = this->divv(i); assert_lt(ioff, _doffs[_dInv[imodv]+1] - _doffs[_dInv[imodv]]); - uint32_t isaIIdx = _doffs[_dInv[imodv]] + ioff; + TIndexOffU isaIIdx = _doffs[_dInv[imodv]] + ioff; assert_lt(isaIIdx, length(_isaPrime)); - uint32_t isaPrimeI = _isaPrime[isaIIdx]; + TIndexOffU isaPrimeI = _isaPrime[isaIIdx]; assert_leq(isaPrimeI, length(_isaPrime)); return isaPrimeI; } @@ -904,7 +905,7 @@ uint32_t DifferenceCoverSample::rank(uint32_t i) const { * if suffix j is lexicographically greater. */ template -int64_t DifferenceCoverSample::breakTie(uint32_t i, uint32_t j) const { +int64_t DifferenceCoverSample::breakTie(TIndexOffU i, TIndexOffU j) const { assert(built()); assert_neq(i, j); assert_lt(i, length(this->text())); @@ -915,20 +916,20 @@ int64_t DifferenceCoverSample::breakTie(uint32_t i, uint32_t j) const { assert_neq(0xffffffff, _dInv[jmodv]); // must be in the sample uint32_t dimodv = _dInv[imodv]; uint32_t djmodv = _dInv[jmodv]; - uint32_t ioff = this->divv(i); - uint32_t joff = this->divv(j); + TIndexOffU ioff = this->divv(i); + TIndexOffU joff = this->divv(j); assert_lt(dimodv+1, length(_doffs)); assert_lt(djmodv+1, length(_doffs)); // assert_lt: expected (32024) < (0) assert_lt(ioff, _doffs[dimodv+1] - _doffs[dimodv]); assert_lt(joff, _doffs[djmodv+1] - _doffs[djmodv]); - uint32_t isaIIdx = _doffs[dimodv] + ioff; - uint32_t isaJIdx = _doffs[djmodv] + joff; + TIndexOffU isaIIdx = _doffs[dimodv] + ioff; + TIndexOffU isaJIdx = _doffs[djmodv] + joff; assert_lt(isaIIdx, length(_isaPrime)); assert_lt(isaJIdx, length(_isaPrime)); assert_neq(isaIIdx, isaJIdx); // ranks must be unique - uint32_t isaPrimeI = _isaPrime[isaIIdx]; - uint32_t isaPrimeJ = _isaPrime[isaJIdx]; + TIndexOffU isaPrimeI = _isaPrime[isaIIdx]; + TIndexOffU isaPrimeJ = _isaPrime[isaJIdx]; assert_neq(isaPrimeI, isaPrimeJ); // ranks must be unique assert_leq(isaPrimeI, length(_isaPrime)); assert_leq(isaPrimeJ, length(_isaPrime)); @@ -940,7 +941,7 @@ int64_t DifferenceCoverSample::breakTie(uint32_t i, uint32_t j) const { * be compared before the difference cover can break the tie. */ template -uint32_t DifferenceCoverSample::tieBreakOff(uint32_t i, uint32_t j) const { +uint32_t DifferenceCoverSample::tieBreakOff(TIndexOffU i, TIndexOffU j) const { const TStr& t = this->text(); const String& dmap = this->dmap(); assert(built()); diff --git a/ebwt.cpp b/ebwt.cpp index 705c41f..33df260 100644 --- a/ebwt.cpp +++ b/ebwt.cpp @@ -13,6 +13,19 @@ using namespace std; +#ifdef BOWTIE_64BIT_INDEX + +const std::string gEbwt_ext("bt2l"); + +#else + +const std::string gEbwt_ext("bt2"); + +#endif // BOWTIE_64BIT_INDEX + +string gLastIOErrMsg; + + /** * Try to find the Bowtie index specified by the user. First try the * exact path given by the user. Then try the user-provided string @@ -27,7 +40,7 @@ string adjustEbwtBase(const string& cmdline, string str = ebwtFileBase; ifstream in; if(verbose) cout << "Trying " << str << endl; - in.open((str + ".1.ebwt").c_str(), ios_base::in | ios::binary); + in.open((str + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary); if(!in.is_open()) { if(verbose) cout << " didn't work" << endl; in.close(); @@ -41,7 +54,7 @@ string adjustEbwtBase(const string& cmdline, } str += ebwtFileBase; if(verbose) cout << "Trying " << str << endl; - in.open((str + ".1.ebwt").c_str(), ios_base::in | ios::binary); + in.open((str + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary); if(!in.is_open()) { if(verbose) cout << " didn't work" << endl; in.close(); diff --git a/ebwt.h b/ebwt.h index 4b43fd3..1a9b349 100644 --- a/ebwt.h +++ b/ebwt.h @@ -53,6 +53,7 @@ using namespace seqan; // From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl extern uint8_t cCntLUT_4[4][4][256]; +extern const std::string gEbwt_ext; static const uint64_t c_table[4] = { 0xffffffffffffffffllu, @@ -99,7 +100,7 @@ class EbwtParams { public: EbwtParams() { } - EbwtParams(uint32_t len, + EbwtParams(TIndexOffU len, int32_t lineRate, int32_t linesPerSide, int32_t offRate, @@ -116,7 +117,7 @@ class EbwtParams { eh._isaRate, eh._ftabChars, eh._color, eh._entireReverse); } - void init(uint32_t len, int32_t lineRate, int32_t linesPerSide, + void init(TIndexOffU len, int32_t lineRate, int32_t linesPerSide, int32_t offRate, int32_t isaRate, int32_t ftabChars, bool color, bool entireReverse) { @@ -130,16 +131,16 @@ class EbwtParams { _linesPerSide = linesPerSide; _origOffRate = offRate; _offRate = offRate; - _offMask = 0xffffffff << _offRate; + _offMask = OFF_MASK << _offRate; _isaRate = isaRate; - _isaMask = 0xffffffff << ((_isaRate >= 0) ? _isaRate : 0); + _isaMask = OFF_MASK << ((_isaRate >= 0) ? _isaRate : 0); _ftabChars = ftabChars; _eftabLen = _ftabChars*2; - _eftabSz = _eftabLen*4; + _eftabSz = _eftabLen*OFF_SIZE; _ftabLen = (1 << (_ftabChars*2))+1; _ftabSz = _ftabLen*4; _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate; - _offsSz = _offsLen*4; + _offsSz = (uint64_t)_offsLen*OFF_SIZE; _isaLen = (_isaRate == -1)? 0 : ((_bwtLen + (1 << _isaRate) - 1) >> _isaRate); _isaSz = _isaLen*4; _lineSz = 1 << _lineRate; @@ -154,35 +155,35 @@ class EbwtParams { assert(repOk()); } - uint32_t len() const { return _len; } - uint32_t bwtLen() const { return _bwtLen; } - uint32_t sz() const { return _sz; } - uint32_t bwtSz() const { return _bwtSz; } + TIndexOffU len() const { return _len; } + TIndexOffU bwtLen() const { return _bwtLen; } + TIndexOffU sz() const { return _sz; } + TIndexOffU bwtSz() const { return _bwtSz; } int32_t lineRate() const { return _lineRate; } int32_t linesPerSide() const { return _linesPerSide; } int32_t origOffRate() const { return _origOffRate; } int32_t offRate() const { return _offRate; } - uint32_t offMask() const { return _offMask; } + TIndexOffU offMask() const { return _offMask; } int32_t isaRate() const { return _isaRate; } uint32_t isaMask() const { return _isaMask; } int32_t ftabChars() const { return _ftabChars; } uint32_t eftabLen() const { return _eftabLen; } uint32_t eftabSz() const { return _eftabSz; } - uint32_t ftabLen() const { return _ftabLen; } - uint32_t ftabSz() const { return _ftabSz; } - uint32_t offsLen() const { return _offsLen; } - uint32_t offsSz() const { return _offsSz; } - uint32_t isaLen() const { return _isaLen; } - uint32_t isaSz() const { return _isaSz; } + TIndexOffU ftabLen() const { return _ftabLen; } + TIndexOffU ftabSz() const { return _ftabSz; } + TIndexOffU offsLen() const { return _offsLen; } + uint64_t offsSz() const { return _offsSz; } + TIndexOffU isaLen() const { return _isaLen; } + uint64_t isaSz() const { return _isaSz; } uint32_t lineSz() const { return _lineSz; } uint32_t sideSz() const { return _sideSz; } uint32_t sideBwtSz() const { return _sideBwtSz; } uint32_t sideBwtLen() const { return _sideBwtLen; } - uint32_t numSidePairs() const { return _numSidePairs; } - uint32_t numSides() const { return _numSides; } - uint32_t numLines() const { return _numLines; } - uint32_t ebwtTotLen() const { return _ebwtTotLen; } - uint32_t ebwtTotSz() const { return _ebwtTotSz; } + uint32_t numSidePairs() const { return _numSidePairs; } /* check */ + TIndexOffU numSides() const { return _numSides; } + TIndexOffU numLines() const { return _numLines; } + TIndexOffU ebwtTotLen() const { return _ebwtTotLen; } + TIndexOffU ebwtTotSz() const { return _ebwtTotSz; } bool color() const { return _color; } bool entireReverse() const { return _entireReverse; } @@ -192,9 +193,9 @@ class EbwtParams { */ void setOffRate(int __offRate) { _offRate = __offRate; - _offMask = 0xffffffff << _offRate; + _offMask = OFF_MASK << _offRate; _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate; - _offsSz = _offsLen*4; + _offsSz = (uint64_t) _offsLen*OFF_SIZE; } /** @@ -203,9 +204,9 @@ class EbwtParams { */ void setIsaRate(int __isaRate) { _isaRate = __isaRate; - _isaMask = 0xffffffff << _isaRate; + _isaMask = OFF_MASK << _isaRate; _isaLen = (_bwtLen + (1 << _isaRate) - 1) >> _isaRate; - _isaSz = _isaLen*4; + _isaSz = (uint64_t)_isaLen*OFF_SIZE; } /// Check that this EbwtParams is internally consistent @@ -258,35 +259,35 @@ class EbwtParams { << " reverse: " << _entireReverse << endl; } - uint32_t _len; - uint32_t _bwtLen; - uint32_t _sz; - uint32_t _bwtSz; + TIndexOffU _len; + TIndexOffU _bwtLen; + TIndexOffU _sz; + TIndexOffU _bwtSz; int32_t _lineRate; int32_t _linesPerSide; int32_t _origOffRate; int32_t _offRate; - uint32_t _offMask; + TIndexOffU _offMask; int32_t _isaRate; uint32_t _isaMask; int32_t _ftabChars; uint32_t _eftabLen; uint32_t _eftabSz; - uint32_t _ftabLen; - uint32_t _ftabSz; - uint32_t _offsLen; - uint32_t _offsSz; - uint32_t _isaLen; - uint32_t _isaSz; + TIndexOffU _ftabLen; + TIndexOffU _ftabSz; + TIndexOffU _offsLen; + uint64_t _offsSz; + TIndexOffU _isaLen; + uint64_t _isaSz; uint32_t _lineSz; uint32_t _sideSz; uint32_t _sideBwtSz; uint32_t _sideBwtLen; uint32_t _numSidePairs; - uint32_t _numSides; - uint32_t _numLines; - uint32_t _ebwtTotLen; - uint32_t _ebwtTotSz; + TIndexOffU _numSides; + TIndexOffU _numLines; + TIndexOffU _ebwtTotLen; + TIndexOffU _ebwtTotSz; bool _color; bool _entireReverse; }; @@ -341,8 +342,8 @@ class Ebwt { _fw(__fw), \ _in1(MM_FILE_INIT), \ _in2(MM_FILE_INIT), \ - _zOff(0xffffffff), \ - _zEbwtByteOff(0xffffffff), \ + _zOff(OFF_MASK), \ + _zEbwtByteOff(OFF_MASK), \ _zEbwtBpOff(-1), \ _nPat(0), \ _nFrag(0), \ @@ -399,8 +400,8 @@ class Ebwt { rmap_ = rmap; _useMm = useMm; useShmem_ = useShmem; - _in1Str = in + ".1.ebwt"; - _in2Str = in + ".2.ebwt"; + _in1Str = in + ".1." + gEbwt_ext; + _in2Str = in + ".2." + gEbwt_ext; readIntoMemory( color, // expect colorspace reference? __fw ? -1 : needEntireReverse, // need REF_READ_REVERSE @@ -436,14 +437,14 @@ class Ebwt { const string& file, // base filename for EBWT files bool __fw, bool useBlockwise, - uint32_t bmax, - uint32_t bmaxSqrtMult, - uint32_t bmaxDivN, + TIndexOffU bmax, + TIndexOffU bmaxSqrtMult, + TIndexOffU bmaxDivN, int dcv, vector& is, vector& szs, vector& plens, - uint32_t sztot, + TIndexOffU sztot, const RefReadInParams& refparams, uint32_t seed, int32_t __overrideOffRate = -1, @@ -466,8 +467,8 @@ class Ebwt { ProcessorSupport ps; _usePOPCNTinstruction = ps.POPCNTenabled(); #endif - _in1Str = file + ".1.ebwt"; - _in2Str = file + ".2.ebwt"; + _in1Str = file + ".1." + gEbwt_ext; + _in2Str = file + ".2." + gEbwt_ext; // Open output files ofstream fout1(_in1Str.c_str(), ios::binary); if(!fout1.good()) { @@ -548,9 +549,9 @@ class Ebwt { * Write the rstarts array given the szs array for the reference. */ void szsToDisk(const vector& szs, ostream& os, int reverse) { - size_t seq = 0; - uint32_t off = 0; - uint32_t totlen = 0; + TIndexOffU seq = 0; + TIndexOffU off = 0; + TIndexOffU totlen = 0; for(unsigned int i = 0; i < szs.size(); i++) { if(szs[i].len == 0) continue; if(szs[i].first) off = 0; @@ -560,9 +561,9 @@ class Ebwt { #else if(szs[i].first) seq++; #endif - size_t seqm1 = seq-1; + TIndexOffU seqm1 = seq-1; assert_lt(seqm1, _nPat); - size_t fwoff = off; + TIndexOffU fwoff = off; if(reverse == REF_READ_REVERSE) { // Invert pattern idxs seqm1 = _nPat - seqm1 - 1; @@ -570,9 +571,9 @@ class Ebwt { assert_leq(off + szs[i].len, _plen[seqm1]); fwoff = _plen[seqm1] - (off + szs[i].len); } - writeU32(os, totlen, this->toBe()); // offset from beginning of joined string - writeU32(os, seqm1, this->toBe()); // sequence id - writeU32(os, fwoff, this->toBe()); // offset into sequence + writeU(os, totlen, this->toBe()); // offset from beginning of joined string + writeU(os, seqm1, this->toBe()); // sequence id + writeU(os, fwoff, this->toBe()); // offset into sequence totlen += szs[i].len; off += szs[i].len; } @@ -592,21 +593,21 @@ class Ebwt { vector& is, vector& szs, vector& plens, - uint32_t sztot, + TIndexOffU sztot, const RefReadInParams& refparams, ofstream& out1, ofstream& out2, bool useBlockwise, - uint32_t bmax, - uint32_t bmaxSqrtMult, - uint32_t bmaxDivN, + TIndexOffU bmax, + TIndexOffU bmaxSqrtMult, + TIndexOffU bmaxDivN, int dcv, uint32_t seed) { // Compose text strings into single string VMSG_NL("Calculating joined length"); TStr s; // holds the entire joined reference after call to joinToDisk - uint32_t jlen; + TIndexOffU jlen; jlen = joinedLen(szs); assert_geq(jlen, sztot); VMSG_NL("Writing header"); @@ -663,19 +664,19 @@ class Ebwt { } // Succesfully obtained joined reference string assert_geq(length(s), jlen); - if(bmax != 0xffffffff) { + if(bmax != OFF_MASK) { VMSG_NL("bmax according to bmax setting: " << bmax); } - else if(bmaxSqrtMult != 0xffffffff) { + else if(bmaxSqrtMult != OFF_MASK) { bmax *= bmaxSqrtMult; VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax); } - else if(bmaxDivN != 0xffffffff) { - bmax = max(jlen / bmaxDivN, 1); + else if(bmaxDivN != OFF_MASK) { + bmax = max(jlen / bmaxDivN, 1); VMSG_NL("bmax according to bmaxDivN setting: " << bmax); } else { - bmax = (uint32_t)sqrt(length(s)); + bmax = (TIndexOffU)sqrt(length(s)); VMSG_NL("bmax defaulted to: " << bmax); } int iter = 0; @@ -718,15 +719,15 @@ class Ebwt { // we would have thrown one eventually as part of // constructing the DifferenceCoverSample dcv <<= 1; - size_t sz = DifferenceCoverSample::simulateAllocs(s, dcv >> 1); + TIndexOffU sz = (TIndexOffU)DifferenceCoverSample::simulateAllocs(s, dcv >> 1); AutoArray tmp(sz); dcv >>= 1; // Likewise with the KarkkainenBlockwiseSA - sz = KarkkainenBlockwiseSA::simulateAllocs(s, bmax); + sz = (TIndexOffU)KarkkainenBlockwiseSA::simulateAllocs(s, bmax); AutoArray tmp2(sz); // Now throw in the 'ftab' and 'isaSample' structures // that we'll eventually allocate in buildToDisk - AutoArray ftab(_eh._ftabLen * 2); + AutoArray ftab(_eh._ftabLen * 2); AutoArray side(_eh._sideSz); // Grab another 20 MB out of caution AutoArray extra(20*1024*1024); @@ -768,7 +769,7 @@ class Ebwt { #else assert_eq(this->_refnames.size(), this->_nPat); #endif - for(size_t i = 0; i < this->_refnames.size(); i++) { + for(TIndexOffU i = 0; i < this->_refnames.size(); i++) { out1 << this->_refnames[i] << endl; } out1 << '\0'; @@ -786,8 +787,8 @@ class Ebwt { * fragments correspond to input sequences - it just cares about * the lengths of the fragments. */ - uint32_t joinedLen(vector& szs) { - uint32_t ret = 0; + TIndexOffU joinedLen(vector& szs) { + TIndexOffU ret = 0; for(unsigned int i = 0; i < szs.size(); i++) { ret += szs[i].len; } @@ -830,18 +831,18 @@ class Ebwt { /// Accessors const EbwtParams& eh() const { return _eh; } - uint32_t zOff() const { return _zOff; } - uint32_t zEbwtByteOff() const { return _zEbwtByteOff; } - int zEbwtBpOff() const { return _zEbwtBpOff; } - uint32_t nPat() const { return _nPat; } - uint32_t nFrag() const { return _nFrag; } - uint32_t* fchr() const { return _fchr; } - uint32_t* ftab() const { return _ftab; } - uint32_t* eftab() const { return _eftab; } - uint32_t* offs() const { return _offs; } - uint32_t* isa() const { return _isa; } - uint32_t* plen() const { return _plen; } - uint32_t* rstarts() const { return _rstarts; } + TIndexOffU zOff() const { return _zOff; } + TIndexOffU zEbwtByteOff() const { return _zEbwtByteOff; } + TIndexOffU zEbwtBpOff() const { return _zEbwtBpOff; } + TIndexOffU nPat() const { return _nPat; } + TIndexOffU nFrag() const { return _nFrag; } + TIndexOffU* fchr() const { return _fchr; } + TIndexOffU* ftab() const { return _ftab; } + TIndexOffU* eftab() const { return _eftab; } + TIndexOffU* offs() const { return _offs; } + uint32_t* isa() const { return _isa; } /* check */ + TIndexOffU* plen() const { return _plen; } + TIndexOffU* rstarts() const { return _rstarts; } uint8_t* ebwt() const { return _ebwt; } const ReferenceMap* rmap() const { return rmap_; } bool toBe() const { return _toBigEndian; } @@ -863,7 +864,7 @@ class Ebwt { assert(_offs != NULL); assert(_isa != NULL); assert(_rstarts != NULL); - assert_neq(_zEbwtByteOff, 0xffffffff); + assert_neq(_zEbwtByteOff, OFF_MASK); assert_neq(_zEbwtBpOff, -1); return true; } else { @@ -872,7 +873,7 @@ class Ebwt { assert(_fchr == NULL); assert(_offs == NULL); assert(_rstarts == NULL); - assert_eq(_zEbwtByteOff, 0xffffffff); + assert_eq(_zEbwtByteOff, OFF_MASK); assert_eq(_zEbwtBpOff, -1); return false; } @@ -930,14 +931,14 @@ class Ebwt { //_plen = NULL; _rstarts = NULL; _ebwt = NULL; - _zEbwtByteOff = 0xffffffff; + _zEbwtByteOff = OFF_MASK; _zEbwtBpOff = -1; } /** * Non-static facade for static function ftabHi. */ - uint32_t ftabHi(uint32_t i) const { + TIndexOffU ftabHi(TIndexOffU i) const { return Ebwt::ftabHi(_ftab, _eftab, _eh._len, _eh._ftabLen, _eh._eftabLen, i); } @@ -951,18 +952,18 @@ class Ebwt { * It's a static member because it's convenient to ask this * question before the Ebwt is fully initialized. */ - static uint32_t ftabHi(uint32_t *ftab, - uint32_t *eftab, - uint32_t len, - uint32_t ftabLen, - uint32_t eftabLen, - uint32_t i) + static TIndexOffU ftabHi(TIndexOffU *ftab, + TIndexOffU *eftab, + TIndexOffU len, + TIndexOffU ftabLen, + TIndexOffU eftabLen, + TIndexOffU i) { assert_lt(i, ftabLen); if(ftab[i] <= len) { return ftab[i]; } else { - uint32_t efIdx = ftab[i] ^ 0xffffffff; + TIndexOffU efIdx = ftab[i] ^ OFF_MASK; assert_lt(efIdx*2+1, eftabLen); return eftab[efIdx*2+1]; } @@ -971,7 +972,7 @@ class Ebwt { /** * Non-static facade for static function ftabLo. */ - uint32_t ftabLo(uint32_t i) const { + TIndexOffU ftabLo(TIndexOffU i) const { return Ebwt::ftabLo(_ftab, _eftab, _eh._len, _eh._ftabLen, _eh._eftabLen, i); } @@ -985,18 +986,18 @@ class Ebwt { * It's a static member because it's convenient to ask this * question before the Ebwt is fully initialized. */ - static uint32_t ftabLo(uint32_t *ftab, - uint32_t *eftab, - uint32_t len, - uint32_t ftabLen, - uint32_t eftabLen, - uint32_t i) + static TIndexOffU ftabLo(TIndexOffU *ftab, + TIndexOffU *eftab, + TIndexOffU len, + TIndexOffU ftabLen, + TIndexOffU eftabLen, + TIndexOffU i) { assert_lt(i, ftabLen); if(ftab[i] <= len) { return ftab[i]; } else { - uint32_t efIdx = ftab[i] ^ 0xffffffff; + TIndexOffU efIdx = ftab[i] ^ OFF_MASK; assert_lt(efIdx*2+1, eftabLen); return eftab[efIdx*2]; } @@ -1010,9 +1011,9 @@ class Ebwt { * _zEbwtBpOff from _zOff. */ void postReadInit(EbwtParams& eh) { - uint32_t sideNum = _zOff / eh._sideBwtLen; - uint32_t sideCharOff = _zOff % eh._sideBwtLen; - uint32_t sideByteOff = sideNum * eh._sideSz; + TIndexOffU sideNum = _zOff / eh._sideBwtLen; + TIndexOffU sideCharOff = _zOff % eh._sideBwtLen; + TIndexOffU sideByteOff = sideNum * eh._sideSz; _zEbwtByteOff = sideCharOff >> 2; assert_lt(_zEbwtByteOff, eh._sideBwtSz); _zEbwtBpOff = sideCharOff & 3; @@ -1091,8 +1092,8 @@ class Ebwt { // Building static TStr join(vector& l, uint32_t seed); - static TStr join(vector& l, vector& szs, uint32_t sztot, const RefReadInParams& refparams, uint32_t seed); - void joinToDisk(vector& l, vector& szs, vector& plens, uint32_t sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2, uint32_t seed = 0); + static TStr join(vector& l, vector& szs, TIndexOffU sztot, const RefReadInParams& refparams, uint32_t seed); + void joinToDisk(vector& l, vector& szs, vector& plens, TIndexOffU sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2, uint32_t seed = 0); void buildToDisk(InorderBlockwiseSA& sa, const TStr& s, ostream& out1, ostream& out2); // I/O @@ -1103,29 +1104,29 @@ class Ebwt { // Sanity checking void printRangeFw(uint32_t begin, uint32_t end) const; void printRangeBw(uint32_t begin, uint32_t end) const; - void sanityCheckUpToSide(int upToSide) const; + void sanityCheckUpToSide(TIndexOff upToSide) const; void sanityCheckAll(int reverse) const; void restore(TStr& s) const; void checkOrigs(const vector >& os, bool color, bool mirror) const; // Searching and reporting - void joinedToTextOff(uint32_t qlen, uint32_t off, uint32_t& tidx, uint32_t& textoff, uint32_t& tlen) const; + void joinedToTextOff(TIndexOffU qlen, TIndexOffU off, TIndexOffU& tidx, TIndexOffU& textoff, TIndexOffU& tlen) const; inline bool report(const String& query, String* quals, String* name, bool color, char primer, char trimc, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector& mmui32, const std::vector& refcs, size_t numMms, uint32_t off, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams& params) const; inline bool reportChaseOne(const String& query, String* quals, String* name, bool color, char primer, char trimc, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector& mmui32, const std::vector& refcs, size_t numMms, uint32_t i, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams& params, SideLocus *l = NULL) const; inline bool reportReconstruct(const String& query, String* quals, String* name, String& lbuf, String& rbuf, const uint32_t *mmui32, const char* refcs, size_t numMms, uint32_t i, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, const EbwtSearchParams& params, SideLocus *l = NULL) const; inline int rowL(const SideLocus& l) const; - inline uint32_t countUpTo(const SideLocus& l, int c) const; - inline void countUpToEx(const SideLocus& l, uint32_t* pairs) const; + inline TIndexOffU countUpTo(const SideLocus& l, int c) const; + inline void countUpToEx(const SideLocus& l, TIndexOffU* pairs) const; inline uint32_t countFwSide(const SideLocus& l, int c) const; inline void countFwSideEx(const SideLocus& l, uint32_t *pairs) const; inline uint32_t countBwSide(const SideLocus& l, int c) const; inline void countBwSideEx(const SideLocus& l, uint32_t *pairs) const; - inline uint32_t mapLF(const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const; - inline void mapLFEx(const SideLocus& l, uint32_t *pairs ASSERT_ONLY(, bool overrideSanity = false)) const; - inline void mapLFEx(const SideLocus& ltop, const SideLocus& lbot, uint32_t *tops, uint32_t *bots ASSERT_ONLY(, bool overrideSanity = false)) const; - inline uint32_t mapLF(const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const; - inline uint32_t mapLF1(uint32_t row, const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const; - inline int mapLF1(uint32_t& row, const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const; + inline TIndexOffU mapLF(const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const; + inline void mapLFEx(const SideLocus& l, TIndexOffU *pairs ASSERT_ONLY(, bool overrideSanity = false)) const; + inline void mapLFEx(const SideLocus& ltop, const SideLocus& lbot, TIndexOffU *tops, TIndexOffU *bots ASSERT_ONLY(, bool overrideSanity = false)) const; + inline TIndexOffU mapLF(const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const; + inline TIndexOffU mapLF1(TIndexOffU row, const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const; + inline int mapLF1(TIndexOffU& row, const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const; /// Check that in-memory Ebwt is internally consistent with respect /// to given EbwtParams; assert if not bool inMemoryRepOk(const EbwtParams& eh) const { @@ -1171,25 +1172,25 @@ class Ebwt { MM_FILE _in2; // input fd for secondary index file string _in1Str; // filename for primary index file string _in2Str; // filename for secondary index file - uint32_t _zOff; - uint32_t _zEbwtByteOff; - int _zEbwtBpOff; - uint32_t _nPat; /// number of reference texts - uint32_t _nFrag; /// number of fragments - uint32_t* _plen; - uint32_t* _rstarts; // starting offset of fragments / text indexes + TIndexOffU _zOff; + TIndexOffU _zEbwtByteOff; + TIndexOffU _zEbwtBpOff; + TIndexOffU _nPat; /// number of reference texts + TIndexOffU _nFrag; /// number of fragments + TIndexOffU* _plen; + TIndexOffU* _rstarts; // starting offset of fragments / text indexes // _fchr, _ftab and _eftab are expected to be relatively small // (usually < 1MB, perhaps a few MB if _fchr is particularly large // - like, say, 11). For this reason, we don't bother with writing // them to disk through separate output streams; we - uint32_t* _fchr; - uint32_t* _ftab; - uint32_t* _eftab; // "extended" entries for _ftab + TIndexOffU* _fchr; + TIndexOffU* _ftab; + TIndexOffU* _eftab; // "extended" entries for _ftab // _offs may be extremely large. E.g. for DNA w/ offRate=4 (one // offset every 16 rows), the total size of _offs is the same as // the total size of the input sequence - uint32_t* _offs; - uint32_t* _isa; + TIndexOffU* _offs; + TIndexOffU* _isa; // _ebwt is the Extended Burrows-Wheeler Transform itself, and thus // is at least as large as the input sequence. uint8_t* _ebwt; @@ -1201,7 +1202,13 @@ class Ebwt { char *mmFile2_; EbwtParams _eh; -#ifdef EBWT_STATS +#ifdef BOWTIE_64BIT_INDEX + static const int default_lineRate = 7; +#else + static const int default_lineRate = 6; +#endif + + #ifdef EBWT_STATS uint64_t mapLFExs_; uint64_t mapLFs_; uint64_t mapLFcs_; @@ -1536,7 +1543,7 @@ struct SideLocus { /** * Construct from row and other relevant information about the Ebwt. */ - SideLocus(uint32_t row, const EbwtParams& ep, const uint8_t* ebwt) { + SideLocus(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) { initFromRow(row, ep, ebwt); } @@ -1544,20 +1551,20 @@ struct SideLocus { * Init two SideLocus objects from a top/bot pair, using the result * from one call to initFromRow to possibly avoid a second call. */ - static void initFromTopBot(uint32_t top, - uint32_t bot, + static void initFromTopBot(TIndexOffU top, + TIndexOffU bot, const EbwtParams& ep, const uint8_t* ebwt, SideLocus& ltop, SideLocus& lbot) { - const uint32_t sideBwtLen = ep._sideBwtLen; + const TIndexOffU sideBwtLen = ep._sideBwtLen; const uint32_t sideBwtSz = ep._sideBwtSz; assert_gt(bot, top); ltop.initFromRow(top, ep, ebwt); - uint32_t spread = bot - top; + TIndexOffU spread = bot - top; if(ltop._charOff + spread < sideBwtLen) { - lbot._charOff = ltop._charOff + spread; + lbot._charOff = (uint32_t)(ltop._charOff + spread); lbot._sideNum = ltop._sideNum; lbot._sideByteOff = ltop._sideByteOff; lbot._fw = ltop._fw; @@ -1575,12 +1582,12 @@ struct SideLocus { * Calculate SideLocus based on a row and other relevant * information about the shape of the Ebwt. */ - void initFromRow(uint32_t row, const EbwtParams& ep, const uint8_t* ebwt) { + void initFromRow(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) { const uint32_t sideSz = ep._sideSz; // Side length is hard-coded for now; this allows the compiler // to do clever things to accelerate / and %. - _sideNum = row / 224; - _charOff = row % 224; + _sideNum = row / (48*OFF_SIZE); + _charOff = row % (48*OFF_SIZE); _sideByteOff = _sideNum * sideSz; assert_leq(row, ep._len); assert_leq(_sideByteOff + sideSz, ep._ebwtTotSz); @@ -1589,7 +1596,7 @@ struct SideLocus { 0 /* prepare for read */, PREFETCH_LOCALITY); #endif - // prefetch tjside too + // prefetch this side too _fw = (_sideNum & 1) != 0; // odd-numbered sides are forward _by = _charOff >> 2; // byte within side assert_lt(_by, (int)ep._sideBwtSz); @@ -1618,8 +1625,8 @@ struct SideLocus { return ebwt + _sideByteOff + (_fw? (-128) : (128)); } - uint32_t _sideByteOff; // offset of top side within ebwt[] - uint32_t _sideNum; // index of side + TIndexOffU _sideByteOff; // offset of top side within ebwt[] + TIndexOffU _sideNum; // index of side uint16_t _charOff; // character offset within side bool _fw; // side is forward or backward? int16_t _by; // byte within side (not adjusted for bw sides) @@ -1662,7 +1669,7 @@ void Ebwt::printRangeFw(uint32_t begin, uint32_t end) const { * of a backward side, print the characters at those positions along * with a summary occ[] array. */ -template +template /* check - update */ void Ebwt::printRangeBw(uint32_t begin, uint32_t end) const { assert(isInMemory()); uint32_t occ[] = {0, 0, 0, 0}; @@ -1687,14 +1694,14 @@ void Ebwt::printRangeBw(uint32_t begin, uint32_t end) const { * comparing against the embedded occ[] arrays. */ template -void Ebwt::sanityCheckUpToSide(int upToSide) const { +void Ebwt::sanityCheckUpToSide(TIndexOff upToSide) const { assert(isInMemory()); - uint32_t occ[] = {0, 0, 0, 0}; - ASSERT_ONLY(uint32_t occ_save[] = {0, 0}); - uint32_t cur = 0; // byte pointer + TIndexOffU occ[] = {0, 0, 0, 0}; + ASSERT_ONLY(TIndexOffU occ_save[] = {0, 0}); + TIndexOffU cur = 0; // byte pointer const EbwtParams& eh = this->_eh; bool fw = false; - while(cur < (upToSide * eh._sideSz)) { + while(cur < (TIndexOffU)(upToSide * eh._sideSz)) { assert_leq(cur + eh._sideSz, eh._ebwtTotLen); for(uint32_t i = 0; i < eh._sideBwtSz; i++) { uint8_t by = this->_ebwt[cur + (fw ? i : eh._sideBwtSz-i-1)]; @@ -1710,18 +1717,18 @@ void Ebwt::sanityCheckUpToSide(int upToSide) const { if(fw) { // Finished forward bucket; check saved [G] and [T] // against the two uint32_ts encoded here - ASSERT_ONLY(uint32_t *u32ebwt = reinterpret_cast(&this->_ebwt[cur + eh._sideBwtSz])); - ASSERT_ONLY(uint32_t gs = u32ebwt[0]); - ASSERT_ONLY(uint32_t ts = u32ebwt[1]); + ASSERT_ONLY(TIndexOffU *u32ebwt = reinterpret_cast(&this->_ebwt[cur + eh._sideBwtSz])); + ASSERT_ONLY(TIndexOffU gs = u32ebwt[0]); + ASSERT_ONLY(TIndexOffU ts = u32ebwt[1]); assert_eq(gs, occ_save[0]); assert_eq(ts, occ_save[1]); fw = false; } else { // Finished backward bucket; check current [A] and [C] // against the two uint32_ts encoded here - ASSERT_ONLY(uint32_t *u32ebwt = reinterpret_cast(&this->_ebwt[cur + eh._sideBwtSz])); - ASSERT_ONLY(uint32_t as = u32ebwt[0]); - ASSERT_ONLY(uint32_t cs = u32ebwt[1]); + ASSERT_ONLY(TIndexOffU *u32ebwt = reinterpret_cast(&this->_ebwt[cur + eh._sideBwtSz])); + ASSERT_ONLY(TIndexOffU as = u32ebwt[0]); + ASSERT_ONLY(TIndexOffU cs = u32ebwt[1]); assert(as == occ[0] || as == occ[0]-1); // one 'a' is a skipped '$' and doesn't count toward occ[] assert_eq(cs, occ[1]); ASSERT_ONLY(occ_save[0] = occ[2]); // save gs @@ -1740,7 +1747,7 @@ void Ebwt::sanityCheckAll(int reverse) const { const EbwtParams& eh = this->_eh; assert(isInMemory()); // Check ftab - for(uint32_t i = 1; i < eh._ftabLen; i++) { + for(TIndexOffU i = 1; i < eh._ftabLen; i++) { assert_geq(this->ftabHi(i), this->ftabLo(i-1)); assert_geq(this->ftabLo(i), this->ftabHi(i-1)); assert_leq(this->ftabHi(i), eh._bwtLen+1); @@ -1748,20 +1755,20 @@ void Ebwt::sanityCheckAll(int reverse) const { assert_eq(this->ftabHi(eh._ftabLen-1), eh._bwtLen); // Check offs - int seenLen = (eh._bwtLen + 31) >> 5; - uint32_t *seen; + TIndexOff seenLen = (eh._bwtLen + 31) >> ((TIndexOffU)5); + TIndexOff *seen; try { - seen = new uint32_t[seenLen]; // bitvector marking seen offsets + seen = new TIndexOff[seenLen]; // bitvector marking seen offsets } catch(bad_alloc& e) { cerr << "Out of memory allocating seen[] at " << __FILE__ << ":" << __LINE__ << endl; throw e; } - memset(seen, 0, 4 * seenLen); - uint32_t offsLen = eh._offsLen; - for(uint32_t i = 0; i < offsLen; i++) { + memset(seen, 0, OFF_SIZE * seenLen); + TIndexOffU offsLen = eh._offsLen; + for(TIndexOffU i = 0; i < offsLen; i++) { assert_lt(this->_offs[i], eh._bwtLen); - int w = this->_offs[i] >> 5; - int r = this->_offs[i] & 31; + TIndexOff w = this->_offs[i] >> 5; + TIndexOff r = this->_offs[i] & 31; assert_eq(0, (seen[w] >> r) & 1); // shouldn't have been seen before seen[w] |= (1 << r); } @@ -1771,12 +1778,12 @@ void Ebwt::sanityCheckAll(int reverse) const { assert_gt(this->_nPat, 0); // Check plen, flen - for(uint32_t i = 0; i < this->_nPat; i++) { + for(TIndexOffU i = 0; i < this->_nPat; i++) { assert_geq(this->_plen[i], 0); } // Check rstarts - for(uint32_t i = 0; i < this->_nFrag-1; i++) { + for(TIndexOffU i = 0; i < this->_nFrag-1; i++) { assert_gt(this->_rstarts[(i+1)*3], this->_rstarts[i*3]); if(reverse == REF_READ_REVERSE) { assert(this->_rstarts[(i*3)+1] >= this->_rstarts[((i+1)*3)+1]); @@ -2004,11 +2011,11 @@ inline static void countInU64Ex(uint64_t dw, uint32_t* arrs) { * Function gets 11.09% in profile */ template -inline uint32_t Ebwt::countUpTo(const SideLocus& l, int c) const { +inline TIndexOffU Ebwt::countUpTo(const SideLocus& l, int c) const { // Count occurrences of c in each 64-bit (using bit trickery); // Someday countInU64() and pop() functions should be // vectorized/SSE-ized in case that helps. - uint32_t cCnt = 0; + TIndexOffU cCnt = 0; const uint8_t *side = l.side(this->_ebwt); int i = 0; #if 1 @@ -2071,7 +2078,7 @@ inline uint32_t Ebwt::countUpTo(const SideLocus& l, int c) const { * side up to (but not including) the given byte/bitpair (by/bp). */ template -inline void Ebwt::countUpToEx(const SideLocus& l, uint32_t* arrs) const { +inline void Ebwt::countUpToEx(const SideLocus& l, TIndexOffU* arrs) const { int i = 0; // Count occurrences of c in each 64-bit (using bit trickery); // note: this seems does not seem to lend a significant boost to @@ -2143,7 +2150,7 @@ inline void Ebwt::countUpToEx(const SideLocus& l, uint32_t* arrs) const { * break just prior to the side. */ template -inline uint32_t Ebwt::countFwSide(const SideLocus& l, int c) const { +inline uint32_t Ebwt::countFwSide(const SideLocus& l, int c) const { /* check */ assert_lt(c, 4); assert_geq(c, 0); assert_lt(l._by, (int)this->_eh._sideBwtSz); @@ -2347,8 +2354,8 @@ inline void Ebwt::countBwSideEx(const SideLocus& l, uint32_t* arrs) const template inline void Ebwt::mapLFEx(const SideLocus& ltop, const SideLocus& lbot, - uint32_t *tops, - uint32_t *bots + TIndexOffU *tops, + TIndexOffU *bots ASSERT_ONLY(, bool overrideSanity) ) const { @@ -2389,7 +2396,7 @@ inline void Ebwt::mapLFEx(const SideLocus& ltop, */ template inline void Ebwt::mapLFEx(const SideLocus& l, - uint32_t *arrs + TIndexOffU *arrs ASSERT_ONLY(, bool overrideSanity) ) const { @@ -2417,14 +2424,14 @@ inline void Ebwt::mapLFEx(const SideLocus& l, * Given row i, return the row that the LF mapping maps i to. */ template -inline uint32_t Ebwt::mapLF(const SideLocus& l +inline TIndexOffU Ebwt::mapLF(const SideLocus& l ASSERT_ONLY(, bool overrideSanity) ) const { #ifdef EBWT_STATS const_cast*>(this)->mapLFs_++; #endif - uint32_t ret; + TIndexOffU ret; assert(l.side(this->_ebwt) != NULL); int c = rowL(l); assert_lt(c, 4); @@ -2437,7 +2444,7 @@ inline uint32_t Ebwt::mapLF(const SideLocus& l // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion - uint32_t arrs[] = { 0, 0, 0, 0 }; + TIndexOffU arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], ret); } @@ -2450,14 +2457,14 @@ inline uint32_t Ebwt::mapLF(const SideLocus& l * i to on character c. */ template -inline uint32_t Ebwt::mapLF(const SideLocus& l, int c +inline TIndexOffU Ebwt::mapLF(const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity) ) const { #ifdef EBWT_STATS const_cast*>(this)->mapLFcs_++; #endif - uint32_t ret; + TIndexOffU ret; assert_lt(c, 4); assert_geq(c, 0); if(l._fw) ret = countFwSide(l, c); // Forward side @@ -2468,7 +2475,7 @@ inline uint32_t Ebwt::mapLF(const SideLocus& l, int c // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion - uint32_t arrs[] = { 0, 0, 0, 0 }; + TIndexOffU arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], ret); } @@ -2481,15 +2488,15 @@ inline uint32_t Ebwt::mapLF(const SideLocus& l, int c * i to on character c. */ template -inline uint32_t Ebwt::mapLF1(uint32_t row, const SideLocus& l, int c +inline TIndexOffU Ebwt::mapLF1(TIndexOffU row, const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity) ) const { #ifdef EBWT_STATS const_cast*>(this)->mapLF1cs_++; #endif - if(rowL(l) != c || row == _zOff) return 0xffffffff; - uint32_t ret; + if(rowL(l) != c || row == _zOff) return OFF_MASK; + TIndexOffU ret; assert_lt(c, 4); assert_geq(c, 0); if(l._fw) ret = countFwSide(l, c); // Forward side @@ -2500,7 +2507,7 @@ inline uint32_t Ebwt::mapLF1(uint32_t row, const SideLocus& l, int c // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion - uint32_t arrs[] = { 0, 0, 0, 0 }; + TIndexOffU arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], ret); } @@ -2513,7 +2520,7 @@ inline uint32_t Ebwt::mapLF1(uint32_t row, const SideLocus& l, int c * i to on character c. */ template -inline int Ebwt::mapLF1(uint32_t& row, const SideLocus& l +inline int Ebwt::mapLF1(TIndexOffU& row, const SideLocus& l ASSERT_ONLY(, bool overrideSanity) ) const { @@ -2532,7 +2539,7 @@ inline int Ebwt::mapLF1(uint32_t& row, const SideLocus& l // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion - uint32_t arrs[] = { 0, 0, 0, 0 }; + TIndexOffU arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], row); } @@ -2547,34 +2554,34 @@ inline int Ebwt::mapLF1(uint32_t& row, const SideLocus& l * sorted list of reference fragment ranges t */ template -void Ebwt::joinedToTextOff(uint32_t qlen, uint32_t off, - uint32_t& tidx, - uint32_t& textoff, - uint32_t& tlen) const +void Ebwt::joinedToTextOff(TIndexOffU qlen, TIndexOffU off, + TIndexOffU& tidx, + TIndexOffU& textoff, + TIndexOffU& tlen) const { - uint32_t top = 0; - uint32_t bot = _nFrag; // 1 greater than largest addressable element - uint32_t elt = 0xffffffff; + TIndexOffU top = 0; + TIndexOffU bot = _nFrag; // 1 greater than largest addressable element + TIndexOffU elt = OFF_MASK; // Begin binary search while(true) { - ASSERT_ONLY(uint32_t oldelt = elt); + ASSERT_ONLY(TIndexOffU oldelt = elt); elt = top + ((bot - top) >> 1); assert_neq(oldelt, elt); // must have made progress - uint32_t lower = _rstarts[elt*3]; - uint32_t upper; + TIndexOffU lower = _rstarts[elt*3]; + TIndexOffU upper; if(elt == _nFrag-1) { upper = _eh._len; } else { upper = _rstarts[((elt+1)*3)]; } assert_gt(upper, lower); - uint32_t fraglen = upper - lower; + TIndexOffU fraglen = upper - lower; if(lower <= off) { if(upper > off) { // not last element, but it's within // off is in this range; check if it falls off if(off + qlen > upper) { // it falls off; signal no-go and return - tidx = 0xffffffff; + tidx = OFF_MASK; assert_lt(elt, _nFrag-1); return; } @@ -2936,14 +2943,14 @@ template void Ebwt::restore(TStr& s) const { assert(isInMemory()); resize(s, this->_eh._len, Exact()); - uint32_t jumps = 0; - uint32_t i = this->_eh._len; // should point to final SA elt (starting with '$') + TIndexOffU jumps = 0; + TIndexOffU i = this->_eh._len; // should point to final SA elt (starting with '$') SideLocus l(i, this->_eh, this->_ebwt); while(i != _zOff) { assert_lt(jumps, this->_eh._len); //if(_verbose) cout << "restore: i: " << i << endl; // Not a marked row; go back a char in the original string - uint32_t newi = mapLF(l); + TIndexOffU newi = mapLF(l); assert_neq(newi, i); s[this->_eh._len - jumps - 1] = rowL(l); i = newi; @@ -2963,7 +2970,7 @@ void Ebwt::checkOrigs(const vector >& os, { TStr rest; restore(rest); - uint32_t restOff = 0; + TIndexOffU restOff = 0; size_t i = 0, j = 0; if(mirror) { // TODO: FIXME @@ -3126,14 +3133,14 @@ void Ebwt::readIntoMemory( } // Read endianness hints from both streams - size_t bytesRead = 0; + uint64_t bytesRead = 0; switchEndian = false; - uint32_t one = readU32(_in1, switchEndian); // 1st word of primary stream + uint32_t one = readU(_in1, switchEndian); // 1st word of primary stream bytesRead += 4; #ifndef NDEBUG - assert_eq(one, readU32(_in2, switchEndian)); // should match! + assert_eq(one, readU(_in2, switchEndian)); // should match! #else - readU32(_in2, switchEndian); + readU(_in2, switchEndian); #endif if(one != 1) { assert_eq((1u<<24), one); @@ -3151,19 +3158,19 @@ void Ebwt::readIntoMemory( } // Reads header entries one by one from primary stream - uint32_t len = readU32(_in1, switchEndian); - bytesRead += 4; - int32_t lineRate = readI32(_in1, switchEndian); + TIndexOffU len = readU(_in1, switchEndian); + bytesRead += OFF_SIZE; + int32_t lineRate = readI(_in1, switchEndian); bytesRead += 4; - int32_t linesPerSide = readI32(_in1, switchEndian); + int32_t linesPerSide = readI(_in1, switchEndian); bytesRead += 4; - int32_t offRate = readI32(_in1, switchEndian); + int32_t offRate = readI(_in1, switchEndian); bytesRead += 4; // TODO: add isaRate to the actual file format (right now, the // user has to tell us whether there's an ISA sample and what the // sampling rate is. int32_t isaRate = _overrideIsaRate; - int32_t ftabChars = readI32(_in1, switchEndian); + int32_t ftabChars = readI(_in1, switchEndian); bytesRead += 4; // chunkRate was deprecated in an earlier version of Bowtie; now // we use it to hold flags. @@ -3210,29 +3217,30 @@ void Ebwt::readIntoMemory( } // Set up overridden suffix-array-sample parameters - uint32_t offsLen = eh->_offsLen; - uint32_t offRateDiff = 0; - uint32_t offsLenSampled = offsLen; + TIndexOffU offsLen = eh->_offsLen; + uint64_t offsSz = eh->_offsSz; + TIndexOffU offRateDiff = 0; + TIndexOffU offsLenSampled = offsLen; if(_overrideOffRate > offRate) { offRateDiff = _overrideOffRate - offRate; } if(offRateDiff > 0) { offsLenSampled >>= offRateDiff; - if((offsLen & ~(0xffffffff << offRateDiff)) != 0) { + if((offsLen & ~(OFF_MASK << offRateDiff)) != 0) { offsLenSampled++; } } // Set up overridden inverted-suffix-array-sample parameters - uint32_t isaLen = eh->_isaLen; - uint32_t isaRateDiff = 0; - uint32_t isaLenSampled = isaLen; + TIndexOffU isaLen = eh->_isaLen; + TIndexOffU isaRateDiff = 0; + TIndexOffU isaLenSampled = isaLen; if(_overrideIsaRate > isaRate) { isaRateDiff = _overrideIsaRate - isaRate; } if(isaRateDiff > 0) { isaLenSampled >>= isaRateDiff; - if((isaLen & ~(0xffffffff << isaRateDiff)) != 0) { + if((isaLen & ~(OFF_MASK << isaRateDiff)) != 0) { isaLenSampled++; } } @@ -3246,8 +3254,8 @@ void Ebwt::readIntoMemory( } // Read nPat from primary stream - this->_nPat = readI32(_in1, switchEndian); - bytesRead += 4; + this->_nPat = readI(_in1, switchEndian); + bytesRead += OFF_SIZE; if(this->_plen != NULL && !_useMm) { // Delete it so that we can re-read it delete[] this->_plen; @@ -3257,9 +3265,9 @@ void Ebwt::readIntoMemory( // Read plen from primary stream if(_useMm) { #ifdef BOWTIE_MM - this->_plen = (uint32_t*)(mmFile[0] + bytesRead); - bytesRead += this->_nPat*4; - lseek(_in1, this->_nPat*4, SEEK_CUR); + this->_plen = (TIndexOffU*)(mmFile[0] + bytesRead); + bytesRead += this->_nPat*OFF_SIZE; + lseek(_in1, this->_nPat*OFF_SIZE, SEEK_CUR); #endif } else { try { @@ -3267,15 +3275,15 @@ void Ebwt::readIntoMemory( cerr << "Reading plen (" << this->_nPat << "): "; logTime(cerr); } - this->_plen = new uint32_t[this->_nPat]; + this->_plen = new TIndexOffU[this->_nPat]; if(switchEndian) { - for(uint32_t i = 0; i < this->_nPat; i++) { - this->_plen[i] = readU32(_in1, switchEndian); + for(TIndexOffU i = 0; i < this->_nPat; i++) { + this->_plen[i] = readU(_in1, switchEndian); } } else { - MM_READ_RET r = MM_READ(_in1, (void*)this->_plen, this->_nPat*4); - if(r != (MM_READ_RET)(this->_nPat*4)) { - cerr << "Error reading _plen[] array: " << r << ", " << (this->_nPat*4) << endl; + MM_READ_RET r = MM_READ(_in1, (void*)this->_plen, this->_nPat*OFF_SIZE); + if(r != (MM_READ_RET)(this->_nPat*OFF_SIZE)) { + cerr << "Error reading _plen[] array: " << r << ", " << (this->_nPat*OFF_SIZE) << endl; throw 1; } } @@ -3294,8 +3302,8 @@ void Ebwt::readIntoMemory( // (i.e. everything up to and including join()). if(justHeader) goto done; - this->_nFrag = readU32(_in1, switchEndian); - bytesRead += 4; + this->_nFrag = readU(_in1, switchEndian); + bytesRead += OFF_SIZE; if(_verbose || startVerbose) { cerr << "Reading rstarts (" << this->_nFrag*3 << "): "; logTime(cerr); @@ -3303,24 +3311,24 @@ void Ebwt::readIntoMemory( assert_geq(this->_nFrag, this->_nPat); if(_useMm) { #ifdef BOWTIE_MM - this->_rstarts = (uint32_t*)(mmFile[0] + bytesRead); - bytesRead += this->_nFrag*4*3; - lseek(_in1, this->_nFrag*4*3, SEEK_CUR); + this->_rstarts = (TIndexOffU*)(mmFile[0] + bytesRead); + bytesRead += this->_nFrag*OFF_SIZE*3; + lseek(_in1, this->_nFrag*OFF_SIZE*3, SEEK_CUR); #endif } else { - this->_rstarts = new uint32_t[this->_nFrag*3]; + this->_rstarts = new TIndexOffU[this->_nFrag*3]; if(switchEndian) { - for(uint32_t i = 0; i < this->_nFrag*3; i += 3) { + for(TIndexOffU i = 0; i < this->_nFrag*3; i += 3) { // fragment starting position in joined reference // string, text id, and fragment offset within text - this->_rstarts[i] = readU32(_in1, switchEndian); - this->_rstarts[i+1] = readU32(_in1, switchEndian); - this->_rstarts[i+2] = readU32(_in1, switchEndian); + this->_rstarts[i] = readU(_in1, switchEndian); + this->_rstarts[i+1] = readU(_in1, switchEndian); + this->_rstarts[i+2] = readU(_in1, switchEndian); } } else { - MM_READ_RET r = MM_READ(_in1, (void *)this->_rstarts, this->_nFrag*4*3); - if(r != (MM_READ_RET)(this->_nFrag*4*3)) { - cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*4*3) << endl; + MM_READ_RET r = MM_READ(_in1, (void *)this->_rstarts, this->_nFrag*OFF_SIZE*3); + if(r != (MM_READ_RET)(this->_nFrag*OFF_SIZE*3)) { + cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*OFF_SIZE*3) << endl; throw 1; } } @@ -3357,22 +3365,29 @@ void Ebwt::readIntoMemory( } if(shmemLeader) { // Read ebwt from primary stream - MM_READ_RET r = MM_READ(_in1, (void *)this->_ebwt, eh->_ebwtTotLen); - if(r != (MM_READ_RET)eh->_ebwtTotLen) { - cerr << "Error reading ebwt array: returned " << r << ", length was " << (eh->_ebwtTotLen) << endl - << "Your index files may be corrupt; please try re-building or re-downloading." << endl - << "A complete index consists of 6 files: XYZ.1.ebwt, XYZ.2.ebwt, XYZ.3.ebwt," << endl - << "XYZ.4.ebwt, XYZ.rev.1.ebwt, and XYZ.rev.2.ebwt. The XYZ.1.ebwt and " << endl - << "XYZ.rev.1.ebwt files should have the same size, as should the XYZ.2.ebwt and" << endl - << "XYZ.rev.2.ebwt files." << endl; - throw 1; + uint64_t bytesLeft = eh->_ebwtTotLen; + char *pebwt = (char*)this->ebwt(); + + while (bytesLeft>0){ + MM_READ_RET r = MM_READ(_in1, (void *)pebwt, bytesLeft); + if(MM_IS_IO_ERR(_in1,r,bytesLeft)) { + cerr << "Error reading ebwt array: returned " << r << ", length was " << (eh->_ebwtTotLen) << endl + << "Your index files may be corrupt; please try re-building or re-downloading." << endl + << "A complete index consists of 6 files: XYZ.1.ebwt, XYZ.2.ebwt, XYZ.3.ebwt," << endl + << "XYZ.4.ebwt, XYZ.rev.1.ebwt, and XYZ.rev.2.ebwt. The XYZ.1.ebwt and " << endl + << "XYZ.rev.1.ebwt files should have the same size, as should the XYZ.2.ebwt and" << endl + << "XYZ.rev.2.ebwt files." << endl; + throw 1; + } + pebwt += r; + bytesLeft -= r; } if(switchEndian) { uint8_t *side = this->_ebwt; for(size_t i = 0; i < eh->_numSides; i++) { - uint32_t *cums = reinterpret_cast(side + eh->_sideSz - 8); - cums[0] = endianSwapU32(cums[0]); - cums[1] = endianSwapU32(cums[1]); + TIndexOffU *cums = reinterpret_cast(side + eh->_sideSz - 8); + cums[0] = endianSwapU(cums[0]); + cums[1] = endianSwapU(cums[1]); side += this->_eh._sideSz; } } @@ -3385,8 +3400,8 @@ void Ebwt::readIntoMemory( } // Read zOff from primary stream - _zOff = readU32(_in1, switchEndian); - bytesRead += 4; + _zOff = readU(_in1, switchEndian); + bytesRead += OFF_SIZE; assert_lt(_zOff, len); try { @@ -3394,14 +3409,14 @@ void Ebwt::readIntoMemory( if(_verbose || startVerbose) cerr << "Reading fchr (5)" << endl; if(_useMm) { #ifdef BOWTIE_MM - this->_fchr = (uint32_t*)(mmFile[0] + bytesRead); - bytesRead += 5*4; - lseek(_in1, 5*4, SEEK_CUR); + this->_fchr = (TIndexOffU*)(mmFile[0] + bytesRead); + bytesRead += 5*OFF_SIZE; + lseek(_in1, 5*OFF_SIZE, SEEK_CUR); #endif } else { - this->_fchr = new uint32_t[5]; + this->_fchr = new TIndexOffU[5]; for(int i = 0; i < 5; i++) { - this->_fchr[i] = readU32(_in1, switchEndian); + this->_fchr[i] = readU(_in1, switchEndian); assert_leq(this->_fchr[i], len); if(i > 0) assert_geq(this->_fchr[i], this->_fchr[i-1]); } @@ -3414,19 +3429,19 @@ void Ebwt::readIntoMemory( } if(_useMm) { #ifdef BOWTIE_MM - this->_ftab = (uint32_t*)(mmFile[0] + bytesRead); - bytesRead += eh->_ftabLen*4; - lseek(_in1, eh->_ftabLen*4, SEEK_CUR); + this->_ftab = (TIndexOffU*)(mmFile[0] + bytesRead); + bytesRead += eh->_ftabLen*OFF_SIZE; + lseek(_in1, eh->_ftabLen*OFF_SIZE, SEEK_CUR); #endif } else { - this->_ftab = new uint32_t[eh->_ftabLen]; + this->_ftab = new TIndexOffU[eh->_ftabLen]; if(switchEndian) { - for(uint32_t i = 0; i < eh->_ftabLen; i++) - this->_ftab[i] = readU32(_in1, switchEndian); + for(TIndexOffU i = 0; i < eh->_ftabLen; i++) + this->_ftab[i] = readU(_in1, switchEndian); } else { - MM_READ_RET r = MM_READ(_in1, (void *)this->_ftab, eh->_ftabLen*4); - if(r != (MM_READ_RET)(eh->_ftabLen*4)) { - cerr << "Error reading _ftab[] array: " << r << ", " << (eh->_ftabLen*4) << endl; + MM_READ_RET r = MM_READ(_in1, (void *)this->_ftab, eh->_ftabLen*OFF_SIZE); + if(r != (MM_READ_RET)(eh->_ftabLen*OFF_SIZE)) { + cerr << "Error reading _ftab[] array: " << r << ", " << (eh->_ftabLen*OFF_SIZE) << endl; throw 1; } } @@ -3438,24 +3453,24 @@ void Ebwt::readIntoMemory( } if(_useMm) { #ifdef BOWTIE_MM - this->_eftab = (uint32_t*)(mmFile[0] + bytesRead); - bytesRead += eh->_eftabLen*4; - lseek(_in1, eh->_eftabLen*4, SEEK_CUR); + this->_eftab = (TIndexOffU*)(mmFile[0] + bytesRead); + bytesRead += eh->_eftabLen*OFF_SIZE; + lseek(_in1, eh->_eftabLen*OFF_SIZE, SEEK_CUR); #endif } else { - this->_eftab = new uint32_t[eh->_eftabLen]; + this->_eftab = new TIndexOffU[eh->_eftabLen]; if(switchEndian) { - for(uint32_t i = 0; i < eh->_eftabLen; i++) - this->_eftab[i] = readU32(_in1, switchEndian); + for(TIndexOffU i = 0; i < eh->_eftabLen; i++) + this->_eftab[i] = readU(_in1, switchEndian); } else { - MM_READ_RET r = MM_READ(_in1, (void *)this->_eftab, eh->_eftabLen*4); - if(r != (MM_READ_RET)(eh->_eftabLen*4)) { - cerr << "Error reading _eftab[] array: " << r << ", " << (eh->_eftabLen*4) << endl; + MM_READ_RET r = MM_READ(_in1, (void *)this->_eftab, eh->_eftabLen*OFF_SIZE); + if(r != (MM_READ_RET)(eh->_eftabLen*OFF_SIZE)) { + cerr << "Error reading _eftab[] array: " << r << ", " << (eh->_eftabLen*OFF_SIZE) << endl; throw 1; } } } - for(uint32_t i = 0; i < eh->_eftabLen; i++) { + for(TIndexOffU i = 0; i < eh->_eftabLen; i++) { if(i > 0 && this->_eftab[i] > 0) { assert_geq(this->_eftab[i], this->_eftab[i-1]); } else if(i > 0 && this->_eftab[i-1] == 0) { @@ -3498,15 +3513,15 @@ void Ebwt::readIntoMemory( if(!useShmem_) { // Allocate offs_ try { - this->_offs = new uint32_t[offsLenSampled]; + this->_offs = new TIndexOffU[offsLenSampled]; } catch(bad_alloc& e) { cerr << "Out of memory allocating the offs[] array for the Bowtie index." << endl << "Please try again on a computer with more memory." << endl; throw 1; } } else { - shmemLeader = ALLOC_SHARED_U32( - (_in2Str + "[offs]"), offsLenSampled*4, &this->_offs, + shmemLeader = ALLOC_SHARED_U( + (_in2Str + "[offs]"), offsLenSampled*OFF_SIZE, &this->_offs, "offs", (_verbose || startVerbose)); } } @@ -3516,14 +3531,14 @@ void Ebwt::readIntoMemory( // Allocate offs (big allocation) if(switchEndian || offRateDiff > 0) { assert(!_useMm); - const uint32_t blockMaxSz = (2 * 1024 * 1024); // 2 MB block size - const uint32_t blockMaxSzU32 = (blockMaxSz >> 2); // # U32s per block + const TIndexOffU blockMaxSz = (2 * 1024 * 1024); // 2 MB block size + const TIndexOffU blockMaxSzU = (blockMaxSz >> (OFF_SIZE/4 +1)); // # U32s per block char *buf = new char[blockMaxSz]; - for(uint32_t i = 0; i < offsLen; i += blockMaxSzU32) { - uint32_t block = min(blockMaxSzU32, offsLen - i); - MM_READ_RET r = MM_READ(_in2, (void *)buf, block << 2); - if(r != (MM_READ_RET)(block << 2)) { - cerr << "Error reading block of offs array: " << r << ", " << (block << 2) << endl + for(TIndexOffU i = 0; i < offsLen; i += blockMaxSzU) { + TIndexOffU block = min(blockMaxSzU, offsLen - i); + MM_READ_RET r = MM_READ(_in2, (void *)buf, block << (OFF_SIZE/4 + 1)); + if(r != (MM_READ_RET)(block << (OFF_SIZE/4 + 1))) { + cerr << "Error reading block of offs array: " << r << ", " << (block << (OFF_SIZE/4 + 1)) << endl << "Your index files may be corrupt; please try re-building or re-downloading." << endl << "A complete index consists of 6 files: XYZ.1.ebwt, XYZ.2.ebwt, XYZ.3.ebwt," << endl << "XYZ.4.ebwt, XYZ.rev.1.ebwt, and XYZ.rev.2.ebwt. The XYZ.1.ebwt and " << endl @@ -3531,12 +3546,12 @@ void Ebwt::readIntoMemory( << "XYZ.rev.2.ebwt files." << endl; throw 1; } - uint32_t idx = i >> offRateDiff; - for(uint32_t j = 0; j < block; j += (1 << offRateDiff)) { + TIndexOffU idx = i >> offRateDiff; + for(TIndexOffU j = 0; j < block; j += (1 << offRateDiff)) { assert_lt(idx, offsLenSampled); - this->_offs[idx] = ((uint32_t*)buf)[j]; + this->_offs[idx] = ((TIndexOffU*)buf)[j]; if(switchEndian) { - this->_offs[idx] = endianSwapU32(this->_offs[idx]); + this->_offs[idx] = endianSwapU(this->_offs[idx]); } idx++; } @@ -3545,52 +3560,47 @@ void Ebwt::readIntoMemory( } else { if(_useMm) { #ifdef BOWTIE_MM - this->_offs = (uint32_t*)(mmFile[1] + bytesRead); - bytesRead += (offsLen << 2); - lseek(_in2, (offsLen << 2), SEEK_CUR); + _offs.init((TIndexOffU*)(mmFile[1] + bytesRead), offsLen, false); + bytesRead += offsSz; + // Argument to lseek can be 64 bits if compiled with + // _FILE_OFFSET_BITS + MM_SEEK(_in2, offsSz, SEEK_CUR); #endif } else { // If any of the high two bits are set - if((offsLen & 0xf0000000) != 0) { - if(sizeof(char *) <= 4) { - cerr << "Sanity error: sizeof(char *) <= 4 but offsLen is " << hex << offsLen << endl; - throw 1; - } - // offsLen << 4 overflows sometimes, so do it in four reads - char *offs = (char *)this->_offs; - for(int i = 0; i < 16; i++) { - MM_READ_RET r = MM_READ(_in2, (void*)offs, offsLen >> 2); - if(r != (MM_READ_RET)(offsLen >> 2)) { - cerr << "Error reading block of _offs[] array: " << r << ", " << (offsLen >> 2) << endl; - throw 1; - } - offs += (offsLen >> 2); - } - } else { - // Do it all in one read - MM_READ_RET r = MM_READ(_in2, (void*)this->_offs, offsLen << 2); - if(r != (MM_READ_RET)(offsLen << 2)) { - cerr << "Error reading _offs[] array: " << r << ", " << (offsLen << 2) << endl; + // Workaround for small-index mode where MM_READ may + // not be able to handle read amounts greater than 2^32 + // bytes. + uint64_t bytesLeft = offsSz; + char *offs = (char *)this->offs(); + + while(bytesLeft > 0) { + MM_READ_RET r = MM_READ(_in2, (void*)offs, bytesLeft); + if(MM_IS_IO_ERR(_in2,r,bytesLeft)) { + cerr << "Error reading block of _offs[] array: " + << r << ", " << bytesLeft << gLastIOErrMsg << endl; throw 1; } + offs += r; + bytesLeft -= r; } } } { ASSERT_ONLY(Bitset offsSeen(len+1)); - for(uint32_t i = 0; i < offsLenSampled; i++) { + for(TIndexOffU i = 0; i < offsLenSampled; i++) { assert(!offsSeen.test(this->_offs[i])); ASSERT_ONLY(offsSeen.set(this->_offs[i])); assert_leq(this->_offs[i], len); } } - if(useShmem_) NOTIFY_SHARED(this->_offs, offsLenSampled*4); + if(useShmem_) NOTIFY_SHARED(this->_offs, offsLenSampled*OFF_SIZE); } else { // Not the shmem leader - MM_SEEK(_in2, offsLenSampled*4, SEEK_CUR); - if(useShmem_) WAIT_SHARED(this->_offs, offsLenSampled*4); + MM_SEEK(_in2, offsLenSampled*OFF_SIZE, SEEK_CUR); + if(useShmem_) WAIT_SHARED(this->_offs, offsLenSampled*OFF_SIZE); } } @@ -3601,7 +3611,7 @@ void Ebwt::readIntoMemory( } if(!_useMm) { try { - this->_isa = new uint32_t[isaLenSampled]; + this->_isa = new TIndexOffU[isaLenSampled]; } catch(bad_alloc& e) { cerr << "Out of memory allocating the isa[] array for the Bowtie index." << endl << "Please try again on a computer with more memory." << endl; @@ -3611,31 +3621,31 @@ void Ebwt::readIntoMemory( // Read _isa[] if(switchEndian || isaRateDiff > 0) { assert(!_useMm); - for(uint32_t i = 0; i < isaLen; i++) { - if((i & ~(0xffffffff << isaRateDiff)) != 0) { - char tmp[4]; - MM_READ_RET r = MM_READ(_in2, (void *)tmp, 4); - if(r != (MM_READ_RET)4) { + for(TIndexOffU i = 0; i < isaLen; i++) { + if((i & ~(OFF_MASK << isaRateDiff)) != 0) { + char tmp[OFF_SIZE]; + MM_READ_RET r = MM_READ(_in2, (void *)tmp, OFF_SIZE); + if(r != (MM_READ_RET)OFF_SIZE) { cerr << "Error reading a word of the _isa[] array: " << r << ", 4" << endl; throw 1; } } else { - uint32_t idx = i >> isaRateDiff; + TIndexOffU idx = i >> isaRateDiff; assert_lt(idx, isaLenSampled); - this->_isa[idx] = readU32(_in2, switchEndian); + this->_isa[idx] = readU(_in2, switchEndian); } } } else { if(_useMm) { #ifdef BOWTIE_MM - this->_isa = (uint32_t*)(mmFile[1] + bytesRead); + this->_isa = (TIndexOffU*)(mmFile[1] + bytesRead); bytesRead += (isaLen << 2); lseek(_in2, (isaLen << 2), SEEK_CUR); #endif } else { - MM_READ_RET r = MM_READ(_in2, (void *)this->_isa, isaLen*4); - if(r != (MM_READ_RET)(isaLen*4)) { - cerr << "Error reading _isa[] array: " << r << ", " << (isaLen*4) << endl; + MM_READ_RET r = MM_READ(_in2, (void *)this->_isa, isaLen*OFF_SIZE); + if(r != (MM_READ_RET)(isaLen*OFF_SIZE)) { + cerr << "Error reading _isa[] array: " << r << ", " << (isaLen*OFF_SIZE) << endl; throw 1; } } @@ -3643,7 +3653,7 @@ void Ebwt::readIntoMemory( { ASSERT_ONLY(Bitset isasSeen(len+1)); - for(uint32_t i = 0; i < isaLenSampled; i++) { + for(TIndexOffU i = 0; i < isaLenSampled; i++) { assert(!isasSeen.test(this->_isa[i])); ASSERT_ONLY(isasSeen.set(this->_isa[i])); assert_leq(this->_isa[i], len); @@ -3674,28 +3684,28 @@ void Ebwt::readIntoMemory( * file and store them in 'refnames'. */ static inline void -readEbwtRefnames(istream& in, vector& refnames) { +readEbwtRefnames(FILE* fin, vector& refnames) { // _in1 must already be open with the get cursor at the // beginning and no error flags set. - assert(in.good()); - assert_eq((streamoff)in.tellg(), ios::beg); + assert(fin != NULL); + assert_eq(ftello(fin), 0); // Read endianness hints from both streams bool switchEndian = false; - uint32_t one = readU32(in, switchEndian); // 1st word of primary stream + uint32_t one = readU(fin, switchEndian); // 1st word of primary stream if(one != 1) { assert_eq((1u<<24), one); switchEndian = true; } // Reads header entries one by one from primary stream - uint32_t len = readU32(in, switchEndian); - int32_t lineRate = readI32(in, switchEndian); - int32_t linesPerSide = readI32(in, switchEndian); - int32_t offRate = readI32(in, switchEndian); - int32_t ftabChars = readI32(in, switchEndian); + TIndexOffU len = readU(fin, switchEndian); + int32_t lineRate = readI(fin, switchEndian); + int32_t linesPerSide = readI(fin, switchEndian); + int32_t offRate = readI(fin, switchEndian); + int32_t ftabChars = readI(fin, switchEndian); // BTL: chunkRate is now deprecated - int32_t flags = readI32(in, switchEndian); + int32_t flags = readI(fin, switchEndian); bool color = false; bool entireReverse = false; if(flags < 0) { @@ -3706,33 +3716,35 @@ readEbwtRefnames(istream& in, vector& refnames) { // Create a new EbwtParams from the entries read from primary stream EbwtParams eh(len, lineRate, linesPerSide, offRate, -1, ftabChars, color, entireReverse); - uint32_t nPat = readI32(in, switchEndian); // nPat - in.seekg(nPat*4, ios_base::cur); // skip plen + TIndexOffU nPat = readI(fin, switchEndian); // nPat + fseeko(fin, nPat*OFF_SIZE, SEEK_CUR); // Skip rstarts - uint32_t nFrag = readU32(in, switchEndian); - in.seekg(nFrag*4*3, ios_base::cur); + TIndexOffU nFrag = readU(fin, switchEndian); + fseeko(fin, nFrag*OFF_SIZE*3, SEEK_CUR); // Skip ebwt - in.seekg(eh._ebwtTotLen, ios_base::cur); + fseeko(fin, eh._ebwtTotLen, SEEK_CUR); // Skip zOff from primary stream - readU32(in, switchEndian); + readU(fin, switchEndian); // Skip fchr - in.seekg(5 * 4, ios_base::cur); + fseeko(fin, 5 * OFF_SIZE, SEEK_CUR); // Skip ftab - in.seekg(eh._ftabLen*4, ios_base::cur); + fseeko(fin, eh._ftabLen*OFF_SIZE, SEEK_CUR); // Skip eftab - in.seekg(eh._eftabLen*4, ios_base::cur); + fseeko(fin, eh._eftabLen*OFF_SIZE, SEEK_CUR); // Read reference sequence names from primary index file while(true) { char c = '\0'; - in.read(&c, 1); - if(in.eof()) break; + int read_value = 0; + read_value = fgetc(fin); + if(read_value == EOF) break; + c = read_value; if(c == '\0') break; else if(c == '\n') { refnames.push_back(""); @@ -3748,8 +3760,8 @@ readEbwtRefnames(istream& in, vector& refnames) { } // Be kind - in.clear(); in.seekg(0, ios::beg); - assert(in.good()); + fseeko(fin, 0, SEEK_SET); + assert(ferror(fin) == 0); } /** @@ -3758,16 +3770,15 @@ readEbwtRefnames(istream& in, vector& refnames) { */ static inline void readEbwtRefnames(const string& instr, vector& refnames) { - ifstream in; + FILE* fin; // Initialize our primary and secondary input-stream fields - in.open((instr + ".1.ebwt").c_str(), ios_base::in | ios::binary); - if(!in.is_open()) { + fin = fopen((instr + ".1." + gEbwt_ext).c_str(),"rb"); + if(fin == NULL) { throw EbwtFileOpenException("Cannot open file " + instr); } - assert(in.is_open()); - assert(in.good()); - assert_eq((streamoff)in.tellg(), ios::beg); - readEbwtRefnames(in, refnames); + assert_eq(ftello(fin), 0); + readEbwtRefnames(fin, refnames); + fclose(fin); } /** @@ -3776,25 +3787,25 @@ readEbwtRefnames(const string& instr, vector& refnames) { static inline int32_t readFlags(const string& instr) { ifstream in; // Initialize our primary and secondary input-stream fields - in.open((instr + ".1.ebwt").c_str(), ios_base::in | ios::binary); + in.open((instr + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary); if(!in.is_open()) { throw EbwtFileOpenException("Cannot open file " + instr); } assert(in.is_open()); assert(in.good()); bool switchEndian = false; - uint32_t one = readU32(in, switchEndian); // 1st word of primary stream + uint32_t one = readU(in, switchEndian); // 1st word of primary stream if(one != 1) { assert_eq((1u<<24), one); assert_eq(1, endianSwapU32(one)); switchEndian = true; } - readU32(in, switchEndian); - readI32(in, switchEndian); - readI32(in, switchEndian); - readI32(in, switchEndian); - readI32(in, switchEndian); - int32_t flags = readI32(in, switchEndian); + readU(in, switchEndian); + readI(in, switchEndian); + readI(in, switchEndian); + readI(in, switchEndian); + readI(in, switchEndian); + int32_t flags = readI(in, switchEndian); return flags; } @@ -3848,30 +3859,30 @@ void Ebwt::writeFromMemory(bool justHeader, // When building an Ebwt, these header parameters are known // "up-front", i.e., they can be written to disk immediately, // before we join() or buildToDisk() - writeI32(out1, 1, be); // endian hint for priamry stream - writeI32(out2, 1, be); // endian hint for secondary stream - writeU32(out1, eh._len, be); // length of string (and bwt and suffix array) - writeI32(out1, eh._lineRate, be); // 2^lineRate = size in bytes of 1 line - writeI32(out1, eh._linesPerSide, be); // not used - writeI32(out1, eh._offRate, be); // every 2^offRate chars is "marked" - writeI32(out1, eh._ftabChars, be); // number of 2-bit chars used to address ftab + writeI(out1, 1, be); // endian hint for priamry stream + writeI(out2, 1, be); // endian hint for secondary stream + writeU(out1, eh._len, be); // length of string (and bwt and suffix array) + writeI(out1, eh._lineRate, be); // 2^lineRate = size in bytes of 1 line + writeI(out1, eh._linesPerSide, be); // not used + writeI(out1, eh._offRate, be); // every 2^offRate chars is "marked" + writeI(out1, eh._ftabChars, be); // number of 2-bit chars used to address ftab int32_t flags = 1; if(eh._color) flags |= EBWT_COLOR; if(eh._entireReverse) flags |= EBWT_ENTIRE_REV; - writeI32(out1, -flags, be); // BTL: chunkRate is now deprecated + writeI(out1, -flags, be); // BTL: chunkRate is now deprecated if(!justHeader) { assert(isInMemory()); // These Ebwt parameters are known after the inputs strings have // been joined() but before they have been built(). These can // written to the disk next and then discarded from memory. - writeU32(out1, this->_nPat, be); - for(uint32_t i = 0; i < this->_nPat; i++) - writeU32(out1, this->_plen[i], be); + writeU(out1, this->_nPat, be); + for(TIndexOffU i = 0; i < this->_nPat; i++) + writeU(out1, this->_plen[i], be); assert_geq(this->_nFrag, this->_nPat); - writeU32(out1, this->_nFrag, be); - for(uint32_t i = 0; i < this->_nFrag*3; i++) - writeU32(out1, this->_rstarts[i], be); + writeU(out1, this->_nFrag, be); + for(TIndexOffU i = 0; i < this->_nFrag*3; i++) + writeU(out1, this->_rstarts[i], be); // These Ebwt parameters are discovered only as the Ebwt is being // built (in buildToDisk()). Of these, only 'offs' and 'ebwt' are @@ -3879,24 +3890,24 @@ void Ebwt::writeFromMemory(bool justHeader, // discarded from memory as it is built; 'offs' is similarly // written to the secondary file and discarded. out1.write((const char *)this->ebwt(), eh._ebwtTotLen); - writeU32(out1, this->zOff(), be); - uint32_t offsLen = eh._offsLen; - for(uint32_t i = 0; i < offsLen; i++) - writeU32(out2, this->_offs[i], be); + writeU(out1, this->zOff(), be); + TIndexOffU offsLen = eh._offsLen; + for(TIndexOffU i = 0; i < offsLen; i++) + writeU(out2, this->_offs[i], be); uint32_t isaLen = eh._isaLen; - for(uint32_t i = 0; i < isaLen; i++) - writeU32(out2, this->_isa[i], be); + for(TIndexOffU i = 0; i < isaLen; i++) + writeU(out2, this->_isa[i], be); // 'fchr', 'ftab' and 'eftab' are not fully determined until the // loop is finished, so they are written to the primary file after // all of 'ebwt' has already been written and only then discarded // from memory. for(int i = 0; i < 5; i++) - writeU32(out1, this->_fchr[i], be); - for(uint32_t i = 0; i < eh._ftabLen; i++) - writeU32(out1, this->ftab()[i], be); - for(uint32_t i = 0; i < eh._eftabLen; i++) - writeU32(out1, this->eftab()[i], be); + writeU(out1, this->_fchr[i], be); + for(TIndexOffU i = 0; i < eh._ftabLen; i++) + writeU(out1, this->ftab()[i], be); + for(TIndexOffU i = 0; i < eh._eftabLen; i++) + writeU(out1, this->eftab()[i], be); } } @@ -3943,23 +3954,23 @@ void Ebwt::writeFromMemory(bool justHeader, assert_eq(_zEbwtBpOff, copy.zEbwtBpOff()); assert_eq(_zEbwtByteOff, copy.zEbwtByteOff()); assert_eq(_nPat, copy.nPat()); - for(uint32_t i = 0; i < _nPat; i++) + for(TIndexOffU i = 0; i < _nPat; i++) assert_eq(this->_plen[i], copy.plen()[i]); assert_eq(this->_nFrag, copy.nFrag()); - for(uint32_t i = 0; i < this->nFrag*3; i++) { + for(TIndexOffU i = 0; i < this->nFrag*3; i++) { assert_eq(this->_rstarts[i], copy.rstarts()[i]); } for(uint32_t i = 0; i < 5; i++) assert_eq(this->_fchr[i], copy.fchr()[i]); - for(uint32_t i = 0; i < eh._ftabLen; i++) + for(TIndexOffU i = 0; i < eh._ftabLen; i++) assert_eq(this->ftab()[i], copy.ftab()[i]); - for(uint32_t i = 0; i < eh._eftabLen; i++) + for(TIndexOffU i = 0; i < eh._eftabLen; i++) assert_eq(this->eftab()[i], copy.eftab()[i]); - for(uint32_t i = 0; i < eh._offsLen; i++) + for(TIndexOffU i = 0; i < eh._offsLen; i++) assert_eq(this->_offs[i], copy.offs()[i]); - for(uint32_t i = 0; i < eh._isaLen; i++) + for(TIndexOffU i = 0; i < eh._isaLen; i++) assert_eq(this->_isa[i], copy.isa()[i]); - for(uint32_t i = 0; i < eh._ebwtTotLen; i++) + for(TIndexOffU i = 0; i < eh._ebwtTotLen; i++) assert_eq(this->ebwt()[i], copy.ebwt()[i]); //copy.sanityCheckAll(); if(_verbose) @@ -4010,7 +4021,7 @@ TStr Ebwt::join(vector& l, uint32_t seed) { template TStr Ebwt::join(vector& l, vector& szs, - uint32_t sztot, + TIndexOffU sztot, const RefReadInParams& refparams, uint32_t seed) { @@ -4061,7 +4072,7 @@ void Ebwt::joinToDisk( vector& l, vector& szs, vector& plens, - uint32_t sztot, + TIndexOffU sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, @@ -4097,31 +4108,31 @@ void Ebwt::joinToDisk( assert_gt(this->_nPat, 0); assert_geq(this->_nFrag, this->_nPat); this->_rstarts = NULL; - writeU32(out1, this->_nPat, this->toBe()); + writeU(out1, this->_nPat, this->toBe()); assert_eq(plens.size(), this->_nPat); // Allocate plen[] try { - this->_plen = new uint32_t[this->_nPat]; + this->_plen = new TIndexOffU[this->_nPat]; } catch(bad_alloc& e) { cerr << "Out of memory allocating plen[] in Ebwt::join()" << " at " << __FILE__ << ":" << __LINE__ << endl; throw e; } // For each pattern, set plen - for(size_t i = 0; i < plens.size(); i++) { + for(TIndexOffU i = 0; i < plens.size(); i++) { this->_plen[i] = plens[i]; - writeU32(out1, this->_plen[i], this->toBe()); + writeU(out1, this->_plen[i], this->toBe()); } // Write the number of fragments - writeU32(out1, this->_nFrag, this->toBe()); - size_t seqsRead = 0; - ASSERT_ONLY(uint32_t szsi = 0); - ASSERT_ONLY(uint32_t entsWritten = 0); + writeU(out1, this->_nFrag, this->toBe()); + TIndexOffU seqsRead = 0; + ASSERT_ONLY(TIndexOffU szsi = 0); + ASSERT_ONLY(TIndexOffU entsWritten = 0); // For each filebuf for(unsigned int i = 0; i < l.size(); i++) { assert(!l[i]->eof()); bool first = true; - uint32_t patoff = 0; + TIndexOffU patoff = 0; // For each *fragment* (not necessary an entire sequence) we // can pull out of istream l[i]... while(!l[i]->eof()) { @@ -4231,18 +4242,18 @@ void Ebwt::buildToDisk(InorderBlockwiseSA& sa, assert(sa.suffixItrIsReset()); assert_leq((int)ValueSize::VALUE, 4); - uint32_t len = eh._len; - uint32_t ftabLen = eh._ftabLen; - uint32_t sideSz = eh._sideSz; - uint32_t ebwtTotSz = eh._ebwtTotSz; - uint32_t fchr[] = {0, 0, 0, 0, 0}; - uint32_t* ftab = NULL; - uint32_t zOff = 0xffffffff; + TIndexOffU len = eh._len; + TIndexOffU ftabLen = eh._ftabLen; + TIndexOffU sideSz = eh._sideSz; + TIndexOffU ebwtTotSz = eh._ebwtTotSz; + TIndexOffU fchr[] = {0, 0, 0, 0, 0}; + TIndexOffU* ftab = NULL; + TIndexOffU zOff = OFF_MASK; // Save # of occurrences of each character as we walk along the bwt - uint32_t occ[4] = {0, 0, 0, 0}; + TIndexOffU occ[4] = {0, 0, 0, 0}; // Save 'G' and 'T' occurrences between backward and forward buckets - uint32_t occSave[2] = {0, 0}; + TIndexOffU occSave[2] = {0, 0}; // Record rows that should "absorb" adjacent rows in the ftab. // The absorbed rows represent suffixes shorter than the ftabChars @@ -4305,13 +4316,13 @@ void Ebwt::buildToDisk(InorderBlockwiseSA& sa, // Points to the base offset within ebwt for the side currently // being written - uint32_t side = 0; + TIndexOffU side = 0; // Points to a byte offset from 'side' within ebwt[] where next // char should be written #ifdef SIXTY4_FORMAT - int sideCur = (eh._sideBwtSz >> 3) - 1; + TIndexOffU sideCur = (eh._sideBwtSz >> 3) - 1; #else - int sideCur = eh._sideBwtSz - 1; + TIndexOffU sideCur = eh._sideBwtSz - 1; #endif // Whether we're assembling a forward or a reverse bucket @@ -4414,7 +4425,7 @@ void Ebwt::buildToDisk(InorderBlockwiseSA& sa, assert_lt((si >> eh._offRate), eh._offsLen); // Write offsets directly to the secondary output // stream, thereby avoiding keeping them in memory - writeU32(out2, saElt, this->toBe()); + writeU(out2, saElt, this->toBe()); } } else { // Strayed off the end of the SA, now we're just @@ -4479,11 +4490,16 @@ void Ebwt::buildToDisk(InorderBlockwiseSA& sa, // Write 'G' and 'T' assert_leq(occSave[0], occ[2]); assert_leq(occSave[1], occ[3]); - uint32_t *u32side = reinterpret_cast(ebwtSide); + TIndexOffU *u32side = reinterpret_cast(ebwtSide); side += sideSz; assert_leq(side, eh._ebwtTotSz); - u32side[(sideSz >> 2)-2] = endianizeU32(occSave[0], this->toBe()); - u32side[(sideSz >> 2)-1] = endianizeU32(occSave[1], this->toBe()); +#ifdef BOWTIE_64BIT_INDEX + u32side[(sideSz >> 3)-2] = endianizeU(occSave[2], this->toBe()); + u32side[(sideSz >> 3)-1] = endianizeU(occSave[3], this->toBe()); +#else + u32side[(sideSz >> 2)-2] = endianizeU(occSave[2], this->toBe()); + u32side[(sideSz >> 2)-1] = endianizeU(occSave[3], this->toBe()); +#endif // Write forward side to primary file out1.write((const char *)ebwtSide, sideSz); } else if (sideCur == -1) { @@ -4492,11 +4508,16 @@ void Ebwt::buildToDisk(InorderBlockwiseSA& sa, sideCur = 0; assert(!fw); fw = true; // Write 'A' and 'C' - uint32_t *u32side = reinterpret_cast(ebwtSide); + TIndexOffU *u32side = reinterpret_cast(ebwtSide); side += sideSz; assert_leq(side, eh._ebwtTotSz); +#ifdef BOWTIE_64BIT_INDEX + u32side[(sideSz >> 3)-2] = endianizeU32(occ[0], this->toBe()); + u32side[(sideSz >> 3)-1] = endianizeU32(occ[1], this->toBe()); +#else u32side[(sideSz >> 2)-2] = endianizeU32(occ[0], this->toBe()); u32side[(sideSz >> 2)-1] = endianizeU32(occ[1], this->toBe()); +#endif occSave[0] = occ[2]; // save 'G' count occSave[1] = occ[3]; // save 'T' count // Write backward side to primary file @@ -4505,7 +4526,7 @@ void Ebwt::buildToDisk(InorderBlockwiseSA& sa, } VMSG_NL("Exited Ebwt loop"); assert(ftab != NULL); - assert_neq(zOff, 0xffffffff); + assert_neq(zOff, OFF_MASK); if(absorbCnt > 0) { // Absorb any trailing, as-yet-unabsorbed short suffixes into // the last element of ftab @@ -4514,14 +4535,14 @@ void Ebwt::buildToDisk(InorderBlockwiseSA& sa, // Assert that our loop counter got incremented right to the end assert_eq(side, eh._ebwtTotSz); // Assert that we wrote the expected amount to out1 - assert_eq(((uint32_t)out1.tellp() - beforeEbwtOff), eh._ebwtTotSz); + assert_eq(((TIndexOffU)out1.tellp() - beforeEbwtOff), eh._ebwtTotSz); // assert that the last thing we did was write a forward bucket assert(wroteFwBucket); // // Write zOff to primary stream // - writeU32(out1, zOff, this->toBe()); + writeU(out1, zOff, this->toBe()); // // Finish building fchr @@ -4542,24 +4563,24 @@ void Ebwt::buildToDisk(InorderBlockwiseSA& sa, } // Write fchr to primary file for(int i = 0; i < 5; i++) { - writeU32(out1, fchr[i], this->toBe()); + writeU(out1, fchr[i], this->toBe()); } // // Finish building ftab and build eftab // // Prefix sum on ftable - uint32_t eftabLen = 0; + TIndexOffU eftabLen = 0; assert_eq(0, absorbFtab[0]); - for(uint32_t i = 1; i < ftabLen; i++) { + for(TIndexOffU i = 1; i < ftabLen; i++) { if(absorbFtab[i] > 0) eftabLen += 2; } - assert_leq(eftabLen, (uint32_t)eh._ftabChars*2); + assert_leq(eftabLen, (TIndexOffU)eh._ftabChars*2); eftabLen = eh._ftabChars*2; - uint32_t *eftab = NULL; + TIndexOffU *eftab = NULL; try { - eftab = new uint32_t[eftabLen]; - memset(eftab, 0, 4 * eftabLen); + eftab = new TIndexOffU[eftabLen]; + memset(eftab, 0, OFF_SIZE * eftabLen); } catch(bad_alloc &e) { cerr << "Out of memory allocating eftab[] " << "in Ebwt::buildToDisk() at " << __FILE__ << ":" @@ -4567,16 +4588,16 @@ void Ebwt::buildToDisk(InorderBlockwiseSA& sa, throw e; } assert(eftab != NULL); - uint32_t eftabCur = 0; - for(uint32_t i = 1; i < ftabLen; i++) { - uint32_t lo = ftab[i] + Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, i-1); + TIndexOffU eftabCur = 0; + for(TIndexOffU i = 1; i < ftabLen; i++) { + TIndexOffU lo = ftab[i] + Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, i-1); if(absorbFtab[i] > 0) { // Skip a number of short pattern indicated by absorbFtab[i] - uint32_t hi = lo + absorbFtab[i]; + TIndexOffU hi = lo + absorbFtab[i]; assert_lt(eftabCur*2+1, eftabLen); eftab[eftabCur*2] = lo; eftab[eftabCur*2+1] = hi; - ftab[i] = (eftabCur++) ^ 0xffffffff; // insert pointer into eftab + ftab[i] = (eftabCur++) ^ OFF_MASK; // insert pointer into eftab assert_eq(lo, Ebwt::ftabLo(ftab, eftab, len, ftabLen, eftabLen, i)); assert_eq(hi, Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, i)); } else { @@ -4585,22 +4606,22 @@ void Ebwt::buildToDisk(InorderBlockwiseSA& sa, } assert_eq(Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, ftabLen-1), len+1); // Write ftab to primary file - for(uint32_t i = 0; i < ftabLen; i++) { - writeU32(out1, ftab[i], this->toBe()); + for(TIndexOffU i = 0; i < ftabLen; i++) { + writeU(out1, ftab[i], this->toBe()); } // Write eftab to primary file - for(uint32_t i = 0; i < eftabLen; i++) { - writeU32(out1, eftab[i], this->toBe()); + for(TIndexOffU i = 0; i < eftabLen; i++) { + writeU(out1, eftab[i], this->toBe()); } // Write isa to primary file if(isaSample != NULL) { ASSERT_ONLY(Bitset sawISA(eh._len+1)); - for(uint32_t i = 0; i < eh._isaLen; i++) { - uint32_t s = isaSample[i]; + for(TIndexOffU i = 0; i < eh._isaLen; i++) { + TIndexOffU s = isaSample[i]; assert_leq(s, eh._len); assert(!sawISA.test(s)); ASSERT_ONLY(sawISA.set(s)); - writeU32(out2, s, this->toBe()); + writeU(out2, s, this->toBe()); } delete[] isaSample; } diff --git a/ebwt_build.cpp b/ebwt_build.cpp index e5537f2..095fa25 100644 --- a/ebwt_build.cpp +++ b/ebwt_build.cpp @@ -49,6 +49,7 @@ static int reverseType; static string wrapper; bool color; + static void resetOptions() { verbose = true; // be talkative (default) sanityCheck = 0; // do slow sanity checks diff --git a/filebuf.h b/filebuf.h index 8cbe368..1b19ab9 100644 --- a/filebuf.h +++ b/filebuf.h @@ -561,7 +561,7 @@ class OutFileBuf { const char *name_; FILE *out_; - uint32_t cur_; + size_t cur_; char buf_[BUF_SZ]; // (large) input buffer bool closed_; }; diff --git a/multikey_qsort.h b/multikey_qsort.h index 62decd2..987499c 100644 --- a/multikey_qsort.h +++ b/multikey_qsort.h @@ -9,6 +9,7 @@ #include "alphabet.h" #include "assert_helpers.h" #include "diff_sample.h" +#include "btypes.h" using namespace std; using namespace seqan; @@ -226,7 +227,7 @@ static inline void vecswap2(TVal* s, size_t slen, TVal* s2, TPos i, TPos j, TPos */ template bool assertPartitionedSuf(const THost& host, - uint32_t *s, + TIndexOffU *s, size_t slen, int hi, int pivot, @@ -264,7 +265,7 @@ bool assertPartitionedSuf(const THost& host, */ template bool assertPartitionedSuf2(const THost& host, - uint32_t *s, + TIndexOffU *s, size_t slen, int hi, int pivot, @@ -296,7 +297,7 @@ bool assertPartitionedSuf2(const THost& host, * 'host' is a seemingly legitimate suffix-offset list (at this time, * we just check that it doesn't list any suffix twice). */ -static void sanityCheckInputSufs(uint32_t *s, size_t slen) { +static void sanityCheckInputSufs(TIndexOffU *s, size_t slen) { assert_gt(slen, 0); for(size_t i = 0; i < slen; i++) { // Actually, it's convenient to allow the caller to provide @@ -316,19 +317,19 @@ static void sanityCheckInputSufs(uint32_t *s, size_t slen) { template void sanityCheckOrderedSufs(const T& host, size_t hlen, - uint32_t *s, + TIndexOffU *s, size_t slen, size_t upto, uint32_t lower = 0, - uint32_t upper = 0xffffffff) + uint32_t upper = OFF_MASK) { assert_lt(s[0], hlen); - upper = min(upper, slen-1); + upper = min(upper, slen-1); for(size_t i = lower; i < upper; i++) { // Allow s[i+t] to point off the end of the string; this is // convenient for some callers if(s[i+1] >= hlen) continue; - if(upto == 0xffffffff) { + if(upto == OFF_MASK) { assert(dollarLt(suffix(host, s[i]), suffix(host, s[i+1]))); } else { #ifndef NDEBUG @@ -367,13 +368,13 @@ void sanityCheckOrderedSufs(const T& host, template void mkeyQSortSuf(const T& host, size_t hlen, - uint32_t *s, + TIndexOffU *s, size_t slen, int hi, size_t begin, size_t end, size_t depth, - size_t upto = 0xffffffff) + size_t upto = OFF_MASK) { // Helper for making the recursive call; sanity-checks arguments to // make sure that the problem actually got smaller. @@ -455,12 +456,12 @@ void mkeyQSortSuf(const T& host, */ template void mkeyQSortSuf(const T& host, - uint32_t *s, + TIndexOffU *s, size_t slen, int hi, bool verbose = false, bool sanityCheck = false, - size_t upto = 0xffffffff) + size_t upto = OFF_MASK) { size_t hlen = length(host); assert(!empty(s)); @@ -479,14 +480,14 @@ void mkeyQSortSuf(const T& host, template void mkeyQSortSuf2(const T& host, size_t hlen, - uint32_t *s, + TIndexOffU *s, size_t slen, - uint32_t *s2, + TIndexOffU *s2, int hi, size_t begin, size_t end, size_t depth, - size_t upto = 0xffffffff) + size_t upto = OFF_MASK) { // Helper for making the recursive call; sanity-checks arguments to // make sure that the problem actually got smaller. @@ -570,20 +571,20 @@ void mkeyQSortSuf2(const T& host, */ template void mkeyQSortSuf2(const T& host, - uint32_t *s, + TIndexOffU *s, size_t slen, - uint32_t *s2, + TIndexOffU *s2, int hi, bool verbose = false, bool sanityCheck = false, - size_t upto = 0xffffffff) + size_t upto = OFF_MASK) { size_t hlen = length(host); if(sanityCheck) sanityCheckInputSufs(s, slen); - uint32_t *sOrig = NULL; + TIndexOffU *sOrig = NULL; if(sanityCheck) { - sOrig = new uint32_t[slen]; - memcpy(sOrig, s, 4 * slen); + sOrig = new TIndexOffU[slen]; + memcpy(sOrig, s, OFF_SIZE * slen); } mkeyQSortSuf2(host, hlen, s, slen, s2, hi, (size_t)0, slen, (size_t)0, upto); if(sanityCheck) { @@ -633,7 +634,7 @@ bool sufDcLt(const T1& host, template inline void qsortSufDc(const T& host, size_t hlen, - uint32_t* s, + TIndexOffU* s, size_t slen, const DifferenceCoverSample& dc, size_t begin, @@ -676,7 +677,7 @@ template void mkeyQSortSufDcU8(const T1& seqanHost, const T2& host, size_t hlen, - uint32_t* s, + TIndexOffU* s, size_t slen, const DifferenceCoverSample& dc, int hi, @@ -685,7 +686,7 @@ void mkeyQSortSufDcU8(const T1& seqanHost, { if(sanityCheck) sanityCheckInputSufs(s, slen); mkeyQSortSufDcU8(seqanHost, host, hlen, s, slen, dc, hi, 0, slen, 0, sanityCheck); - if(sanityCheck) sanityCheckOrderedSufs(seqanHost, hlen, s, slen, 0xffffffff); + if(sanityCheck) sanityCheckOrderedSufs(seqanHost, hlen, s, slen, OFF_MASK); } /** @@ -696,13 +697,13 @@ template inline bool sufDcLtU8(const T1& seqanHost, const T2& host, size_t hlen, - uint32_t s1, - uint32_t s2, + size_t s1, + size_t s2, const DifferenceCoverSample& dc, bool sanityCheck = false) { hlen += 0; - uint32_t diff = dc.tieBreakOff(s1, s2); + size_t diff = dc.tieBreakOff((TIndexOffU)s1, (TIndexOffU)s2); assert_lt(diff, dc.v()); assert_lt(diff, hlen-s1); assert_lt(diff, hlen-s2); @@ -730,7 +731,7 @@ template inline void qsortSufDcU8(const T1& seqanHost, const T2& host, size_t hlen, - uint32_t* s, + TIndexOffU* s, size_t slen, const DifferenceCoverSample& dc, size_t begin, @@ -772,7 +773,7 @@ void qsortSufDcU8(const T1& seqanHost, #define SELECTION_SORT_CUTOFF 6 // 5 64-element buckets for bucket-sorting A, C, G, T, $ -static uint32_t bkts[4][4 * 1024 * 1024]; +static TIndexOffU bkts[4][4 * 1024 * 1024]; /** * Straightforwardly obtain a uint8_t-ized version of t[off]. This @@ -813,7 +814,7 @@ static void selectionSortSufDcU8( const T1& seqanHost, const T2& host, size_t hlen, - uint32_t* s, + TIndexOffU* s, size_t slen, const DifferenceCoverSample& dc, uint8_t hi, @@ -840,16 +841,16 @@ static void selectionSortSufDcU8( if(off + s[begin] >= hlen || off + s[begin+1] >= hlen) { - off = 0xffffffff; + off = OFF_MASK; } - if(off != 0xffffffff) { + if(off != OFF_MASK) { if(off < depth) { qsortSufDcU8(seqanHost, host, hlen, s, slen, dc, begin, end, sanityCheck); // It's helpful for debugging if we call this here if(sanityCheck) { sanityCheckOrderedSufs(seqanHost, hlen, s, slen, - 0xffffffff, begin, end); + OFF_MASK, begin, end); } return; } @@ -857,11 +858,11 @@ static void selectionSortSufDcU8( } } assert_leq(v, dc.v()); - uint32_t lim = v; + size_t lim = v; assert_geq(lim, 0); for(size_t i = begin; i < end-1; i++) { - uint32_t targ = i; - uint32_t targoff = depth + s[i]; + size_t targ = i; + size_t targoff = depth + s[i]; for(size_t j = i+1; j < end; j++) { assert_neq(j, targ); uint32_t joff = depth + s[j]; @@ -924,7 +925,7 @@ static void selectionSortSufDcU8( if(i != targ) { ASSERT_SUF_LT(targ, i); // swap i and targ - uint32_t tmp = s[i]; + TIndexOffU tmp = s[i]; s[i] = s[targ]; s[targ] = tmp; } @@ -934,7 +935,7 @@ static void selectionSortSufDcU8( } if(sanityCheck) { sanityCheckOrderedSufs(seqanHost, hlen, s, slen, - 0xffffffff, begin, end); + OFF_MASK, begin, end); } } @@ -943,7 +944,7 @@ static void bucketSortSufDcU8( const T1& seqanHost, const T2& host, size_t hlen, - uint32_t* s, + TIndexOffU* s, size_t slen, const DifferenceCoverSample& dc, uint8_t hi, @@ -974,7 +975,7 @@ static void bucketSortSufDcU8( begin, end, depth, sanityCheck); if(sanityCheck) { sanityCheckOrderedSufs(seqanHost, hlen, s, slen, - 0xffffffff, begin, end); + OFF_MASK, begin, end); } return; } @@ -989,11 +990,11 @@ static void bucketSortSufDcU8( } } assert_eq(cnts[0] + cnts[1] + cnts[2] + cnts[3] + cnts[4], end - begin); - uint32_t cur = begin + cnts[0]; - if(cnts[1] > 0) { memcpy(&s[cur], bkts[0], cnts[1] << 2); cur += cnts[1]; } - if(cnts[2] > 0) { memcpy(&s[cur], bkts[1], cnts[2] << 2); cur += cnts[2]; } - if(cnts[3] > 0) { memcpy(&s[cur], bkts[2], cnts[3] << 2); cur += cnts[3]; } - if(cnts[4] > 0) { memcpy(&s[cur], bkts[3], cnts[4] << 2); } + size_t cur = begin + cnts[0]; + if(cnts[1] > 0) { memcpy(&s[cur], bkts[0], cnts[1] << (OFF_SIZE/4 + 1)); cur += cnts[1]; } + if(cnts[2] > 0) { memcpy(&s[cur], bkts[1], cnts[2] << (OFF_SIZE/4 + 1)); cur += cnts[2]; } + if(cnts[3] > 0) { memcpy(&s[cur], bkts[2], cnts[3] << (OFF_SIZE/4 + 1)); cur += cnts[3]; } + if(cnts[4] > 0) { memcpy(&s[cur], bkts[3], cnts[4] << (OFF_SIZE/4 + 1)); } // This frame is now totally finished with bkts[][], so recursive // callees can safely clobber it; we're not done with cnts[], but // that's local to the stack frame. @@ -1036,7 +1037,7 @@ template void mkeyQSortSufDcU8(const T1& seqanHost, const T2& host, size_t hlen, - uint32_t* s, + TIndexOffU* s, size_t slen, const DifferenceCoverSample& dc, int hi, @@ -1061,7 +1062,7 @@ void mkeyQSortSufDcU8(const T1& seqanHost, // k=(end-begin) qsortSufDcU8(seqanHost, host, hlen, s, slen, dc, begin, end, sanityCheck); if(sanityCheck) { - sanityCheckOrderedSufs(seqanHost, hlen, s, slen, 0xffffffff, begin, end); + sanityCheckOrderedSufs(seqanHost, hlen, s, slen, OFF_MASK, begin, end); } return; } @@ -1070,7 +1071,7 @@ void mkeyQSortSufDcU8(const T1& seqanHost, bucketSortSufDcU8(seqanHost, host, hlen, s, slen, dc, (uint8_t)hi, begin, end, depth, sanityCheck); if(sanityCheck) { - sanityCheckOrderedSufs(seqanHost, hlen, s, slen, 0xffffffff, begin, end); + sanityCheckOrderedSufs(seqanHost, hlen, s, slen, OFF_MASK, begin, end); } return; } diff --git a/random_source.h b/random_source.h index 65ec646..fd99927 100644 --- a/random_source.h +++ b/random_source.h @@ -33,6 +33,23 @@ class RandomSource { return ret; } + uint64_t nextU64() { + assert(inited_); + uint64_t first = nextU32(); + first = first << 32; + uint64_t second = nextU32(); + return first | second; + } + + + size_t nextSizeT() { + if(sizeof(size_t) == 4) { + return nextU32(); + } else { + return nextU64(); + } + } + uint32_t nextU2() { assert(inited_); if(lastOff > 30) { diff --git a/ref_read.cpp b/ref_read.cpp index efb9e3f..63d8490 100644 --- a/ref_read.cpp +++ b/ref_read.cpp @@ -15,7 +15,7 @@ RefRecord fastaRefReadSize(FileBuf& in, static int lastc = '>'; // last character seen // RefRecord params - size_t len = 0; // 'len' counts toward total length + TIndexOffU len = 0; // 'len' counts toward total length // 'off' counts number of ambiguous characters before first // unambiguous character size_t off = 0; @@ -58,7 +58,7 @@ RefRecord fastaRefReadSize(FileBuf& in, // Don't emit a warning, since this might legitimately be // a gap on the end of the final sequence in the file lastc = -1; - return RefRecord(off, len, first); + return RefRecord((TIndexOffU)off, (TIndexOffU)len, first); } } @@ -95,7 +95,7 @@ RefRecord fastaRefReadSize(FileBuf& in, cerr << "Warning: Encountered empty reference sequence" << endl; } lastc = '>'; - return RefRecord(off, 0, first); + return RefRecord((TIndexOffU)off, 0, first); } c = in.get(); if(c == -1) { @@ -106,7 +106,7 @@ RefRecord fastaRefReadSize(FileBuf& in, cerr << "Warning: Encountered empty reference sequence" << endl; } lastc = -1; - return RefRecord(off, 0, first); + return RefRecord((TIndexOffU)off, 0, first); } } assert(!rparms.color || (lc != -1)); @@ -145,7 +145,7 @@ RefRecord fastaRefReadSize(FileBuf& in, // It's an N or a gap lastc = c; assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T'); - return RefRecord(off, len, first); + return RefRecord((TIndexOffU)off, (TIndexOffU)len, first); } else { // Not DNA and not a gap, ignore it #ifndef NDEBUG @@ -162,7 +162,7 @@ RefRecord fastaRefReadSize(FileBuf& in, c = in.get(); } lastc = c; - return RefRecord(off, len, first); + return RefRecord((TIndexOffU)off, (TIndexOffU)len, first); } static void @@ -233,10 +233,10 @@ fastaRefReadSizes(vector& in, vector& plens, const RefReadInParams& rparms, BitpairOutFileBuf* bpout, - int& numSeqs) + TIndexOff& numSeqs) { TIndexOffU unambigTot = 0; - uint32_t bothTot = 0; + size_t bothTot = 0; assert_gt(in.size(), 0); uint32_t both = 0, unambig = 0; // For each input istream diff --git a/ref_read.h b/ref_read.h index 4da766e..27cb7b0 100644 --- a/ref_read.h +++ b/ref_read.h @@ -13,10 +13,40 @@ #include "assert_helpers.h" #include "filebuf.h" #include "word_io.h" +#include "endian_swap.h" using namespace std; using namespace seqan; +class RefTooLongException : public exception { + +public: + RefTooLongException() { +#ifdef BOWTIE_64BIT_INDEX + // This should never happen! + msg = "Error: Reference sequence has more than 2^64-1 characters! " + "Please divide the reference into smaller chunks and index each " + "independently."; +#else + msg = "Error: Reference sequence has more than 2^32-1 characters! " + "Please build a large index by passing the --large-index option " + "to bowtie2-build"; +#endif + } + + ~RefTooLongException() throw() {} + + const char* what() const throw() { + return msg.c_str(); + } + +protected: + + string msg; + +}; + + /** * Encapsulates a stretch of the reference containing only unambiguous * characters. From an ordered list of RefRecords, one can (almost) @@ -26,28 +56,28 @@ using namespace seqan; */ struct RefRecord { RefRecord() : off(), len(), first() { } - RefRecord(uint32_t _off, uint32_t _len, bool _first) : + RefRecord(TIndexOffU _off, TIndexOffU _len, bool _first) : off(_off), len(_len), first(_first) { } RefRecord(FILE *in, bool swap) { assert(in != NULL); - if(!fread(&off, 4, 1, in)) { + if(!fread(&off, OFF_SIZE, 1, in)) { cerr << "Error reading RefRecord offset from FILE" << endl; throw 1; } - if(swap) off = endianSwapU32(off); - if(!fread(&len, 4, 1, in)) { + if(swap) off = endianSwapU(off); + if(!fread(&len, OFF_SIZE, 1, in)) { cerr << "Error reading RefRecord offset from FILE" << endl; throw 1; } - if(swap) len = endianSwapU32(len); + if(swap) len = endianSwapU(len); first = fgetc(in) ? true : false; } RefRecord(int in, bool swap) { - off = readU32(in, swap); - len = readU32(in, swap); + off = readU(in, swap); + len = readU(in, swap); char c; if(!read(in, &c, 1)) { cerr << "Error reading RefRecord 'first' flag" << endl; @@ -57,13 +87,13 @@ struct RefRecord { } void write(std::ostream& out, bool be) { - writeU32(out, off, be); - writeU32(out, len, be); + writeU(out, off, be); + writeU(out, len, be); out.put(first ? 1 : 0); } - uint32_t off; /// Offset of the first character in the record - uint32_t len; /// Length of the record + TIndexOffU off; /// Offset of the first character in the record + TIndexOffU len; /// Length of the record bool first; /// Whether this record is the first for a reference sequence }; @@ -103,7 +133,7 @@ fastaRefReadSizes( vector& plens, const RefReadInParams& rparms, BitpairOutFileBuf* bpout, - int& numSeqs); + TIndexOff& numSeqs); extern void reverseRefRecords( @@ -279,7 +309,7 @@ static RefRecord fastaRefReadAppend(FileBuf& in, } } } - return RefRecord(off, len, first); + return RefRecord((TIndexOffU)off, (TIndexOffU)len, first); } #endif /*ndef REF_READ_H_*/ diff --git a/reference.h b/reference.h index 7425801..bc1c7de 100644 --- a/reference.h +++ b/reference.h @@ -6,6 +6,8 @@ #include "mm.h" #include "shmem.h" #include "timer.h" +#include "btypes.h" + /** * Concrete reference representation that bulk-loads the reference from @@ -51,8 +53,8 @@ class BitPairReference { useShmem_(useShmem), verbose_(verbose) { - string s3 = in + ".3.ebwt"; - string s4 = in + ".4.ebwt"; + string s3 = in + ".3." + gEbwt_ext; + string s4 = in + ".4." + gEbwt_ext; #ifdef BOWTIE_MM int f3, f4; @@ -89,9 +91,9 @@ class BitPairReference { throw 1; } if(mmSweep) { - int sum = 0; + TIndexOff sum = 0; for(off_t i = 0; i < sbuf.st_size; i += 1024) { - sum += (int) mmFile[i]; + sum += (TIndexOff) mmFile[i]; } if(startVerbose) { cerr << " Swept the memory-mapped ref index file; checksum: " << sum << ": "; @@ -119,7 +121,7 @@ class BitPairReference { // Read endianness sentinel, set 'swap' uint32_t one; bool swap = false; - one = readU32(f3, swap); + one = readU(f3, swap); if(one != 1) { if(useMm_) { cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl; @@ -130,8 +132,8 @@ class BitPairReference { } // Read # records - uint32_t sz; - sz = readU32(f3, swap); + TIndexOffU sz; + sz = readU(f3, swap); if(sz == 0) { cerr << "Error: number of reference records is 0 in " << s3 << endl; throw 1; @@ -144,12 +146,12 @@ class BitPairReference { // Cumulative count of all unambiguous characters on a per- // stretch 8-bit alignment (i.e. count of bytes we need to // allocate in buf_) - uint32_t cumsz = 0; - uint32_t cumlen = 0; - uint32_t unambiglen = 0; - uint32_t maxlen = 0; + TIndexOffU cumsz = 0; + TIndexOffU cumlen = 0; + TIndexOffU unambiglen = 0; + TIndexOffU maxlen = 0; // For each unambiguous stretch... - for(uint32_t i = 0; i < sz; i++) { + for(TIndexOffU i = 0; i < sz; i++) { recs_.push_back(RefRecord(f3, swap)); } for(uint32_t i = 0; i < sz; i++) { @@ -222,7 +224,7 @@ class BitPairReference { logTime(cerr); } // Store a cap entry for the end of the last reference seq - refRecOffs_.push_back(recs_.size()); + refRecOffs_.push_back((TIndexOffU)recs_.size()); refOffs_.push_back(cumsz); if(unambiglen > 0 && (!color || maxlen > 1)) { refApproxLens_.push_back(cumlen); @@ -418,27 +420,27 @@ class BitPairReference { * unambiguous stretches of the target reference sequence. When * there are many records, binary search would be more appropriate. */ - int getBase(uint32_t tidx, uint32_t toff) const { - uint32_t reci = refRecOffs_[tidx]; // first record for target reference sequence - uint32_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq + int getBase(size_t tidx, size_t toff) const { + uint64_t reci = refRecOffs_[tidx]; // first record for target reference sequence + uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq assert_gt(recf, reci); - uint32_t bufOff = refOffs_[tidx]; - uint32_t off = 0; + uint64_t bufOff = refOffs_[tidx]; + uint64_t off = 0; // For all records pertaining to the target reference sequence... - for(uint32_t i = reci; i < recf; i++) { + for(uint64_t i = reci; i < recf; i++) { assert_geq(toff, off); off += recs_[i].off; if(toff < off) { return 4; } assert_geq(toff, off); - uint32_t recOff = off + recs_[i].len; + uint64_t recOff = off + recs_[i].len; if(toff < recOff) { toff -= off; bufOff += toff; assert_lt(bufOff, bufSz_); - const uint32_t bufElt = (bufOff) >> 2; - const uint32_t shift = (bufOff & 3) << 1; + const uint64_t bufElt = (bufOff) >> 2; + const uint64_t shift = (bufOff & 3) << 1; return ((buf_[bufElt] >> shift) & 3); } bufOff += recs_[i].len; @@ -456,19 +458,19 @@ class BitPairReference { * there are many records, binary search would be more appropriate. */ int getStretchNaive(uint32_t *destU32, - uint32_t tidx, - uint32_t toff, - uint32_t count) const + size_t tidx, + size_t toff, + size_t count) const { uint8_t *dest = (uint8_t*)destU32; - uint32_t reci = refRecOffs_[tidx]; // first record for target reference sequence - uint32_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq + uint64_t reci = refRecOffs_[tidx]; // first record for target reference sequence + uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq assert_gt(recf, reci); - uint32_t cur = 0; - uint32_t bufOff = refOffs_[tidx]; - uint32_t off = 0; + uint64_t cur = 0; + uint64_t bufOff = refOffs_[tidx]; + uint64_t off = 0; // For all records pertaining to the target reference sequence... - for(uint32_t i = reci; i < recf; i++) { + for(uint64_t i = reci; i < recf; i++) { assert_geq(toff, off); off += recs_[i].off; for(; toff < off && count > 0; toff++) { @@ -478,15 +480,15 @@ class BitPairReference { if(count == 0) break; assert_geq(toff, off); if(toff < off + recs_[i].len) { - bufOff += (toff - off); // move bufOff pointer forward + bufOff += (TIndexOffU)(toff - off); // move bufOff pointer forward } else { bufOff += recs_[i].len; } off += recs_[i].len; for(; toff < off && count > 0; toff++) { assert_lt(bufOff, bufSz_); - const uint32_t bufElt = (bufOff) >> 2; - const uint32_t shift = (bufOff & 3) << 1; + const uint64_t bufElt = (bufOff) >> 2; + const uint64_t shift = (bufOff & 3) << 1; dest[cur++] = (buf_[bufElt] >> shift) & 3; bufOff++; count--; @@ -512,12 +514,12 @@ class BitPairReference { * there are many records, binary search would be more appropriate. */ int getStretch(uint32_t *destU32, - uint32_t tidx, - uint32_t toff, - uint32_t count) const + size_t tidx, + size_t toff, + size_t count) const { - ASSERT_ONLY(uint32_t origCount = count); - ASSERT_ONLY(uint32_t origToff = toff); + ASSERT_ONLY(size_t origCount = count); + ASSERT_ONLY(size_t origToff = toff); if(count == 0) return 0; uint8_t *dest = (uint8_t*)destU32; #ifndef NDEBUG @@ -526,22 +528,22 @@ class BitPairReference { uint8_t *dest_2 = ((uint8_t*)destU32_2) + off2; #endif destU32[0] = 0x04040404; // Add Ns, which we might end up using later - uint32_t reci = refRecOffs_[tidx]; // first record for target reference sequence - uint32_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq + uint64_t reci = refRecOffs_[tidx]; // first record for target reference sequence + uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq assert_gt(recf, reci); - uint32_t cur = 4; // keep a cushion of 4 bases at the beginning - uint32_t bufOff = refOffs_[tidx]; - uint32_t off = 0; - int offset = 4; + uint64_t cur = 4; // keep a cushion of 4 bases at the beginning + uint64_t bufOff = refOffs_[tidx]; + uint64_t off = 0; + int64_t offset = 4; bool firstStretch = true; // For all records pertaining to the target reference sequence... - for(uint32_t i = reci; i < recf; i++) { - ASSERT_ONLY(uint32_t origBufOff = bufOff); + for(uint64_t i = reci; i < recf; i++) { + ASSERT_ONLY(uint64_t origBufOff = bufOff); assert_geq(toff, off); off += recs_[i].off; assert_gt(count, 0); if(toff < off) { - uint32_t cpycnt = min(off - toff, count); + size_t cpycnt = min((size_t)(off - toff), count); memset(&dest[cur], 4, cpycnt); count -= cpycnt; toff += cpycnt; @@ -564,29 +566,29 @@ class BitPairReference { if(cur & 3) { offset -= (cur & 3); } - uint32_t curU32 = cur >> 2; + uint64_t curU32 = cur >> 2; // Do the initial few bases if(bufOff & 3) { - const uint32_t bufElt = (bufOff) >> 2; - const int low2 = bufOff & 3; + const uint64_t bufElt = (bufOff) >> 2; + const int64_t low2 = bufOff & 3; destU32[curU32] = byteToU32_[buf_[bufElt]]; for(int j = 0; j < low2; j++) { ((char *)(&destU32[curU32]))[j] = 4; } curU32++; offset += low2; - const int chars = 4 - low2; + const int64_t chars = 4 - low2; count -= chars; bufOff += chars; toff += chars; } assert_eq(0, bufOff & 3); - uint32_t bufOffU32 = bufOff >> 2; - uint32_t countLim = count >> 2; - uint32_t offLim = (off - (toff + 4)) >> 2; - uint32_t lim = min(countLim, offLim); + uint64_t bufOffU32 = bufOff >> 2; + uint64_t countLim = count >> 2; + uint64_t offLim = (off - (toff + 4)) >> 2; + uint64_t lim = min(countLim, offLim); // Do the fast thing for as far as possible - for(uint32_t j = 0; j < lim; j++) { + for(uint64_t j = 0; j < lim; j++) { destU32[curU32] = byteToU32_[buf_[bufOffU32++]]; assert_eq(dest[(curU32 << 2) + 0], dest_2[(curU32 << 2) - offset + 0]); assert_eq(dest[(curU32 << 2) + 1], dest_2[(curU32 << 2) - offset + 1]); @@ -604,8 +606,8 @@ class BitPairReference { // Do the slow thing for the rest for(; toff < off && count > 0; toff++) { assert_lt(bufOff, bufSz_); - const uint32_t bufElt = (bufOff) >> 2; - const uint32_t shift = (bufOff & 3) << 1; + const uint64_t bufElt = (bufOff) >> 2; + const uint64_t shift = (bufOff & 3) << 1; dest[cur++] = (buf_[bufElt] >> shift) & 3; bufOff++; count--; @@ -615,8 +617,8 @@ class BitPairReference { // Do the slow thing for(; toff < off && count > 0; toff++) { assert_lt(bufOff, bufSz_); - const uint32_t bufElt = (bufOff) >> 2; - const uint32_t shift = (bufOff & 3) << 1; + const uint64_t bufElt = (bufOff) >> 2; + const uint64_t shift = (bufOff & 3) << 1; dest[cur++] = (buf_[bufElt] >> shift) & 3; bufOff++; count--; @@ -637,11 +639,11 @@ class BitPairReference { #ifndef NDEBUG delete[] destU32_2; #endif - return offset; + return (int)offset; } /// Return the number of reference sequences. - uint32_t numRefs() const { + TIndexOffU numRefs() const { return nrefs_; } @@ -668,7 +670,7 @@ class BitPairReference { } /// Return the lengths of reference sequences. - uint32_t approxLen(uint32_t elt) const { + TIndexOffU approxLen(TIndexOffU elt) const { assert_lt(elt, nrefs_); return refApproxLens_[elt]; } @@ -702,17 +704,17 @@ class BitPairReference { std::vector recs_; /// records describing unambiguous stretches std::vector refApproxLens_; /// approx lens of ref seqs (excludes trailing ambig chars) - std::vector refLens_; /// approx lens of ref seqs (excludes trailing ambig chars) - std::vector refOffs_; /// buf_ begin offsets per ref seq - std::vector refRecOffs_; /// record begin/end offsets per ref seq + std::vector refLens_; /// approx lens of ref seqs (excludes trailing ambig chars) + std::vector refOffs_; /// buf_ begin offsets per ref seq + std::vector refRecOffs_; /// record begin/end offsets per ref seq std::vector expandIdx_; /// map from small idxs (e.g. w/r/t plen) to large ones (w/r/t refnames) std::vector shrinkIdx_; /// map from large idxs to small std::vector isGaps_; /// ref i is all gaps? uint8_t *buf_; /// the whole reference as a big bitpacked byte array uint8_t *sanityBuf_;/// for sanity-checking buf_ - uint32_t bufSz_; /// size of buf_ - uint32_t bufAllocSz_; - uint32_t nrefs_; /// the number of reference sequences + TIndexOffU bufSz_; /// size of buf_ + TIndexOffU bufAllocSz_; + TIndexOffU nrefs_; /// the number of reference sequences uint32_t nNoGapRefs_; /// the number of reference sequences that aren't totally ambiguous bool loaded_; /// whether it's loaded bool sanity_; /// do sanity checking diff --git a/scripts/best_verify.pl b/scripts/best_verify.pl old mode 100644 new mode 100755 diff --git a/scripts/bs_mapability.pl b/scripts/bs_mapability.pl old mode 100644 new mode 100755 diff --git a/scripts/build_test.sh b/scripts/build_test.sh old mode 100644 new mode 100755 diff --git a/scripts/colorize_fasta.pl b/scripts/colorize_fasta.pl old mode 100644 new mode 100755 diff --git a/scripts/colorize_fastq.pl b/scripts/colorize_fastq.pl old mode 100644 new mode 100755 diff --git a/scripts/convert_quals.pl b/scripts/convert_quals.pl old mode 100644 new mode 100755 diff --git a/scripts/fastq_to_tabbed.pl b/scripts/fastq_to_tabbed.pl old mode 100644 new mode 100755 diff --git a/scripts/gen_dnamasks2colormask.pl b/scripts/gen_dnamasks2colormask.pl old mode 100644 new mode 100755 diff --git a/scripts/gen_solqual_lookup.pl b/scripts/gen_solqual_lookup.pl old mode 100644 new mode 100755 diff --git a/scripts/infer_fraglen.pl b/scripts/infer_fraglen.pl old mode 100644 new mode 100755 diff --git a/scripts/make_c_elegans.sh b/scripts/make_c_elegans.sh old mode 100644 new mode 100755 diff --git a/scripts/make_canFam2.sh b/scripts/make_canFam2.sh old mode 100644 new mode 100755 diff --git a/scripts/make_d_melanogaster.sh b/scripts/make_d_melanogaster.sh old mode 100644 new mode 100755 diff --git a/scripts/make_e_coli.sh b/scripts/make_e_coli.sh old mode 100644 new mode 100755 diff --git a/scripts/make_hg18.sh b/scripts/make_hg18.sh old mode 100644 new mode 100755 diff --git a/scripts/make_hg19.sh b/scripts/make_hg19.sh old mode 100644 new mode 100755 diff --git a/scripts/make_mm9.sh b/scripts/make_mm9.sh old mode 100644 new mode 100755 diff --git a/scripts/make_rn4.sh b/scripts/make_rn4.sh old mode 100644 new mode 100755 diff --git a/scripts/make_s_cerevisiae.sh b/scripts/make_s_cerevisiae.sh old mode 100644 new mode 100755 diff --git a/scripts/mapability.pl b/scripts/mapability.pl old mode 100644 new mode 100755 diff --git a/scripts/pe_verify.pl b/scripts/pe_verify.pl old mode 100644 new mode 100755 diff --git a/scripts/random_bowtie_tests.pl b/scripts/random_bowtie_tests.pl old mode 100644 new mode 100755 diff --git a/scripts/random_bowtie_tests.sh b/scripts/random_bowtie_tests.sh old mode 100644 new mode 100755 diff --git a/scripts/random_bowtie_tests_p.sh b/scripts/random_bowtie_tests_p.sh old mode 100644 new mode 100755 diff --git a/scripts/reconcile_alignments.pl b/scripts/reconcile_alignments.pl old mode 100644 new mode 100755 diff --git a/scripts/reconcile_alignments_pe.pl b/scripts/reconcile_alignments_pe.pl old mode 100644 new mode 100755 diff --git a/shmem.h b/shmem.h index 1765d15..42efa99 100644 --- a/shmem.h +++ b/shmem.h @@ -18,11 +18,13 @@ #include #include #include "str_util.h" +#include "btypes.h" extern void notifySharedMem(void *mem, size_t len); extern void waitSharedMem(void *mem, size_t len); +#define ALLOC_SHARED_U allocSharedMem #define ALLOC_SHARED_U8 allocSharedMem #define ALLOC_SHARED_U32 allocSharedMem #define FREE_SHARED shmdt @@ -135,6 +137,7 @@ bool allocSharedMem(std::string fname, #else +#define ALLOC_SHARED_U(...) 0 #define ALLOC_SHARED_U8(...) 0 #define ALLOC_SHARED_U32(...) 0 #define FREE_SHARED(...) diff --git a/zbox.h b/zbox.h index 6f175dd..152f909 100644 --- a/zbox.h +++ b/zbox.h @@ -9,8 +9,8 @@ */ template void calcZ(const T& s, - uint32_t off, - String& z, + TIndexOffU off, + String& z, bool verbose = false, bool sanityCheck = false) { @@ -27,7 +27,7 @@ void calcZ(const T& s, // compare starting at k with prefix starting at 0 size_t ki = k; while(off+ki < length(s) && s[off+ki] == s[off+ki-k]) ki++; - z[k] = ki - k; + z[k] = (TIndexOffU)(ki - k); assert_lt(off+z[k], slen); if(z[k] > 0) { lCur = k; @@ -45,7 +45,7 @@ void calcZ(const T& s, } else if (z[kPrime] > 0) { int q = 0; while (off+q+rCur+1 < length(s) && s[off+q+rCur+1] == s[off+betaLen+q]) q++; - z[k] = betaLen + q; + z[k] = (TIndexOffU)(betaLen + q); assert_lt(off+z[k], slen); rCur = rCur + q; assert_geq(k, lCur);