Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| #ifndef EBWT_H_ | |
| #define EBWT_H_ | |
| #include <stdint.h> | |
| #include <string.h> | |
| #include <algorithm> | |
| #include <iostream> | |
| #include <fstream> | |
| #include <vector> | |
| #include <set> | |
| #include <sstream> | |
| #include <fcntl.h> | |
| #include <errno.h> | |
| #include <stdexcept> | |
| #include <seqan/sequence.h> | |
| #include <seqan/index.h> | |
| #include <sys/stat.h> | |
| #ifdef BOWTIE_MM | |
| #include <sys/mman.h> | |
| #include <sys/shm.h> | |
| #endif | |
| #include "auto_array.h" | |
| #include "shmem.h" | |
| #include "alphabet.h" | |
| #include "assert_helpers.h" | |
| #include "bitpack.h" | |
| #include "blockwise_sa.h" | |
| #include "endian_swap.h" | |
| #include "word_io.h" | |
| #include "random_source.h" | |
| #include "hit.h" | |
| #include "ref_read.h" | |
| #include "threading.h" | |
| #include "bitset.h" | |
| #include "str_util.h" | |
| #include "mm.h" | |
| #include "timer.h" | |
| #include "refmap.h" | |
| #include "color_dec.h" | |
| #include "reference.h" | |
| #ifdef POPCNT_CAPABILITY | |
| #include "processor_support.h" | |
| #endif | |
| using namespace std; | |
| using namespace seqan; | |
| #ifndef PREFETCH_LOCALITY | |
| // No locality by default | |
| #define PREFETCH_LOCALITY 2 | |
| #endif | |
| // From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl | |
| extern uint8_t cCntLUT_4[4][4][256]; | |
| static const uint64_t c_table[4] = { | |
| 0xffffffffffffffffllu, | |
| 0xaaaaaaaaaaaaaaaallu, | |
| 0x5555555555555555llu, | |
| 0x0000000000000000llu | |
| }; | |
| #ifndef VMSG_NL | |
| #define VMSG_NL(args...) \ | |
| if(this->verbose()) { \ | |
| stringstream tmp; \ | |
| tmp << args << endl; \ | |
| this->verbose(tmp.str()); \ | |
| } | |
| #endif | |
| #ifndef VMSG | |
| #define VMSG(args...) \ | |
| if(this->verbose()) { \ | |
| stringstream tmp; \ | |
| tmp << args; \ | |
| this->verbose(tmp.str()); \ | |
| } | |
| #endif | |
| /** | |
| * Flags describing type of Ebwt. | |
| */ | |
| enum EBWT_FLAGS { | |
| EBWT_COLOR = 2, // true -> Ebwt is colorspace | |
| EBWT_ENTIRE_REV = 4 // true -> reverse Ebwt is the whole | |
| // concatenated string reversed, rather than | |
| // each stretch reversed | |
| }; | |
| extern string gLastIOErrMsg; | |
| inline bool is_read_err(int fdesc, ssize_t ret, size_t count){ | |
| if (ret < 0) { | |
| std::stringstream sstm; | |
| sstm << "ERRNO: " << errno << " ERR Msg:" << strerror(errno) << std::endl; | |
| gLastIOErrMsg = sstm.str(); | |
| return true; | |
| } | |
| return false; | |
| } | |
| /* Checks whether a call to fread() failed or not. */ | |
| inline bool is_fread_err(FILE* file_hd, size_t ret, size_t count){ | |
| if (ferror(file_hd)) { | |
| gLastIOErrMsg = "Error Reading File!"; | |
| return true; | |
| } | |
| return false; | |
| } | |
| /** | |
| * Extended Burrows-Wheeler transform header. This together with the | |
| * actual data arrays and other text-specific parameters defined in | |
| * class Ebwt constitute the entire Ebwt. | |
| */ | |
| class EbwtParams { | |
| public: | |
| EbwtParams() { } | |
| EbwtParams(TIndexOffU len, | |
| int32_t lineRate, | |
| int32_t linesPerSide, | |
| int32_t offRate, | |
| int32_t isaRate, | |
| int32_t ftabChars, | |
| bool color, | |
| bool entireReverse) | |
| { | |
| init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireReverse); | |
| } | |
| EbwtParams(const EbwtParams& eh) { | |
| init(eh._len, eh._lineRate, eh._linesPerSide, eh._offRate, | |
| eh._isaRate, eh._ftabChars, eh._color, eh._entireReverse); | |
| } | |
| void init(TIndexOffU len, int32_t lineRate, int32_t linesPerSide, | |
| int32_t offRate, int32_t isaRate, int32_t ftabChars, | |
| bool color, bool entireReverse) | |
| { | |
| _color = color; | |
| _entireReverse = entireReverse; | |
| _len = len; | |
| _bwtLen = _len + 1; | |
| _sz = (len+3)/4; | |
| _bwtSz = (len/4 + 1); | |
| _lineRate = lineRate; | |
| _linesPerSide = linesPerSide; | |
| _origOffRate = offRate; | |
| _offRate = offRate; | |
| _offMask = OFF_MASK << _offRate; | |
| _isaRate = isaRate; | |
| _isaMask = OFF_MASK << ((_isaRate >= 0) ? _isaRate : 0); | |
| _ftabChars = ftabChars; | |
| _eftabLen = _ftabChars*2; | |
| _eftabSz = _eftabLen*OFF_SIZE; | |
| _ftabLen = (1 << (_ftabChars*2))+1; | |
| _ftabSz = _ftabLen*OFF_SIZE; | |
| _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate; | |
| _offsSz = (uint64_t)_offsLen*OFF_SIZE; | |
| _isaLen = (_isaRate == -1)? 0 : ((_bwtLen + (1 << _isaRate) - 1) >> _isaRate); | |
| _isaSz = _isaLen*OFF_SIZE; | |
| _lineSz = 1 << _lineRate; | |
| _sideSz = _lineSz * _linesPerSide; | |
| _sideBwtSz = _sideSz - 2*OFF_SIZE; | |
| _sideBwtLen = _sideBwtSz*4; | |
| _numSidePairs = (_bwtSz+(2*_sideBwtSz)-1)/(2*_sideBwtSz); | |
| _numSides = _numSidePairs*2; | |
| _numLines = _numSides * _linesPerSide; | |
| _ebwtTotLen = _numSidePairs * (2*_sideSz); | |
| _ebwtTotSz = _ebwtTotLen; | |
| assert(repOk()); | |
| } | |
| TIndexOffU len() const { return _len; } | |
| TIndexOffU bwtLen() const { return _bwtLen; } | |
| TIndexOffU sz() const { return _sz; } | |
| TIndexOffU bwtSz() const { return _bwtSz; } | |
| int32_t lineRate() const { return _lineRate; } | |
| int32_t linesPerSide() const { return _linesPerSide; } | |
| int32_t origOffRate() const { return _origOffRate; } | |
| int32_t offRate() const { return _offRate; } | |
| TIndexOffU offMask() const { return _offMask; } | |
| int32_t isaRate() const { return _isaRate; } | |
| uint32_t isaMask() const { return _isaMask; } | |
| int32_t ftabChars() const { return _ftabChars; } | |
| uint32_t eftabLen() const { return _eftabLen; } | |
| uint32_t eftabSz() const { return _eftabSz; } | |
| TIndexOffU ftabLen() const { return _ftabLen; } | |
| TIndexOffU ftabSz() const { return _ftabSz; } | |
| TIndexOffU offsLen() const { return _offsLen; } | |
| uint64_t offsSz() const { return _offsSz; } | |
| TIndexOffU isaLen() const { return _isaLen; } | |
| uint64_t isaSz() const { return _isaSz; } | |
| uint32_t lineSz() const { return _lineSz; } | |
| uint32_t sideSz() const { return _sideSz; } | |
| uint32_t sideBwtSz() const { return _sideBwtSz; } | |
| uint32_t sideBwtLen() const { return _sideBwtLen; } | |
| uint32_t numSidePairs() const { return _numSidePairs; } /* check */ | |
| TIndexOffU numSides() const { return _numSides; } | |
| TIndexOffU numLines() const { return _numLines; } | |
| TIndexOffU ebwtTotLen() const { return _ebwtTotLen; } | |
| TIndexOffU ebwtTotSz() const { return _ebwtTotSz; } | |
| bool color() const { return _color; } | |
| bool entireReverse() const { return _entireReverse; } | |
| /** | |
| * Set a new suffix-array sampling rate, which involves updating | |
| * rate, mask, sample length, and sample size. | |
| */ | |
| void setOffRate(int __offRate) { | |
| _offRate = __offRate; | |
| _offMask = OFF_MASK << _offRate; | |
| _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate; | |
| _offsSz = (uint64_t) _offsLen*OFF_SIZE; | |
| } | |
| /** | |
| * Set a new inverse suffix-array sampling rate, which involves | |
| * updating rate, mask, sample length, and sample size. | |
| */ | |
| void setIsaRate(int __isaRate) { | |
| _isaRate = __isaRate; | |
| _isaMask = OFF_MASK << _isaRate; | |
| _isaLen = (_bwtLen + (1 << _isaRate) - 1) >> _isaRate; | |
| _isaSz = (uint64_t)_isaLen*OFF_SIZE; | |
| } | |
| /// Check that this EbwtParams is internally consistent | |
| bool repOk() const { | |
| assert_gt(_len, 0); | |
| assert_gt(_lineRate, 3); | |
| assert_geq(_offRate, 0); | |
| assert_leq(_ftabChars, 16); | |
| assert_geq(_ftabChars, 1); | |
| assert_lt(_lineRate, 32); | |
| assert_lt(_linesPerSide, 32); | |
| assert_lt(_ftabChars, 32); | |
| assert_eq(0, _ebwtTotSz % (2*_lineSz)); | |
| return true; | |
| } | |
| /** | |
| * Pretty-print the header contents to the given output stream. | |
| */ | |
| void print(ostream& out) const { | |
| out << "Headers:" << endl | |
| << " len: " << _len << endl | |
| << " bwtLen: " << _bwtLen << endl | |
| << " sz: " << _sz << endl | |
| << " bwtSz: " << _bwtSz << endl | |
| << " lineRate: " << _lineRate << endl | |
| << " linesPerSide: " << _linesPerSide << endl | |
| << " offRate: " << _offRate << endl | |
| << " offMask: 0x" << hex << _offMask << dec << endl | |
| << " isaRate: " << _isaRate << endl | |
| << " isaMask: 0x" << hex << _isaMask << dec << endl | |
| << " ftabChars: " << _ftabChars << endl | |
| << " eftabLen: " << _eftabLen << endl | |
| << " eftabSz: " << _eftabSz << endl | |
| << " ftabLen: " << _ftabLen << endl | |
| << " ftabSz: " << _ftabSz << endl | |
| << " offsLen: " << _offsLen << endl | |
| << " offsSz: " << _offsSz << endl | |
| << " isaLen: " << _isaLen << endl | |
| << " isaSz: " << _isaSz << endl | |
| << " lineSz: " << _lineSz << endl | |
| << " sideSz: " << _sideSz << endl | |
| << " sideBwtSz: " << _sideBwtSz << endl | |
| << " sideBwtLen: " << _sideBwtLen << endl | |
| << " numSidePairs: " << _numSidePairs << endl | |
| << " numSides: " << _numSides << endl | |
| << " numLines: " << _numLines << endl | |
| << " ebwtTotLen: " << _ebwtTotLen << endl | |
| << " ebwtTotSz: " << _ebwtTotSz << endl | |
| << " reverse: " << _entireReverse << endl; | |
| } | |
| TIndexOffU _len; | |
| TIndexOffU _bwtLen; | |
| TIndexOffU _sz; | |
| TIndexOffU _bwtSz; | |
| int32_t _lineRate; | |
| int32_t _linesPerSide; | |
| int32_t _origOffRate; | |
| int32_t _offRate; | |
| TIndexOffU _offMask; | |
| int32_t _isaRate; | |
| uint32_t _isaMask; | |
| int32_t _ftabChars; | |
| uint32_t _eftabLen; | |
| uint32_t _eftabSz; | |
| TIndexOffU _ftabLen; | |
| TIndexOffU _ftabSz; | |
| TIndexOffU _offsLen; | |
| uint64_t _offsSz; | |
| TIndexOffU _isaLen; | |
| uint64_t _isaSz; | |
| uint32_t _lineSz; | |
| uint32_t _sideSz; | |
| uint32_t _sideBwtSz; | |
| uint32_t _sideBwtLen; | |
| uint32_t _numSidePairs; | |
| TIndexOffU _numSides; | |
| TIndexOffU _numLines; | |
| TIndexOffU _ebwtTotLen; | |
| TIndexOffU _ebwtTotSz; | |
| bool _color; | |
| bool _entireReverse; | |
| }; | |
| /** | |
| * Exception to throw when a file-realted error occurs. | |
| */ | |
| class EbwtFileOpenException : public std::runtime_error { | |
| public: | |
| EbwtFileOpenException(const std::string& msg = "") : | |
| std::runtime_error(msg) { } | |
| }; | |
| /** | |
| * Calculate size of file with given name. | |
| */ | |
| static inline int64_t fileSize(const char* name) { | |
| std::ifstream f; | |
| f.open(name, std::ios_base::binary | std::ios_base::in); | |
| if (!f.good() || f.eof() || !f.is_open()) { return 0; } | |
| f.seekg(0, std::ios_base::beg); | |
| std::ifstream::pos_type begin_pos = f.tellg(); | |
| f.seekg(0, std::ios_base::end); | |
| return static_cast<int64_t>(f.tellg() - begin_pos); | |
| } | |
| // Forward declarations for Ebwt class | |
| struct SideLocus; | |
| template<typename TStr> class EbwtSearchParams; | |
| /** | |
| * Extended Burrows-Wheeler transform data. | |
| * | |
| * An Ebwt may be transferred to and from RAM with calls to | |
| * evictFromMemory() and loadIntoMemory(). By default, a newly-created | |
| * Ebwt is not loaded into memory; if the user would like to use a | |
| * newly-created Ebwt to answer queries, they must first call | |
| * loadIntoMemory(). | |
| */ | |
| template <typename TStr> | |
| class Ebwt { | |
| public: | |
| typedef typename Value<TStr>::Type TAlphabet; | |
| #define Ebwt_INITS \ | |
| _toBigEndian(currentlyBigEndian()), \ | |
| _overrideOffRate(__overrideOffRate), \ | |
| _overrideIsaRate(__overrideIsaRate), \ | |
| _verbose(verbose), \ | |
| _passMemExc(passMemExc), \ | |
| _sanity(sanityCheck), \ | |
| _fw(__fw), \ | |
| _in1(NULL), \ | |
| _in2(NULL), \ | |
| _zOff(OFF_MASK), \ | |
| _zEbwtByteOff(OFF_MASK), \ | |
| _zEbwtBpOff(-1), \ | |
| _nPat(0), \ | |
| _nFrag(0), \ | |
| _plen(NULL), \ | |
| _rstarts(NULL), \ | |
| _fchr(NULL), \ | |
| _ftab(NULL), \ | |
| _eftab(NULL), \ | |
| _offs(NULL), \ | |
| _isa(NULL), \ | |
| _ebwt(NULL), \ | |
| _useMm(false), \ | |
| useShmem_(false), \ | |
| _refnames(), \ | |
| rmap_(NULL), \ | |
| mmFile1_(NULL), \ | |
| mmFile2_(NULL) | |
| #ifdef EBWT_STATS | |
| #define Ebwt_STAT_INITS \ | |
| ,mapLFExs_(0llu), \ | |
| mapLFs_(0llu), \ | |
| mapLFcs_(0llu), \ | |
| mapLF1cs_(0llu), \ | |
| mapLF1s_(0llu) | |
| #else | |
| #define Ebwt_STAT_INITS | |
| #endif | |
| /// Construct an Ebwt from the given input file | |
| Ebwt(const string& in, | |
| int color, | |
| int needEntireReverse, | |
| bool __fw, | |
| int32_t __overrideOffRate = -1, | |
| int32_t __overrideIsaRate = -1, | |
| bool useMm = false, | |
| bool useShmem = false, | |
| bool mmSweep = false, | |
| bool loadNames = false, | |
| const ReferenceMap* rmap = NULL, | |
| bool verbose = false, | |
| bool startVerbose = false, | |
| bool passMemExc = false, | |
| bool sanityCheck = false) : | |
| Ebwt_INITS | |
| Ebwt_STAT_INITS | |
| { | |
| assert(!useMm || !useShmem); | |
| #ifdef POPCNT_CAPABILITY | |
| ProcessorSupport ps; | |
| _usePOPCNTinstruction = ps.POPCNTenabled(); | |
| #endif | |
| rmap_ = rmap; | |
| _useMm = useMm; | |
| useShmem_ = useShmem; | |
| _in1Str = in + ".1." + gEbwt_ext; | |
| _in2Str = in + ".2." + gEbwt_ext; | |
| readIntoMemory( | |
| color, // expect colorspace reference? | |
| __fw ? -1 : needEntireReverse, // need REF_READ_REVERSE | |
| true, // stop after loading the header portion? | |
| &_eh, // params structure to fill in | |
| mmSweep, // mmSweep | |
| loadNames, // loadNames | |
| startVerbose); // startVerbose | |
| // If the offRate has been overridden, reflect that in the | |
| // _eh._offRate field | |
| if(_overrideOffRate > _eh._offRate) { | |
| _eh.setOffRate(_overrideOffRate); | |
| assert_eq(_overrideOffRate, _eh._offRate); | |
| } | |
| // Same with isaRate | |
| if(_overrideIsaRate > _eh._isaRate) { | |
| _eh.setIsaRate(_overrideIsaRate); | |
| assert_eq(_overrideIsaRate, _eh._isaRate); | |
| } | |
| assert(repOk()); | |
| } | |
| /// Construct an Ebwt from the given header parameters and string | |
| /// vector, optionally using a blockwise suffix sorter with the | |
| /// given 'bmax' and 'dcv' parameters. The string vector is | |
| /// ultimately joined and the joined string is passed to buildToDisk(). | |
| Ebwt(int color, | |
| int32_t lineRate, | |
| int32_t linesPerSide, | |
| int32_t offRate, | |
| int32_t isaRate, | |
| int32_t ftabChars, | |
| int nthreads, | |
| const string& file, // base filename for EBWT files | |
| bool __fw, | |
| bool useBlockwise, | |
| TIndexOffU bmax, | |
| TIndexOffU bmaxSqrtMult, | |
| TIndexOffU bmaxDivN, | |
| int dcv, | |
| vector<FileBuf*>& is, | |
| vector<RefRecord>& szs, | |
| vector<uint32_t>& plens, | |
| TIndexOffU sztot, | |
| const RefReadInParams& refparams, | |
| uint32_t seed, | |
| int32_t __overrideOffRate = -1, | |
| int32_t __overrideIsaRate = -1, | |
| bool verbose = false, | |
| bool passMemExc = false, | |
| bool sanityCheck = false) : | |
| Ebwt_INITS | |
| Ebwt_STAT_INITS, | |
| _eh(joinedLen(szs), | |
| lineRate, | |
| linesPerSide, | |
| offRate, | |
| isaRate, | |
| ftabChars, | |
| color, | |
| refparams.reverse == REF_READ_REVERSE) | |
| { | |
| #ifdef POPCNT_CAPABILITY | |
| ProcessorSupport ps; | |
| _usePOPCNTinstruction = ps.POPCNTenabled(); | |
| #endif | |
| _in1Str = file + ".1." + gEbwt_ext; | |
| _in2Str = file + ".2." + gEbwt_ext; | |
| // Open output files | |
| ofstream fout1(_in1Str.c_str(), ios::binary); | |
| if(!fout1.good()) { | |
| cerr << "Could not open index file for writing: \"" << _in1Str << "\"" << endl | |
| << "Please make sure the directory exists and that permissions allow writing by" << endl | |
| << "Bowtie." << endl; | |
| throw 1; | |
| } | |
| ofstream fout2(_in2Str.c_str(), ios::binary); | |
| if(!fout2.good()) { | |
| cerr << "Could not open index file for writing: \"" << _in2Str << "\"" << endl | |
| << "Please make sure the directory exists and that permissions allow writing by" << endl | |
| << "Bowtie." << endl; | |
| throw 1; | |
| } | |
| // Build | |
| initFromVector( | |
| is, | |
| szs, | |
| plens, | |
| sztot, | |
| refparams, | |
| fout1, | |
| fout2, | |
| file, | |
| nthreads, | |
| useBlockwise, | |
| bmax, | |
| bmaxSqrtMult, | |
| bmaxDivN, | |
| dcv, | |
| seed); | |
| // Close output files | |
| fout1.flush(); | |
| int64_t tellpSz1 = (int64_t)fout1.tellp(); | |
| VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary EBWT file: " << _in1Str); | |
| fout1.close(); | |
| bool err = false; | |
| if(tellpSz1 > fileSize(_in1Str.c_str())) { | |
| err = true; | |
| cerr << "Index is corrupt: File size for " << _in1Str << " should have been " << tellpSz1 | |
| << " but is actually " << fileSize(_in1Str.c_str()) << "." << endl; | |
| } | |
| fout2.flush(); | |
| int64_t tellpSz2 = (int64_t)fout2.tellp(); | |
| VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary EBWT file: " << _in2Str); | |
| fout2.close(); | |
| if(tellpSz2 > fileSize(_in2Str.c_str())) { | |
| err = true; | |
| cerr << "Index is corrupt: File size for " << _in2Str << " should have been " << tellpSz2 | |
| << " but is actually " << fileSize(_in2Str.c_str()) << "." << endl; | |
| } | |
| if(err) { | |
| cerr << "Please check if there is a problem with the disk or if disk is full." << endl; | |
| throw 1; | |
| } | |
| // Reopen as input streams | |
| VMSG_NL("Re-opening _in1 and _in2 as input streams"); | |
| if(_sanity) { | |
| VMSG_NL("Sanity-checking Ebwt"); | |
| assert(!isInMemory()); | |
| readIntoMemory( | |
| color, | |
| __fw ? -1 : refparams.reverse == REF_READ_REVERSE, | |
| false, | |
| NULL, | |
| false, | |
| true, | |
| false); | |
| sanityCheckAll(refparams.reverse); | |
| evictFromMemory(); | |
| assert(!isInMemory()); | |
| } | |
| VMSG_NL("Returning from Ebwt constructor"); | |
| } | |
| bool isPacked(); | |
| /** | |
| * Write the rstarts array given the szs array for the reference. | |
| */ | |
| void szsToDisk(const vector<RefRecord>& szs, ostream& os, int reverse) { | |
| TIndexOffU seq = 0; | |
| TIndexOffU off = 0; | |
| TIndexOffU totlen = 0; | |
| for(unsigned int i = 0; i < szs.size(); i++) { | |
| if(szs[i].len == 0) continue; | |
| if(szs[i].first) off = 0; | |
| off += szs[i].off; | |
| #ifdef ACCOUNT_FOR_ALL_GAP_REFS | |
| if(szs[i].first && szs[i].len > 0) seq++; | |
| #else | |
| if(szs[i].first) seq++; | |
| #endif | |
| TIndexOffU seqm1 = seq-1; | |
| assert_lt(seqm1, _nPat); | |
| TIndexOffU fwoff = off; | |
| if(reverse == REF_READ_REVERSE) { | |
| // Invert pattern idxs | |
| seqm1 = _nPat - seqm1 - 1; | |
| // Invert pattern idxs | |
| assert_leq(off + szs[i].len, _plen[seqm1]); | |
| fwoff = _plen[seqm1] - (off + szs[i].len); | |
| } | |
| writeU<TIndexOffU>(os, totlen, this->toBe()); // offset from beginning of joined string | |
| writeU<TIndexOffU>(os, seqm1, this->toBe()); // sequence id | |
| writeU<TIndexOffU>(os, fwoff, this->toBe()); // offset into sequence | |
| totlen += szs[i].len; | |
| off += szs[i].len; | |
| } | |
| } | |
| /** | |
| * Helper for the constructors above. Takes a vector of text | |
| * strings and joins them into a single string with a call to | |
| * joinToDisk, which does a join (with padding) and writes some of | |
| * the resulting data directly to disk rather than keep it in | |
| * memory. It then constructs a suffix-array producer (what kind | |
| * depends on 'useBlockwise') for the resulting sequence. The | |
| * suffix-array producer can then be used to obtain chunks of the | |
| * joined string's suffix array. | |
| */ | |
| void initFromVector( | |
| vector<FileBuf*>& is, | |
| vector<RefRecord>& szs, | |
| vector<uint32_t>& plens, | |
| TIndexOffU sztot, | |
| const RefReadInParams& refparams, | |
| ofstream& out1, | |
| ofstream& out2, | |
| const string& outfile, | |
| int nthreads, | |
| bool useBlockwise, | |
| TIndexOffU bmax, | |
| TIndexOffU bmaxSqrtMult, | |
| TIndexOffU bmaxDivN, | |
| int dcv, | |
| uint32_t seed) | |
| { | |
| // Compose text strings into single string | |
| VMSG_NL("Calculating joined length"); | |
| TStr s; // holds the entire joined reference after call to joinToDisk | |
| TIndexOffU jlen; | |
| jlen = joinedLen(szs); | |
| assert_geq(jlen, sztot); | |
| VMSG_NL("Writing header"); | |
| writeFromMemory(true, out1, out2); | |
| try { | |
| VMSG_NL("Reserving space for joined string"); | |
| seqan::reserve(s, jlen, Exact()); | |
| VMSG_NL("Joining reference sequences"); | |
| if(refparams.reverse == REF_READ_REVERSE) { | |
| { | |
| Timer timer(cout, " Time to join reference sequences: ", _verbose); | |
| joinToDisk(is, szs, plens, sztot, refparams, s, out1, out2, seed); | |
| } { | |
| Timer timer(cout, " Time to reverse reference sequence: ", _verbose); | |
| vector<RefRecord> tmp; | |
| reverseInPlace(s); | |
| reverseRefRecords(szs, tmp, false, false); | |
| szsToDisk(tmp, out1, refparams.reverse); | |
| } | |
| } else { | |
| Timer timer(cout, " Time to join reference sequences: ", _verbose); | |
| joinToDisk(is, szs, plens, sztot, refparams, s, out1, out2, seed); | |
| szsToDisk(szs, out1, refparams.reverse); | |
| } | |
| // Joined reference sequence now in 's' | |
| } catch(bad_alloc& e) { | |
| // If we throw an allocation exception in the try block, | |
| // that means that the joined version of the reference | |
| // string itself is too larger to fit in memory. The only | |
| // alternatives are to tell the user to give us more memory | |
| // or to try again with a packed representation of the | |
| // reference (if we haven't tried that already). | |
| cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl; | |
| if(!isPacked() && _passMemExc) { | |
| // Pass the exception up so that we can retry using a | |
| // packed string representation | |
| throw e; | |
| } | |
| // There's no point passing this exception on. The fact | |
| // that we couldn't allocate the joined string means that | |
| // --bmax is irrelevant - the user should re-run with | |
| // ebwt-build-packed | |
| if(isPacked()) { | |
| cerr << "Please try running bowtie-build on a computer with more memory." << endl; | |
| } else { | |
| cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl | |
| << "mode (-a/--auto), or try again on a computer with more memory." << endl; | |
| } | |
| if(sizeof(void*) == 4) { | |
| cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl | |
| << "this executable is 32-bit." << endl; | |
| } | |
| throw 1; | |
| } | |
| // Succesfully obtained joined reference string | |
| assert_geq(length(s), jlen); | |
| if(bmax != OFF_MASK) { | |
| VMSG_NL("bmax according to bmax setting: " << bmax); | |
| } | |
| else if(bmaxSqrtMult != OFF_MASK) { | |
| bmax *= bmaxSqrtMult; | |
| VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax); | |
| } | |
| else if(bmaxDivN != OFF_MASK) { | |
| bmax = max<TIndexOffU>(jlen / bmaxDivN, 1); | |
| VMSG_NL("bmax according to bmaxDivN setting: " << bmax); | |
| } | |
| else { | |
| bmax = (TIndexOffU)sqrt(length(s)); | |
| VMSG_NL("bmax defaulted to: " << bmax); | |
| } | |
| int iter = 0; | |
| bool first = true; | |
| // Look for bmax/dcv parameters that work. | |
| while(true) { | |
| if(!first && bmax < 40 && _passMemExc) { | |
| cerr << "Could not find approrpiate bmax/dcv settings for building this index." << endl; | |
| if(!isPacked()) { | |
| // Throw an exception exception so that we can | |
| // retry using a packed string representation | |
| throw bad_alloc(); | |
| } else { | |
| cerr << "Already tried a packed string representation." << endl; | |
| } | |
| cerr << "Please try indexing this reference on a computer with more memory." << endl; | |
| if(sizeof(void*) == 4) { | |
| cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl | |
| << "this executable is 32-bit." << endl; | |
| } | |
| throw 1; | |
| } | |
| if(dcv > 4096) dcv = 4096; | |
| if((iter % 6) == 5 && dcv < 4096 && dcv != 0) { | |
| dcv <<= 1; // double difference-cover period | |
| } else { | |
| bmax -= (bmax >> 2); // reduce by 25% | |
| } | |
| VMSG("Using parameters --bmax " << bmax); | |
| if(dcv == 0) { | |
| VMSG_NL(" and *no difference cover*"); | |
| } else { | |
| VMSG_NL(" --dcv " << dcv); | |
| } | |
| iter++; | |
| try { | |
| { | |
| VMSG_NL(" Doing ahead-of-time memory usage test"); | |
| // Make a quick-and-dirty attempt to force a bad_alloc iff | |
| // we would have thrown one eventually as part of | |
| // constructing the DifferenceCoverSample | |
| dcv <<= 1; | |
| TIndexOffU sz = (TIndexOffU)DifferenceCoverSample<TStr>::simulateAllocs(s, dcv >> 1); | |
| if(nthreads > 1) sz *= (nthreads + 1); | |
| AutoArray<uint8_t> tmp(sz); | |
| dcv >>= 1; | |
| // Likewise with the KarkkainenBlockwiseSA | |
| sz = (TIndexOffU)KarkkainenBlockwiseSA<TStr>::simulateAllocs(s, bmax); | |
| AutoArray<uint8_t> tmp2(sz); | |
| // Now throw in the 'ftab' and 'isaSample' structures | |
| // that we'll eventually allocate in buildToDisk | |
| AutoArray<TIndexOffU> ftab(_eh._ftabLen * 2); | |
| AutoArray<uint8_t> side(_eh._sideSz); | |
| // Grab another 20 MB out of caution | |
| AutoArray<uint32_t> extra(20*1024*1024); | |
| // If we made it here without throwing bad_alloc, then we | |
| // passed the memory-usage stress test | |
| VMSG(" Passed! Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv); | |
| if(isPacked()) { | |
| VMSG(" --packed"); | |
| } | |
| VMSG_NL(""); | |
| } | |
| VMSG_NL("Constructing suffix-array element generator"); | |
| KarkkainenBlockwiseSA<TStr> bsa(s, bmax, nthreads, dcv, seed, _sanity, _passMemExc, _verbose, outfile); | |
| assert(bsa.suffixItrIsReset()); | |
| assert_eq(bsa.size(), length(s)+1); | |
| VMSG_NL("Converting suffix-array elements to index image"); | |
| buildToDisk(bsa, s, out1, out2); | |
| out1.flush(); out2.flush(); | |
| if(out1.fail() || out2.fail()) { | |
| cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl; | |
| throw 1; | |
| } | |
| break; | |
| } catch(bad_alloc& e) { | |
| if(_passMemExc) { | |
| VMSG_NL(" Ran out of memory; automatically trying more memory-economical parameters."); | |
| } else { | |
| cerr << "Out of memory while constructing suffix array. Please try using a smaller" << endl | |
| << "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl; | |
| throw 1; | |
| } | |
| } | |
| first = false; | |
| } | |
| assert(repOk()); | |
| // Now write reference sequence names on the end | |
| #ifdef ACCOUNT_FOR_ALL_GAP_REFS | |
| assert_geq(this->_refnames.size(), this->_nPat); | |
| #else | |
| assert_eq(this->_refnames.size(), this->_nPat); | |
| #endif | |
| for(TIndexOffU i = 0; i < this->_refnames.size(); i++) { | |
| out1 << this->_refnames[i] << endl; | |
| } | |
| out1 << '\0'; | |
| out1.flush(); out2.flush(); | |
| if(out1.fail() || out2.fail()) { | |
| cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl; | |
| throw 1; | |
| } | |
| VMSG_NL("Returning from initFromVector"); | |
| } | |
| /** | |
| * Return the length that the joined string of the given string | |
| * list will have. Note that this is indifferent to how the text | |
| * fragments correspond to input sequences - it just cares about | |
| * the lengths of the fragments. | |
| */ | |
| TIndexOffU joinedLen(vector<RefRecord>& szs) { | |
| TIndexOffU ret = 0; | |
| for(unsigned int i = 0; i < szs.size(); i++) { | |
| ret += szs[i].len; | |
| } | |
| return ret; | |
| } | |
| /// Destruct an Ebwt | |
| ~Ebwt() { | |
| // Only free buffers if we're *not* using memory-mapped files | |
| if(!_useMm) { | |
| // Delete everything that was allocated in read(false, ...) | |
| if(_fchr != NULL) delete[] _fchr; _fchr = NULL; | |
| if(_ftab != NULL) delete[] _ftab; _ftab = NULL; | |
| if(_eftab != NULL) delete[] _eftab; _eftab = NULL; | |
| if(_offs != NULL && !useShmem_) { | |
| delete[] _offs; _offs = NULL; | |
| } else if(_offs != NULL && useShmem_) { | |
| FREE_SHARED(_offs); | |
| } | |
| if(_isa != NULL) delete[] _isa; _isa = NULL; | |
| if(_plen != NULL) delete[] _plen; _plen = NULL; | |
| if(_rstarts != NULL) delete[] _rstarts; _rstarts = NULL; | |
| if(_ebwt != NULL && !useShmem_) { | |
| delete[] _ebwt; _ebwt = NULL; | |
| } else if(_ebwt != NULL && useShmem_) { | |
| FREE_SHARED(_ebwt); | |
| } | |
| } | |
| if (_in1 != NULL) fclose(_in1); | |
| if (_in2 != NULL) fclose(_in2); | |
| #ifdef EBWT_STATS | |
| cout << (_fw ? "Forward index:" : "Mirror index:") << endl; | |
| cout << " mapLFEx: " << mapLFExs_ << endl; | |
| cout << " mapLF: " << mapLFs_ << endl; | |
| cout << " mapLF(c): " << mapLFcs_ << endl; | |
| cout << " mapLF1(c): " << mapLF1cs_ << endl; | |
| cout << " mapLF(c): " << mapLF1s_ << endl; | |
| #endif | |
| } | |
| /// Accessors | |
| const EbwtParams& eh() const { return _eh; } | |
| TIndexOffU zOff() const { return _zOff; } | |
| TIndexOffU zEbwtByteOff() const { return _zEbwtByteOff; } | |
| TIndexOff zEbwtBpOff() const { return _zEbwtBpOff; } | |
| TIndexOffU nPat() const { return _nPat; } | |
| TIndexOffU nFrag() const { return _nFrag; } | |
| TIndexOffU* fchr() const { return _fchr; } | |
| TIndexOffU* ftab() const { return _ftab; } | |
| TIndexOffU* eftab() const { return _eftab; } | |
| TIndexOffU* offs() const { return _offs; } | |
| uint32_t* isa() const { return _isa; } /* check */ | |
| TIndexOffU* plen() const { return _plen; } | |
| TIndexOffU* rstarts() const { return _rstarts; } | |
| uint8_t* ebwt() const { return _ebwt; } | |
| const ReferenceMap* rmap() const { return rmap_; } | |
| bool toBe() const { return _toBigEndian; } | |
| bool verbose() const { return _verbose; } | |
| bool sanityCheck() const { return _sanity; } | |
| vector<string>& refnames() { return _refnames; } | |
| bool fw() const { return _fw; } | |
| #ifdef POPCNT_CAPABILITY | |
| bool _usePOPCNTinstruction; | |
| #endif | |
| /// Return true iff the Ebwt is currently in memory | |
| bool isInMemory() const { | |
| if(_ebwt != NULL) { | |
| assert(_eh.repOk()); | |
| assert(_ftab != NULL); | |
| assert(_eftab != NULL); | |
| assert(_fchr != NULL); | |
| assert(_offs != NULL); | |
| assert(_isa != NULL); | |
| assert(_rstarts != NULL); | |
| assert_neq(_zEbwtByteOff, OFF_MASK); | |
| assert_neq(_zEbwtBpOff, -1); | |
| return true; | |
| } else { | |
| assert(_ftab == NULL); | |
| assert(_eftab == NULL); | |
| assert(_fchr == NULL); | |
| assert(_offs == NULL); | |
| assert(_rstarts == NULL); | |
| assert_eq(_zEbwtByteOff, OFF_MASK); | |
| assert_eq(_zEbwtBpOff, -1); | |
| return false; | |
| } | |
| } | |
| /// Return true iff the Ebwt is currently stored on disk | |
| bool isEvicted() const { | |
| return !isInMemory(); | |
| } | |
| /** | |
| * Load this Ebwt into memory by reading it in from the _in1 and | |
| * _in2 streams. | |
| */ | |
| void loadIntoMemory( | |
| int color, | |
| int needEntireReverse, | |
| bool loadNames, | |
| bool verbose) | |
| { | |
| readIntoMemory( | |
| color, // expect index to be colorspace? | |
| needEntireReverse, // require reverse index to be concatenated reference reversed | |
| false, // stop after loading the header portion? | |
| NULL, // params | |
| false, // mmSweep | |
| loadNames, // loadNames | |
| verbose); // startVerbose | |
| } | |
| /** | |
| * Frees memory associated with the Ebwt. | |
| */ | |
| void evictFromMemory() { | |
| assert(isInMemory()); | |
| if(!_useMm) { | |
| delete[] _fchr; | |
| delete[] _ftab; | |
| delete[] _eftab; | |
| if(!useShmem_) delete[] _offs; | |
| delete[] _isa; | |
| // Keep plen; it's small and the client may want to query it | |
| // even when the others are evicted. | |
| //delete[] _plen; | |
| delete[] _rstarts; | |
| if(!useShmem_) delete[] _ebwt; | |
| } | |
| _fchr = NULL; | |
| _ftab = NULL; | |
| _eftab = NULL; | |
| _offs = NULL; | |
| _isa = NULL; | |
| // Keep plen; it's small and the client may want to query it | |
| // even when the others are evicted. | |
| //_plen = NULL; | |
| _rstarts = NULL; | |
| _ebwt = NULL; | |
| _zEbwtByteOff = OFF_MASK; | |
| _zEbwtBpOff = -1; | |
| } | |
| /** | |
| * Non-static facade for static function ftabHi. | |
| */ | |
| TIndexOffU ftabHi(TIndexOffU i) const { | |
| return Ebwt::ftabHi(_ftab, _eftab, _eh._len, _eh._ftabLen, | |
| _eh._eftabLen, i); | |
| } | |
| /** | |
| * Get "high interpretation" of ftab entry at index i. The high | |
| * interpretation of a regular ftab entry is just the entry | |
| * itself. The high interpretation of an extended entry is the | |
| * second correpsonding ui32 in the eftab. | |
| * | |
| * It's a static member because it's convenient to ask this | |
| * question before the Ebwt is fully initialized. | |
| */ | |
| static TIndexOffU ftabHi(TIndexOffU *ftab, | |
| TIndexOffU *eftab, | |
| TIndexOffU len, | |
| TIndexOffU ftabLen, | |
| TIndexOffU eftabLen, | |
| TIndexOffU i) | |
| { | |
| assert_lt(i, ftabLen); | |
| if(ftab[i] <= len) { | |
| return ftab[i]; | |
| } else { | |
| TIndexOffU efIdx = ftab[i] ^ OFF_MASK; | |
| assert_lt(efIdx*2+1, eftabLen); | |
| return eftab[efIdx*2+1]; | |
| } | |
| } | |
| /** | |
| * Non-static facade for static function ftabLo. | |
| */ | |
| TIndexOffU ftabLo(TIndexOffU i) const { | |
| return Ebwt::ftabLo(_ftab, _eftab, _eh._len, _eh._ftabLen, | |
| _eh._eftabLen, i); | |
| } | |
| /** | |
| * Get "low interpretation" of ftab entry at index i. The low | |
| * interpretation of a regular ftab entry is just the entry | |
| * itself. The low interpretation of an extended entry is the | |
| * first correpsonding ui32 in the eftab. | |
| * | |
| * It's a static member because it's convenient to ask this | |
| * question before the Ebwt is fully initialized. | |
| */ | |
| static TIndexOffU ftabLo(TIndexOffU *ftab, | |
| TIndexOffU *eftab, | |
| TIndexOffU len, | |
| TIndexOffU ftabLen, | |
| TIndexOffU eftabLen, | |
| TIndexOffU i) | |
| { | |
| assert_lt(i, ftabLen); | |
| if(ftab[i] <= len) { | |
| return ftab[i]; | |
| } else { | |
| TIndexOffU efIdx = ftab[i] ^ OFF_MASK; | |
| assert_lt(efIdx*2+1, eftabLen); | |
| return eftab[efIdx*2]; | |
| } | |
| } | |
| /** | |
| * When using read() to create an Ebwt, we have to set a couple of | |
| * additional fields in the Ebwt object that aren't part of the | |
| * parameter list and are not stored explicitly in the file. Right | |
| * now, this just involves initializing _zEbwtByteOff and | |
| * _zEbwtBpOff from _zOff. | |
| */ | |
| void postReadInit(EbwtParams& eh) { | |
| TIndexOffU sideNum = _zOff / eh._sideBwtLen; | |
| TIndexOffU sideCharOff = _zOff % eh._sideBwtLen; | |
| TIndexOffU sideByteOff = sideNum * eh._sideSz; | |
| _zEbwtByteOff = sideCharOff >> 2; | |
| assert_lt(_zEbwtByteOff, eh._sideBwtSz); | |
| _zEbwtBpOff = sideCharOff & 3; | |
| assert_lt(_zEbwtBpOff, 4); | |
| if((sideNum & 1) == 0) { | |
| // This is an even (backward) side | |
| _zEbwtByteOff = eh._sideBwtSz - _zEbwtByteOff - 1; | |
| _zEbwtBpOff = 3 - _zEbwtBpOff; | |
| assert_lt(_zEbwtBpOff, 4); | |
| } | |
| _zEbwtByteOff += sideByteOff; | |
| assert(repOk(eh)); // Ebwt should be fully initialized now | |
| } | |
| /** | |
| * Pretty-print the Ebwt to the given output stream. | |
| */ | |
| void print(ostream& out) const { | |
| print(out, _eh); | |
| } | |
| /** | |
| * Pretty-print the Ebwt and given EbwtParams to the given output | |
| * stream. | |
| */ | |
| void print(ostream& out, const EbwtParams& eh) const { | |
| eh.print(out); // print params | |
| out << "Ebwt (" << (isInMemory()? "memory" : "disk") << "):" << endl | |
| << " zOff: " << _zOff << endl | |
| << " zEbwtByteOff: " << _zEbwtByteOff << endl | |
| << " zEbwtBpOff: " << _zEbwtBpOff << endl | |
| << " nPat: " << _nPat << endl | |
| << " plen: "; | |
| if(_plen == NULL) { | |
| out << "NULL" << endl; | |
| } else { | |
| out << "non-NULL, [0] = " << _plen[0] << endl; | |
| } | |
| out << " rstarts: "; | |
| if(_rstarts == NULL) { | |
| out << "NULL" << endl; | |
| } else { | |
| out << "non-NULL, [0] = " << _rstarts[0] << endl; | |
| } | |
| out << " ebwt: "; | |
| if(_ebwt == NULL) { | |
| out << "NULL" << endl; | |
| } else { | |
| out << "non-NULL, [0] = " << _ebwt[0] << endl; | |
| } | |
| out << " fchr: "; | |
| if(_fchr == NULL) { | |
| out << "NULL" << endl; | |
| } else { | |
| out << "non-NULL, [0] = " << _fchr[0] << endl; | |
| } | |
| out << " ftab: "; | |
| if(_ftab == NULL) { | |
| out << "NULL" << endl; | |
| } else { | |
| out << "non-NULL, [0] = " << _ftab[0] << endl; | |
| } | |
| out << " eftab: "; | |
| if(_eftab == NULL) { | |
| out << "NULL" << endl; | |
| } else { | |
| out << "non-NULL, [0] = " << _eftab[0] << endl; | |
| } | |
| out << " offs: "; | |
| if(_offs == NULL) { | |
| out << "NULL" << endl; | |
| } else { | |
| out << "non-NULL, [0] = " << _offs[0] << endl; | |
| } | |
| } | |
| // Building | |
| static TStr join(vector<TStr>& l, uint32_t seed); | |
| static TStr join(vector<FileBuf*>& l, vector<RefRecord>& szs, TIndexOffU sztot, const RefReadInParams& refparams, uint32_t seed); | |
| void joinToDisk(vector<FileBuf*>& l, vector<RefRecord>& szs, vector<uint32_t>& plens, TIndexOffU sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2, uint32_t seed = 0); | |
| void buildToDisk(InorderBlockwiseSA<TStr>& sa, const TStr& s, ostream& out1, ostream& out2); | |
| // I/O | |
| void readIntoMemory(int color, int needEntireReverse, bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, bool startVerbose); | |
| void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const; | |
| void writeFromMemory(bool justHeader, const string& out1, const string& out2) const; | |
| // Sanity checking | |
| void printRangeFw(uint32_t begin, uint32_t end) const; | |
| void printRangeBw(uint32_t begin, uint32_t end) const; | |
| void sanityCheckUpToSide(TIndexOff upToSide) const; | |
| void sanityCheckAll(int reverse) const; | |
| void restore(TStr& s) const; | |
| void checkOrigs(const vector<String<Dna5> >& os, bool color, bool mirror) const; | |
| // Searching and reporting | |
| void joinedToTextOff(TIndexOffU qlen, TIndexOffU off, TIndexOffU& tidx, TIndexOffU& textoff, TIndexOffU& tlen) const; | |
| inline bool report(const String<Dna5>& query, String<char>* quals, String<char>* name, bool color, char primer, char trimc, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector<TIndexOffU>& mmui32, const std::vector<uint8_t>& refcs, size_t numMms, TIndexOffU off, TIndexOffU top, TIndexOffU bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams<TStr>& params) const; | |
| inline bool reportChaseOne(const String<Dna5>& query, String<char>* quals, String<char>* name, bool color, char primer, char trimc, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector<TIndexOffU>& mmui32, const std::vector<uint8_t>& refcs, size_t numMms, TIndexOffU i, TIndexOffU top, TIndexOffU bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams<TStr>& params, SideLocus *l = NULL) const; | |
| inline bool reportReconstruct(const String<Dna5>& query, String<char>* quals, String<char>* name, String<Dna5>& lbuf, String<Dna5>& rbuf, const TIndexOffU *mmui32, const char* refcs, size_t numMms, TIndexOffU i, TIndexOffU top, TIndexOffU bot, uint32_t qlen, int stratum, const EbwtSearchParams<TStr>& params, SideLocus *l = NULL) const; | |
| inline int rowL(const SideLocus& l) const; | |
| inline TIndexOffU countUpTo(const SideLocus& l, int c) const; | |
| inline void countUpToEx(const SideLocus& l, TIndexOffU* pairs) const; | |
| inline TIndexOffU countFwSide(const SideLocus& l, int c) const; | |
| inline void countFwSideEx(const SideLocus& l, TIndexOffU *pairs) const; | |
| inline TIndexOffU countBwSide(const SideLocus& l, int c) const; | |
| inline void countBwSideEx(const SideLocus& l, TIndexOffU *pairs) const; | |
| inline TIndexOffU mapLF(const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const; | |
| inline void mapLFEx(const SideLocus& l, TIndexOffU *pairs ASSERT_ONLY(, bool overrideSanity = false)) const; | |
| inline void mapLFEx(const SideLocus& ltop, const SideLocus& lbot, TIndexOffU *tops, TIndexOffU *bots ASSERT_ONLY(, bool overrideSanity = false)) const; | |
| inline TIndexOffU mapLF(const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const; | |
| inline TIndexOffU mapLF1(TIndexOffU row, const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const; | |
| inline int mapLF1(TIndexOffU& row, const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const; | |
| /// Check that in-memory Ebwt is internally consistent with respect | |
| /// to given EbwtParams; assert if not | |
| bool inMemoryRepOk(const EbwtParams& eh) const { | |
| assert_leq(ValueSize<TAlphabet>::VALUE, 4); | |
| assert_geq(_zEbwtBpOff, 0); | |
| assert_lt(_zEbwtBpOff, 4); | |
| assert_lt(_zEbwtByteOff, eh._ebwtTotSz); | |
| assert_lt(_zOff, eh._bwtLen); | |
| assert(_rstarts != NULL); | |
| assert_geq(_nFrag, _nPat); | |
| return true; | |
| } | |
| /// Check that in-memory Ebwt is internally consistent; assert if | |
| /// not | |
| bool inMemoryRepOk() const { | |
| return repOk(_eh); | |
| } | |
| /// Check that Ebwt is internally consistent with respect to given | |
| /// EbwtParams; assert if not | |
| bool repOk(const EbwtParams& eh) const { | |
| assert(_eh.repOk()); | |
| if(isInMemory()) { | |
| return inMemoryRepOk(eh); | |
| } | |
| return true; | |
| } | |
| /// Check that Ebwt is internally consistent; assert if not | |
| bool repOk() const { | |
| return repOk(_eh); | |
| } | |
| bool _toBigEndian; | |
| int32_t _overrideOffRate; | |
| int32_t _overrideIsaRate; | |
| bool _verbose; | |
| bool _passMemExc; | |
| bool _sanity; | |
| bool _fw; // true iff this is a forward index | |
| FILE *_in1; // input fd for primary index file | |
| FILE *_in2; // input fd for secondary index file | |
| string _in1Str; // filename for primary index file | |
| string _in2Str; // filename for secondary index file | |
| TIndexOffU _zOff; | |
| TIndexOffU _zEbwtByteOff; | |
| TIndexOff _zEbwtBpOff; | |
| TIndexOffU _nPat; /// number of reference texts | |
| TIndexOffU _nFrag; /// number of fragments | |
| TIndexOffU* _plen; | |
| TIndexOffU* _rstarts; // starting offset of fragments / text indexes | |
| // _fchr, _ftab and _eftab are expected to be relatively small | |
| // (usually < 1MB, perhaps a few MB if _fchr is particularly large | |
| // - like, say, 11). For this reason, we don't bother with writing | |
| // them to disk through separate output streams; we | |
| TIndexOffU* _fchr; | |
| TIndexOffU* _ftab; | |
| TIndexOffU* _eftab; // "extended" entries for _ftab | |
| // _offs may be extremely large. E.g. for DNA w/ offRate=4 (one | |
| // offset every 16 rows), the total size of _offs is the same as | |
| // the total size of the input sequence | |
| TIndexOffU* _offs; | |
| TIndexOffU* _isa; | |
| // _ebwt is the Extended Burrows-Wheeler Transform itself, and thus | |
| // is at least as large as the input sequence. | |
| uint8_t* _ebwt; | |
| bool _useMm; /// use memory-mapped files to hold the index | |
| bool useShmem_; /// use shared memory to hold large parts of the index | |
| vector<string> _refnames; /// names of the reference sequences | |
| const ReferenceMap* rmap_; /// mapping into another reference coordinate space | |
| char *mmFile1_; | |
| char *mmFile2_; | |
| EbwtParams _eh; | |
| #ifdef BOWTIE_64BIT_INDEX | |
| static const int default_lineRate = 7; | |
| #else | |
| static const int default_lineRate = 6; | |
| #endif | |
| #ifdef EBWT_STATS | |
| uint64_t mapLFExs_; | |
| uint64_t mapLFs_; | |
| uint64_t mapLFcs_; | |
| #endif | |
| private: | |
| ostream& log() const { | |
| return cout; // TODO: turn this into a parameter | |
| } | |
| /// Print a verbose message and flush (flushing is helpful for | |
| /// debugging) | |
| void verbose(const string& s) const { | |
| if(this->verbose()) { | |
| this->log() << s; | |
| this->log().flush(); | |
| } | |
| } | |
| }; | |
| /// Specialization for packed Ebwts - return true | |
| template<> | |
| bool Ebwt<String<Dna, Packed<> > >::isPacked() { | |
| return true; | |
| } | |
| /// By default, Ebwts are not packed | |
| template<typename TStr> | |
| bool Ebwt<TStr>::isPacked() { | |
| return false; | |
| } | |
| /** | |
| * Structure encapsulating search parameters, such as whether and how | |
| * to backtrack and how to deal with multiple equally-good hits. | |
| */ | |
| template<typename TStr> | |
| class EbwtSearchParams { | |
| public: | |
| EbwtSearchParams(HitSinkPerThread& sink, | |
| const vector<String<Dna5> >& texts, | |
| bool fw = true, | |
| bool ebwtFw = true) : | |
| _sink(sink), | |
| _texts(texts), | |
| _patid(0xffffffff), | |
| _fw(fw) { } | |
| HitSinkPerThread& sink() const { return _sink; } | |
| void setPatId(uint32_t patid) { _patid = patid; } | |
| uint32_t patId() const { return _patid; } | |
| void setFw(bool fw) { _fw = fw; } | |
| bool fw() const { return _fw; } | |
| /** | |
| * Report a hit. Returns true iff caller can call off the search. | |
| */ | |
| bool reportHit(const String<Dna5>& query, // read sequence | |
| String<char>* quals, // read quality values | |
| String<char>* name, // read name | |
| bool color, // true -> read is colorspace | |
| char primer, // primer base trimmed from beginning | |
| char trimc, // first color trimmed from beginning | |
| bool colExEnds, // true -> exclude nucleotides at extreme ends after decoding | |
| int snpPhred, // penalty for a SNP | |
| const BitPairReference* ref, // reference (= NULL if not necessary) | |
| const ReferenceMap* rmap, // map to another reference coordinate system | |
| bool ebwtFw, // whether index is forward (true) or mirror (false) | |
| const std::vector<TIndexOffU>& mmui32, // mismatch list | |
| const std::vector<uint8_t>& refcs, // reference characters | |
| size_t numMms, // # mismatches | |
| UPair h, // ref coords | |
| UPair mh, // mate's ref coords | |
| bool mfw, // mate's orientation | |
| uint16_t mlen, // mate length | |
| UPair a, // arrow pair | |
| uint32_t tlen, // length of text | |
| uint32_t qlen, // length of query | |
| int stratum, // alignment stratum | |
| uint16_t cost, // cost of alignment | |
| uint32_t oms, // approx. # other valid alignments | |
| uint32_t patid, | |
| uint32_t seed, | |
| uint8_t mate) const | |
| { | |
| #ifndef NDEBUG | |
| // Check that no two elements of the mms array are the same | |
| for(size_t i = 0; i < numMms; i++) { | |
| for(size_t j = i+1; j < numMms; j++) { | |
| assert_neq(mmui32[i], mmui32[j]); | |
| } | |
| } | |
| #endif | |
| // If ebwtFw is true, then 'query' and 'quals' are reversed | |
| // If _fw is false, then 'query' and 'quals' are reverse complemented | |
| assert(!color || ref != NULL); | |
| assert(quals != NULL); | |
| assert(name != NULL); | |
| assert_eq(mmui32.size(), refcs.size()); | |
| assert_leq(numMms, mmui32.size()); | |
| assert_gt(qlen, 0); | |
| Hit hit; | |
| hit.stratum = stratum; | |
| hit.cost = cost; | |
| hit.patSeq = query; | |
| hit.quals = *quals; | |
| if(!ebwtFw) { | |
| // Re-reverse the pattern and the quals back to how they | |
| // appeared in the read file | |
| ::reverseInPlace(hit.patSeq); | |
| ::reverseInPlace(hit.quals); | |
| } | |
| if(color) { | |
| hit.colSeq = hit.patSeq; | |
| hit.colQuals = hit.quals; | |
| hit.crefcs.resize(qlen, 0); | |
| // Turn the mmui32 and refcs arrays into the mm FixedBitset and | |
| // the refc vector | |
| for(size_t i = 0; i < numMms; i++) { | |
| if (ebwtFw != _fw) { | |
| // The 3' end is on the left but the mm vector encodes | |
| // mismatches w/r/t the 5' end, so we flip | |
| uint32_t off = qlen - mmui32[i] - 1; | |
| hit.cmms.set(off); | |
| hit.crefcs[off] = refcs[i]; | |
| } else { | |
| hit.cmms.set(mmui32[i]); | |
| hit.crefcs[i] = refcs[i]; | |
| } | |
| } | |
| assert(ref != NULL); | |
| char read[1024]; | |
| uint32_t rfbuf[(1024+16)/4]; | |
| ASSERT_ONLY(char rfbuf2[1024]); | |
| char qual[1024]; | |
| char ns[1024]; | |
| char cmm[1024]; | |
| char nmm[1024]; | |
| int cmms = 0; | |
| int nmms = 0; | |
| // TODO: account for indels when calculating these bounds | |
| size_t readi = 0; | |
| size_t readf = seqan::length(hit.patSeq); | |
| size_t refi = 0; | |
| size_t reff = readf + 1; | |
| bool maqRound = false; | |
| for(size_t i = 0; i < qlen + 1; i++) { | |
| if(i < qlen) { | |
| read[i] = (int)hit.patSeq[i]; | |
| qual[i] = mmPenalty(maqRound, phredCharToPhredQual(hit.quals[i])); | |
| } | |
| ASSERT_ONLY(rfbuf2[i] = ref->getBase(h.first, h.second + i)); | |
| } | |
| int offset = ref->getStretch(rfbuf, h.first, h.second, qlen + 1); | |
| char *rf = (char*)rfbuf + offset; | |
| for(size_t i = 0; i < qlen + 1; i++) { | |
| assert_eq(rf[i], rfbuf2[i]); | |
| rf[i] = (1 << rf[i]); | |
| } | |
| decodeHit( | |
| read, // ASCII colors, '0', '1', '2', '3', '.' | |
| qual, // ASCII quals, Phred+33 encoded | |
| readi, // offset of first character within 'read' to consider | |
| readf, // offset of last char (exclusive) in 'read' to consider | |
| rf, // reference sequence, as masks | |
| refi, // offset of first character within 'ref' to consider | |
| reff, // offset of last char (exclusive) in 'ref' to consider | |
| snpPhred, // penalty incurred by a SNP | |
| ns, // decoded nucleotides are appended here | |
| cmm, // where the color mismatches are in the string | |
| nmm, // where nucleotide mismatches are in the string | |
| cmms, // number of color mismatches | |
| nmms);// number of nucleotide mismatches | |
| size_t nqlen = qlen + (colExEnds ? -1 : 1); | |
| seqan::resize(hit.patSeq, nqlen); | |
| seqan::resize(hit.quals, nqlen); | |
| hit.refcs.resize(nqlen); | |
| size_t lo = colExEnds ? 1 : 0; | |
| size_t hi = colExEnds ? qlen : qlen+1; | |
| size_t destpos = 0; | |
| for(size_t i = lo; i < hi; i++, destpos++) { | |
| // Set sequence character | |
| assert_leq(ns[i], 4); | |
| assert_geq(ns[i], 0); | |
| hit.patSeq[destpos] = (Dna5)(int)ns[i]; | |
| // Set initial quality | |
| hit.quals[destpos] = '!'; | |
| // Color mismatches penalize quality | |
| if(i > 0) { | |
| if(cmm[i-1] == 'M') { | |
| if((int)hit.quals[destpos] + (int)qual[i-1] > 126) { | |
| hit.quals[destpos] = 126; | |
| } else { | |
| hit.quals[destpos] += qual[i-1]; | |
| } | |
| } else if((int)hit.colSeq[i-1] != 4) { | |
| hit.quals[destpos] -= qual[i-1]; | |
| } | |
| } | |
| if(i < qlen) { | |
| if(cmm[i] == 'M') { | |
| if((int)hit.quals[destpos] + (int)qual[i] > 126) { | |
| hit.quals[destpos] = 126; | |
| } else { | |
| hit.quals[destpos] += qual[i]; | |
| } | |
| } else if((int)hit.patSeq[i] != 4) { | |
| hit.quals[destpos] -= qual[i]; | |
| } | |
| } | |
| if(hit.quals[destpos] < '!') { | |
| hit.quals[destpos] = '!'; | |
| } | |
| if(nmm[i] != 'M') { | |
| uint32_t off = (uint32_t)i - (colExEnds? 1:0); | |
| if(!_fw) off = (uint32_t)nqlen - off - 1; | |
| assert_lt(off, nqlen); | |
| hit.mms.set(off); | |
| hit.refcs[off] = "ACGT"[ref->getBase(h.first, h.second+i)]; | |
| } | |
| } | |
| if(colExEnds) { | |
| // Extreme bases have been removed; that makes the | |
| // nucleotide alignment one character shorter than the | |
| // color alignment | |
| qlen--; mlen--; | |
| // It also shifts the alignment's offset up by 1 | |
| h.second++; | |
| } else { | |
| // Extreme bases are included; that makes the | |
| // nucleotide alignment one character longer than the | |
| // color alignment | |
| qlen++; mlen++; | |
| } | |
| } else { | |
| // Turn the mmui32 and refcs arrays into the mm FixedBitset and | |
| // the refc vector | |
| hit.refcs.resize(qlen, 0); | |
| for(size_t i = 0; i < numMms; i++) { | |
| if (ebwtFw != _fw) { | |
| // The 3' end is on the left but the mm vector encodes | |
| // mismatches w/r/t the 5' end, so we flip | |
| uint32_t off = qlen - mmui32[i] - 1; | |
| hit.mms.set(off); | |
| hit.refcs[off] = refcs[i]; | |
| } else { | |
| hit.mms.set(mmui32[i]); | |
| hit.refcs[mmui32[i]] = refcs[i]; | |
| } | |
| } | |
| } | |
| // Check the hit against the original text, if it's available | |
| if(_texts.size() > 0) { | |
| assert_lt(h.first, _texts.size()); | |
| FixedBitset<1024> diffs; | |
| // This type of check assumes that only mismatches are | |
| // possible. If indels are possible, then we either need | |
| // the caller to provide information about indel locations, | |
| // or we need to extend this to a more complicated check. | |
| assert_leq(h.second + qlen, length(_texts[h.first])); | |
| for(size_t i = 0; i < qlen; i++) { | |
| assert_neq(4, (int)_texts[h.first][h.second + i]); | |
| // Forward pattern appears at h | |
| if((int)hit.patSeq[i] != (int)_texts[h.first][h.second + i]) { | |
| uint32_t qoff = (uint32_t)i; | |
| // if ebwtFw != _fw the 3' end is on on the | |
| // left end of the pattern, but the diff vector | |
| // should encode mismatches w/r/t the 5' end, | |
| // so we flip | |
| if (_fw) diffs.set(qoff); | |
| else diffs.set(qlen - qoff - 1); | |
| } | |
| } | |
| if(diffs != hit.mms) { | |
| // Oops, mismatches were not where we expected them; | |
| // print a diagnostic message before asserting | |
| cerr << "Expected " << hit.mms.str() << " mismatches, got " << diffs.str() << endl; | |
| cerr << " Pat: " << hit.patSeq << endl; | |
| cerr << " Tseg: "; | |
| for(size_t i = 0; i < qlen; i++) { | |
| cerr << _texts[h.first][h.second + i]; | |
| } | |
| cerr << endl; | |
| cerr << " mmui32: "; | |
| for(size_t i = 0; i < numMms; i++) { | |
| cerr << mmui32[i] << " "; | |
| } | |
| cerr << endl; | |
| cerr << " FW: " << _fw << endl; | |
| cerr << " Ebwt FW: " << ebwtFw << endl; | |
| } | |
| if(diffs != hit.mms) assert(false); | |
| } | |
| hit.h = h; | |
| if(rmap != NULL) rmap->map(hit.h); | |
| hit.patId = ((patid == 0xffffffff) ? _patid : patid); | |
| hit.patName = *name; | |
| hit.mh = mh; | |
| hit.fw = _fw; | |
| hit.mfw = mfw; | |
| hit.mlen = mlen; | |
| hit.oms = oms; | |
| hit.mate = mate; | |
| hit.color = color; | |
| hit.primer = primer; | |
| hit.trimc = trimc; | |
| hit.seed = seed; | |
| assert(hit.repOk()); | |
| return sink().reportHit(hit, stratum); | |
| } | |
| private: | |
| HitSinkPerThread& _sink; | |
| const vector<String<Dna5> >& _texts; // original texts, if available (if not | |
| // available, _texts.size() == 0) | |
| uint32_t _patid; // id of current read | |
| bool _fw; // current read is forward-oriented | |
| }; | |
| /** | |
| * Encapsulates a location in the bwt text in terms of the side it | |
| * occurs in and its offset within the side. | |
| */ | |
| struct SideLocus { | |
| SideLocus() : | |
| _sideByteOff(0), | |
| _sideNum(0), | |
| _charOff(0), | |
| _fw(true), | |
| _by(-1), | |
| _bp(-1) { } | |
| /** | |
| * Construct from row and other relevant information about the Ebwt. | |
| */ | |
| SideLocus(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) { | |
| initFromRow(row, ep, ebwt); | |
| } | |
| /** | |
| * Init two SideLocus objects from a top/bot pair, using the result | |
| * from one call to initFromRow to possibly avoid a second call. | |
| */ | |
| static void initFromTopBot(TIndexOffU top, | |
| TIndexOffU bot, | |
| const EbwtParams& ep, | |
| const uint8_t* ebwt, | |
| SideLocus& ltop, | |
| SideLocus& lbot) | |
| { | |
| const TIndexOffU sideBwtLen = ep._sideBwtLen; | |
| const uint32_t sideBwtSz = ep._sideBwtSz; | |
| assert_gt(bot, top); | |
| ltop.initFromRow(top, ep, ebwt); | |
| TIndexOffU spread = bot - top; | |
| if(ltop._charOff + spread < sideBwtLen) { | |
| lbot._charOff = (uint32_t)(ltop._charOff + spread); | |
| lbot._sideNum = ltop._sideNum; | |
| lbot._sideByteOff = ltop._sideByteOff; | |
| lbot._fw = ltop._fw; | |
| lbot._by = lbot._charOff >> 2; | |
| assert_lt(lbot._by, (int)sideBwtSz); | |
| if(!lbot._fw) lbot._by = sideBwtSz - lbot._by - 1; | |
| lbot._bp = lbot._charOff & 3; | |
| if(!lbot._fw) lbot._bp ^= 3; | |
| } else { | |
| lbot.initFromRow(bot, ep, ebwt); | |
| } | |
| } | |
| /** | |
| * Calculate SideLocus based on a row and other relevant | |
| * information about the shape of the Ebwt. | |
| */ | |
| void initFromRow(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) { | |
| const uint32_t sideSz = ep._sideSz; | |
| // Side length is hard-coded for now; this allows the compiler | |
| // to do clever things to accelerate / and %. | |
| _sideNum = row / (56*OFF_SIZE); | |
| _charOff = row % (56*OFF_SIZE); | |
| _sideByteOff = _sideNum * sideSz; | |
| assert_leq(row, ep._len); | |
| assert_leq(_sideByteOff + sideSz, ep._ebwtTotSz); | |
| #ifndef NO_PREFETCH | |
| __builtin_prefetch((const void *)(ebwt + _sideByteOff), | |
| 0 /* prepare for read */, | |
| PREFETCH_LOCALITY); | |
| #endif | |
| // prefetch this side too | |
| _fw = (_sideNum & 1) != 0; // odd-numbered sides are forward | |
| _by = _charOff >> 2; // byte within side | |
| assert_lt(_by, (int)ep._sideBwtSz); | |
| _bp = _charOff & 3; // bit-pair within byte | |
| if(!_fw) { | |
| _by = ep._sideBwtSz - _by - 1; | |
| _bp ^= 3; | |
| } | |
| } | |
| /// Return true iff this is an initialized SideLocus | |
| bool valid() { | |
| return _bp != -1; | |
| } | |
| /// Make this look like an invalid SideLocus | |
| void invalidate() { | |
| _bp = -1; | |
| } | |
| const uint8_t *side(const uint8_t* ebwt) const { | |
| return ebwt + _sideByteOff; | |
| } | |
| const uint8_t *oside(const uint8_t* ebwt) const { | |
| return ebwt + _sideByteOff + (_fw? (-128) : (128)); | |
| } | |
| TIndexOffU _sideByteOff; // offset of top side within ebwt[] | |
| TIndexOffU _sideNum; // index of side | |
| uint16_t _charOff; // character offset within side | |
| bool _fw; // side is forward or backward? | |
| int16_t _by; // byte within side (not adjusted for bw sides) | |
| int8_t _bp; // bitpair within byte (not adjusted for bw sides) | |
| }; | |
| #include "ebwt_search_backtrack.h" | |
| /////////////////////////////////////////////////////////////////////// | |
| // | |
| // Functions for printing and sanity-checking Ebwts | |
| // | |
| /////////////////////////////////////////////////////////////////////// | |
| /** | |
| * Given a range of positions in the EBWT array within the BWT portion | |
| * of a forward side, print the characters at those positions along | |
| * with a summary occ[] array. | |
| */ | |
| template<typename TStr> | |
| void Ebwt<TStr>::printRangeFw(uint32_t begin, uint32_t end) const { | |
| assert(isInMemory()); | |
| uint32_t occ[] = {0, 0, 0, 0}; | |
| assert_gt(end, begin); | |
| for(uint32_t i = begin; i < end; i++) { | |
| uint8_t by = this->_ebwt[i]; | |
| for(int j = 0; j < 4; j++) { | |
| // Unpack from lowest to highest bit pair | |
| int twoBit = unpack_2b_from_8b(by, j); | |
| occ[twoBit]++; | |
| cout << "ACGT"[twoBit]; | |
| } | |
| assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3); | |
| } | |
| cout << ":{" << occ[0] << "," << occ[1] << "," << occ[2] << "," << occ[3] << "}" << endl; | |
| } | |
| /** | |
| * Given a range of positions in the EBWT array within the BWT portion | |
| * of a backward side, print the characters at those positions along | |
| * with a summary occ[] array. | |
| */ | |
| template<typename TStr> /* check - update */ | |
| void Ebwt<TStr>::printRangeBw(uint32_t begin, uint32_t end) const { | |
| assert(isInMemory()); | |
| uint32_t occ[] = {0, 0, 0, 0}; | |
| assert_gt(end, begin); | |
| for(uint32_t i = end-1; i >= begin; i--) { | |
| uint8_t by = this->_ebwt[i]; | |
| for(int j = 3; j >= 0; j--) { | |
| // Unpack from lowest to highest bit pair | |
| int twoBit = unpack_2b_from_8b(by, j); | |
| occ[twoBit]++; | |
| cout << "ACGT"[twoBit]; | |
| } | |
| assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3); | |
| if(i == 0) break; | |
| } | |
| cout << ":{" << occ[0] << "," << occ[1] << "," << occ[2] << "," << occ[3] << "}" << endl; | |
| } | |
| /** | |
| * Check that the ebwt array is internally consistent up to (and not | |
| * including) the given side index by re-counting the chars and | |
| * comparing against the embedded occ[] arrays. | |
| */ | |
| template<typename TStr> | |
| void Ebwt<TStr>::sanityCheckUpToSide(TIndexOff upToSide) const { | |
| assert(isInMemory()); | |
| TIndexOffU occ[] = {0, 0, 0, 0}; | |
| ASSERT_ONLY(TIndexOffU occ_save[] = {0, 0}); | |
| TIndexOffU cur = 0; // byte pointer | |
| const EbwtParams& eh = this->_eh; | |
| bool fw = false; | |
| while(cur < (TIndexOffU)(upToSide * eh._sideSz)) { | |
| assert_leq(cur + eh._sideSz, eh._ebwtTotLen); | |
| for(uint32_t i = 0; i < eh._sideBwtSz; i++) { | |
| uint8_t by = this->_ebwt[cur + (fw ? i : eh._sideBwtSz-i-1)]; | |
| for(int j = 0; j < 4; j++) { | |
| // Unpack from lowest to highest bit pair | |
| int twoBit = unpack_2b_from_8b(by, fw ? j : 3-j); | |
| occ[twoBit]++; | |
| //if(_verbose) cout << "ACGT"[twoBit]; | |
| } | |
| assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % 4); | |
| } | |
| assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % eh._sideBwtLen); | |
| if(fw) { | |
| // Finished forward bucket; check saved [G] and [T] | |
| // against the two uint32_ts encoded here | |
| ASSERT_ONLY(TIndexOffU *u32ebwt = reinterpret_cast<TIndexOffU*>(&this->_ebwt[cur + eh._sideBwtSz])); | |
| ASSERT_ONLY(TIndexOffU gs = u32ebwt[0]); | |
| ASSERT_ONLY(TIndexOffU ts = u32ebwt[1]); | |
| assert_eq(gs, occ_save[0]); | |
| assert_eq(ts, occ_save[1]); | |
| fw = false; | |
| } else { | |
| // Finished backward bucket; check current [A] and [C] | |
| // against the two uint32_ts encoded here | |
| ASSERT_ONLY(TIndexOffU *u32ebwt = reinterpret_cast<TIndexOffU*>(&this->_ebwt[cur + eh._sideBwtSz])); | |
| ASSERT_ONLY(TIndexOffU as = u32ebwt[0]); | |
| ASSERT_ONLY(TIndexOffU cs = u32ebwt[1]); | |
| assert(as == occ[0] || as == occ[0]-1); // one 'a' is a skipped '$' and doesn't count toward occ[] | |
| assert_eq(cs, occ[1]); | |
| ASSERT_ONLY(occ_save[0] = occ[2]); // save gs | |
| ASSERT_ONLY(occ_save[1] = occ[3]); // save ts | |
| fw = true; | |
| } | |
| cur += eh._sideSz; | |
| } | |
| } | |
| /** | |
| * Sanity-check various pieces of the Ebwt | |
| */ | |
| template<typename TStr> | |
| void Ebwt<TStr>::sanityCheckAll(int reverse) const { | |
| const EbwtParams& eh = this->_eh; | |
| assert(isInMemory()); | |
| // Check ftab | |
| for(TIndexOffU i = 1; i < eh._ftabLen; i++) { | |
| assert_geq(this->ftabHi(i), this->ftabLo(i-1)); | |
| assert_geq(this->ftabLo(i), this->ftabHi(i-1)); | |
| assert_leq(this->ftabHi(i), eh._bwtLen+1); | |
| } | |
| assert_eq(this->ftabHi(eh._ftabLen-1), eh._bwtLen); | |
| // Check offs | |
| TIndexOff seenLen = (eh._bwtLen + 31) >> ((TIndexOffU)5); | |
| TIndexOff *seen; | |
| try { | |
| seen = new TIndexOff[seenLen]; // bitvector marking seen offsets | |
| } catch(bad_alloc& e) { | |
| cerr << "Out of memory allocating seen[] at " << __FILE__ << ":" << __LINE__ << endl; | |
| throw e; | |
| } | |
| memset(seen, 0, OFF_SIZE * seenLen); | |
| TIndexOffU offsLen = eh._offsLen; | |
| for(TIndexOffU i = 0; i < offsLen; i++) { | |
| assert_lt(this->_offs[i], eh._bwtLen); | |
| TIndexOff w = this->_offs[i] >> 5; | |
| TIndexOff r = this->_offs[i] & 31; | |
| assert_eq(0, (seen[w] >> r) & 1); // shouldn't have been seen before | |
| seen[w] |= (1 << r); | |
| } | |
| delete[] seen; | |
| // Check nPat | |
| assert_gt(this->_nPat, 0); | |
| // Check plen, flen | |
| for(TIndexOffU i = 0; i < this->_nPat; i++) { | |
| assert_geq(this->_plen[i], 0); | |
| } | |
| // Check rstarts | |
| for(TIndexOffU i = 0; i < this->_nFrag-1; i++) { | |
| assert_gt(this->_rstarts[(i+1)*3], this->_rstarts[i*3]); | |
| if(reverse == REF_READ_REVERSE) { | |
| assert(this->_rstarts[(i*3)+1] >= this->_rstarts[((i+1)*3)+1]); | |
| } else { | |
| assert(this->_rstarts[(i*3)+1] <= this->_rstarts[((i+1)*3)+1]); | |
| } | |
| } | |
| // Check ebwt | |
| sanityCheckUpToSide(eh._numSides); | |
| VMSG_NL("Ebwt::sanityCheck passed"); | |
| } | |
| /////////////////////////////////////////////////////////////////////// | |
| // | |
| // Functions for searching Ebwts | |
| // | |
| /////////////////////////////////////////////////////////////////////// | |
| /** | |
| * Return the final character in row i (i.e. the i'th character in the | |
| * BWT transform). Note that the 'L' in the name of the function | |
| * stands for 'last', as in the literature. | |
| */ | |
| template<typename TStr> | |
| inline int Ebwt<TStr>::rowL(const SideLocus& l) const { | |
| // Extract and return appropriate bit-pair | |
| #ifdef SIXTY4_FORMAT | |
| return (((uint64_t*)l.side(this->_ebwt))[l._by >> 3] >> ((((l._by & 7) << 2) + l._bp) << 1)) & 3; | |
| #else | |
| return unpack_2b_from_8b(l.side(this->_ebwt)[l._by], l._bp); | |
| #endif | |
| } | |
| /** | |
| * Inline-function version of the above. This does not always seem to | |
| * be inlined | |
| */ | |
| #if 0 | |
| // Use gcc's intrinsic popcountll. I don't recommend it because it | |
| // seems to be somewhat slower than the bit-bashing pop64 routine both | |
| // on an AMD server and on an Intel workstation. On the other hand, | |
| // perhaps when the builtin is used GCC is smart enough to insert a | |
| // pop-count instruction on architectures that have one (e.g. Itanium). | |
| // For now, it's disabled. | |
| #define pop64(x) __builtin_popcountll(x) | |
| #elif 0 | |
| __declspec naked int __stdcall pop64 | |
| (uint64_t v) | |
| { | |
| static const uint64_t C55 = 0x5555555555555555ll; | |
| static const uint64_t C33 = 0x3333333333333333ll; | |
| static const uint64_t C0F = 0x0F0F0F0F0F0F0F0Fll; | |
| __asm { | |
| MOVD MM0, [ESP+4] ;v_low | |
| PUNPCKLDQ MM0, [ESP+8] ;v | |
| MOVQ MM1, MM0 ;v | |
| PSRLD MM0, 1 ;v >> 1 | |
| PAND MM0, [C55] ;(v >> 1) & 0x55555555 | |
| PSUBD MM1, MM0 ;w = v - ((v >> 1) & 0x55555555) | |
| MOVQ MM0, MM1 ;w | |
| PSRLD MM1, 2 ;w >> 2 | |
| PAND MM0, [C33] ;w & 0x33333333 | |
| PAND MM1, [C33] ;(w >> 2) & 0x33333333 | |
| PADDD MM0, MM1 ;x = (w & 0x33333333) + | |
| ; ((w >> 2) & 0x33333333) | |
| MOVQ MM1, MM0 ;x | |
| PSRLD MM0, 4 ;x >> 4 | |
| PADDD MM0, MM1 ;x + (x >> 4) | |
| PAND MM0, [C0F] ;y = (x + (x >> 4) & 0x0F0F0F0F) | |
| PXOR MM1, MM1 ;0 | |
| PSADBW (MM0, MM1) ;sum across all 8 bytes | |
| MOVD EAX, MM0 ;result in EAX per calling | |
| ; convention | |
| EMMS ;clear MMX state | |
| RET 8 ;pop 8-byte argument off stack | |
| ; and return | |
| } | |
| } | |
| #elif 0 | |
| // Use a bytewise LUT version of popcount. This is slower than the | |
| // bit-bashing pop64 routine both on an AMD server and on an Intel | |
| // workstation. It seems to be about the same speed as the GCC builtin | |
| // on Intel, and a bit faster than it on AMD. For now, it's disabled. | |
| const int popcntU8Table[256] = { | |
| 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, | |
| 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, | |
| 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, | |
| 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, | |
| 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, | |
| 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, | |
| 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, | |
| 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8 | |
| }; | |
| // Use this bytewise population count table | |
| inline static int pop64(uint64_t x) { | |
| const unsigned char * p = (const unsigned char *) &x; | |
| return popcntU8Table[p[0]] + | |
| popcntU8Table[p[1]] + | |
| popcntU8Table[p[2]] + | |
| popcntU8Table[p[3]] + | |
| popcntU8Table[p[4]] + | |
| popcntU8Table[p[5]] + | |
| popcntU8Table[p[6]] + | |
| popcntU8Table[p[7]]; | |
| } | |
| #else | |
| #ifdef POPCNT_CAPABILITY // wrapping of "struct" | |
| struct USE_POPCNT_GENERIC { | |
| #endif | |
| // Use this standard bit-bashing population count | |
| inline static int pop64(uint64_t x) { | |
| x = x - ((x >> 1) & 0x5555555555555555llu); | |
| x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu); | |
| x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu; | |
| x = x + (x >> 8); | |
| x = x + (x >> 16); | |
| x = x + (x >> 32); | |
| return x & 0x3F; | |
| } | |
| #ifdef POPCNT_CAPABILITY // wrapping a "struct" | |
| }; | |
| #endif | |
| #ifdef POPCNT_CAPABILITY | |
| struct USE_POPCNT_INSTRUCTION { | |
| inline static int pop64(uint64_t x) { | |
| int64_t count; | |
| asm ("popcntq %[x],%[count]\n": [count] "=&r" (count): [x] "r" (x)); | |
| return count; | |
| } | |
| }; | |
| #endif | |
| #endif | |
| /** | |
| * Tricky-bit-bashing bitpair counting for given two-bit value (0-3) | |
| * within a 64-bit argument. | |
| */ | |
| #ifdef POPCNT_CAPABILITY | |
| template<typename Operation> | |
| #endif | |
| inline static int countInU64(int c, uint64_t dw) { | |
| uint64_t c0 = c_table[c]; | |
| uint64_t x0 = dw ^ c0; | |
| uint64_t x1 = (x0 >> 1); | |
| uint64_t x2 = x1 & (0x5555555555555555llu); | |
| uint64_t x3 = x0 & x2; | |
| #ifdef POPCNT_CAPABILITY | |
| uint64_t tmp = Operation().pop64(x3); | |
| #else | |
| uint64_t tmp = pop64(x3); | |
| #endif | |
| return (int) tmp; | |
| } | |
| /** | |
| * Tricky-bit-bashing bitpair counting for given two-bit value (0-3) | |
| * within a 64-bit argument. | |
| * | |
| * Function gets 2.32% in profile | |
| */ | |
| #ifdef POPCNT_CAPABILITY | |
| template<typename Operation> | |
| #endif | |
| inline static void countInU64Ex(uint64_t dw, TIndexOffU* arrs) { | |
| uint64_t c0 = c_table[0]; | |
| uint64_t x0 = dw ^ c0; | |
| uint64_t x1 = (x0 >> 1); | |
| uint64_t x2 = x1 & (0x5555555555555555llu); | |
| uint64_t x3 = x0 & x2; | |
| #ifdef POPCNT_CAPABILITY | |
| uint64_t tmp = Operation().pop64(x3); | |
| #else | |
| uint64_t tmp = pop64(x3); | |
| #endif | |
| arrs[0] += (uint32_t) tmp; | |
| c0 = c_table[1]; | |
| x0 = dw ^ c0; | |
| x1 = (x0 >> 1); | |
| x2 = x1 & (0x5555555555555555llu); | |
| x3 = x0 & x2; | |
| #ifdef POPCNT_CAPABILITY | |
| tmp = Operation().pop64(x3); | |
| #else | |
| tmp = pop64(x3); | |
| #endif | |
| arrs[1] += (uint32_t) tmp; | |
| c0 = c_table[2]; | |
| x0 = dw ^ c0; | |
| x1 = (x0 >> 1); | |
| x2 = x1 & (0x5555555555555555llu); | |
| x3 = x0 & x2; | |
| #ifdef POPCNT_CAPABILITY | |
| tmp = Operation().pop64(x3); | |
| #else | |
| tmp = pop64(x3); | |
| #endif | |
| arrs[2] += (uint32_t) tmp; | |
| c0 = c_table[3]; | |
| x0 = dw ^ c0; | |
| x1 = (x0 >> 1); | |
| x2 = x1 & (0x5555555555555555llu); | |
| x3 = x0 & x2; | |
| #ifdef POPCNT_CAPABILITY | |
| tmp = Operation().pop64(x3); | |
| #else | |
| tmp = pop64(x3); | |
| #endif | |
| arrs[3] += (uint32_t) tmp; | |
| } | |
| /** | |
| * Counts the number of occurrences of character 'c' in the given Ebwt | |
| * side up to (but not including) the given byte/bitpair (by/bp). | |
| * | |
| * This is a performance-critical function. This is the top search- | |
| * related hit in the time profile. | |
| * | |
| * Function gets 11.09% in profile | |
| */ | |
| template<typename TStr> | |
| inline TIndexOffU Ebwt<TStr>::countUpTo(const SideLocus& l, int c) const { | |
| // Count occurrences of c in each 64-bit (using bit trickery); | |
| // Someday countInU64() and pop() functions should be | |
| // vectorized/SSE-ized in case that helps. | |
| TIndexOffU cCnt = 0; | |
| const uint8_t *side = l.side(this->_ebwt); | |
| int i = 0; | |
| #if 1 | |
| #ifdef POPCNT_CAPABILITY | |
| if ( _usePOPCNTinstruction) { | |
| for(; i + 7 < l._by; i += 8) { | |
| cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, *(uint64_t*)&side[i]); | |
| } | |
| } | |
| else { | |
| for(; i + 7 < l._by; i += 8) { | |
| cCnt += countInU64<USE_POPCNT_GENERIC>(c, *(uint64_t*)&side[i]); | |
| } | |
| } | |
| #else | |
| for(; i + 7 < l._by; i += 8) { | |
| cCnt += countInU64(c, *(uint64_t*)&side[i]); | |
| } | |
| #endif | |
| #else | |
| for(; i + 2 < l._by; i += 2) { | |
| cCnt += cCntLUT_16b_4[c][*(uint16_t*)&side[i]]; | |
| } | |
| #endif | |
| #ifdef SIXTY4_FORMAT | |
| // Calculate number of bit pairs to shift off the end | |
| const int bpShiftoff = 32 - (((l._by & 7) << 2) + l._bp); | |
| if(bpShiftoff < 32) { | |
| assert_lt(bpShiftoff, 32); | |
| const uint64_t sw = (*(uint64_t*)&side[i]) << (bpShiftoff << 1); | |
| #ifdef POPCNT_CAPABILITY | |
| if (_usePOPCNTinstruction) { | |
| cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, sw); | |
| } | |
| else { | |
| cCnt += countInU64<USE_POPCNT_GENERIC>(c, sw); | |
| } | |
| #else | |
| cCnt += countInU64(c, sw); | |
| #endif | |
| if(c == 0) cCnt -= bpShiftoff; // we turned these into As | |
| } | |
| #else | |
| // Count occurences of c in the rest of the side (using LUT) | |
| for(; i < l._by; i++) { | |
| cCnt += cCntLUT_4[0][c][side[i]]; | |
| } | |
| // Count occurences of c in the rest of the byte | |
| if(l._bp > 0) { | |
| cCnt += cCntLUT_4[(int)l._bp][c][side[i]]; | |
| } | |
| #endif | |
| return cCnt; | |
| } | |
| /** | |
| * Counts the number of occurrences of character 'c' in the given Ebwt | |
| * side up to (but not including) the given byte/bitpair (by/bp). | |
| */ | |
| template<typename TStr> | |
| inline void Ebwt<TStr>::countUpToEx(const SideLocus& l, TIndexOffU* arrs) const { | |
| int i = 0; | |
| // Count occurrences of c in each 64-bit (using bit trickery); | |
| // note: this seems does not seem to lend a significant boost to | |
| // performance. If you comment out this whole loop (which won't | |
| // affect correctness - it will just cause the following loop to | |
| // take up the slack) then runtime does not change noticeably. | |
| // Someday the countInU64() and pop() functions should be | |
| // vectorized/SSE-ized in case that helps. | |
| const uint8_t *side = l.side(this->_ebwt); | |
| #ifdef POPCNT_CAPABILITY | |
| if (_usePOPCNTinstruction) { | |
| for(; i+7 < l._by; i += 8) { | |
| countInU64Ex<USE_POPCNT_INSTRUCTION>(*(uint64_t*)&side[i], arrs); | |
| } | |
| } | |
| else { | |
| for(; i+7 < l._by; i += 8) { | |
| countInU64Ex<USE_POPCNT_GENERIC>(*(uint64_t*)&side[i], arrs); | |
| } | |
| } | |
| #else | |
| for(; i+7 < l._by; i += 8) { | |
| countInU64Ex(*(uint64_t*)&side[i], arrs); | |
| } | |
| #endif | |
| #ifdef SIXTY4_FORMAT | |
| // Calculate number of bit pairs to shift off the end | |
| const int bpShiftoff = 32 - (((l._by & 7) << 2) + l._bp); | |
| assert_leq(bpShiftoff, 32); | |
| if(bpShiftoff < 32) { | |
| const uint64_t sw = (*(uint64_t*)&l.side(this->_ebwt)[i]) << (bpShiftoff << 1); | |
| #ifdef POPCNT_CAPABILITY | |
| if (_usePOPCNTinstruction) { | |
| countInU64Ex<USE_POPCNT_INSTRUCTION>(sw, arrs); | |
| } | |
| else{ | |
| countInU64Ex<USE_POPCNT_GENERIC>(sw, arrs); | |
| } | |
| #else | |
| countInU64Ex(sw, arrs); | |
| #endif | |
| arrs[0] -= bpShiftoff; | |
| } | |
| #else | |
| // Count occurences of c in the rest of the side (using LUT) | |
| for(; i < l._by; i++) { | |
| arrs[0] += cCntLUT_4[0][0][side[i]]; | |
| arrs[1] += cCntLUT_4[0][1][side[i]]; | |
| arrs[2] += cCntLUT_4[0][2][side[i]]; | |
| arrs[3] += cCntLUT_4[0][3][side[i]]; | |
| } | |
| // Count occurences of c in the rest of the byte | |
| if(l._bp > 0) { | |
| arrs[0] += cCntLUT_4[(int)l._bp][0][side[i]]; | |
| arrs[1] += cCntLUT_4[(int)l._bp][1][side[i]]; | |
| arrs[2] += cCntLUT_4[(int)l._bp][2][side[i]]; | |
| arrs[3] += cCntLUT_4[(int)l._bp][3][side[i]]; | |
| } | |
| #endif | |
| } | |
| /** | |
| * Count all occurrences of character c from the beginning of the | |
| * forward side to <by,bp> and add in the occ[] count up to the side | |
| * break just prior to the side. | |
| */ | |
| template<typename TStr> | |
| inline TIndexOffU Ebwt<TStr>::countFwSide(const SideLocus& l, int c) const { /* check */ | |
| assert_lt(c, 4); | |
| assert_geq(c, 0); | |
| assert_lt(l._by, (int)this->_eh._sideBwtSz); | |
| assert_geq(l._by, 0); | |
| assert_lt(l._bp, 4); | |
| assert_geq(l._bp, 0); | |
| const uint8_t *side = l.side(this->_ebwt); | |
| TIndexOffU cCnt = countUpTo(l, c); | |
| assert_leq(cCnt, this->_eh._sideBwtLen); | |
| if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { | |
| // Adjust for the fact that we represented $ with an 'A', but | |
| // shouldn't count it as an 'A' here | |
| if((l._sideByteOff + l._by > _zEbwtByteOff) || | |
| (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff)) | |
| { | |
| cCnt--; // Adjust for '$' looking like an 'A' | |
| } | |
| } | |
| TIndexOffU ret; | |
| // Now factor in the occ[] count at the side break | |
| if(c < 2) { | |
| const TIndexOffU *ac = reinterpret_cast<const TIndexOffU*>(side - 2*OFF_SIZE); | |
| assert_leq(ac[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding | |
| assert_leq(ac[1], this->_eh._len); | |
| ret = ac[c] + cCnt + this->_fchr[c]; | |
| } else { | |
| const TIndexOffU *gt = reinterpret_cast<const TIndexOffU*>(side + this->_eh._sideSz - 2*OFF_SIZE); // next | |
| assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len); | |
| ret = gt[c-2] + cCnt + this->_fchr[c]; | |
| } | |
| #ifndef NDEBUG | |
| assert_leq(ret, this->_fchr[c+1]); // can't have jumpded into next char's section | |
| if(c == 0) { | |
| assert_leq(cCnt, this->_eh._sideBwtLen); | |
| } else { | |
| assert_leq(ret, this->_eh._bwtLen); | |
| } | |
| #endif | |
| return ret; | |
| } | |
| /** | |
| * Count all occurrences of character c from the beginning of the | |
| * forward side to <by,bp> and add in the occ[] count up to the side | |
| * break just prior to the side. | |
| */ | |
| template<typename TStr> | |
| inline void Ebwt<TStr>::countFwSideEx(const SideLocus& l, TIndexOffU* arrs) const | |
| { | |
| assert_lt(l._by, (int)this->_eh._sideBwtSz); | |
| assert_geq(l._by, 0); | |
| assert_lt(l._bp, 4); | |
| assert_geq(l._bp, 0); | |
| countUpToEx(l, arrs); | |
| #ifndef NDEBUG | |
| assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section | |
| assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section | |
| assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section | |
| assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section | |
| #endif | |
| assert_leq(arrs[0], this->_eh._sideBwtLen); | |
| assert_leq(arrs[1], this->_eh._sideBwtLen); | |
| assert_leq(arrs[2], this->_eh._sideBwtLen); | |
| assert_leq(arrs[3], this->_eh._sideBwtLen); | |
| const uint8_t *side = l.side(this->_ebwt); | |
| if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { | |
| // Adjust for the fact that we represented $ with an 'A', but | |
| // shouldn't count it as an 'A' here | |
| if((l._sideByteOff + l._by > _zEbwtByteOff) || | |
| (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff)) | |
| { | |
| arrs[0]--; // Adjust for '$' looking like an 'A' | |
| } | |
| } | |
| // Now factor in the occ[] count at the side break | |
| const TIndexOffU *ac = reinterpret_cast<const TIndexOffU*>(side - 2*OFF_SIZE); | |
| const TIndexOffU *gt = reinterpret_cast<const TIndexOffU*>(side + this->_eh._sideSz - 2*OFF_SIZE); | |
| #ifndef NDEBUG | |
| assert_leq(ac[0], this->_fchr[1] + this->_eh.sideBwtLen()); | |
| assert_leq(ac[1], this->_fchr[2]-this->_fchr[1]); | |
| assert_leq(gt[0], this->_fchr[3]-this->_fchr[2]); | |
| assert_leq(gt[1], this->_fchr[4]-this->_fchr[3]); | |
| #endif | |
| assert_leq(ac[0], this->_eh._len + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_eh._len); | |
| assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len); | |
| arrs[0] += (ac[0] + this->_fchr[0]); | |
| arrs[1] += (ac[1] + this->_fchr[1]); | |
| arrs[2] += (gt[0] + this->_fchr[2]); | |
| arrs[3] += (gt[1] + this->_fchr[3]); | |
| #ifndef NDEBUG | |
| assert_leq(arrs[0], this->_fchr[1]); // can't have jumpded into next char's section | |
| assert_leq(arrs[1], this->_fchr[2]); // can't have jumpded into next char's section | |
| assert_leq(arrs[2], this->_fchr[3]); // can't have jumpded into next char's section | |
| assert_leq(arrs[3], this->_fchr[4]); // can't have jumpded into next char's section | |
| #endif | |
| } | |
| /** | |
| * Count all instances of character c from <by,bp> to the logical end | |
| * (actual beginning) of the backward side, and subtract that from the | |
| * occ[] count up to the side break. | |
| */ | |
| template<typename TStr> | |
| inline TIndexOffU Ebwt<TStr>::countBwSide(const SideLocus& l, int c) const { | |
| assert_lt(c, 4); | |
| assert_geq(c, 0); | |
| assert_lt(l._by, (int)this->_eh._sideBwtSz); | |
| assert_geq(l._by, 0); | |
| assert_lt(l._bp, 4); | |
| assert_geq(l._bp, 0); | |
| const uint8_t *side = l.side(this->_ebwt); | |
| TIndexOffU cCnt = countUpTo(l, c); | |
| if(rowL(l) == c) cCnt++; | |
| assert_leq(cCnt, this->_eh._sideBwtLen); | |
| if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { | |
| // Adjust for the fact that we represented $ with an 'A', but | |
| // shouldn't count it as an 'A' here | |
| if((l._sideByteOff + l._by > _zEbwtByteOff) || | |
| (l._sideByteOff + l._by == _zEbwtByteOff && l._bp >= _zEbwtBpOff)) | |
| { | |
| cCnt--; | |
| } | |
| } | |
| TIndexOffU ret; | |
| // Now factor in the occ[] count at the side break | |
| if(c < 2) { | |
| const TIndexOffU *ac = reinterpret_cast<const TIndexOffU*>(side + this->_eh._sideSz - 2*OFF_SIZE); | |
| assert_leq(ac[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding | |
| assert_leq(ac[1], this->_eh._len); | |
| ret = ac[c] - cCnt + this->_fchr[c]; | |
| } else { | |
| const TIndexOffU *gt = reinterpret_cast<const TIndexOffU*>(side + (2*this->_eh._sideSz) - 2*OFF_SIZE); // next | |
| assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len); | |
| ret = gt[c-2] - cCnt + this->_fchr[c]; | |
| } | |
| #ifndef NDEBUG | |
| assert_leq(ret, this->_fchr[c+1]); // can't have jumped into next char's section | |
| if(c == 0) { | |
| assert_leq(cCnt, this->_eh._sideBwtLen); | |
| } else { | |
| assert_lt(ret, this->_eh._bwtLen); | |
| } | |
| #endif | |
| return ret; | |
| } | |
| /** | |
| * Count all instances of character c from <by,bp> to the logical end | |
| * (actual beginning) of the backward side, and subtract that from the | |
| * occ[] count up to the side break. | |
| */ | |
| template<typename TStr> | |
| inline void Ebwt<TStr>::countBwSideEx(const SideLocus& l, TIndexOffU* arrs) const { | |
| assert_lt(l._by, (int)this->_eh._sideBwtSz); | |
| assert_geq(l._by, 0); | |
| assert_lt(l._bp, 4); | |
| assert_geq(l._bp, 0); | |
| const uint8_t *side = l.side(this->_ebwt); | |
| countUpToEx(l, arrs); | |
| arrs[rowL(l)]++; | |
| assert_leq(arrs[0], this->_eh._sideBwtLen); | |
| assert_leq(arrs[1], this->_eh._sideBwtLen); | |
| assert_leq(arrs[2], this->_eh._sideBwtLen); | |
| assert_leq(arrs[3], this->_eh._sideBwtLen); | |
| if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { | |
| // Adjust for the fact that we represented $ with an 'A', but | |
| // shouldn't count it as an 'A' here | |
| if((l._sideByteOff + l._by > _zEbwtByteOff) || | |
| (l._sideByteOff + l._by == _zEbwtByteOff && l._bp >= _zEbwtBpOff)) | |
| { | |
| arrs[0]--; // Adjust for '$' looking like an 'A' | |
| } | |
| } | |
| // Now factor in the occ[] count at the side break | |
| const TIndexOffU *ac = reinterpret_cast<const TIndexOffU*>(side + this->_eh._sideSz - 2*OFF_SIZE); | |
| const TIndexOffU *gt = reinterpret_cast<const TIndexOffU*>(side + (2*this->_eh._sideSz) - 2*OFF_SIZE); | |
| #ifndef NDEBUG | |
| assert_leq(ac[0], this->_fchr[1] + this->_eh.sideBwtLen()); | |
| assert_leq(ac[1], this->_fchr[2]-this->_fchr[1]); | |
| assert_leq(gt[0], this->_fchr[3]-this->_fchr[2]); | |
| assert_leq(gt[1], this->_fchr[4]-this->_fchr[3]); | |
| #endif | |
| assert_leq(ac[0], this->_eh._len + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_eh._len); | |
| assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len); | |
| arrs[0] = (ac[0] - arrs[0] + this->_fchr[0]); | |
| arrs[1] = (ac[1] - arrs[1] + this->_fchr[1]); | |
| arrs[2] = (gt[0] - arrs[2] + this->_fchr[2]); | |
| arrs[3] = (gt[1] - arrs[3] + this->_fchr[3]); | |
| #ifndef NDEBUG | |
| assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section | |
| assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section | |
| assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section | |
| assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section | |
| #endif | |
| } | |
| /** | |
| * Given top and bot loci, calculate counts of all four DNA chars up to | |
| * those loci. Used for more advanced backtracking-search. | |
| */ | |
| template<typename TStr> | |
| inline void Ebwt<TStr>::mapLFEx(const SideLocus& ltop, | |
| const SideLocus& lbot, | |
| TIndexOffU *tops, | |
| TIndexOffU *bots | |
| ASSERT_ONLY(, bool overrideSanity) | |
| ) const | |
| { | |
| // TODO: Where there's overlap, reuse the count for the overlapping | |
| // portion | |
| #ifdef EBWT_STATS | |
| const_cast<Ebwt<TStr>*>(this)->mapLFExs_++; | |
| #endif | |
| assert_eq(0, tops[0]); assert_eq(0, bots[0]); | |
| assert_eq(0, tops[1]); assert_eq(0, bots[1]); | |
| assert_eq(0, tops[2]); assert_eq(0, bots[2]); | |
| assert_eq(0, tops[3]); assert_eq(0, bots[3]); | |
| if(ltop._fw) countFwSideEx(ltop, tops); // Forward side | |
| else countBwSideEx(ltop, tops); // Backward side | |
| if(lbot._fw) countFwSideEx(lbot, bots); // Forward side | |
| else countBwSideEx(lbot, bots); // Backward side | |
| #ifndef NDEBUG | |
| if(_sanity && !overrideSanity) { | |
| // Make sure results match up with individual calls to mapLF; | |
| // be sure to override sanity-checking in the callee, or we'll | |
| // have infinite recursion | |
| assert_eq(mapLF(ltop, 0, true), tops[0]); | |
| assert_eq(mapLF(ltop, 1, true), tops[1]); | |
| assert_eq(mapLF(ltop, 2, true), tops[2]); | |
| assert_eq(mapLF(ltop, 3, true), tops[3]); | |
| assert_eq(mapLF(lbot, 0, true), bots[0]); | |
| assert_eq(mapLF(lbot, 1, true), bots[1]); | |
| assert_eq(mapLF(lbot, 2, true), bots[2]); | |
| assert_eq(mapLF(lbot, 3, true), bots[3]); | |
| } | |
| #endif | |
| } | |
| #ifndef NDEBUG | |
| /** | |
| * Given top and bot loci, calculate counts of all four DNA chars up to | |
| * those loci. Used for more advanced backtracking-search. | |
| */ | |
| template<typename TStr> | |
| inline void Ebwt<TStr>::mapLFEx(const SideLocus& l, | |
| TIndexOffU *arrs | |
| ASSERT_ONLY(, bool overrideSanity) | |
| ) const | |
| { | |
| assert_eq(0, arrs[0]); | |
| assert_eq(0, arrs[1]); | |
| assert_eq(0, arrs[2]); | |
| assert_eq(0, arrs[3]); | |
| if(l._fw) countFwSideEx(l, arrs); // Forward side | |
| else countBwSideEx(l, arrs); // Backward side | |
| #ifndef NDEBUG | |
| if(_sanity && !overrideSanity) { | |
| // Make sure results match up with individual calls to mapLF; | |
| // be sure to override sanity-checking in the callee, or we'll | |
| // have infinite recursion | |
| assert_eq(mapLF(l, 0, true), arrs[0]); | |
| assert_eq(mapLF(l, 1, true), arrs[1]); | |
| assert_eq(mapLF(l, 2, true), arrs[2]); | |
| assert_eq(mapLF(l, 3, true), arrs[3]); | |
| } | |
| #endif | |
| } | |
| #endif | |
| /** | |
| * Given row i, return the row that the LF mapping maps i to. | |
| */ | |
| template<typename TStr> | |
| inline TIndexOffU Ebwt<TStr>::mapLF(const SideLocus& l | |
| ASSERT_ONLY(, bool overrideSanity) | |
| ) const | |
| { | |
| #ifdef EBWT_STATS | |
| const_cast<Ebwt<TStr>*>(this)->mapLFs_++; | |
| #endif | |
| TIndexOffU ret; | |
| assert(l.side(this->_ebwt) != NULL); | |
| int c = rowL(l); | |
| assert_lt(c, 4); | |
| assert_geq(c, 0); | |
| if(l._fw) ret = countFwSide(l, c); // Forward side | |
| else ret = countBwSide(l, c); // Backward side | |
| assert_lt(ret, this->_eh._bwtLen); | |
| #ifndef NDEBUG | |
| if(_sanity && !overrideSanity) { | |
| // Make sure results match up with results from mapLFEx; | |
| // be sure to override sanity-checking in the callee, or we'll | |
| // have infinite recursion | |
| TIndexOffU arrs[] = { 0, 0, 0, 0 }; | |
| mapLFEx(l, arrs, true); | |
| assert_eq(arrs[c], ret); | |
| } | |
| #endif | |
| return ret; | |
| } | |
| /** | |
| * Given row i and character c, return the row that the LF mapping maps | |
| * i to on character c. | |
| */ | |
| template<typename TStr> | |
| inline TIndexOffU Ebwt<TStr>::mapLF(const SideLocus& l, int c | |
| ASSERT_ONLY(, bool overrideSanity) | |
| ) const | |
| { | |
| #ifdef EBWT_STATS | |
| const_cast<Ebwt<TStr>*>(this)->mapLFcs_++; | |
| #endif | |
| TIndexOffU ret; | |
| assert_lt(c, 4); | |
| assert_geq(c, 0); | |
| if(l._fw) ret = countFwSide(l, c); // Forward side | |
| else ret = countBwSide(l, c); // Backward side | |
| assert_lt(ret, this->_eh._bwtLen); | |
| #ifndef NDEBUG | |
| if(_sanity && !overrideSanity) { | |
| // Make sure results match up with results from mapLFEx; | |
| // be sure to override sanity-checking in the callee, or we'll | |
| // have infinite recursion | |
| TIndexOffU arrs[] = { 0, 0, 0, 0 }; | |
| mapLFEx(l, arrs, true); | |
| assert_eq(arrs[c], ret); | |
| } | |
| #endif | |
| return ret; | |
| } | |
| /** | |
| * Given row i and character c, return the row that the LF mapping maps | |
| * i to on character c. | |
| */ | |
| template<typename TStr> | |
| inline TIndexOffU Ebwt<TStr>::mapLF1(TIndexOffU row, const SideLocus& l, int c | |
| ASSERT_ONLY(, bool overrideSanity) | |
| ) const | |
| { | |
| #ifdef EBWT_STATS | |
| const_cast<Ebwt<TStr>*>(this)->mapLF1cs_++; | |
| #endif | |
| if(rowL(l) != c || row == _zOff) return OFF_MASK; | |
| TIndexOffU ret; | |
| assert_lt(c, 4); | |
| assert_geq(c, 0); | |
| if(l._fw) ret = countFwSide(l, c); // Forward side | |
| else ret = countBwSide(l, c); // Backward side | |
| assert_lt(ret, this->_eh._bwtLen); | |
| #ifndef NDEBUG | |
| if(_sanity && !overrideSanity) { | |
| // Make sure results match up with results from mapLFEx; | |
| // be sure to override sanity-checking in the callee, or we'll | |
| // have infinite recursion | |
| TIndexOffU arrs[] = { 0, 0, 0, 0 }; | |
| mapLFEx(l, arrs, true); | |
| assert_eq(arrs[c], ret); | |
| } | |
| #endif | |
| return ret; | |
| } | |
| /** | |
| * Given row i and character c, return the row that the LF mapping maps | |
| * i to on character c. | |
| */ | |
| template<typename TStr> | |
| inline int Ebwt<TStr>::mapLF1(TIndexOffU& row, const SideLocus& l | |
| ASSERT_ONLY(, bool overrideSanity) | |
| ) const | |
| { | |
| #ifdef EBWT_STATS | |
| const_cast<Ebwt<TStr>*>(this)->mapLF1s_++; | |
| #endif | |
| if(row == _zOff) return -1; | |
| int c = rowL(l); | |
| assert_lt(c, 4); | |
| assert_geq(c, 0); | |
| if(l._fw) row = countFwSide(l, c); // Forward side | |
| else row = countBwSide(l, c); // Backward side | |
| assert_lt(row, this->_eh._bwtLen); | |
| #ifndef NDEBUG | |
| if(_sanity && !overrideSanity) { | |
| // Make sure results match up with results from mapLFEx; | |
| // be sure to override sanity-checking in the callee, or we'll | |
| // have infinite recursion | |
| TIndexOffU arrs[] = { 0, 0, 0, 0 }; | |
| mapLFEx(l, arrs, true); | |
| assert_eq(arrs[c], row); | |
| } | |
| #endif | |
| return c; | |
| } | |
| /** | |
| * Take an offset into the joined text and translate it into the | |
| * reference of the index it falls on, the offset into the reference, | |
| * and the length of the reference. Use a binary search through the | |
| * sorted list of reference fragment ranges t | |
| */ | |
| template<typename TStr> | |
| void Ebwt<TStr>::joinedToTextOff(TIndexOffU qlen, TIndexOffU off, | |
| TIndexOffU& tidx, | |
| TIndexOffU& textoff, | |
| TIndexOffU& tlen) const | |
| { | |
| TIndexOffU top = 0; | |
| TIndexOffU bot = _nFrag; // 1 greater than largest addressable element | |
| TIndexOffU elt = OFF_MASK; | |
| // Begin binary search | |
| while(true) { | |
| ASSERT_ONLY(TIndexOffU oldelt = elt); | |
| elt = top + ((bot - top) >> 1); | |
| assert_neq(oldelt, elt); // must have made progress | |
| TIndexOffU lower = _rstarts[elt*3]; | |
| TIndexOffU upper; | |
| if(elt == _nFrag-1) { | |
| upper = _eh._len; | |
| } else { | |
| upper = _rstarts[((elt+1)*3)]; | |
| } | |
| assert_gt(upper, lower); | |
| TIndexOffU fraglen = upper - lower; | |
| if(lower <= off) { | |
| if(upper > off) { // not last element, but it's within | |
| // off is in this range; check if it falls off | |
| if(off + qlen > upper) { | |
| // it falls off; signal no-go and return | |
| tidx = OFF_MASK; | |
| assert_lt(elt, _nFrag-1); | |
| return; | |
| } | |
| tidx = _rstarts[(elt*3)+1]; | |
| assert_lt(tidx, this->_nPat); | |
| assert_leq(fraglen, this->_plen[tidx]); | |
| // it doesn't fall off; now calculate textoff. | |
| // Initially it's the number of characters that precede | |
| // the alignment in the fragment | |
| uint32_t fragoff = off - _rstarts[(elt*3)]; | |
| if(!this->_fw) { | |
| fragoff = fraglen - fragoff - 1; | |
| fragoff -= (qlen-1); | |
| } | |
| // Add the alignment's offset into the fragment | |
| // ('fragoff') to the fragment's offset within the text | |
| textoff = fragoff + _rstarts[(elt*3)+2]; | |
| assert_lt(textoff, this->_plen[tidx]); | |
| break; // done with binary search | |
| } else { | |
| // 'off' belongs somewhere in the region between elt | |
| // and bot | |
| top = elt; | |
| } | |
| } else { | |
| // 'off' belongs somewhere in the region between top and | |
| // elt | |
| bot = elt; | |
| } | |
| // continue with binary search | |
| } | |
| tlen = this->_plen[tidx]; | |
| } | |
| /** | |
| * Report a potential match at offset 'off' with pattern length | |
| * 'qlen'. Filter out spurious matches that span texts. | |
| */ | |
| template<typename TStr> | |
| inline bool Ebwt<TStr>::report(const String<Dna5>& query, | |
| String<char>* quals, | |
| String<char>* name, | |
| bool color, | |
| char primer, | |
| char trimc, | |
| bool colExEnds, | |
| int snpPhred, | |
| const BitPairReference* ref, | |
| const std::vector<TIndexOffU>& mmui32, | |
| const std::vector<uint8_t>& refcs, | |
| size_t numMms, | |
| TIndexOffU off, | |
| TIndexOffU top, | |
| TIndexOffU bot, | |
| uint32_t qlen, | |
| int stratum, | |
| uint16_t cost, | |
| uint32_t patid, | |
| uint32_t seed, | |
| const EbwtSearchParams<TStr>& params) const | |
| { | |
| VMSG_NL("In report"); | |
| assert_geq(cost, (uint32_t)(stratum << 14)); | |
| assert_lt(off, this->_eh._len); | |
| TIndexOffU tidx; | |
| TIndexOffU textoff; | |
| TIndexOffU tlen; | |
| joinedToTextOff(qlen, off, tidx, textoff, tlen); | |
| if(tidx == OFF_MASK) { | |
| return false; | |
| } | |
| return params.reportHit( | |
| query, // read sequence | |
| quals, // read quality values | |
| name, // read name | |
| color, // true -> read is colorspace | |
| primer, | |
| trimc, | |
| colExEnds, // true -> exclude nucleotides on ends | |
| snpPhred, // phred probability of SNP | |
| ref, // reference sequence | |
| rmap_, // map to another reference coordinate system | |
| _fw, // true = index is forward; false = mirror | |
| mmui32, // mismatch positions | |
| refcs, // reference characters for mms | |
| numMms, // # mismatches | |
| make_pair(tidx, textoff), // position | |
| make_pair<TIndexOffU,TIndexOffU>(0, 0), // (bogus) mate position | |
| true, // (bogus) mate orientation | |
| 0, // (bogus) mate length | |
| make_pair(top, bot), // arrows | |
| tlen, // textlen | |
| qlen, // qlen | |
| stratum, // alignment stratum | |
| cost, // cost, including stratum & quality penalty | |
| bot-top-1, // # other hits | |
| patid, // pattern id | |
| seed, // pseudo-random seed | |
| 0); // mate (0 = unpaired) | |
| } | |
| #include "row_chaser.h" | |
| /** | |
| * Report a result. Involves walking backwards along the original | |
| * string by way of the LF-mapping until we reach a marked SA row or | |
| * the row corresponding to the 0th suffix. A marked row's offset | |
| * into the original string can be read directly from the this->_offs[] | |
| * array. | |
| */ | |
| template<typename TStr> | |
| inline bool Ebwt<TStr>::reportChaseOne(const String<Dna5>& query, | |
| String<char>* quals, | |
| String<char>* name, | |
| bool color, | |
| char primer, | |
| char trimc, | |
| bool colExEnds, | |
| int snpPhred, | |
| const BitPairReference* ref, | |
| const std::vector<TIndexOffU>& mmui32, | |
| const std::vector<uint8_t>& refcs, | |
| size_t numMms, | |
| TIndexOffU i, | |
| TIndexOffU top, | |
| TIndexOffU bot, | |
| uint32_t qlen, | |
| int stratum, | |
| uint16_t cost, | |
| uint32_t patid, | |
| uint32_t seed, | |
| const EbwtSearchParams<TStr>& params, | |
| SideLocus *l) const | |
| { | |
| VMSG_NL("In reportChaseOne"); | |
| TIndexOffU off; | |
| uint32_t jumps = 0; | |
| ASSERT_ONLY(uint32_t origi = i); | |
| SideLocus myl; | |
| const TIndexOffU offMask = this->_eh._offMask; | |
| const uint32_t offRate = this->_eh._offRate; | |
| const TIndexOffU* offs = this->_offs; | |
| // If the caller didn't give us a pre-calculated (and prefetched) | |
| // locus, then we have to do that now | |
| if(l == NULL) { | |
| l = &myl; | |
| l->initFromRow(i, this->_eh, this->_ebwt); | |
| } | |
| assert(l != NULL); | |
| assert(l->valid()); | |
| // Walk along until we reach the next marked row to the left | |
| while(((i & offMask) != i) && i != _zOff) { | |
| // Not a marked row; walk left one more char | |
| TIndexOffU newi = mapLF(*l); // calc next row | |
| assert_neq(newi, i); | |
| i = newi; // update row | |
| l->initFromRow(i, this->_eh, this->_ebwt); // update locus | |
| jumps++; | |
| } | |
| // This is a marked row | |
| if(i == _zOff) { | |
| // Special case: it's the row corresponding to the | |
| // lexicographically smallest suffix, which is implicitly | |
| // marked 0 | |
| off = jumps; | |
| VMSG_NL("reportChaseOne found zoff off=" << off << " (jumps=" << jumps << ")"); | |
| } else { | |
| // Normal marked row, calculate offset of row i | |
| off = offs[i >> offRate] + jumps; | |
| VMSG_NL("reportChaseOne found off=" << off << " (jumps=" << jumps << ")"); | |
| } | |
| #ifndef NDEBUG | |
| { | |
| uint32_t rcoff = RowChaser<TStr>::toFlatRefOff(this, qlen, origi); | |
| assert_eq(rcoff, off); | |
| } | |
| #endif | |
| return report(query, quals, name, color, primer, trimc, colExEnds, | |
| snpPhred, ref, mmui32, refcs, numMms, off, top, bot, | |
| qlen, stratum, cost, patid, seed, params); | |
| } | |
| /** | |
| * Report a result. Involves walking backwards along the original | |
| * string by way of the LF-mapping until we reach a marked SA row or | |
| * the row corresponding to the 0th suffix. A marked row's offset | |
| * into the original string can be read directly from the this->_offs[] | |
| * array. | |
| */ | |
| template<typename TStr> | |
| inline bool Ebwt<TStr>::reportReconstruct(const String<Dna5>& query, | |
| String<char>* quals, | |
| String<char>* name, | |
| String<Dna5>& lbuf, | |
| String<Dna5>& rbuf, | |
| const TIndexOffU *mmui32, | |
| const char* refcs, | |
| size_t numMms, | |
| TIndexOffU i, | |
| TIndexOffU top, | |
| TIndexOffU bot, | |
| uint32_t qlen, | |
| int stratum, | |
| const EbwtSearchParams<TStr>& params, | |
| SideLocus *l) const | |
| { | |
| VMSG_NL("In reportReconstruct"); | |
| assert_gt(_eh._isaLen, 0); // Must have inverse suffix array to reconstruct | |
| TIndexOffU off; | |
| TIndexOffU jumps = 0; | |
| SideLocus myl; | |
| const TIndexOffU offMask = this->_eh._offMask; | |
| const TIndexOffU offRate = this->_eh._offRate; | |
| const TIndexOffU* offs = this->_offs; | |
| const TIndexOffU* isa = this->_isa; | |
| assert(isa != NULL); | |
| if(l == NULL) { | |
| l = &myl; | |
| myl.initFromRow(i, this->_eh, this->_ebwt); | |
| } | |
| assert(l != NULL); | |
| clear(lbuf); clear(rbuf); | |
| // Walk along until we reach the next marked row to the left | |
| while(((i & offMask) != i) && i != _zOff) { | |
| // Not a marked row; walk left one more char | |
| int c = rowL(*l); | |
| appendValue(lbuf, (Dna5)c); | |
| TIndexOffU newi; | |
| assert_lt(c, 4); | |
| assert_geq(c, 0); | |
| if(l->_fw) newi = countFwSide(*l, c); // Forward side | |
| else newi = countBwSide(*l, c); // Backward side | |
| assert_lt(newi, this->_eh._bwtLen); | |
| assert_neq(newi, i); | |
| i = newi; // update row | |
| l->initFromRow(i, this->_eh, this->_ebwt); // update locus | |
| jumps++; | |
| } | |
| // This is a marked row | |
| if(i == _zOff) { | |
| // Special case: it's the row corresponding to the | |
| // lexicographically smallest suffix, which is implicitly | |
| // marked 0 | |
| off = jumps; | |
| VMSG_NL("reportChaseOne found zoff off=" << off << " (jumps=" << jumps << ")"); | |
| } else { | |
| // Normal marked row, calculate offset of row i | |
| off = offs[i >> offRate] + jumps; | |
| VMSG_NL("reportChaseOne found off=" << off << " (jumps=" << jumps << ")"); | |
| } | |
| // 'off' now holds the text offset of the first (leftmost) position | |
| // involved in the alignment. Next we call joinedToTextOff to | |
| // check whether the seed is valid (i.e., does not straddle a | |
| // boundary between two reference seuqences) and to obtain its | |
| // extents | |
| TIndexOffU tidx; // the index (id) of the reference we hit in | |
| TIndexOffU textoff; // the offset of the alignment within the reference | |
| TIndexOffU tlen; // length of reference seed hit in | |
| joinedToTextOff(qlen, off, tidx, textoff, tlen); | |
| if(tidx == OFF_MASK) { | |
| // The seed straddled a reference boundary, and so is spurious. | |
| // Return false, indicating that we shouldn't stop. | |
| return false; | |
| } | |
| if(jumps > textoff) { | |
| // In our progress toward a marked row, we passed the boundary | |
| // between the reference sequence containing the seed and the | |
| // reference sequence to the left of it. That's OK, we just | |
| // need to knock off the extra characters we added to 'lbuf'. | |
| assert_eq(jumps, length(lbuf)); | |
| _setLength(lbuf, textoff); | |
| jumps = textoff; | |
| assert_eq(textoff, length(lbuf)); | |
| } else if(jumps < textoff) { | |
| // Keep walking until we reach the end of the reference | |
| assert_neq(i, _zOff); | |
| TIndexOffU diff = textoff-jumps; | |
| for(size_t j = 0; j < diff; j++) { | |
| // Not a marked row; walk left one more char | |
| int c = rowL(*l); | |
| appendValue(lbuf, (Dna5)c); | |
| TIndexOffU newi; | |
| assert_lt(c, 4); | |
| assert_geq(c, 0); | |
| if(l->_fw) newi = countFwSide(*l, c); // Forward side | |
| else newi = countBwSide(*l, c); // Backward side | |
| assert_lt(newi, this->_eh._bwtLen); | |
| assert_neq(newi, i); | |
| i = newi; // update row | |
| assert_neq(i, _zOff); | |
| l->initFromRow(i, this->_eh, this->_ebwt); // update locus | |
| jumps++; | |
| } | |
| assert_eq(textoff, jumps); | |
| assert_eq(textoff, length(lbuf)); | |
| } | |
| assert_eq(textoff, jumps); | |
| assert_eq(textoff, length(lbuf)); | |
| // Calculate the right-hand extent of the reference | |
| TIndexOffU ref_right = off - textoff + tlen; | |
| // Round the right-hand extent to the nearest ISA element that maps | |
| // to it or a character to its right | |
| TIndexOffU ref_right_rounded = ref_right; | |
| if((ref_right_rounded & _eh._isaMask) != ref_right_rounded) { | |
| ref_right_rounded = ((ref_right_rounded >> _eh._isaRate)+1) << _eh._isaRate; | |
| } | |
| // TODO: handle case where ref_right_rounded is off the end of _isa | |
| // Let the current suffix-array elt be determined by the ISA | |
| if((ref_right_rounded >> _eh._isaRate) >= _eh._isaLen) { | |
| i = _eh._len; | |
| ref_right_rounded = _eh._len; | |
| } else { | |
| i = isa[ref_right_rounded >> _eh._isaRate]; | |
| } | |
| TIndexOffU right_steps_rounded = ref_right_rounded - (off + qlen); | |
| TIndexOffU right_steps = ref_right - (off + qlen); | |
| l->initFromRow(i, this->_eh, this->_ebwt); // update locus | |
| for(size_t j = 0; j < right_steps_rounded; j++) { | |
| // Not a marked row; walk left one more char | |
| int c = rowL(*l); | |
| appendValue(rbuf, (Dna5)c); | |
| TIndexOffU newi; | |
| assert_lt(c, 4); assert_geq(c, 0); | |
| if(l->_fw) newi = countFwSide(*l, c); // Forward side | |
| else newi = countBwSide(*l, c); // Backward side | |
| assert_lt(newi, this->_eh._bwtLen); | |
| assert_neq(newi, i); | |
| i = newi; // update row | |
| assert_neq(i, _zOff); | |
| l->initFromRow(i, this->_eh, this->_ebwt); // update locus | |
| jumps++; | |
| } | |
| if(right_steps_rounded > right_steps) { | |
| jumps -= (right_steps_rounded - right_steps); | |
| _setLength(rbuf, right_steps); | |
| } | |
| assert_eq(right_steps, length(rbuf)); | |
| assert_eq(tlen, jumps + qlen); | |
| ::reverseInPlace(lbuf); | |
| ::reverseInPlace(rbuf); | |
| { | |
| cout << "reportReconstruct:" << endl | |
| << " " << lbuf << query << rbuf << endl; | |
| cout << " "; | |
| for(size_t i = 0; i < length(lbuf); i++) cout << " "; | |
| cout << query << endl; | |
| } | |
| // Now we've reconstructed the | |
| return false; | |
| } | |
| /** | |
| * Transform this Ebwt into the original string in linear time by using | |
| * the LF mapping to walk backwards starting at the row correpsonding | |
| * to the end of the string. The result is written to s. The Ebwt | |
| * must be in memory. | |
| */ | |
| template<typename TStr> | |
| void Ebwt<TStr>::restore(TStr& s) const { | |
| assert(isInMemory()); | |
| resize(s, this->_eh._len, Exact()); | |
| TIndexOffU jumps = 0; | |
| TIndexOffU i = this->_eh._len; // should point to final SA elt (starting with '$') | |
| SideLocus l(i, this->_eh, this->_ebwt); | |
| while(i != _zOff) { | |
| assert_lt(jumps, this->_eh._len); | |
| //if(_verbose) cout << "restore: i: " << i << endl; | |
| // Not a marked row; go back a char in the original string | |
| TIndexOffU newi = mapLF(l); | |
| assert_neq(newi, i); | |
| s[this->_eh._len - jumps - 1] = rowL(l); | |
| i = newi; | |
| l.initFromRow(i, this->_eh, this->_ebwt); | |
| jumps++; | |
| } | |
| assert_eq(jumps, this->_eh._len); | |
| } | |
| /** | |
| * Check that this Ebwt, when restored via restore(), matches up with | |
| * the given array of reference sequences. For sanity checking. | |
| */ | |
| template <typename TStr> | |
| void Ebwt<TStr>::checkOrigs(const vector<String<Dna5> >& os, | |
| bool color, bool mirror) const | |
| { | |
| TStr rest; | |
| restore(rest); | |
| TIndexOffU restOff = 0; | |
| size_t i = 0, j = 0; | |
| if(mirror) { | |
| // TODO: FIXME | |
| return; | |
| } | |
| while(i < os.size()) { | |
| size_t olen = length(os[i]); | |
| int lastorig = -1; | |
| for(; j < olen; j++) { | |
| size_t joff = j; | |
| if(mirror) joff = olen - j - 1; | |
| if((int)os[i][joff] == 4) { | |
| // Skip over Ns | |
| lastorig = -1; | |
| if(!mirror) { | |
| while(j < olen && (int)os[i][j] == 4) j++; | |
| } else { | |
| while(j < olen && (int)os[i][olen-j-1] == 4) j++; | |
| } | |
| j--; | |
| continue; | |
| } | |
| if(lastorig == -1 && color) { | |
| lastorig = os[i][joff]; | |
| continue; | |
| } | |
| if(color) { | |
| assert_neq(-1, lastorig); | |
| assert_eq(dinuc2color[(int)os[i][joff]][lastorig], rest[restOff]); | |
| } else { | |
| assert_eq(os[i][joff], rest[restOff]); | |
| } | |
| lastorig = (int)os[i][joff]; | |
| restOff++; | |
| } | |
| if(j == length(os[i])) { | |
| // Moved to next sequence | |
| i++; | |
| j = 0; | |
| } else { | |
| // Just jumped over a gap | |
| } | |
| } | |
| } | |
| /////////////////////////////////////////////////////////////////////// | |
| // | |
| // Functions for reading and writing Ebwts | |
| // | |
| /////////////////////////////////////////////////////////////////////// | |
| /** | |
| * Read an Ebwt from file with given filename. | |
| */ | |
| template<typename TStr> | |
| void Ebwt<TStr>::readIntoMemory( | |
| int color, | |
| int needEntireRev, | |
| bool justHeader, | |
| EbwtParams *params, | |
| bool mmSweep, | |
| bool loadNames, | |
| bool startVerbose) | |
| { | |
| bool switchEndian; // dummy; caller doesn't care | |
| #ifdef BOWTIE_MM | |
| char *mmFile[] = { NULL, NULL }; | |
| #endif | |
| if(_in1Str.length() > 0) { | |
| if(_verbose || startVerbose) { | |
| cerr << " About to open input files: "; | |
| logTime(cerr); | |
| } | |
| // Initialize our primary and secondary input-stream fields | |
| if(_in1 != NULL) fclose(_in1); | |
| if(_verbose || startVerbose) cerr << "Opening \"" << _in1Str << "\"" << endl; | |
| if((_in1 = fopen(_in1Str.c_str(), "rb")) == NULL) { | |
| cerr << "Could not open index file " << _in1Str << endl; | |
| } | |
| if(_in2 != NULL) fclose(_in2); | |
| if(_verbose || startVerbose) cerr << "Opening \"" << _in2Str << "\"" << endl; | |
| if((_in2 = fopen(_in2Str.c_str(), "rb")) == NULL) { | |
| cerr << "Could not open index file " << _in2Str << endl; | |
| } | |
| if(_verbose || startVerbose) { | |
| cerr << " Finished opening input files: "; | |
| logTime(cerr); | |
| } | |
| #ifdef BOWTIE_MM | |
| if(_useMm /*&& !justHeader*/) { | |
| const char *names[] = {_in1Str.c_str(), _in2Str.c_str()}; | |
| int fds[] = { fileno(_in1), fileno(_in2) }; | |
| for(int i = 0; i < 2; i++) { | |
| if(_verbose || startVerbose) { | |
| cerr << " Memory-mapping input file " << (i+1) << ": "; | |
| logTime(cerr); | |
| } | |
| struct stat sbuf; | |
| if (stat(names[i], &sbuf) == -1) { | |
| perror("stat"); | |
| cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl; | |
| throw 1; | |
| } | |
| mmFile[i] = (char*)mmap((void *)0, sbuf.st_size, | |
| PROT_READ, MAP_SHARED, fds[i], 0); | |
| if(mmFile[i] == (void *)(-1)) { | |
| perror("mmap"); | |
| cerr << "Error: Could not memory-map the index file " << names[i] << endl; | |
| throw 1; | |
| } | |
| if(mmSweep) { | |
| int sum = 0; | |
| for(off_t j = 0; j < sbuf.st_size; j += 1024) { | |
| sum += (int) mmFile[i][j]; | |
| } | |
| if(startVerbose) { | |
| cerr << " Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": "; | |
| logTime(cerr); | |
| } | |
| } | |
| } | |
| mmFile1_ = mmFile[0]; | |
| mmFile2_ = mmFile[1]; | |
| } | |
| #endif | |
| } | |
| #ifdef BOWTIE_MM | |
| else if(_useMm && !justHeader) { | |
| mmFile[0] = mmFile1_; | |
| mmFile[1] = mmFile2_; | |
| } | |
| if(_useMm && !justHeader) { | |
| assert(mmFile[0] == mmFile1_); | |
| assert(mmFile[1] == mmFile2_); | |
| } | |
| #endif | |
| if(_verbose || startVerbose) { | |
| cerr << " Reading header: "; | |
| logTime(cerr); | |
| } | |
| // Read endianness hints from both streams | |
| uint64_t bytesRead = 0; | |
| switchEndian = false; | |
| uint32_t one = readU<uint32_t>(_in1, switchEndian); // 1st word of primary stream | |
| bytesRead += 4; | |
| #ifndef NDEBUG | |
| assert_eq(one, readU<uint32_t>(_in2, switchEndian)); // should match! | |
| #else | |
| readU<uint32_t>(_in2, switchEndian); | |
| #endif | |
| if(one != 1) { | |
| assert_eq((1u<<24), one); | |
| assert_eq(1, endianSwapU32(one)); | |
| switchEndian = true; | |
| } | |
| // Can't switch endianness and use memory-mapped files; in order to | |
| // support this, someone has to modify the file to switch | |
| // endiannesses appropriately, and we can't do this inside Bowtie | |
| // or we might be setting up a race condition with other processes. | |
| if(switchEndian && _useMm) { | |
| cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl; | |
| throw 1; | |
| } | |
| // Reads header entries one by one from primary stream | |
| TIndexOffU len = readU<TIndexOffU>(_in1, switchEndian); | |
| bytesRead += OFF_SIZE; | |
| int32_t lineRate = readI<int32_t>(_in1, switchEndian); | |
| bytesRead += 4; | |
| int32_t linesPerSide = readI<int32_t>(_in1, switchEndian); | |
| bytesRead += 4; | |
| int32_t offRate = readI<int32_t>(_in1, switchEndian); | |
| bytesRead += 4; | |
| // TODO: add isaRate to the actual file format (right now, the | |
| // user has to tell us whether there's an ISA sample and what the | |
| // sampling rate is. | |
| int32_t isaRate = _overrideIsaRate; | |
| int32_t ftabChars = readI<int32_t>(_in1, switchEndian); | |
| bytesRead += 4; | |
| // chunkRate was deprecated in an earlier version of Bowtie; now | |
| // we use it to hold flags. | |
| int32_t flags = readI<int32_t>(_in1, switchEndian); | |
| bool entireRev = false; | |
| if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) { | |
| if(color != -1 && !color) { | |
| cerr << "Error: -C was not specified when running bowtie, but index is in colorspace. If" << endl | |
| << "your reads are in colorspace, please use the -C option. If your reads are not" << endl | |
| << "in colorspace, please use a normal index (one built without specifying -C to" << endl | |
| << "bowtie-build)." << endl; | |
| throw 1; | |
| } | |
| color = 1; | |
| } else if(flags < 0) { | |
| if(color != -1 && color) { | |
| cerr << "Error: -C was specified when running bowtie, but index is not in colorspace. If" << endl | |
| << "your reads are in colorspace, please use a colorspace index (one built using" << endl | |
| << "bowtie-build -C). If your reads are not in colorspace, don't specify -C when" << endl | |
| << "running bowtie." << endl; | |
| throw 1; | |
| } | |
| color = 0; | |
| } | |
| if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) == 0)) { | |
| if(needEntireRev != -1 && needEntireRev != 0) { | |
| cerr << "Error: This index is not compatible with this version of bowtie. Please use a" << endl | |
| << "current version of bowtie-build." << endl; | |
| throw 1; | |
| } | |
| } else entireRev = true; | |
| bytesRead += 4; | |
| // Create a new EbwtParams from the entries read from primary stream | |
| EbwtParams *eh; | |
| bool deleteEh = false; | |
| if(params != NULL) { | |
| params->init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireRev); | |
| if(_verbose || startVerbose) params->print(cerr); | |
| eh = params; | |
| } else { | |
| eh = new EbwtParams(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireRev); | |
| deleteEh = true; | |
| } | |
| // Set up overridden suffix-array-sample parameters | |
| TIndexOffU offsLen = eh->_offsLen; | |
| uint64_t offsSz = eh->_offsSz; | |
| TIndexOffU offRateDiff = 0; | |
| TIndexOffU offsLenSampled = offsLen; | |
| if(_overrideOffRate > offRate) { | |
| offRateDiff = _overrideOffRate - offRate; | |
| } | |
| if(offRateDiff > 0) { | |
| offsLenSampled >>= offRateDiff; | |
| if((offsLen & ~(OFF_MASK << offRateDiff)) != 0) { | |
| offsLenSampled++; | |
| } | |
| } | |
| // Set up overridden inverted-suffix-array-sample parameters | |
| TIndexOffU isaLen = eh->_isaLen; | |
| TIndexOffU isaRateDiff = 0; | |
| TIndexOffU isaLenSampled = isaLen; | |
| if(_overrideIsaRate > isaRate) { | |
| isaRateDiff = _overrideIsaRate - isaRate; | |
| } | |
| if(isaRateDiff > 0) { | |
| isaLenSampled >>= isaRateDiff; | |
| if((isaLen & ~(OFF_MASK << isaRateDiff)) != 0) { | |
| isaLenSampled++; | |
| } | |
| } | |
| // Can't override the offrate or isarate and use memory-mapped | |
| // files; ultimately, all processes need to copy the sparser sample | |
| // into their own memory spaces. | |
| if(_useMm && (offRateDiff || isaRateDiff)) { | |
| cerr << "Error: Can't use memory-mapped files when the offrate or isarate is overridden" << endl; | |
| throw 1; | |
| } | |
| // Read nPat from primary stream | |
| this->_nPat = readI<TIndexOffU>(_in1, switchEndian); | |
| bytesRead += OFF_SIZE; | |
| if(this->_plen != NULL && !_useMm) { | |
| // Delete it so that we can re-read it | |
| delete[] this->_plen; | |
| this->_plen = NULL; | |
| } | |
| // Read plen from primary stream | |
| if(_useMm) { | |
| #ifdef BOWTIE_MM | |
| this->_plen = (TIndexOffU*)(mmFile[0] + bytesRead); | |
| bytesRead += this->_nPat*OFF_SIZE; | |
| fseeko(_in1, this->_nPat*OFF_SIZE, SEEK_CUR); | |
| #endif | |
| } else { | |
| try { | |
| if(_verbose || startVerbose) { | |
| cerr << "Reading plen (" << this->_nPat << "): "; | |
| logTime(cerr); | |
| } | |
| this->_plen = new TIndexOffU[this->_nPat]; | |
| if(switchEndian) { | |
| for(TIndexOffU i = 0; i < this->_nPat; i++) { | |
| this->_plen[i] = readU<TIndexOffU>(_in1, switchEndian); | |
| } | |
| } else { | |
| size_t r = MM_READ(_in1, (void*)this->_plen, this->_nPat*OFF_SIZE); | |
| if(r != (size_t)(this->_nPat*OFF_SIZE)) { | |
| cerr << "Error reading _plen[] array: " << r << ", " << (this->_nPat*OFF_SIZE) << endl; | |
| throw 1; | |
| } | |
| } | |
| } catch(bad_alloc& e) { | |
| cerr << "Out of memory allocating plen[] in Ebwt::read()" | |
| << " at " << __FILE__ << ":" << __LINE__ << endl; | |
| throw e; | |
| } | |
| } | |
| bool shmemLeader; | |
| // TODO: I'm not consistent on what "header" means. Here I'm using | |
| // "header" to mean everything that would exist in memory if we | |
| // started to build the Ebwt but stopped short of the build*() step | |
| // (i.e. everything up to and including join()). | |
| if(justHeader) goto done; | |
| this->_nFrag = readU<TIndexOffU>(_in1, switchEndian); | |
| bytesRead += OFF_SIZE; | |
| if(_verbose || startVerbose) { | |
| cerr << "Reading rstarts (" << this->_nFrag*3 << "): "; | |
| logTime(cerr); | |
| } | |
| assert_geq(this->_nFrag, this->_nPat); | |
| if(_useMm) { | |
| #ifdef BOWTIE_MM | |
| this->_rstarts = (TIndexOffU*)(mmFile[0] + bytesRead); | |
| bytesRead += this->_nFrag*OFF_SIZE*3; | |
| fseeko(_in1, this->_nFrag*OFF_SIZE*3, SEEK_CUR); | |
| #endif | |
| } else { | |
| this->_rstarts = new TIndexOffU[this->_nFrag*3]; | |
| if(switchEndian) { | |
| for(TIndexOffU i = 0; i < this->_nFrag*3; i += 3) { | |
| // fragment starting position in joined reference | |
| // string, text id, and fragment offset within text | |
| this->_rstarts[i] = readU<TIndexOffU>(_in1, switchEndian); | |
| this->_rstarts[i+1] = readU<TIndexOffU>(_in1, switchEndian); | |
| this->_rstarts[i+2] = readU<TIndexOffU>(_in1, switchEndian); | |
| } | |
| } else { | |
| size_t r = MM_READ(_in1, (void *)this->_rstarts, this->_nFrag*OFF_SIZE*3); | |
| if(r != (size_t)(this->_nFrag*OFF_SIZE*3)) { | |
| cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*OFF_SIZE*3) << endl; | |
| throw 1; | |
| } | |
| } | |
| } | |
| if(_useMm) { | |
| #ifdef BOWTIE_MM | |
| this->_ebwt = (uint8_t*)(mmFile[0] + bytesRead); | |
| bytesRead += eh->_ebwtTotLen; | |
| fseeko(_in1, eh->_ebwtTotLen, SEEK_CUR); | |
| #endif | |
| } else { | |
| // Allocate ebwt (big allocation) | |
| if(_verbose || startVerbose) { | |
| cerr << "Reading ebwt (" << eh->_ebwtTotLen << "): "; | |
| logTime(cerr); | |
| } | |
| bool shmemLeader = true; | |
| if(useShmem_) { | |
| shmemLeader = ALLOC_SHARED_U8( | |
| (_in1Str + "[ebwt]"), eh->_ebwtTotLen, &this->_ebwt, | |
| "ebwt[]", (_verbose || startVerbose)); | |
| if(_verbose || startVerbose) { | |
| cerr << " shared-mem " << (shmemLeader ? "leader" : "follower") << endl; | |
| } | |
| } else { | |
| try { | |
| this->_ebwt = new uint8_t[eh->_ebwtTotLen]; | |
| } catch(bad_alloc& e) { | |
| cerr << "Out of memory allocating the ebwt[] array for the Bowtie index. Please try" << endl | |
| << "again on a computer with more memory." << endl; | |
| throw 1; | |
| } | |
| } | |
| if(shmemLeader) { | |
| // Read ebwt from primary stream | |
| uint64_t bytesLeft = eh->_ebwtTotLen; | |
| char *pebwt = (char*)this->ebwt(); | |
| while (bytesLeft>0){ | |
| size_t r = MM_READ(_in1, (void *)pebwt, bytesLeft); | |
| if(MM_IS_IO_ERR(_in1,r,bytesLeft)) { | |
| cerr << "Error reading ebwt array: returned " << r << ", length was " << (eh->_ebwtTotLen) << endl | |
| << "Your index files may be corrupt; please try re-building or re-downloading." << endl | |
| << "A complete index consists of 6 files: XYZ.1.ebwt, XYZ.2.ebwt, XYZ.3.ebwt," << endl | |
| << "XYZ.4.ebwt, XYZ.rev.1.ebwt, and XYZ.rev.2.ebwt. The XYZ.1.ebwt and " << endl | |
| << "XYZ.rev.1.ebwt files should have the same size, as should the XYZ.2.ebwt and" << endl | |
| << "XYZ.rev.2.ebwt files." << endl; | |
| throw 1; | |
| } | |
| pebwt += r; | |
| bytesLeft -= r; | |
| } | |
| if(switchEndian) { | |
| uint8_t *side = this->_ebwt; | |
| for(size_t i = 0; i < eh->_numSides; i++) { | |
| TIndexOffU *cums = reinterpret_cast<TIndexOffU*>(side + eh->_sideSz - 2*OFF_SIZE); | |
| cums[0] = endianSwapU(cums[0]); | |
| cums[1] = endianSwapU(cums[1]); | |
| side += this->_eh._sideSz; | |
| } | |
| } | |
| if(useShmem_) NOTIFY_SHARED(this->_ebwt, eh->_ebwtTotLen); | |
| } else { | |
| // Seek past the data and wait until master is finished | |
| fseeko(_in1, eh->_ebwtTotLen, SEEK_CUR); | |
| if(useShmem_) WAIT_SHARED(this->_ebwt, eh->_ebwtTotLen); | |
| } | |
| } | |
| // Read zOff from primary stream | |
| _zOff = readU<TIndexOffU>(_in1, switchEndian); | |
| bytesRead += OFF_SIZE; | |
| assert_lt(_zOff, len); | |
| try { | |
| // Read fchr from primary stream | |
| if(_verbose || startVerbose) cerr << "Reading fchr (5)" << endl; | |
| if(_useMm) { | |
| #ifdef BOWTIE_MM | |
| this->_fchr = (TIndexOffU*)(mmFile[0] + bytesRead); | |
| bytesRead += 5*OFF_SIZE; | |
| fseeko(_in1, 5*OFF_SIZE, SEEK_CUR); | |
| #endif | |
| } else { | |
| this->_fchr = new TIndexOffU[5]; | |
| for(int i = 0; i < 5; i++) { | |
| this->_fchr[i] = readU<TIndexOffU>(_in1, switchEndian); | |
| assert_leq(this->_fchr[i], len); | |
| if(i > 0) assert_geq(this->_fchr[i], this->_fchr[i-1]); | |
| } | |
| } | |
| assert_gt(this->_fchr[4], this->_fchr[0]); | |
| // Read ftab from primary stream | |
| if(_verbose || startVerbose) { | |
| cerr << "Reading ftab (" << eh->_ftabLen << "): "; | |
| logTime(cerr); | |
| } | |
| if(_useMm) { | |
| #ifdef BOWTIE_MM | |
| this->_ftab = (TIndexOffU*)(mmFile[0] + bytesRead); | |
| bytesRead += eh->_ftabLen*OFF_SIZE; | |
| fseeko(_in1, eh->_ftabLen*OFF_SIZE, SEEK_CUR); | |
| #endif | |
| } else { | |
| this->_ftab = new TIndexOffU[eh->_ftabLen]; | |
| if(switchEndian) { | |
| for(TIndexOffU i = 0; i < eh->_ftabLen; i++) | |
| this->_ftab[i] = readU<TIndexOffU>(_in1, switchEndian); | |
| } else { | |
| size_t r = MM_READ(_in1, (void *)this->_ftab, eh->_ftabLen*OFF_SIZE); | |
| if(r != (size_t)(eh->_ftabLen*OFF_SIZE)) { | |
| cerr << "Error reading _ftab[] array: " << r << ", " << (eh->_ftabLen*OFF_SIZE) << endl; | |
| throw 1; | |
| } | |
| } | |
| } | |
| // Read etab from primary stream | |
| if(_verbose || startVerbose) { | |
| cerr << "Reading eftab (" << eh->_eftabLen << "): "; | |
| logTime(cerr); | |
| } | |
| if(_useMm) { | |
| #ifdef BOWTIE_MM | |
| this->_eftab = (TIndexOffU*)(mmFile[0] + bytesRead); | |
| bytesRead += eh->_eftabLen*OFF_SIZE; | |
| fseeko(_in1, eh->_eftabLen*OFF_SIZE, SEEK_CUR); | |
| #endif | |
| } else { | |
| this->_eftab = new TIndexOffU[eh->_eftabLen]; | |
| if(switchEndian) { | |
| for(TIndexOffU i = 0; i < eh->_eftabLen; i++) | |
| this->_eftab[i] = readU<TIndexOffU>(_in1, switchEndian); | |
| } else { | |
| size_t r = MM_READ(_in1, (void *)this->_eftab, eh->_eftabLen*OFF_SIZE); | |
| if(r != (size_t)(eh->_eftabLen*OFF_SIZE)) { | |
| cerr << "Error reading _eftab[] array: " << r << ", " << (eh->_eftabLen*OFF_SIZE) << endl; | |
| throw 1; | |
| } | |
| } | |
| } | |
| for(TIndexOffU i = 0; i < eh->_eftabLen; i++) { | |
| if(i > 0 && this->_eftab[i] > 0) { | |
| assert_geq(this->_eftab[i], this->_eftab[i-1]); | |
| } else if(i > 0 && this->_eftab[i-1] == 0) { | |
| assert_eq(0, this->_eftab[i]); | |
| } | |
| } | |
| } catch(bad_alloc& e) { | |
| cerr << "Out of memory allocating fchr[], ftab[] or eftab[] arrays for the Bowtie index." << endl | |
| << "Please try again on a computer with more memory." << endl; | |
| throw 1; | |
| } | |
| // Read reference sequence names from primary index file (or not, | |
| // if --refidx is specified) | |
| if(loadNames) { | |
| while(true) { | |
| char c = '\0'; | |
| if(MM_READ(_in1, (void *)(&c), (size_t)1) != (size_t)1) break; | |
| bytesRead++; | |
| if(c == '\0') break; | |
| else if(c == '\n') { | |
| this->_refnames.push_back(""); | |
| } else { | |
| if(this->_refnames.size() == 0) { | |
| this->_refnames.push_back(""); | |
| } | |
| this->_refnames.back().push_back(c); | |
| } | |
| } | |
| } | |
| bytesRead = 4; // reset for secondary index file (already read 1-sentinel) | |
| shmemLeader = true; | |
| if(_verbose || startVerbose) { | |
| cerr << "Reading offs (" << offsLenSampled << " 32-bit words): "; | |
| logTime(cerr); | |
| } | |
| if(!_useMm) { | |
| if(!useShmem_) { | |
| // Allocate offs_ | |
| try { | |
| this->_offs = new TIndexOffU[offsLenSampled]; | |
| } catch(bad_alloc& e) { | |
| cerr << "Out of memory allocating the offs[] array for the Bowtie index." << endl | |
| << "Please try again on a computer with more memory." << endl; | |
| throw 1; | |
| } | |
| } else { | |
| shmemLeader = ALLOC_SHARED_U( | |
| (_in2Str + "[offs]"), offsLenSampled*OFF_SIZE, &this->_offs, | |
| "offs", (_verbose || startVerbose)); | |
| } | |
| } | |
| if(_overrideOffRate < 32) { | |
| if(shmemLeader) { | |
| // Allocate offs (big allocation) | |
| if(switchEndian || offRateDiff > 0) { | |
| assert(!_useMm); | |
| const TIndexOffU blockMaxSz = (2 * 1024 * 1024); // 2 MB block size | |
| const TIndexOffU blockMaxSzU = (blockMaxSz >> (OFF_SIZE/4 +1)); // # U32s per block | |
| char *buf = new char[blockMaxSz]; | |
| for(TIndexOffU i = 0; i < offsLen; i += blockMaxSzU) { | |
| TIndexOffU block = min<TIndexOffU>(blockMaxSzU, offsLen - i); | |
| size_t r = MM_READ(_in2, (void *)buf, block << (OFF_SIZE/4 + 1)); | |
| if(r != (size_t)(block << (OFF_SIZE/4 + 1))) { | |
| cerr << "Error reading block of offs array: " << r << ", " << (block << (OFF_SIZE/4 + 1)) << endl | |
| << "Your index files may be corrupt; please try re-building or re-downloading." << endl | |
| << "A complete index consists of 6 files: XYZ.1.ebwt, XYZ.2.ebwt, XYZ.3.ebwt," << endl | |
| << "XYZ.4.ebwt, XYZ.rev.1.ebwt, and XYZ.rev.2.ebwt. The XYZ.1.ebwt and " << endl | |
| << "XYZ.rev.1.ebwt files should have the same size, as should the XYZ.2.ebwt and" << endl | |
| << "XYZ.rev.2.ebwt files." << endl; | |
| throw 1; | |
| } | |
| TIndexOffU idx = i >> offRateDiff; | |
| for(TIndexOffU j = 0; j < block; j += (1 << offRateDiff)) { | |
| assert_lt(idx, offsLenSampled); | |
| this->_offs[idx] = ((TIndexOffU*)buf)[j]; | |
| if(switchEndian) { | |
| this->_offs[idx] = endianSwapU(this->_offs[idx]); | |
| } | |
| idx++; | |
| } | |
| } | |
| delete[] buf; | |
| } else { | |
| if(_useMm) { | |
| #ifdef BOWTIE_MM | |
| this->_offs = (TIndexOffU*)(mmFile[1] + bytesRead); | |
| bytesRead += offsSz; | |
| // Argument to lseek can be 64 bits if compiled with | |
| // _FILE_OFFSET_BITS | |
| fseeko(_in2, offsSz, SEEK_CUR); | |
| #endif | |
| } else { | |
| // If any of the high two bits are set | |
| // Workaround for small-index mode where MM_READ may | |
| // not be able to handle read amounts greater than 2^32 | |
| // bytes. | |
| uint64_t bytesLeft = offsSz; | |
| char *offs = (char *)this->offs(); | |
| while(bytesLeft > 0) { | |
| size_t r = MM_READ(_in2, (void*)offs, bytesLeft); | |
| if(MM_IS_IO_ERR(_in2,r,bytesLeft)) { | |
| cerr << "Error reading block of _offs[] array: " | |
| << r << ", " << bytesLeft << gLastIOErrMsg << endl; | |
| throw 1; | |
| } | |
| offs += r; | |
| bytesLeft -= r; | |
| } | |
| } | |
| } | |
| { | |
| ASSERT_ONLY(Bitset offsSeen(len+1)); | |
| for(TIndexOffU i = 0; i < offsLenSampled; i++) { | |
| assert(!offsSeen.test(this->_offs[i])); | |
| ASSERT_ONLY(offsSeen.set(this->_offs[i])); | |
| assert_leq(this->_offs[i], len); | |
| } | |
| } | |
| if(useShmem_) NOTIFY_SHARED(this->_offs, offsLenSampled*OFF_SIZE); | |
| } else { | |
| // Not the shmem leader | |
| fseeko(_in2, offsLenSampled*OFF_SIZE, SEEK_CUR); | |
| if(useShmem_) WAIT_SHARED(this->_offs, offsLenSampled*OFF_SIZE); | |
| } | |
| } | |
| // Allocate _isa[] (big allocation) | |
| if(_verbose || startVerbose) { | |
| cerr << "Reading isa (" << isaLenSampled << "): "; | |
| logTime(cerr); | |
| } | |
| if(!_useMm) { | |
| try { | |
| this->_isa = new TIndexOffU[isaLenSampled]; | |
| } catch(bad_alloc& e) { | |
| cerr << "Out of memory allocating the isa[] array for the Bowtie index." << endl | |
| << "Please try again on a computer with more memory." << endl; | |
| throw 1; | |
| } | |
| } | |
| // Read _isa[] | |
| if(switchEndian || isaRateDiff > 0) { | |
| assert(!_useMm); | |
| for(TIndexOffU i = 0; i < isaLen; i++) { | |
| if((i & ~(OFF_MASK << isaRateDiff)) != 0) { | |
| char tmp[OFF_SIZE]; | |
| size_t r = MM_READ(_in2, (void *)tmp, OFF_SIZE); | |
| if(r != (size_t)OFF_SIZE) { | |
| cerr << "Error reading a word of the _isa[] array: " << r << ", 4" << endl; | |
| throw 1; | |
| } | |
| } else { | |
| TIndexOffU idx = i >> isaRateDiff; | |
| assert_lt(idx, isaLenSampled); | |
| this->_isa[idx] = readU<TIndexOffU>(_in2, switchEndian); | |
| } | |
| } | |
| } else { | |
| if(_useMm) { | |
| #ifdef BOWTIE_MM | |
| this->_isa = (TIndexOffU*)(mmFile[1] + bytesRead); | |
| bytesRead += (isaLen << 2); | |
| fseeko(_in2, (isaLen << 2), SEEK_CUR); | |
| #endif | |
| } else { | |
| size_t r = MM_READ(_in2, (void *)this->_isa, isaLen*OFF_SIZE); | |
| if(r != (size_t)(isaLen*OFF_SIZE)) { | |
| cerr << "Error reading _isa[] array: " << r << ", " << (isaLen*OFF_SIZE) << endl; | |
| throw 1; | |
| } | |
| } | |
| } | |
| { | |
| ASSERT_ONLY(Bitset isasSeen(len+1)); | |
| for(TIndexOffU i = 0; i < isaLenSampled; i++) { | |
| assert(!isasSeen.test(this->_isa[i])); | |
| ASSERT_ONLY(isasSeen.set(this->_isa[i])); | |
| assert_leq(this->_isa[i], len); | |
| } | |
| } | |
| this->postReadInit(*eh); // Initialize fields of Ebwt not read from file | |
| if(_verbose || startVerbose) print(cerr, *eh); | |
| // The fact that _ebwt and friends actually point to something | |
| // (other than NULL) now signals to other member functions that the | |
| // Ebwt is loaded into memory. | |
| done: // Exit hatch for both justHeader and !justHeader | |
| // Be kind | |
| if(deleteEh) delete eh; | |
| if (_in1 != NULL) rewind(_in1); | |
| if (_in2 != NULL) rewind(_in2); | |
| } | |
| /** | |
| * Read reference names from an input stream 'in' for an Ebwt primary | |
| * file and store them in 'refnames'. | |
| */ | |
| static inline void | |
| readEbwtRefnames(FILE* fin, vector<string>& refnames) { | |
| // _in1 must already be open with the get cursor at the | |
| // beginning and no error flags set. | |
| assert(fin != NULL); | |
| assert_eq(ftello(fin), 0); | |
| // Read endianness hints from both streams | |
| bool switchEndian = false; | |
| uint32_t one = readU<uint32_t>(fin, switchEndian); // 1st word of primary stream | |
| if(one != 1) { | |
| assert_eq((1u<<24), one); | |
| switchEndian = true; | |
| } | |
| // Reads header entries one by one from primary stream | |
| TIndexOffU len = readU<TIndexOffU>(fin, switchEndian); | |
| int32_t lineRate = readI<int32_t>(fin, switchEndian); | |
| int32_t linesPerSide = readI<int32_t>(fin, switchEndian); | |
| int32_t offRate = readI<int32_t>(fin, switchEndian); | |
| int32_t ftabChars = readI<int32_t>(fin, switchEndian); | |
| // BTL: chunkRate is now deprecated | |
| int32_t flags = readI<int32_t>(fin, switchEndian); | |
| bool color = false; | |
| bool entireReverse = false; | |
| if(flags < 0) { | |
| color = (((-flags) & EBWT_COLOR) != 0); | |
| entireReverse = (((-flags) & EBWT_ENTIRE_REV) != 0); | |
| } | |
| // Create a new EbwtParams from the entries read from primary stream | |
| EbwtParams eh(len, lineRate, linesPerSide, offRate, -1, ftabChars, color, entireReverse); | |
| TIndexOffU nPat = readI<TIndexOffU>(fin, switchEndian); // nPat | |
| fseeko(fin, nPat*OFF_SIZE, SEEK_CUR); | |
| // Skip rstarts | |
| TIndexOffU nFrag = readU<TIndexOffU>(fin, switchEndian); | |
| fseeko(fin, nFrag*OFF_SIZE*3, SEEK_CUR); | |
| // Skip ebwt | |
| fseeko(fin, eh._ebwtTotLen, SEEK_CUR); | |
| // Skip zOff from primary stream | |
| readU<TIndexOffU>(fin, switchEndian); | |
| // Skip fchr | |
| fseeko(fin, 5 * OFF_SIZE, SEEK_CUR); | |
| // Skip ftab | |
| fseeko(fin, eh._ftabLen*OFF_SIZE, SEEK_CUR); | |
| // Skip eftab | |
| fseeko(fin, eh._eftabLen*OFF_SIZE, SEEK_CUR); | |
| // Read reference sequence names from primary index file | |
| while(true) { | |
| char c = '\0'; | |
| int read_value = 0; | |
| read_value = fgetc(fin); | |
| if(read_value == EOF) break; | |
| c = read_value; | |
| if(c == '\0') break; | |
| else if(c == '\n') { | |
| refnames.push_back(""); | |
| } else { | |
| if(refnames.size() == 0) { | |
| refnames.push_back(""); | |
| } | |
| refnames.back().push_back(c); | |
| } | |
| } | |
| if(refnames.back().empty()) { | |
| refnames.pop_back(); | |
| } | |
| // Be kind | |
| fseeko(fin, 0, SEEK_SET); | |
| assert(ferror(fin) == 0); | |
| } | |
| /** | |
| * Read reference names from the index with basename 'in' and store | |
| * them in 'refnames'. | |
| */ | |
| static inline void | |
| readEbwtRefnames(const string& instr, vector<string>& refnames) { | |
| FILE* fin; | |
| // Initialize our primary and secondary input-stream fields | |
| fin = fopen((instr + ".1." + gEbwt_ext).c_str(),"rb"); | |
| if(fin == NULL) { | |
| throw EbwtFileOpenException("Cannot open file " + instr); | |
| } | |
| assert_eq(ftello(fin), 0); | |
| readEbwtRefnames(fin, refnames); | |
| fclose(fin); | |
| } | |
| /** | |
| * Read just enough of the Ebwt's header to get its flags | |
| */ | |
| static inline int32_t readFlags(const string& instr) { | |
| ifstream in; | |
| // Initialize our primary and secondary input-stream fields | |
| in.open((instr + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary); | |
| if(!in.is_open()) { | |
| throw EbwtFileOpenException("Cannot open file " + instr); | |
| } | |
| assert(in.is_open()); | |
| assert(in.good()); | |
| bool switchEndian = false; | |
| uint32_t one = readU<uint32_t>(in, switchEndian); // 1st word of primary stream | |
| if(one != 1) { | |
| assert_eq((1u<<24), one); | |
| assert_eq(1, endianSwapU32(one)); | |
| switchEndian = true; | |
| } | |
| readU<TIndexOffU>(in, switchEndian); | |
| readI<int32_t>(in, switchEndian); | |
| readI<int32_t>(in, switchEndian); | |
| readI<int32_t>(in, switchEndian); | |
| readI<int32_t>(in, switchEndian); | |
| int32_t flags = readI<int32_t>(in, switchEndian); | |
| return flags; | |
| } | |
| /** | |
| * Read just enough of the Ebwt's header to determine whether it's | |
| * colorspace. | |
| */ | |
| static inline bool | |
| readEbwtColor(const string& instr) { | |
| int32_t flags = readFlags(instr); | |
| if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) { | |
| return true; | |
| } else { | |
| return false; | |
| } | |
| } | |
| /** | |
| * Read just enough of the Ebwt's header to determine whether it's | |
| * entirely reversed. | |
| */ | |
| static inline bool | |
| readEntireReverse(const string& instr) { | |
| int32_t flags = readFlags(instr); | |
| if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) != 0)) { | |
| return true; | |
| } else { | |
| return false; | |
| } | |
| } | |
| /** | |
| * Write an extended Burrows-Wheeler transform to a pair of output | |
| * streams. | |
| * | |
| * @param out1 output stream to primary file | |
| * @param out2 output stream to secondary file | |
| * @param be write in big endian? | |
| */ | |
| template<typename TStr> | |
| void Ebwt<TStr>::writeFromMemory(bool justHeader, | |
| ostream& out1, | |
| ostream& out2) const | |
| { | |
| const EbwtParams& eh = this->_eh; | |
| assert(eh.repOk()); | |
| uint32_t be = this->toBe(); | |
| assert(out1.good()); | |
| assert(out2.good()); | |
| // When building an Ebwt, these header parameters are known | |
| // "up-front", i.e., they can be written to disk immediately, | |
| // before we join() or buildToDisk() | |
| writeI<int32_t>(out1, 1, be); // endian hint for priamry stream | |
| writeI<int32_t>(out2, 1, be); // endian hint for secondary stream | |
| writeU<TIndexOffU>(out1, eh._len, be); // length of string (and bwt and suffix array) | |
| writeI<int32_t>(out1, eh._lineRate, be); // 2^lineRate = size in bytes of 1 line | |
| writeI<int32_t>(out1, eh._linesPerSide, be); // not used | |
| writeI<int32_t>(out1, eh._offRate, be); // every 2^offRate chars is "marked" | |
| writeI<int32_t>(out1, eh._ftabChars, be); // number of 2-bit chars used to address ftab | |
| int32_t flags = 1; | |
| if(eh._color) flags |= EBWT_COLOR; | |
| if(eh._entireReverse) flags |= EBWT_ENTIRE_REV; | |
| writeI<int32_t>(out1, -flags, be); // BTL: chunkRate is now deprecated | |
| if(!justHeader) { | |
| assert(isInMemory()); | |
| // These Ebwt parameters are known after the inputs strings have | |
| // been joined() but before they have been built(). These can | |
| // written to the disk next and then discarded from memory. | |
| writeU<TIndexOffU>(out1, this->_nPat, be); | |
| for(TIndexOffU i = 0; i < this->_nPat; i++) | |
| writeU<TIndexOffU>(out1, this->_plen[i], be); | |
| assert_geq(this->_nFrag, this->_nPat); | |
| writeU<TIndexOffU>(out1, this->_nFrag, be); | |
| for(TIndexOffU i = 0; i < this->_nFrag*3; i++) | |
| writeU<TIndexOffU>(out1, this->_rstarts[i], be); | |
| // These Ebwt parameters are discovered only as the Ebwt is being | |
| // built (in buildToDisk()). Of these, only 'offs' and 'ebwt' are | |
| // terribly large. 'ebwt' is written to the primary file and then | |
| // discarded from memory as it is built; 'offs' is similarly | |
| // written to the secondary file and discarded. | |
| out1.write((const char *)this->ebwt(), eh._ebwtTotLen); | |
| writeU<TIndexOffU>(out1, this->zOff(), be); | |
| TIndexOffU offsLen = eh._offsLen; | |
| for(TIndexOffU i = 0; i < offsLen; i++) | |
| writeU<TIndexOffU>(out2, this->_offs[i], be); | |
| uint32_t isaLen = eh._isaLen; | |
| for(TIndexOffU i = 0; i < isaLen; i++) | |
| writeU<TIndexOffU>(out2, this->_isa[i], be); | |
| // 'fchr', 'ftab' and 'eftab' are not fully determined until the | |
| // loop is finished, so they are written to the primary file after | |
| // all of 'ebwt' has already been written and only then discarded | |
| // from memory. | |
| for(int i = 0; i < 5; i++) | |
| writeU<TIndexOffU>(out1, this->_fchr[i], be); | |
| for(TIndexOffU i = 0; i < eh._ftabLen; i++) | |
| writeU<TIndexOffU>(out1, this->ftab()[i], be); | |
| for(TIndexOffU i = 0; i < eh._eftabLen; i++) | |
| writeU<TIndexOffU>(out1, this->eftab()[i], be); | |
| } | |
| } | |
| /** | |
| * Given a pair of strings representing output filenames, and assuming | |
| * this Ebwt object is currently in memory, write out this Ebwt to the | |
| * specified files. | |
| * | |
| * If sanity-checking is enabled, then once the streams have been | |
| * fully written and closed, we reopen them and read them into a | |
| * (hopefully) exact copy of this Ebwt. We then assert that the | |
| * current Ebwt and the copy match in all of their fields. | |
| */ | |
| template<typename TStr> | |
| void Ebwt<TStr>::writeFromMemory(bool justHeader, | |
| const string& out1, | |
| const string& out2) const | |
| { | |
| const EbwtParams& eh = this->_eh; | |
| assert(isInMemory()); | |
| assert(eh.repOk()); | |
| ofstream fout1(out1.c_str(), ios::binary); | |
| ofstream fout2(out2.c_str(), ios::binary); | |
| writeFromMemory(justHeader, fout1, fout2); | |
| fout1.close(); | |
| fout2.close(); | |
| // Read the file back in and assert that all components match | |
| if(_sanity) { | |
| if(_verbose) | |
| cout << "Re-reading \"" << out1 << "\"/\"" << out2 << "\" for sanity check" << endl; | |
| Ebwt copy(out1, out2, _verbose, _sanity); | |
| assert(!isInMemory()); | |
| copy.loadIntoMemory(eh._color ? 1 : 0, -1, false, false); | |
| assert(isInMemory()); | |
| assert_eq(eh._lineRate, copy.eh()._lineRate); | |
| assert_eq(eh._linesPerSide, copy.eh()._linesPerSide); | |
| assert_eq(eh._offRate, copy.eh()._offRate); | |
| assert_eq(eh._isaRate, copy.eh()._isaRate); | |
| assert_eq(eh._ftabChars, copy.eh()._ftabChars); | |
| assert_eq(eh._len, copy.eh()._len); | |
| assert_eq(_zOff, copy.zOff()); | |
| assert_eq(_zEbwtBpOff, copy.zEbwtBpOff()); | |
| assert_eq(_zEbwtByteOff, copy.zEbwtByteOff()); | |
| assert_eq(_nPat, copy.nPat()); | |
| for(TIndexOffU i = 0; i < _nPat; i++) | |
| assert_eq(this->_plen[i], copy.plen()[i]); | |
| assert_eq(this->_nFrag, copy.nFrag()); | |
| for(TIndexOffU i = 0; i < this->nFrag*3; i++) { | |
| assert_eq(this->_rstarts[i], copy.rstarts()[i]); | |
| } | |
| for(uint32_t i = 0; i < 5; i++) | |
| assert_eq(this->_fchr[i], copy.fchr()[i]); | |
| for(TIndexOffU i = 0; i < eh._ftabLen; i++) | |
| assert_eq(this->ftab()[i], copy.ftab()[i]); | |
| for(TIndexOffU i = 0; i < eh._eftabLen; i++) | |
| assert_eq(this->eftab()[i], copy.eftab()[i]); | |
| for(TIndexOffU i = 0; i < eh._offsLen; i++) | |
| assert_eq(this->_offs[i], copy.offs()[i]); | |
| for(TIndexOffU i = 0; i < eh._isaLen; i++) | |
| assert_eq(this->_isa[i], copy.isa()[i]); | |
| for(TIndexOffU i = 0; i < eh._ebwtTotLen; i++) | |
| assert_eq(this->ebwt()[i], copy.ebwt()[i]); | |
| //copy.sanityCheckAll(); | |
| if(_verbose) | |
| cout << "Read-in check passed for \"" << out1 << "\"/\"" << out2 << "\"" << endl; | |
| } | |
| } | |
| /////////////////////////////////////////////////////////////////////// | |
| // | |
| // Functions for building Ebwts | |
| // | |
| /////////////////////////////////////////////////////////////////////// | |
| /** | |
| * Join several text strings together in a way that's compatible with | |
| * the text-chunking scheme dictated by chunkRate parameter. | |
| * | |
| * The non-static member Ebwt::join additionally builds auxilliary | |
| * arrays that maintain a mapping between chunks in the joined string | |
| * and the original text strings. | |
| */ | |
| template<typename TStr> | |
| TStr Ebwt<TStr>::join(vector<TStr>& l, uint32_t seed) { | |
| RandomSource rand; // reproducible given same seed | |
| rand.init(seed); | |
| TStr ret; | |
| size_t guessLen = 0; | |
| for(size_t i = 0; i < l.size(); i++) { | |
| guessLen += length(l[i]); | |
| } | |
| reserve(ret, guessLen, Exact()); | |
| for(size_t i = 0; i < l.size(); i++) { | |
| TStr& s = l[i]; | |
| assert_gt(length(s), 0); | |
| append(ret, s); | |
| } | |
| return ret; | |
| } | |
| /** | |
| * Join several text strings together in a way that's compatible with | |
| * the text-chunking scheme dictated by chunkRate parameter. | |
| * | |
| * The non-static member Ebwt::join additionally builds auxilliary | |
| * arrays that maintain a mapping between chunks in the joined string | |
| * and the original text strings. | |
| */ | |
| template<typename TStr> | |
| TStr Ebwt<TStr>::join(vector<FileBuf*>& l, | |
| vector<RefRecord>& szs, | |
| TIndexOffU sztot, | |
| const RefReadInParams& refparams, | |
| uint32_t seed) | |
| { | |
| RandomSource rand; // reproducible given same seed | |
| rand.init(seed); | |
| RefReadInParams rpcp = refparams; | |
| TStr ret; | |
| size_t guessLen = sztot; | |
| reserve(ret, guessLen, Exact()); | |
| ASSERT_ONLY(size_t szsi = 0); | |
| for(size_t i = 0; i < l.size(); i++) { | |
| // For each sequence we can pull out of istream l[i]... | |
| assert(!l[i]->eof()); | |
| bool first = true; | |
| while(!l[i]->eof()) { | |
| RefRecord rec = fastaRefReadAppend(*l[i], first, ret, rpcp); | |
| #ifndef ACCOUNT_FOR_ALL_GAP_REFS | |
| if(rec.first && rec.len == 0) rec.first = false; | |
| #endif | |
| first = false; | |
| size_t bases = rec.len; | |
| assert_eq(rec.off, szs[szsi].off); | |
| assert_eq(rec.len, szs[szsi].len); | |
| assert_eq(rec.first, szs[szsi].first); | |
| ASSERT_ONLY(szsi++); | |
| if(bases == 0) continue; | |
| } | |
| } | |
| return ret; | |
| } | |
| /** | |
| * Join several text strings together according to the text-chunking | |
| * scheme specified in the EbwtParams. Ebwt fields calculated in this | |
| * function are written directly to disk. | |
| * | |
| * It is assumed, but not required, that the header values have already | |
| * been written to 'out1' before this function is called. | |
| * | |
| * The static member Ebwt::join just returns a joined version of a | |
| * list of strings without building any of the auxiliary arrays. | |
| * Because the pseudo-random number generator is the same, we expect | |
| * this function and the static function to give the same result given | |
| * the same seed. | |
| */ | |
| template<typename TStr> | |
| void Ebwt<TStr>::joinToDisk( | |
| vector<FileBuf*>& l, | |
| vector<RefRecord>& szs, | |
| vector<uint32_t>& plens, | |
| TIndexOffU sztot, | |
| const RefReadInParams& refparams, | |
| TStr& ret, | |
| ostream& out1, | |
| ostream& out2, | |
| uint32_t seed) | |
| { | |
| RandomSource rand; // reproducible given same seed | |
| rand.init(seed); | |
| RefReadInParams rpcp = refparams; | |
| assert_gt(szs.size(), 0); | |
| assert_gt(l.size(), 0); | |
| assert_gt(sztot, 0); | |
| // Not every fragment represents a distinct sequence - many | |
| // fragments may correspond to a single sequence. Count the | |
| // number of sequences here by counting the number of "first" | |
| // fragments. | |
| this->_nPat = 0; | |
| this->_nFrag = 0; | |
| #ifdef ACCOUNT_FOR_ALL_GAP_REFS | |
| int nGapFrag = 0; | |
| #endif | |
| for(size_t i = 0; i < szs.size(); i++) { | |
| if(szs[i].len > 0) this->_nFrag++; | |
| #ifdef ACCOUNT_FOR_ALL_GAP_REFS | |
| if(szs[i].len == 0 && szs[i].off > 0) nGapFrag++; | |
| if(szs[i].first && szs[i].len > 0) this->_nPat++; | |
| #else | |
| // For all records where len=0 and first=1, set first=0 | |
| assert(szs[i].len > 0 || !szs[i].first); | |
| if(szs[i].first) this->_nPat++; | |
| #endif | |
| } | |
| assert_gt(this->_nPat, 0); | |
| assert_geq(this->_nFrag, this->_nPat); | |
| this->_rstarts = NULL; | |
| writeU<TIndexOffU>(out1, this->_nPat, this->toBe()); | |
| assert_eq(plens.size(), this->_nPat); | |
| // Allocate plen[] | |
| try { | |
| this->_plen = new TIndexOffU[this->_nPat]; | |
| } catch(bad_alloc& e) { | |
| cerr << "Out of memory allocating plen[] in Ebwt::join()" | |
| << " at " << __FILE__ << ":" << __LINE__ << endl; | |
| throw e; | |
| } | |
| // For each pattern, set plen | |
| for(TIndexOffU i = 0; i < plens.size(); i++) { | |
| this->_plen[i] = plens[i]; | |
| writeU<TIndexOffU>(out1, this->_plen[i], this->toBe()); | |
| } | |
| // Write the number of fragments | |
| writeU<TIndexOffU>(out1, this->_nFrag, this->toBe()); | |
| TIndexOffU seqsRead = 0; | |
| ASSERT_ONLY(TIndexOffU szsi = 0); | |
| ASSERT_ONLY(TIndexOffU entsWritten = 0); | |
| // For each filebuf | |
| for(unsigned int i = 0; i < l.size(); i++) { | |
| assert(!l[i]->eof()); | |
| bool first = true; | |
| TIndexOffU patoff = 0; | |
| // For each *fragment* (not necessary an entire sequence) we | |
| // can pull out of istream l[i]... | |
| while(!l[i]->eof()) { | |
| string name; | |
| // Push a new name onto our vector | |
| _refnames.push_back(""); | |
| //uint32_t oldRetLen = length(ret); | |
| RefRecord rec = fastaRefReadAppend(*l[i], first, ret, rpcp, &_refnames.back()); | |
| #ifndef ACCOUNT_FOR_ALL_GAP_REFS | |
| if(rec.first && rec.len == 0) rec.first = false; | |
| #endif | |
| first = false; | |
| if(rec.first) { | |
| if(_refnames.back().length() == 0) { | |
| // If name was empty, replace with an index | |
| ostringstream stm; | |
| stm << (_refnames.size()-1); | |
| _refnames.back() = stm.str(); | |
| } | |
| } else { | |
| // This record didn't actually start a new sequence so | |
| // no need to add a name | |
| //assert_eq(0, _refnames.back().length()); | |
| _refnames.pop_back(); | |
| } | |
| assert_lt(szsi, szs.size()); | |
| assert(szs[szsi].first == 0 || szs[szsi].first == 1); | |
| assert_eq(rec.off, szs[szsi].off); | |
| assert_eq(rec.len, szs[szsi].len); | |
| // szs[szsi].first == 2 sometimes?!?! g++ is unable to do | |
| // the following correctly, regardless of how I write it | |
| //assert((rec.first == 0) == (szs[szsi].first == 0)); | |
| assert(rec.first || rec.off > 0); | |
| ASSERT_ONLY(szsi++); | |
| #ifdef ACCOUNT_FOR_ALL_GAP_REFS | |
| if(rec.len == 0) continue; | |
| if(rec.first && rec.len > 0) seqsRead++; | |
| assert_leq(rec.len, this->_plen[seqsRead-1]); | |
| #else | |
| if(rec.first) seqsRead++; | |
| if(rec.len == 0) continue; | |
| assert_leq(rec.len, this->_plen[seqsRead-1]); | |
| #endif | |
| // Reset the patoff if this is the first fragment | |
| if(rec.first) patoff = 0; | |
| patoff += rec.off; // add fragment's offset from end of last frag. | |
| // Adjust rpcps | |
| //uint32_t seq = seqsRead-1; | |
| ASSERT_ONLY(entsWritten++); | |
| // This is where rstarts elements are written to the output stream | |
| //writeU32(out1, oldRetLen, this->toBe()); // offset from beginning of joined string | |
| //writeU32(out1, seq, this->toBe()); // sequence id | |
| //writeU32(out1, patoff, this->toBe()); // offset into sequence | |
| patoff += rec.len; | |
| } | |
| assert_gt(szsi, 0); | |
| l[i]->reset(); | |
| assert(!l[i]->eof()); | |
| #ifndef NDEBUG | |
| int c = l[i]->get(); | |
| assert_eq('>', c); | |
| assert(!l[i]->eof()); | |
| l[i]->reset(); | |
| assert(!l[i]->eof()); | |
| #endif | |
| } | |
| assert_eq(entsWritten, this->_nFrag); | |
| } | |
| /** | |
| * Build an Ebwt from a string 's' and its suffix array 'sa' (which | |
| * might actually be a suffix array *builder* that builds blocks of the | |
| * array on demand). The bulk of the Ebwt, i.e. the ebwt and offs | |
| * arrays, is written directly to disk. This is by design: keeping | |
| * those arrays in memory needlessly increases the footprint of the | |
| * building process. Instead, we prefer to build the Ebwt directly | |
| * "to disk" and then read it back into memory later as necessary. | |
| * | |
| * It is assumed that the header values and join-related values (nPat, | |
| * plen) have already been written to 'out1' before this function | |
| * is called. When this function is finished, it will have | |
| * additionally written ebwt, zOff, fchr, ftab and eftab to the primary | |
| * file and offs to the secondary file. | |
| * | |
| * Assume DNA/RNA/any alphabet with 4 or fewer elements. | |
| * Assume occ array entries are 32 bits each. | |
| * | |
| * @param sa the suffix array to convert to a Ebwt | |
| * @param buildISA whether to output an ISA sample into out2 after | |
| * the SA sample | |
| * @param s the original string | |
| * @param out | |
| */ | |
| template<typename TStr> | |
| void Ebwt<TStr>::buildToDisk(InorderBlockwiseSA<TStr>& sa, | |
| const TStr& s, | |
| ostream& out1, | |
| ostream& out2) | |
| { | |
| const EbwtParams& eh = this->_eh; | |
| assert(eh.repOk()); | |
| assert_eq(length(s)+1, sa.size()); | |
| assert_eq(length(s), eh._len); | |
| assert_gt(eh._lineRate, 3); | |
| assert(sa.suffixItrIsReset()); | |
| assert_leq((int)ValueSize<Dna>::VALUE, 4); | |
| TIndexOffU len = eh._len; | |
| TIndexOffU ftabLen = eh._ftabLen; | |
| TIndexOffU sideSz = eh._sideSz; | |
| TIndexOffU ebwtTotSz = eh._ebwtTotSz; | |
| TIndexOffU fchr[] = {0, 0, 0, 0, 0}; | |
| TIndexOffU* ftab = NULL; | |
| TIndexOffU zOff = OFF_MASK; | |
| // Save # of occurrences of each character as we walk along the bwt | |
| TIndexOffU occ[4] = {0, 0, 0, 0}; | |
| // Save 'G' and 'T' occurrences between backward and forward buckets | |
| TIndexOffU occSave[2] = {0, 0}; | |
| // Record rows that should "absorb" adjacent rows in the ftab. | |
| // The absorbed rows represent suffixes shorter than the ftabChars | |
| // cutoff. | |
| uint8_t absorbCnt = 0; | |
| uint8_t *absorbFtab; | |
| try { | |
| VMSG_NL("Allocating ftab, absorbFtab"); | |
| ftab = new TIndexOffU[ftabLen]; | |
| memset(ftab, 0, OFF_SIZE * ftabLen); | |
| absorbFtab = new uint8_t[ftabLen]; | |
| memset(absorbFtab, 0, ftabLen); | |
| } catch(bad_alloc &e) { | |
| cerr << "Out of memory allocating ftab[] or absorbFtab[] " | |
| << "in Ebwt::buildToDisk() at " << __FILE__ << ":" | |
| << __LINE__ << endl; | |
| throw e; | |
| } | |
| assert(ftab != NULL); | |
| assert(absorbFtab != NULL); | |
| // Allocate the side buffer; holds a single side as its being | |
| // constructed and then written to disk. Reused across all sides. | |
| #ifdef SIXTY4_FORMAT | |
| uint64_t *ebwtSide = NULL; | |
| #else | |
| uint8_t *ebwtSide = NULL; | |
| #endif | |
| try { | |
| #ifdef SIXTY4_FORMAT | |
| ebwtSide = new uint64_t[sideSz >> 3]; | |
| #else | |
| ebwtSide = new uint8_t[sideSz]; | |
| #endif | |
| } catch(bad_alloc &e) { | |
| cerr << "Out of memory allocating ebwtSide[] in " | |
| << "Ebwt::buildToDisk() at " << __FILE__ << ":" | |
| << __LINE__ << endl; | |
| throw e; | |
| } | |
| assert(ebwtSide != NULL); | |
| // Allocate a buffer to hold the ISA sample, which we accumulate in | |
| // the loop and then output at the end. We can't write output the | |
| // ISA right away because the order in which we calculate its | |
| // elements is based on the suffix array, which we only see bit by | |
| // bit | |
| uint32_t *isaSample = NULL; | |
| if(eh._isaRate >= 0) { | |
| try { | |
| isaSample = new uint32_t[eh._isaLen]; | |
| } catch(bad_alloc &e) { | |
| cerr << "Out of memory allocating isaSample[] in " | |
| << "Ebwt::buildToDisk() at " << __FILE__ << ":" | |
| << __LINE__ << endl; | |
| throw e; | |
| } | |
| assert(isaSample != NULL); | |
| } | |
| // Points to the base offset within ebwt for the side currently | |
| // being written | |
| TIndexOffU side = 0; | |
| // Points to a byte offset from 'side' within ebwt[] where next | |
| // char should be written | |
| #ifdef SIXTY4_FORMAT | |
| TIndexOff sideCur = (eh._sideBwtSz >> 3) - 1; | |
| #else | |
| TIndexOff sideCur = eh._sideBwtSz - 1; | |
| #endif | |
| // Whether we're assembling a forward or a reverse bucket | |
| bool fw = false; | |
| // Did we just finish writing a forward bucket? (Must be true when | |
| // we exit the loop.) | |
| ASSERT_ONLY(bool wroteFwBucket = false); | |
| // Have we skipped the '$' in the last column yet? | |
| ASSERT_ONLY(bool dollarSkipped = false); | |
| TIndexOffU si = 0; // string offset (chars) | |
| ASSERT_ONLY(TIndexOffU lastSufInt = 0); | |
| ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix | |
| // array (as opposed to the padding at the | |
| // end) | |
| // Iterate over packed bwt bytes | |
| VMSG_NL("Entering Ebwt loop"); | |
| ASSERT_ONLY(TIndexOffU beforeEbwtOff = (uint32_t)out1.tellp()); | |
| while(side < ebwtTotSz) { | |
| ASSERT_ONLY(wroteFwBucket = false); | |
| // Sanity-check our cursor into the side buffer | |
| assert_geq(sideCur, 0); | |
| assert_lt(sideCur, (int)eh._sideBwtSz); | |
| assert_eq(0, side % sideSz); // 'side' must be on side boundary | |
| ebwtSide[sideCur] = 0; // clear | |
| assert_lt(side + sideCur, ebwtTotSz); | |
| // Iterate over bit-pairs in the si'th character of the BWT | |
| #ifdef SIXTY4_FORMAT | |
| for(int bpi = 0; bpi < 32; bpi++, si++) | |
| #else | |
| for(int bpi = 0; bpi < 4; bpi++, si++) | |
| #endif | |
| { | |
| int bwtChar; | |
| bool count = true; | |
| if(si <= len) { | |
| // Still in the SA; extract the bwtChar | |
| TIndexOffU saElt = sa.nextSuffix(); | |
| // (that might have triggered sa to calc next suf block) | |
| if(isaSample != NULL && (saElt & eh._isaMask) == saElt) { | |
| // This element belongs in the ISA sample. Add | |
| // an entry mapping the text offset to the offset | |
| // into the suffix array that holds the suffix | |
| // beginning with the character at that text offset | |
| assert_lt((saElt >> eh._isaRate), eh._isaLen); | |
| isaSample[saElt >> eh._isaRate] = si; | |
| } | |
| if(saElt == 0) { | |
| // Don't add the '$' in the last column to the BWT | |
| // transform; we can't encode a $ (only A C T or G) | |
| // and counting it as, say, an A, will mess up the | |
| // LR mapping | |
| bwtChar = 0; count = false; | |
| ASSERT_ONLY(dollarSkipped = true); | |
| zOff = si; // remember the SA row that | |
| // corresponds to the 0th suffix | |
| } else { | |
| bwtChar = (int)(Dna)(s[saElt-1]); | |
| assert_lt(bwtChar, 4); | |
| // Update the fchr | |
| fchr[bwtChar]++; | |
| } | |
| // Update ftab | |
| if((len-saElt) >= (TIndexOffU)eh._ftabChars) { | |
| // Turn the first ftabChars characters of the | |
| // suffix into an integer index into ftab | |
| TIndexOffU sufInt = 0; | |
| for(int i = 0; i < eh._ftabChars; i++) { | |
| sufInt <<= 2; | |
| assert_lt((TIndexOffU)i, len-saElt); | |
| sufInt |= (unsigned char)(Dna)(s[saElt+i]); | |
| } | |
| // Assert that this prefix-of-suffix is greater | |
| // than or equal to the last one (true b/c the | |
| // suffix array is sorted) | |
| #ifndef NDEBUG | |
| if(lastSufInt > 0) assert_geq(sufInt, lastSufInt); | |
| lastSufInt = sufInt; | |
| #endif | |
| // Update ftab | |
| assert_lt(sufInt+1, ftabLen); | |
| ftab[sufInt+1]++; | |
| if(absorbCnt > 0) { | |
| // Absorb all short suffixes since the last | |
| // transition into this transition | |
| absorbFtab[sufInt] = absorbCnt; | |
| absorbCnt = 0; | |
| } | |
| } else { | |
| // Otherwise if suffix is fewer than ftabChars | |
| // characters long, then add it to the 'absorbCnt'; | |
| // it will be absorbed into the next transition | |
| assert_lt(absorbCnt, 255); | |
| absorbCnt++; | |
| } | |
| // Suffix array offset boundary? - update offset array | |
| if((si & eh._offMask) == si) { | |
| assert_lt((si >> eh._offRate), eh._offsLen); | |
| // Write offsets directly to the secondary output | |
| // stream, thereby avoiding keeping them in memory | |
| writeU<TIndexOffU>(out2, saElt, this->toBe()); | |
| } | |
| } else { | |
| // Strayed off the end of the SA, now we're just | |
| // padding out a bucket | |
| #ifndef NDEBUG | |
| if(inSA) { | |
| // Assert that we wrote all the characters in the | |
| // string before now | |
| assert_eq(si, len+1); | |
| inSA = false; | |
| } | |
| #endif | |
| // 'A' used for padding; important that padding be | |
| // counted in the occ[] array | |
| bwtChar = 0; | |
| } | |
| if(count) occ[bwtChar]++; | |
| // Append BWT char to bwt section of current side | |
| if(fw) { | |
| // Forward bucket: fill from least to most | |
| #ifdef SIXTY4_FORMAT | |
| ebwtSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1)); | |
| if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0); | |
| #else | |
| pack_2b_in_8b(bwtChar, ebwtSide[sideCur], bpi); | |
| assert_eq((ebwtSide[sideCur] >> (bpi*2)) & 3, bwtChar); | |
| #endif | |
| } else { | |
| // Backward bucket: fill from most to least | |
| #ifdef SIXTY4_FORMAT | |
| ebwtSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1)); | |
| if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0); | |
| #else | |
| pack_2b_in_8b(bwtChar, ebwtSide[sideCur], 3-bpi); | |
| assert_eq((ebwtSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar); | |
| #endif | |
| } | |
| } // end loop over bit-pairs | |
| assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3); | |
| #ifdef SIXTY4_FORMAT | |
| assert_eq(0, si & 31); | |
| #else | |
| assert_eq(0, si & 3); | |
| #endif | |
| if(fw) sideCur++; | |
| else sideCur--; | |
| #ifdef SIXTY4_FORMAT | |
| if(sideCur == (int)eh._sideBwtSz >> 3) | |
| #else | |
| if(sideCur == (int)eh._sideBwtSz) | |
| #endif | |
| { | |
| // Forward side boundary | |
| assert_eq(0, si % eh._sideBwtLen); | |
| #ifdef SIXTY4_FORMAT | |
| sideCur = (eh._sideBwtSz >> 3) - 1; | |
| #else | |
| sideCur = eh._sideBwtSz - 1; | |
| #endif | |
| assert(fw); fw = false; | |
| ASSERT_ONLY(wroteFwBucket = true); | |
| // Write 'G' and 'T' | |
| assert_leq(occSave[0], occ[2]); | |
| assert_leq(occSave[1], occ[3]); | |
| TIndexOffU *u32side = reinterpret_cast<TIndexOffU*>(ebwtSide); | |
| side += sideSz; | |
| assert_leq(side, eh._ebwtTotSz); | |
| #ifdef BOWTIE_64BIT_INDEX | |
| u32side[(sideSz >> 3)-2] = endianizeU<TIndexOffU>(occSave[0], this->toBe()); | |
| u32side[(sideSz >> 3)-1] = endianizeU<TIndexOffU>(occSave[1], this->toBe()); | |
| #else | |
| u32side[(sideSz >> 2)-2] = endianizeU<TIndexOffU>(occSave[0], this->toBe()); | |
| u32side[(sideSz >> 2)-1] = endianizeU<TIndexOffU>(occSave[1], this->toBe()); | |
| #endif | |
| // Write forward side to primary file | |
| out1.write((const char *)ebwtSide, sideSz); | |
| } else if (sideCur == -1) { | |
| // Backward side boundary | |
| assert_eq(0, si % eh._sideBwtLen); | |
| sideCur = 0; | |
| assert(!fw); fw = true; | |
| // Write 'A' and 'C' | |
| TIndexOffU *u32side = reinterpret_cast<TIndexOffU*>(ebwtSide); | |
| side += sideSz; | |
| assert_leq(side, eh._ebwtTotSz); | |
| #ifdef BOWTIE_64BIT_INDEX | |
| u32side[(sideSz >> 3)-2] = endianizeU<TIndexOffU>(occ[0], this->toBe()); | |
| u32side[(sideSz >> 3)-1] = endianizeU<TIndexOffU>(occ[1], this->toBe()); | |
| #else | |
| u32side[(sideSz >> 2)-2] = endianizeU<TIndexOffU>(occ[0], this->toBe()); | |
| u32side[(sideSz >> 2)-1] = endianizeU<TIndexOffU>(occ[1], this->toBe()); | |
| #endif | |
| occSave[0] = occ[2]; // save 'G' count | |
| occSave[1] = occ[3]; // save 'T' count | |
| // Write backward side to primary file | |
| out1.write((const char *)ebwtSide, sideSz); | |
| } | |
| } | |
| VMSG_NL("Exited Ebwt loop"); | |
| assert(ftab != NULL); | |
| assert_neq(zOff, OFF_MASK); | |
| if(absorbCnt > 0) { | |
| // Absorb any trailing, as-yet-unabsorbed short suffixes into | |
| // the last element of ftab | |
| absorbFtab[ftabLen-1] = absorbCnt; | |
| } | |
| // Assert that our loop counter got incremented right to the end | |
| assert_eq(side, eh._ebwtTotSz); | |
| // Assert that we wrote the expected amount to out1 | |
| assert_eq(((TIndexOffU)out1.tellp() - beforeEbwtOff), eh._ebwtTotSz); | |
| // assert that the last thing we did was write a forward bucket | |
| assert(wroteFwBucket); | |
| // | |
| // Write zOff to primary stream | |
| // | |
| writeU<TIndexOffU>(out1, zOff, this->toBe()); | |
| // | |
| // Finish building fchr | |
| // | |
| // Exclusive prefix sum on fchr | |
| for(int i = 1; i < 4; i++) { | |
| fchr[i] += fchr[i-1]; | |
| } | |
| assert_eq(fchr[3], len); | |
| // Shift everybody up by one | |
| for(int i = 4; i >= 1; i--) { | |
| fchr[i] = fchr[i-1]; | |
| } | |
| fchr[0] = 0; | |
| if(_verbose) { | |
| for(int i = 0; i < 5; i++) | |
| cout << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl; | |
| } | |
| // Write fchr to primary file | |
| for(int i = 0; i < 5; i++) { | |
| writeU<TIndexOffU>(out1, fchr[i], this->toBe()); | |
| } | |
| // | |
| // Finish building ftab and build eftab | |
| // | |
| // Prefix sum on ftable | |
| TIndexOffU eftabLen = 0; | |
| assert_eq(0, absorbFtab[0]); | |
| for(TIndexOffU i = 1; i < ftabLen; i++) { | |
| if(absorbFtab[i] > 0) eftabLen += 2; | |
| } | |
| assert_leq(eftabLen, (TIndexOffU)eh._ftabChars*2); | |
| eftabLen = eh._ftabChars*2; | |
| TIndexOffU *eftab = NULL; | |
| try { | |
| eftab = new TIndexOffU[eftabLen]; | |
| memset(eftab, 0, OFF_SIZE * eftabLen); | |
| } catch(bad_alloc &e) { | |
| cerr << "Out of memory allocating eftab[] " | |
| << "in Ebwt::buildToDisk() at " << __FILE__ << ":" | |
| << __LINE__ << endl; | |
| throw e; | |
| } | |
| assert(eftab != NULL); | |
| TIndexOffU eftabCur = 0; | |
| for(TIndexOffU i = 1; i < ftabLen; i++) { | |
| TIndexOffU lo = ftab[i] + Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, i-1); | |
| if(absorbFtab[i] > 0) { | |
| // Skip a number of short pattern indicated by absorbFtab[i] | |
| TIndexOffU hi = lo + absorbFtab[i]; | |
| assert_lt(eftabCur*2+1, eftabLen); | |
| eftab[eftabCur*2] = lo; | |
| eftab[eftabCur*2+1] = hi; | |
| ftab[i] = (eftabCur++) ^ OFF_MASK; // insert pointer into eftab | |
| assert_eq(lo, Ebwt::ftabLo(ftab, eftab, len, ftabLen, eftabLen, i)); | |
| assert_eq(hi, Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, i)); | |
| } else { | |
| ftab[i] = lo; | |
| } | |
| } | |
| assert_eq(Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, ftabLen-1), len+1); | |
| // Write ftab to primary file | |
| for(TIndexOffU i = 0; i < ftabLen; i++) { | |
| writeU<TIndexOffU>(out1, ftab[i], this->toBe()); | |
| } | |
| // Write eftab to primary file | |
| for(TIndexOffU i = 0; i < eftabLen; i++) { | |
| writeU<TIndexOffU>(out1, eftab[i], this->toBe()); | |
| } | |
| // Write isa to primary file | |
| if(isaSample != NULL) { | |
| ASSERT_ONLY(Bitset sawISA(eh._len+1)); | |
| for(TIndexOffU i = 0; i < eh._isaLen; i++) { | |
| TIndexOffU s = isaSample[i]; | |
| assert_leq(s, eh._len); | |
| assert(!sawISA.test(s)); | |
| ASSERT_ONLY(sawISA.set(s)); | |
| writeU<TIndexOffU>(out2, s, this->toBe()); | |
| } | |
| delete[] isaSample; | |
| } | |
| delete[] ftab; | |
| delete[] eftab; | |
| delete[] absorbFtab; | |
| // Note: if you'd like to sanity-check the Ebwt, you'll have to | |
| // read it back into memory first! | |
| assert(!isInMemory()); | |
| VMSG_NL("Exiting Ebwt::buildToDisk()"); | |
| } | |
| /** | |
| * Try to find the Bowtie index specified by the user. First try the | |
| * exact path given by the user. Then try the user-provided string | |
| * appended onto the path of the "indexes" subdirectory below this | |
| * executable, then try the provided string appended onto | |
| * "$BOWTIE_INDEXES/". | |
| */ | |
| string adjustEbwtBase(const string& cmdline, | |
| const string& ebwtFileBase, | |
| bool verbose = false); | |
| #endif /*EBWT_H_*/ |