Permalink
Fetching contributors…
Cannot retrieve contributors at this time
4655 lines (4451 sloc) 151 KB
#ifndef EBWT_H_
#define EBWT_H_
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <vector>
#include <set>
#include <sstream>
#include <fcntl.h>
#include <errno.h>
#include <stdexcept>
#include <seqan/sequence.h>
#include <seqan/index.h>
#include <sys/stat.h>
#ifdef BOWTIE_MM
#include <sys/mman.h>
#include <sys/shm.h>
#endif
#include "auto_array.h"
#include "shmem.h"
#include "alphabet.h"
#include "assert_helpers.h"
#include "bitpack.h"
#include "blockwise_sa.h"
#include "endian_swap.h"
#include "word_io.h"
#include "random_source.h"
#include "hit.h"
#include "ref_read.h"
#include "threading.h"
#include "bitset.h"
#include "str_util.h"
#include "mm.h"
#include "timer.h"
#include "refmap.h"
#include "color_dec.h"
#include "reference.h"
#ifdef POPCNT_CAPABILITY
#include "processor_support.h"
#endif
using namespace std;
using namespace seqan;
#ifndef PREFETCH_LOCALITY
// No locality by default
#define PREFETCH_LOCALITY 2
#endif
// From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl
extern uint8_t cCntLUT_4[4][4][256];
static const uint64_t c_table[4] = {
0xffffffffffffffffllu,
0xaaaaaaaaaaaaaaaallu,
0x5555555555555555llu,
0x0000000000000000llu
};
#ifndef VMSG_NL
#define VMSG_NL(args...) \
if(this->verbose()) { \
stringstream tmp; \
tmp << args << endl; \
this->verbose(tmp.str()); \
}
#endif
#ifndef VMSG
#define VMSG(args...) \
if(this->verbose()) { \
stringstream tmp; \
tmp << args; \
this->verbose(tmp.str()); \
}
#endif
/**
* Flags describing type of Ebwt.
*/
enum EBWT_FLAGS {
EBWT_COLOR = 2, // true -> Ebwt is colorspace
EBWT_ENTIRE_REV = 4 // true -> reverse Ebwt is the whole
// concatenated string reversed, rather than
// each stretch reversed
};
extern string gLastIOErrMsg;
inline bool is_read_err(int fdesc, ssize_t ret, size_t count){
if (ret < 0) {
std::stringstream sstm;
sstm << "ERRNO: " << errno << " ERR Msg:" << strerror(errno) << std::endl;
gLastIOErrMsg = sstm.str();
return true;
}
return false;
}
/* Checks whether a call to fread() failed or not. */
inline bool is_fread_err(FILE* file_hd, size_t ret, size_t count){
if (ferror(file_hd)) {
gLastIOErrMsg = "Error Reading File!";
return true;
}
return false;
}
/**
* Extended Burrows-Wheeler transform header. This together with the
* actual data arrays and other text-specific parameters defined in
* class Ebwt constitute the entire Ebwt.
*/
class EbwtParams {
public:
EbwtParams() { }
EbwtParams(TIndexOffU len,
int32_t lineRate,
int32_t linesPerSide,
int32_t offRate,
int32_t isaRate,
int32_t ftabChars,
bool color,
bool entireReverse)
{
init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireReverse);
}
EbwtParams(const EbwtParams& eh) {
init(eh._len, eh._lineRate, eh._linesPerSide, eh._offRate,
eh._isaRate, eh._ftabChars, eh._color, eh._entireReverse);
}
void init(TIndexOffU len, int32_t lineRate, int32_t linesPerSide,
int32_t offRate, int32_t isaRate, int32_t ftabChars,
bool color, bool entireReverse)
{
_color = color;
_entireReverse = entireReverse;
_len = len;
_bwtLen = _len + 1;
_sz = (len+3)/4;
_bwtSz = (len/4 + 1);
_lineRate = lineRate;
_linesPerSide = linesPerSide;
_origOffRate = offRate;
_offRate = offRate;
_offMask = OFF_MASK << _offRate;
_isaRate = isaRate;
_isaMask = OFF_MASK << ((_isaRate >= 0) ? _isaRate : 0);
_ftabChars = ftabChars;
_eftabLen = _ftabChars*2;
_eftabSz = _eftabLen*OFF_SIZE;
_ftabLen = (1 << (_ftabChars*2))+1;
_ftabSz = _ftabLen*OFF_SIZE;
_offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
_offsSz = (uint64_t)_offsLen*OFF_SIZE;
_isaLen = (_isaRate == -1)? 0 : ((_bwtLen + (1 << _isaRate) - 1) >> _isaRate);
_isaSz = _isaLen*OFF_SIZE;
_lineSz = 1 << _lineRate;
_sideSz = _lineSz * _linesPerSide;
_sideBwtSz = _sideSz - 2*OFF_SIZE;
_sideBwtLen = _sideBwtSz*4;
_numSidePairs = (_bwtSz+(2*_sideBwtSz)-1)/(2*_sideBwtSz);
_numSides = _numSidePairs*2;
_numLines = _numSides * _linesPerSide;
_ebwtTotLen = _numSidePairs * (2*_sideSz);
_ebwtTotSz = _ebwtTotLen;
assert(repOk());
}
TIndexOffU len() const { return _len; }
TIndexOffU bwtLen() const { return _bwtLen; }
TIndexOffU sz() const { return _sz; }
TIndexOffU bwtSz() const { return _bwtSz; }
int32_t lineRate() const { return _lineRate; }
int32_t linesPerSide() const { return _linesPerSide; }
int32_t origOffRate() const { return _origOffRate; }
int32_t offRate() const { return _offRate; }
TIndexOffU offMask() const { return _offMask; }
int32_t isaRate() const { return _isaRate; }
uint32_t isaMask() const { return _isaMask; }
int32_t ftabChars() const { return _ftabChars; }
uint32_t eftabLen() const { return _eftabLen; }
uint32_t eftabSz() const { return _eftabSz; }
TIndexOffU ftabLen() const { return _ftabLen; }
TIndexOffU ftabSz() const { return _ftabSz; }
TIndexOffU offsLen() const { return _offsLen; }
uint64_t offsSz() const { return _offsSz; }
TIndexOffU isaLen() const { return _isaLen; }
uint64_t isaSz() const { return _isaSz; }
uint32_t lineSz() const { return _lineSz; }
uint32_t sideSz() const { return _sideSz; }
uint32_t sideBwtSz() const { return _sideBwtSz; }
uint32_t sideBwtLen() const { return _sideBwtLen; }
uint32_t numSidePairs() const { return _numSidePairs; } /* check */
TIndexOffU numSides() const { return _numSides; }
TIndexOffU numLines() const { return _numLines; }
TIndexOffU ebwtTotLen() const { return _ebwtTotLen; }
TIndexOffU ebwtTotSz() const { return _ebwtTotSz; }
bool color() const { return _color; }
bool entireReverse() const { return _entireReverse; }
/**
* Set a new suffix-array sampling rate, which involves updating
* rate, mask, sample length, and sample size.
*/
void setOffRate(int __offRate) {
_offRate = __offRate;
_offMask = OFF_MASK << _offRate;
_offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
_offsSz = (uint64_t) _offsLen*OFF_SIZE;
}
/**
* Set a new inverse suffix-array sampling rate, which involves
* updating rate, mask, sample length, and sample size.
*/
void setIsaRate(int __isaRate) {
_isaRate = __isaRate;
_isaMask = OFF_MASK << _isaRate;
_isaLen = (_bwtLen + (1 << _isaRate) - 1) >> _isaRate;
_isaSz = (uint64_t)_isaLen*OFF_SIZE;
}
/// Check that this EbwtParams is internally consistent
bool repOk() const {
assert_gt(_len, 0);
assert_gt(_lineRate, 3);
assert_geq(_offRate, 0);
assert_leq(_ftabChars, 16);
assert_geq(_ftabChars, 1);
assert_lt(_lineRate, 32);
assert_lt(_linesPerSide, 32);
assert_lt(_ftabChars, 32);
assert_eq(0, _ebwtTotSz % (2*_lineSz));
return true;
}
/**
* Pretty-print the header contents to the given output stream.
*/
void print(ostream& out) const {
out << "Headers:" << endl
<< " len: " << _len << endl
<< " bwtLen: " << _bwtLen << endl
<< " sz: " << _sz << endl
<< " bwtSz: " << _bwtSz << endl
<< " lineRate: " << _lineRate << endl
<< " linesPerSide: " << _linesPerSide << endl
<< " offRate: " << _offRate << endl
<< " offMask: 0x" << hex << _offMask << dec << endl
<< " isaRate: " << _isaRate << endl
<< " isaMask: 0x" << hex << _isaMask << dec << endl
<< " ftabChars: " << _ftabChars << endl
<< " eftabLen: " << _eftabLen << endl
<< " eftabSz: " << _eftabSz << endl
<< " ftabLen: " << _ftabLen << endl
<< " ftabSz: " << _ftabSz << endl
<< " offsLen: " << _offsLen << endl
<< " offsSz: " << _offsSz << endl
<< " isaLen: " << _isaLen << endl
<< " isaSz: " << _isaSz << endl
<< " lineSz: " << _lineSz << endl
<< " sideSz: " << _sideSz << endl
<< " sideBwtSz: " << _sideBwtSz << endl
<< " sideBwtLen: " << _sideBwtLen << endl
<< " numSidePairs: " << _numSidePairs << endl
<< " numSides: " << _numSides << endl
<< " numLines: " << _numLines << endl
<< " ebwtTotLen: " << _ebwtTotLen << endl
<< " ebwtTotSz: " << _ebwtTotSz << endl
<< " reverse: " << _entireReverse << endl;
}
TIndexOffU _len;
TIndexOffU _bwtLen;
TIndexOffU _sz;
TIndexOffU _bwtSz;
int32_t _lineRate;
int32_t _linesPerSide;
int32_t _origOffRate;
int32_t _offRate;
TIndexOffU _offMask;
int32_t _isaRate;
uint32_t _isaMask;
int32_t _ftabChars;
uint32_t _eftabLen;
uint32_t _eftabSz;
TIndexOffU _ftabLen;
TIndexOffU _ftabSz;
TIndexOffU _offsLen;
uint64_t _offsSz;
TIndexOffU _isaLen;
uint64_t _isaSz;
uint32_t _lineSz;
uint32_t _sideSz;
uint32_t _sideBwtSz;
uint32_t _sideBwtLen;
uint32_t _numSidePairs;
TIndexOffU _numSides;
TIndexOffU _numLines;
TIndexOffU _ebwtTotLen;
TIndexOffU _ebwtTotSz;
bool _color;
bool _entireReverse;
};
/**
* Exception to throw when a file-realted error occurs.
*/
class EbwtFileOpenException : public std::runtime_error {
public:
EbwtFileOpenException(const std::string& msg = "") :
std::runtime_error(msg) { }
};
/**
* Calculate size of file with given name.
*/
static inline int64_t fileSize(const char* name) {
std::ifstream f;
f.open(name, std::ios_base::binary | std::ios_base::in);
if (!f.good() || f.eof() || !f.is_open()) { return 0; }
f.seekg(0, std::ios_base::beg);
std::ifstream::pos_type begin_pos = f.tellg();
f.seekg(0, std::ios_base::end);
return static_cast<int64_t>(f.tellg() - begin_pos);
}
// Forward declarations for Ebwt class
struct SideLocus;
template<typename TStr> class EbwtSearchParams;
/**
* Extended Burrows-Wheeler transform data.
*
* An Ebwt may be transferred to and from RAM with calls to
* evictFromMemory() and loadIntoMemory(). By default, a newly-created
* Ebwt is not loaded into memory; if the user would like to use a
* newly-created Ebwt to answer queries, they must first call
* loadIntoMemory().
*/
template <typename TStr>
class Ebwt {
public:
typedef typename Value<TStr>::Type TAlphabet;
#define Ebwt_INITS \
_toBigEndian(currentlyBigEndian()), \
_overrideOffRate(__overrideOffRate), \
_overrideIsaRate(__overrideIsaRate), \
_verbose(verbose), \
_passMemExc(passMemExc), \
_sanity(sanityCheck), \
_fw(__fw), \
_in1(NULL), \
_in2(NULL), \
_zOff(OFF_MASK), \
_zEbwtByteOff(OFF_MASK), \
_zEbwtBpOff(-1), \
_nPat(0), \
_nFrag(0), \
_plen(NULL), \
_rstarts(NULL), \
_fchr(NULL), \
_ftab(NULL), \
_eftab(NULL), \
_offs(NULL), \
_isa(NULL), \
_ebwt(NULL), \
_useMm(false), \
useShmem_(false), \
_refnames(), \
rmap_(NULL), \
mmFile1_(NULL), \
mmFile2_(NULL)
#ifdef EBWT_STATS
#define Ebwt_STAT_INITS \
,mapLFExs_(0llu), \
mapLFs_(0llu), \
mapLFcs_(0llu), \
mapLF1cs_(0llu), \
mapLF1s_(0llu)
#else
#define Ebwt_STAT_INITS
#endif
/// Construct an Ebwt from the given input file
Ebwt(const string& in,
int color,
int needEntireReverse,
bool __fw,
int32_t __overrideOffRate = -1,
int32_t __overrideIsaRate = -1,
bool useMm = false,
bool useShmem = false,
bool mmSweep = false,
bool loadNames = false,
const ReferenceMap* rmap = NULL,
bool verbose = false,
bool startVerbose = false,
bool passMemExc = false,
bool sanityCheck = false) :
Ebwt_INITS
Ebwt_STAT_INITS
{
assert(!useMm || !useShmem);
#ifdef POPCNT_CAPABILITY
ProcessorSupport ps;
_usePOPCNTinstruction = ps.POPCNTenabled();
#endif
rmap_ = rmap;
_useMm = useMm;
useShmem_ = useShmem;
_in1Str = in + ".1." + gEbwt_ext;
_in2Str = in + ".2." + gEbwt_ext;
readIntoMemory(
color, // expect colorspace reference?
__fw ? -1 : needEntireReverse, // need REF_READ_REVERSE
true, // stop after loading the header portion?
&_eh, // params structure to fill in
mmSweep, // mmSweep
loadNames, // loadNames
startVerbose); // startVerbose
// If the offRate has been overridden, reflect that in the
// _eh._offRate field
if(_overrideOffRate > _eh._offRate) {
_eh.setOffRate(_overrideOffRate);
assert_eq(_overrideOffRate, _eh._offRate);
}
// Same with isaRate
if(_overrideIsaRate > _eh._isaRate) {
_eh.setIsaRate(_overrideIsaRate);
assert_eq(_overrideIsaRate, _eh._isaRate);
}
assert(repOk());
}
/// Construct an Ebwt from the given header parameters and string
/// vector, optionally using a blockwise suffix sorter with the
/// given 'bmax' and 'dcv' parameters. The string vector is
/// ultimately joined and the joined string is passed to buildToDisk().
Ebwt(int color,
int32_t lineRate,
int32_t linesPerSide,
int32_t offRate,
int32_t isaRate,
int32_t ftabChars,
int nthreads,
const string& file, // base filename for EBWT files
bool __fw,
bool useBlockwise,
TIndexOffU bmax,
TIndexOffU bmaxSqrtMult,
TIndexOffU bmaxDivN,
int dcv,
vector<FileBuf*>& is,
vector<RefRecord>& szs,
vector<uint32_t>& plens,
TIndexOffU sztot,
const RefReadInParams& refparams,
uint32_t seed,
int32_t __overrideOffRate = -1,
int32_t __overrideIsaRate = -1,
bool verbose = false,
bool passMemExc = false,
bool sanityCheck = false) :
Ebwt_INITS
Ebwt_STAT_INITS,
_eh(joinedLen(szs),
lineRate,
linesPerSide,
offRate,
isaRate,
ftabChars,
color,
refparams.reverse == REF_READ_REVERSE)
{
#ifdef POPCNT_CAPABILITY
ProcessorSupport ps;
_usePOPCNTinstruction = ps.POPCNTenabled();
#endif
_in1Str = file + ".1." + gEbwt_ext;
_in2Str = file + ".2." + gEbwt_ext;
// Open output files
ofstream fout1(_in1Str.c_str(), ios::binary);
if(!fout1.good()) {
cerr << "Could not open index file for writing: \"" << _in1Str << "\"" << endl
<< "Please make sure the directory exists and that permissions allow writing by" << endl
<< "Bowtie." << endl;
throw 1;
}
ofstream fout2(_in2Str.c_str(), ios::binary);
if(!fout2.good()) {
cerr << "Could not open index file for writing: \"" << _in2Str << "\"" << endl
<< "Please make sure the directory exists and that permissions allow writing by" << endl
<< "Bowtie." << endl;
throw 1;
}
// Build
initFromVector(
is,
szs,
plens,
sztot,
refparams,
fout1,
fout2,
file,
nthreads,
useBlockwise,
bmax,
bmaxSqrtMult,
bmaxDivN,
dcv,
seed);
// Close output files
fout1.flush();
int64_t tellpSz1 = (int64_t)fout1.tellp();
VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary EBWT file: " << _in1Str);
fout1.close();
bool err = false;
if(tellpSz1 > fileSize(_in1Str.c_str())) {
err = true;
cerr << "Index is corrupt: File size for " << _in1Str << " should have been " << tellpSz1
<< " but is actually " << fileSize(_in1Str.c_str()) << "." << endl;
}
fout2.flush();
int64_t tellpSz2 = (int64_t)fout2.tellp();
VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary EBWT file: " << _in2Str);
fout2.close();
if(tellpSz2 > fileSize(_in2Str.c_str())) {
err = true;
cerr << "Index is corrupt: File size for " << _in2Str << " should have been " << tellpSz2
<< " but is actually " << fileSize(_in2Str.c_str()) << "." << endl;
}
if(err) {
cerr << "Please check if there is a problem with the disk or if disk is full." << endl;
throw 1;
}
// Reopen as input streams
VMSG_NL("Re-opening _in1 and _in2 as input streams");
if(_sanity) {
VMSG_NL("Sanity-checking Ebwt");
assert(!isInMemory());
readIntoMemory(
color,
__fw ? -1 : refparams.reverse == REF_READ_REVERSE,
false,
NULL,
false,
true,
false);
sanityCheckAll(refparams.reverse);
evictFromMemory();
assert(!isInMemory());
}
VMSG_NL("Returning from Ebwt constructor");
}
bool isPacked();
/**
* Write the rstarts array given the szs array for the reference.
*/
void szsToDisk(const vector<RefRecord>& szs, ostream& os, int reverse) {
TIndexOffU seq = 0;
TIndexOffU off = 0;
TIndexOffU totlen = 0;
for(unsigned int i = 0; i < szs.size(); i++) {
if(szs[i].len == 0) continue;
if(szs[i].first) off = 0;
off += szs[i].off;
#ifdef ACCOUNT_FOR_ALL_GAP_REFS
if(szs[i].first && szs[i].len > 0) seq++;
#else
if(szs[i].first) seq++;
#endif
TIndexOffU seqm1 = seq-1;
assert_lt(seqm1, _nPat);
TIndexOffU fwoff = off;
if(reverse == REF_READ_REVERSE) {
// Invert pattern idxs
seqm1 = _nPat - seqm1 - 1;
// Invert pattern idxs
assert_leq(off + szs[i].len, _plen[seqm1]);
fwoff = _plen[seqm1] - (off + szs[i].len);
}
writeU<TIndexOffU>(os, totlen, this->toBe()); // offset from beginning of joined string
writeU<TIndexOffU>(os, seqm1, this->toBe()); // sequence id
writeU<TIndexOffU>(os, fwoff, this->toBe()); // offset into sequence
totlen += szs[i].len;
off += szs[i].len;
}
}
/**
* Helper for the constructors above. Takes a vector of text
* strings and joins them into a single string with a call to
* joinToDisk, which does a join (with padding) and writes some of
* the resulting data directly to disk rather than keep it in
* memory. It then constructs a suffix-array producer (what kind
* depends on 'useBlockwise') for the resulting sequence. The
* suffix-array producer can then be used to obtain chunks of the
* joined string's suffix array.
*/
void initFromVector(
vector<FileBuf*>& is,
vector<RefRecord>& szs,
vector<uint32_t>& plens,
TIndexOffU sztot,
const RefReadInParams& refparams,
ofstream& out1,
ofstream& out2,
const string& outfile,
int nthreads,
bool useBlockwise,
TIndexOffU bmax,
TIndexOffU bmaxSqrtMult,
TIndexOffU bmaxDivN,
int dcv,
uint32_t seed)
{
// Compose text strings into single string
VMSG_NL("Calculating joined length");
TStr s; // holds the entire joined reference after call to joinToDisk
TIndexOffU jlen;
jlen = joinedLen(szs);
assert_geq(jlen, sztot);
VMSG_NL("Writing header");
writeFromMemory(true, out1, out2);
try {
VMSG_NL("Reserving space for joined string");
seqan::reserve(s, jlen, Exact());
VMSG_NL("Joining reference sequences");
if(refparams.reverse == REF_READ_REVERSE) {
{
Timer timer(cout, " Time to join reference sequences: ", _verbose);
joinToDisk(is, szs, plens, sztot, refparams, s, out1, out2, seed);
} {
Timer timer(cout, " Time to reverse reference sequence: ", _verbose);
vector<RefRecord> tmp;
reverseInPlace(s);
reverseRefRecords(szs, tmp, false, false);
szsToDisk(tmp, out1, refparams.reverse);
}
} else {
Timer timer(cout, " Time to join reference sequences: ", _verbose);
joinToDisk(is, szs, plens, sztot, refparams, s, out1, out2, seed);
szsToDisk(szs, out1, refparams.reverse);
}
// Joined reference sequence now in 's'
} catch(bad_alloc& e) {
// If we throw an allocation exception in the try block,
// that means that the joined version of the reference
// string itself is too larger to fit in memory. The only
// alternatives are to tell the user to give us more memory
// or to try again with a packed representation of the
// reference (if we haven't tried that already).
cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl;
if(!isPacked() && _passMemExc) {
// Pass the exception up so that we can retry using a
// packed string representation
throw e;
}
// There's no point passing this exception on. The fact
// that we couldn't allocate the joined string means that
// --bmax is irrelevant - the user should re-run with
// ebwt-build-packed
if(isPacked()) {
cerr << "Please try running bowtie-build on a computer with more memory." << endl;
} else {
cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl
<< "mode (-a/--auto), or try again on a computer with more memory." << endl;
}
if(sizeof(void*) == 4) {
cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
<< "this executable is 32-bit." << endl;
}
throw 1;
}
// Succesfully obtained joined reference string
assert_geq(length(s), jlen);
if(bmax != OFF_MASK) {
VMSG_NL("bmax according to bmax setting: " << bmax);
}
else if(bmaxSqrtMult != OFF_MASK) {
bmax *= bmaxSqrtMult;
VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax);
}
else if(bmaxDivN != OFF_MASK) {
bmax = max<TIndexOffU>(jlen / bmaxDivN, 1);
VMSG_NL("bmax according to bmaxDivN setting: " << bmax);
}
else {
bmax = (TIndexOffU)sqrt(length(s));
VMSG_NL("bmax defaulted to: " << bmax);
}
int iter = 0;
bool first = true;
// Look for bmax/dcv parameters that work.
while(true) {
if(!first && bmax < 40 && _passMemExc) {
cerr << "Could not find approrpiate bmax/dcv settings for building this index." << endl;
if(!isPacked()) {
// Throw an exception exception so that we can
// retry using a packed string representation
throw bad_alloc();
} else {
cerr << "Already tried a packed string representation." << endl;
}
cerr << "Please try indexing this reference on a computer with more memory." << endl;
if(sizeof(void*) == 4) {
cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
<< "this executable is 32-bit." << endl;
}
throw 1;
}
if(dcv > 4096) dcv = 4096;
if((iter % 6) == 5 && dcv < 4096 && dcv != 0) {
dcv <<= 1; // double difference-cover period
} else {
bmax -= (bmax >> 2); // reduce by 25%
}
VMSG("Using parameters --bmax " << bmax);
if(dcv == 0) {
VMSG_NL(" and *no difference cover*");
} else {
VMSG_NL(" --dcv " << dcv);
}
iter++;
try {
{
VMSG_NL(" Doing ahead-of-time memory usage test");
// Make a quick-and-dirty attempt to force a bad_alloc iff
// we would have thrown one eventually as part of
// constructing the DifferenceCoverSample
dcv <<= 1;
TIndexOffU sz = (TIndexOffU)DifferenceCoverSample<TStr>::simulateAllocs(s, dcv >> 1);
if(nthreads > 1) sz *= (nthreads + 1);
AutoArray<uint8_t> tmp(sz);
dcv >>= 1;
// Likewise with the KarkkainenBlockwiseSA
sz = (TIndexOffU)KarkkainenBlockwiseSA<TStr>::simulateAllocs(s, bmax);
AutoArray<uint8_t> tmp2(sz);
// Now throw in the 'ftab' and 'isaSample' structures
// that we'll eventually allocate in buildToDisk
AutoArray<TIndexOffU> ftab(_eh._ftabLen * 2);
AutoArray<uint8_t> side(_eh._sideSz);
// Grab another 20 MB out of caution
AutoArray<uint32_t> extra(20*1024*1024);
// If we made it here without throwing bad_alloc, then we
// passed the memory-usage stress test
VMSG(" Passed! Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv);
if(isPacked()) {
VMSG(" --packed");
}
VMSG_NL("");
}
VMSG_NL("Constructing suffix-array element generator");
KarkkainenBlockwiseSA<TStr> bsa(s, bmax, nthreads, dcv, seed, _sanity, _passMemExc, _verbose, outfile);
assert(bsa.suffixItrIsReset());
assert_eq(bsa.size(), length(s)+1);
VMSG_NL("Converting suffix-array elements to index image");
buildToDisk(bsa, s, out1, out2);
out1.flush(); out2.flush();
if(out1.fail() || out2.fail()) {
cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl;
throw 1;
}
break;
} catch(bad_alloc& e) {
if(_passMemExc) {
VMSG_NL(" Ran out of memory; automatically trying more memory-economical parameters.");
} else {
cerr << "Out of memory while constructing suffix array. Please try using a smaller" << endl
<< "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl;
throw 1;
}
}
first = false;
}
assert(repOk());
// Now write reference sequence names on the end
#ifdef ACCOUNT_FOR_ALL_GAP_REFS
assert_geq(this->_refnames.size(), this->_nPat);
#else
assert_eq(this->_refnames.size(), this->_nPat);
#endif
for(TIndexOffU i = 0; i < this->_refnames.size(); i++) {
out1 << this->_refnames[i] << endl;
}
out1 << '\0';
out1.flush(); out2.flush();
if(out1.fail() || out2.fail()) {
cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl;
throw 1;
}
VMSG_NL("Returning from initFromVector");
}
/**
* Return the length that the joined string of the given string
* list will have. Note that this is indifferent to how the text
* fragments correspond to input sequences - it just cares about
* the lengths of the fragments.
*/
TIndexOffU joinedLen(vector<RefRecord>& szs) {
TIndexOffU ret = 0;
for(unsigned int i = 0; i < szs.size(); i++) {
ret += szs[i].len;
}
return ret;
}
/// Destruct an Ebwt
~Ebwt() {
// Only free buffers if we're *not* using memory-mapped files
if(!_useMm) {
// Delete everything that was allocated in read(false, ...)
if(_fchr != NULL) delete[] _fchr; _fchr = NULL;
if(_ftab != NULL) delete[] _ftab; _ftab = NULL;
if(_eftab != NULL) delete[] _eftab; _eftab = NULL;
if(_offs != NULL && !useShmem_) {
delete[] _offs; _offs = NULL;
} else if(_offs != NULL && useShmem_) {
FREE_SHARED(_offs);
}
if(_isa != NULL) delete[] _isa; _isa = NULL;
if(_plen != NULL) delete[] _plen; _plen = NULL;
if(_rstarts != NULL) delete[] _rstarts; _rstarts = NULL;
if(_ebwt != NULL && !useShmem_) {
delete[] _ebwt; _ebwt = NULL;
} else if(_ebwt != NULL && useShmem_) {
FREE_SHARED(_ebwt);
}
}
if (_in1 != NULL) fclose(_in1);
if (_in2 != NULL) fclose(_in2);
#ifdef EBWT_STATS
cout << (_fw ? "Forward index:" : "Mirror index:") << endl;
cout << " mapLFEx: " << mapLFExs_ << endl;
cout << " mapLF: " << mapLFs_ << endl;
cout << " mapLF(c): " << mapLFcs_ << endl;
cout << " mapLF1(c): " << mapLF1cs_ << endl;
cout << " mapLF(c): " << mapLF1s_ << endl;
#endif
}
/// Accessors
const EbwtParams& eh() const { return _eh; }
TIndexOffU zOff() const { return _zOff; }
TIndexOffU zEbwtByteOff() const { return _zEbwtByteOff; }
TIndexOff zEbwtBpOff() const { return _zEbwtBpOff; }
TIndexOffU nPat() const { return _nPat; }
TIndexOffU nFrag() const { return _nFrag; }
TIndexOffU* fchr() const { return _fchr; }
TIndexOffU* ftab() const { return _ftab; }
TIndexOffU* eftab() const { return _eftab; }
TIndexOffU* offs() const { return _offs; }
uint32_t* isa() const { return _isa; } /* check */
TIndexOffU* plen() const { return _plen; }
TIndexOffU* rstarts() const { return _rstarts; }
uint8_t* ebwt() const { return _ebwt; }
const ReferenceMap* rmap() const { return rmap_; }
bool toBe() const { return _toBigEndian; }
bool verbose() const { return _verbose; }
bool sanityCheck() const { return _sanity; }
vector<string>& refnames() { return _refnames; }
bool fw() const { return _fw; }
#ifdef POPCNT_CAPABILITY
bool _usePOPCNTinstruction;
#endif
/// Return true iff the Ebwt is currently in memory
bool isInMemory() const {
if(_ebwt != NULL) {
assert(_eh.repOk());
assert(_ftab != NULL);
assert(_eftab != NULL);
assert(_fchr != NULL);
assert(_offs != NULL);
assert(_isa != NULL);
assert(_rstarts != NULL);
assert_neq(_zEbwtByteOff, OFF_MASK);
assert_neq(_zEbwtBpOff, -1);
return true;
} else {
assert(_ftab == NULL);
assert(_eftab == NULL);
assert(_fchr == NULL);
assert(_offs == NULL);
assert(_rstarts == NULL);
assert_eq(_zEbwtByteOff, OFF_MASK);
assert_eq(_zEbwtBpOff, -1);
return false;
}
}
/// Return true iff the Ebwt is currently stored on disk
bool isEvicted() const {
return !isInMemory();
}
/**
* Load this Ebwt into memory by reading it in from the _in1 and
* _in2 streams.
*/
void loadIntoMemory(
int color,
int needEntireReverse,
bool loadNames,
bool verbose)
{
readIntoMemory(
color, // expect index to be colorspace?
needEntireReverse, // require reverse index to be concatenated reference reversed
false, // stop after loading the header portion?
NULL, // params
false, // mmSweep
loadNames, // loadNames
verbose); // startVerbose
}
/**
* Frees memory associated with the Ebwt.
*/
void evictFromMemory() {
assert(isInMemory());
if(!_useMm) {
delete[] _fchr;
delete[] _ftab;
delete[] _eftab;
if(!useShmem_) delete[] _offs;
delete[] _isa;
// Keep plen; it's small and the client may want to query it
// even when the others are evicted.
//delete[] _plen;
delete[] _rstarts;
if(!useShmem_) delete[] _ebwt;
}
_fchr = NULL;
_ftab = NULL;
_eftab = NULL;
_offs = NULL;
_isa = NULL;
// Keep plen; it's small and the client may want to query it
// even when the others are evicted.
//_plen = NULL;
_rstarts = NULL;
_ebwt = NULL;
_zEbwtByteOff = OFF_MASK;
_zEbwtBpOff = -1;
}
/**
* Non-static facade for static function ftabHi.
*/
TIndexOffU ftabHi(TIndexOffU i) const {
return Ebwt::ftabHi(_ftab, _eftab, _eh._len, _eh._ftabLen,
_eh._eftabLen, i);
}
/**
* Get "high interpretation" of ftab entry at index i. The high
* interpretation of a regular ftab entry is just the entry
* itself. The high interpretation of an extended entry is the
* second correpsonding ui32 in the eftab.
*
* It's a static member because it's convenient to ask this
* question before the Ebwt is fully initialized.
*/
static TIndexOffU ftabHi(TIndexOffU *ftab,
TIndexOffU *eftab,
TIndexOffU len,
TIndexOffU ftabLen,
TIndexOffU eftabLen,
TIndexOffU i)
{
assert_lt(i, ftabLen);
if(ftab[i] <= len) {
return ftab[i];
} else {
TIndexOffU efIdx = ftab[i] ^ OFF_MASK;
assert_lt(efIdx*2+1, eftabLen);
return eftab[efIdx*2+1];
}
}
/**
* Non-static facade for static function ftabLo.
*/
TIndexOffU ftabLo(TIndexOffU i) const {
return Ebwt::ftabLo(_ftab, _eftab, _eh._len, _eh._ftabLen,
_eh._eftabLen, i);
}
/**
* Get "low interpretation" of ftab entry at index i. The low
* interpretation of a regular ftab entry is just the entry
* itself. The low interpretation of an extended entry is the
* first correpsonding ui32 in the eftab.
*
* It's a static member because it's convenient to ask this
* question before the Ebwt is fully initialized.
*/
static TIndexOffU ftabLo(TIndexOffU *ftab,
TIndexOffU *eftab,
TIndexOffU len,
TIndexOffU ftabLen,
TIndexOffU eftabLen,
TIndexOffU i)
{
assert_lt(i, ftabLen);
if(ftab[i] <= len) {
return ftab[i];
} else {
TIndexOffU efIdx = ftab[i] ^ OFF_MASK;
assert_lt(efIdx*2+1, eftabLen);
return eftab[efIdx*2];
}
}
/**
* When using read() to create an Ebwt, we have to set a couple of
* additional fields in the Ebwt object that aren't part of the
* parameter list and are not stored explicitly in the file. Right
* now, this just involves initializing _zEbwtByteOff and
* _zEbwtBpOff from _zOff.
*/
void postReadInit(EbwtParams& eh) {
TIndexOffU sideNum = _zOff / eh._sideBwtLen;
TIndexOffU sideCharOff = _zOff % eh._sideBwtLen;
TIndexOffU sideByteOff = sideNum * eh._sideSz;
_zEbwtByteOff = sideCharOff >> 2;
assert_lt(_zEbwtByteOff, eh._sideBwtSz);
_zEbwtBpOff = sideCharOff & 3;
assert_lt(_zEbwtBpOff, 4);
if((sideNum & 1) == 0) {
// This is an even (backward) side
_zEbwtByteOff = eh._sideBwtSz - _zEbwtByteOff - 1;
_zEbwtBpOff = 3 - _zEbwtBpOff;
assert_lt(_zEbwtBpOff, 4);
}
_zEbwtByteOff += sideByteOff;
assert(repOk(eh)); // Ebwt should be fully initialized now
}
/**
* Pretty-print the Ebwt to the given output stream.
*/
void print(ostream& out) const {
print(out, _eh);
}
/**
* Pretty-print the Ebwt and given EbwtParams to the given output
* stream.
*/
void print(ostream& out, const EbwtParams& eh) const {
eh.print(out); // print params
out << "Ebwt (" << (isInMemory()? "memory" : "disk") << "):" << endl
<< " zOff: " << _zOff << endl
<< " zEbwtByteOff: " << _zEbwtByteOff << endl
<< " zEbwtBpOff: " << _zEbwtBpOff << endl
<< " nPat: " << _nPat << endl
<< " plen: ";
if(_plen == NULL) {
out << "NULL" << endl;
} else {
out << "non-NULL, [0] = " << _plen[0] << endl;
}
out << " rstarts: ";
if(_rstarts == NULL) {
out << "NULL" << endl;
} else {
out << "non-NULL, [0] = " << _rstarts[0] << endl;
}
out << " ebwt: ";
if(_ebwt == NULL) {
out << "NULL" << endl;
} else {
out << "non-NULL, [0] = " << _ebwt[0] << endl;
}
out << " fchr: ";
if(_fchr == NULL) {
out << "NULL" << endl;
} else {
out << "non-NULL, [0] = " << _fchr[0] << endl;
}
out << " ftab: ";
if(_ftab == NULL) {
out << "NULL" << endl;
} else {
out << "non-NULL, [0] = " << _ftab[0] << endl;
}
out << " eftab: ";
if(_eftab == NULL) {
out << "NULL" << endl;
} else {
out << "non-NULL, [0] = " << _eftab[0] << endl;
}
out << " offs: ";
if(_offs == NULL) {
out << "NULL" << endl;
} else {
out << "non-NULL, [0] = " << _offs[0] << endl;
}
}
// Building
static TStr join(vector<TStr>& l, uint32_t seed);
static TStr join(vector<FileBuf*>& l, vector<RefRecord>& szs, TIndexOffU sztot, const RefReadInParams& refparams, uint32_t seed);
void joinToDisk(vector<FileBuf*>& l, vector<RefRecord>& szs, vector<uint32_t>& plens, TIndexOffU sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2, uint32_t seed = 0);
void buildToDisk(InorderBlockwiseSA<TStr>& sa, const TStr& s, ostream& out1, ostream& out2);
// I/O
void readIntoMemory(int color, int needEntireReverse, bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, bool startVerbose);
void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const;
void writeFromMemory(bool justHeader, const string& out1, const string& out2) const;
// Sanity checking
void printRangeFw(uint32_t begin, uint32_t end) const;
void printRangeBw(uint32_t begin, uint32_t end) const;
void sanityCheckUpToSide(TIndexOff upToSide) const;
void sanityCheckAll(int reverse) const;
void restore(TStr& s) const;
void checkOrigs(const vector<String<Dna5> >& os, bool color, bool mirror) const;
// Searching and reporting
void joinedToTextOff(TIndexOffU qlen, TIndexOffU off, TIndexOffU& tidx, TIndexOffU& textoff, TIndexOffU& tlen) const;
inline bool report(const String<Dna5>& query, String<char>* quals, String<char>* name, bool color, char primer, char trimc, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector<TIndexOffU>& mmui32, const std::vector<uint8_t>& refcs, size_t numMms, TIndexOffU off, TIndexOffU top, TIndexOffU bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams<TStr>& params) const;
inline bool reportChaseOne(const String<Dna5>& query, String<char>* quals, String<char>* name, bool color, char primer, char trimc, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector<TIndexOffU>& mmui32, const std::vector<uint8_t>& refcs, size_t numMms, TIndexOffU i, TIndexOffU top, TIndexOffU bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams<TStr>& params, SideLocus *l = NULL) const;
inline bool reportReconstruct(const String<Dna5>& query, String<char>* quals, String<char>* name, String<Dna5>& lbuf, String<Dna5>& rbuf, const TIndexOffU *mmui32, const char* refcs, size_t numMms, TIndexOffU i, TIndexOffU top, TIndexOffU bot, uint32_t qlen, int stratum, const EbwtSearchParams<TStr>& params, SideLocus *l = NULL) const;
inline int rowL(const SideLocus& l) const;
inline TIndexOffU countUpTo(const SideLocus& l, int c) const;
inline void countUpToEx(const SideLocus& l, TIndexOffU* pairs) const;
inline TIndexOffU countFwSide(const SideLocus& l, int c) const;
inline void countFwSideEx(const SideLocus& l, TIndexOffU *pairs) const;
inline TIndexOffU countBwSide(const SideLocus& l, int c) const;
inline void countBwSideEx(const SideLocus& l, TIndexOffU *pairs) const;
inline TIndexOffU mapLF(const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const;
inline void mapLFEx(const SideLocus& l, TIndexOffU *pairs ASSERT_ONLY(, bool overrideSanity = false)) const;
inline void mapLFEx(const SideLocus& ltop, const SideLocus& lbot, TIndexOffU *tops, TIndexOffU *bots ASSERT_ONLY(, bool overrideSanity = false)) const;
inline TIndexOffU mapLF(const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const;
inline TIndexOffU mapLF1(TIndexOffU row, const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const;
inline int mapLF1(TIndexOffU& row, const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const;
/// Check that in-memory Ebwt is internally consistent with respect
/// to given EbwtParams; assert if not
bool inMemoryRepOk(const EbwtParams& eh) const {
assert_leq(ValueSize<TAlphabet>::VALUE, 4);
assert_geq(_zEbwtBpOff, 0);
assert_lt(_zEbwtBpOff, 4);
assert_lt(_zEbwtByteOff, eh._ebwtTotSz);
assert_lt(_zOff, eh._bwtLen);
assert(_rstarts != NULL);
assert_geq(_nFrag, _nPat);
return true;
}
/// Check that in-memory Ebwt is internally consistent; assert if
/// not
bool inMemoryRepOk() const {
return repOk(_eh);
}
/// Check that Ebwt is internally consistent with respect to given
/// EbwtParams; assert if not
bool repOk(const EbwtParams& eh) const {
assert(_eh.repOk());
if(isInMemory()) {
return inMemoryRepOk(eh);
}
return true;
}
/// Check that Ebwt is internally consistent; assert if not
bool repOk() const {
return repOk(_eh);
}
bool _toBigEndian;
int32_t _overrideOffRate;
int32_t _overrideIsaRate;
bool _verbose;
bool _passMemExc;
bool _sanity;
bool _fw; // true iff this is a forward index
FILE *_in1; // input fd for primary index file
FILE *_in2; // input fd for secondary index file
string _in1Str; // filename for primary index file
string _in2Str; // filename for secondary index file
TIndexOffU _zOff;
TIndexOffU _zEbwtByteOff;
TIndexOff _zEbwtBpOff;
TIndexOffU _nPat; /// number of reference texts
TIndexOffU _nFrag; /// number of fragments
TIndexOffU* _plen;
TIndexOffU* _rstarts; // starting offset of fragments / text indexes
// _fchr, _ftab and _eftab are expected to be relatively small
// (usually < 1MB, perhaps a few MB if _fchr is particularly large
// - like, say, 11). For this reason, we don't bother with writing
// them to disk through separate output streams; we
TIndexOffU* _fchr;
TIndexOffU* _ftab;
TIndexOffU* _eftab; // "extended" entries for _ftab
// _offs may be extremely large. E.g. for DNA w/ offRate=4 (one
// offset every 16 rows), the total size of _offs is the same as
// the total size of the input sequence
TIndexOffU* _offs;
TIndexOffU* _isa;
// _ebwt is the Extended Burrows-Wheeler Transform itself, and thus
// is at least as large as the input sequence.
uint8_t* _ebwt;
bool _useMm; /// use memory-mapped files to hold the index
bool useShmem_; /// use shared memory to hold large parts of the index
vector<string> _refnames; /// names of the reference sequences
const ReferenceMap* rmap_; /// mapping into another reference coordinate space
char *mmFile1_;
char *mmFile2_;
EbwtParams _eh;
#ifdef BOWTIE_64BIT_INDEX
static const int default_lineRate = 7;
#else
static const int default_lineRate = 6;
#endif
#ifdef EBWT_STATS
uint64_t mapLFExs_;
uint64_t mapLFs_;
uint64_t mapLFcs_;
#endif
private:
ostream& log() const {
return cout; // TODO: turn this into a parameter
}
/// Print a verbose message and flush (flushing is helpful for
/// debugging)
void verbose(const string& s) const {
if(this->verbose()) {
this->log() << s;
this->log().flush();
}
}
};
/// Specialization for packed Ebwts - return true
template<>
bool Ebwt<String<Dna, Packed<> > >::isPacked() {
return true;
}
/// By default, Ebwts are not packed
template<typename TStr>
bool Ebwt<TStr>::isPacked() {
return false;
}
/**
* Structure encapsulating search parameters, such as whether and how
* to backtrack and how to deal with multiple equally-good hits.
*/
template<typename TStr>
class EbwtSearchParams {
public:
EbwtSearchParams(HitSinkPerThread& sink,
const vector<String<Dna5> >& texts,
bool fw = true,
bool ebwtFw = true) :
_sink(sink),
_texts(texts),
_patid(0xffffffff),
_fw(fw) { }
HitSinkPerThread& sink() const { return _sink; }
void setPatId(uint32_t patid) { _patid = patid; }
uint32_t patId() const { return _patid; }
void setFw(bool fw) { _fw = fw; }
bool fw() const { return _fw; }
/**
* Report a hit. Returns true iff caller can call off the search.
*/
bool reportHit(const String<Dna5>& query, // read sequence
String<char>* quals, // read quality values
String<char>* name, // read name
bool color, // true -> read is colorspace
char primer, // primer base trimmed from beginning
char trimc, // first color trimmed from beginning
bool colExEnds, // true -> exclude nucleotides at extreme ends after decoding
int snpPhred, // penalty for a SNP
const BitPairReference* ref, // reference (= NULL if not necessary)
const ReferenceMap* rmap, // map to another reference coordinate system
bool ebwtFw, // whether index is forward (true) or mirror (false)
const std::vector<TIndexOffU>& mmui32, // mismatch list
const std::vector<uint8_t>& refcs, // reference characters
size_t numMms, // # mismatches
UPair h, // ref coords
UPair mh, // mate's ref coords
bool mfw, // mate's orientation
uint16_t mlen, // mate length
UPair a, // arrow pair
uint32_t tlen, // length of text
uint32_t qlen, // length of query
int stratum, // alignment stratum
uint16_t cost, // cost of alignment
uint32_t oms, // approx. # other valid alignments
uint32_t patid,
uint32_t seed,
uint8_t mate) const
{
#ifndef NDEBUG
// Check that no two elements of the mms array are the same
for(size_t i = 0; i < numMms; i++) {
for(size_t j = i+1; j < numMms; j++) {
assert_neq(mmui32[i], mmui32[j]);
}
}
#endif
// If ebwtFw is true, then 'query' and 'quals' are reversed
// If _fw is false, then 'query' and 'quals' are reverse complemented
assert(!color || ref != NULL);
assert(quals != NULL);
assert(name != NULL);
assert_eq(mmui32.size(), refcs.size());
assert_leq(numMms, mmui32.size());
assert_gt(qlen, 0);
Hit hit;
hit.stratum = stratum;
hit.cost = cost;
hit.patSeq = query;
hit.quals = *quals;
if(!ebwtFw) {
// Re-reverse the pattern and the quals back to how they
// appeared in the read file
::reverseInPlace(hit.patSeq);
::reverseInPlace(hit.quals);
}
if(color) {
hit.colSeq = hit.patSeq;
hit.colQuals = hit.quals;
hit.crefcs.resize(qlen, 0);
// Turn the mmui32 and refcs arrays into the mm FixedBitset and
// the refc vector
for(size_t i = 0; i < numMms; i++) {
if (ebwtFw != _fw) {
// The 3' end is on the left but the mm vector encodes
// mismatches w/r/t the 5' end, so we flip
uint32_t off = qlen - mmui32[i] - 1;
hit.cmms.set(off);
hit.crefcs[off] = refcs[i];
} else {
hit.cmms.set(mmui32[i]);
hit.crefcs[i] = refcs[i];
}
}
assert(ref != NULL);
char read[1024];
uint32_t rfbuf[(1024+16)/4];
ASSERT_ONLY(char rfbuf2[1024]);
char qual[1024];
char ns[1024];
char cmm[1024];
char nmm[1024];
int cmms = 0;
int nmms = 0;
// TODO: account for indels when calculating these bounds
size_t readi = 0;
size_t readf = seqan::length(hit.patSeq);
size_t refi = 0;
size_t reff = readf + 1;
bool maqRound = false;
for(size_t i = 0; i < qlen + 1; i++) {
if(i < qlen) {
read[i] = (int)hit.patSeq[i];
qual[i] = mmPenalty(maqRound, phredCharToPhredQual(hit.quals[i]));
}
ASSERT_ONLY(rfbuf2[i] = ref->getBase(h.first, h.second + i));
}
int offset = ref->getStretch(rfbuf, h.first, h.second, qlen + 1);
char *rf = (char*)rfbuf + offset;
for(size_t i = 0; i < qlen + 1; i++) {
assert_eq(rf[i], rfbuf2[i]);
rf[i] = (1 << rf[i]);
}
decodeHit(
read, // ASCII colors, '0', '1', '2', '3', '.'
qual, // ASCII quals, Phred+33 encoded
readi, // offset of first character within 'read' to consider
readf, // offset of last char (exclusive) in 'read' to consider
rf, // reference sequence, as masks
refi, // offset of first character within 'ref' to consider
reff, // offset of last char (exclusive) in 'ref' to consider
snpPhred, // penalty incurred by a SNP
ns, // decoded nucleotides are appended here
cmm, // where the color mismatches are in the string
nmm, // where nucleotide mismatches are in the string
cmms, // number of color mismatches
nmms);// number of nucleotide mismatches
size_t nqlen = qlen + (colExEnds ? -1 : 1);
seqan::resize(hit.patSeq, nqlen);
seqan::resize(hit.quals, nqlen);
hit.refcs.resize(nqlen);
size_t lo = colExEnds ? 1 : 0;
size_t hi = colExEnds ? qlen : qlen+1;
size_t destpos = 0;
for(size_t i = lo; i < hi; i++, destpos++) {
// Set sequence character
assert_leq(ns[i], 4);
assert_geq(ns[i], 0);
hit.patSeq[destpos] = (Dna5)(int)ns[i];
// Set initial quality
hit.quals[destpos] = '!';
// Color mismatches penalize quality
if(i > 0) {
if(cmm[i-1] == 'M') {
if((int)hit.quals[destpos] + (int)qual[i-1] > 126) {
hit.quals[destpos] = 126;
} else {
hit.quals[destpos] += qual[i-1];
}
} else if((int)hit.colSeq[i-1] != 4) {
hit.quals[destpos] -= qual[i-1];
}
}
if(i < qlen) {
if(cmm[i] == 'M') {
if((int)hit.quals[destpos] + (int)qual[i] > 126) {
hit.quals[destpos] = 126;
} else {
hit.quals[destpos] += qual[i];
}
} else if((int)hit.patSeq[i] != 4) {
hit.quals[destpos] -= qual[i];
}
}
if(hit.quals[destpos] < '!') {
hit.quals[destpos] = '!';
}
if(nmm[i] != 'M') {
uint32_t off = (uint32_t)i - (colExEnds? 1:0);
if(!_fw) off = (uint32_t)nqlen - off - 1;
assert_lt(off, nqlen);
hit.mms.set(off);
hit.refcs[off] = "ACGT"[ref->getBase(h.first, h.second+i)];
}
}
if(colExEnds) {
// Extreme bases have been removed; that makes the
// nucleotide alignment one character shorter than the
// color alignment
qlen--; mlen--;
// It also shifts the alignment's offset up by 1
h.second++;
} else {
// Extreme bases are included; that makes the
// nucleotide alignment one character longer than the
// color alignment
qlen++; mlen++;
}
} else {
// Turn the mmui32 and refcs arrays into the mm FixedBitset and
// the refc vector
hit.refcs.resize(qlen, 0);
for(size_t i = 0; i < numMms; i++) {
if (ebwtFw != _fw) {
// The 3' end is on the left but the mm vector encodes
// mismatches w/r/t the 5' end, so we flip
uint32_t off = qlen - mmui32[i] - 1;
hit.mms.set(off);
hit.refcs[off] = refcs[i];
} else {
hit.mms.set(mmui32[i]);
hit.refcs[mmui32[i]] = refcs[i];
}
}
}
// Check the hit against the original text, if it's available
if(_texts.size() > 0) {
assert_lt(h.first, _texts.size());
FixedBitset<1024> diffs;
// This type of check assumes that only mismatches are
// possible. If indels are possible, then we either need
// the caller to provide information about indel locations,
// or we need to extend this to a more complicated check.
assert_leq(h.second + qlen, length(_texts[h.first]));
for(size_t i = 0; i < qlen; i++) {
assert_neq(4, (int)_texts[h.first][h.second + i]);
// Forward pattern appears at h
if((int)hit.patSeq[i] != (int)_texts[h.first][h.second + i]) {
uint32_t qoff = (uint32_t)i;
// if ebwtFw != _fw the 3' end is on on the
// left end of the pattern, but the diff vector
// should encode mismatches w/r/t the 5' end,
// so we flip
if (_fw) diffs.set(qoff);
else diffs.set(qlen - qoff - 1);
}
}
if(diffs != hit.mms) {
// Oops, mismatches were not where we expected them;
// print a diagnostic message before asserting
cerr << "Expected " << hit.mms.str() << " mismatches, got " << diffs.str() << endl;
cerr << " Pat: " << hit.patSeq << endl;
cerr << " Tseg: ";
for(size_t i = 0; i < qlen; i++) {
cerr << _texts[h.first][h.second + i];
}
cerr << endl;
cerr << " mmui32: ";
for(size_t i = 0; i < numMms; i++) {
cerr << mmui32[i] << " ";
}
cerr << endl;
cerr << " FW: " << _fw << endl;
cerr << " Ebwt FW: " << ebwtFw << endl;
}
if(diffs != hit.mms) assert(false);
}
hit.h = h;
if(rmap != NULL) rmap->map(hit.h);
hit.patId = ((patid == 0xffffffff) ? _patid : patid);
hit.patName = *name;
hit.mh = mh;
hit.fw = _fw;
hit.mfw = mfw;
hit.mlen = mlen;
hit.oms = oms;
hit.mate = mate;
hit.color = color;
hit.primer = primer;
hit.trimc = trimc;
hit.seed = seed;
assert(hit.repOk());
return sink().reportHit(hit, stratum);
}
private:
HitSinkPerThread& _sink;
const vector<String<Dna5> >& _texts; // original texts, if available (if not
// available, _texts.size() == 0)
uint32_t _patid; // id of current read
bool _fw; // current read is forward-oriented
};
/**
* Encapsulates a location in the bwt text in terms of the side it
* occurs in and its offset within the side.
*/
struct SideLocus {
SideLocus() :
_sideByteOff(0),
_sideNum(0),
_charOff(0),
_fw(true),
_by(-1),
_bp(-1) { }
/**
* Construct from row and other relevant information about the Ebwt.
*/
SideLocus(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) {
initFromRow(row, ep, ebwt);
}
/**
* Init two SideLocus objects from a top/bot pair, using the result
* from one call to initFromRow to possibly avoid a second call.
*/
static void initFromTopBot(TIndexOffU top,
TIndexOffU bot,
const EbwtParams& ep,
const uint8_t* ebwt,
SideLocus& ltop,
SideLocus& lbot)
{
const TIndexOffU sideBwtLen = ep._sideBwtLen;
const uint32_t sideBwtSz = ep._sideBwtSz;
assert_gt(bot, top);
ltop.initFromRow(top, ep, ebwt);
TIndexOffU spread = bot - top;
if(ltop._charOff + spread < sideBwtLen) {
lbot._charOff = (uint32_t)(ltop._charOff + spread);
lbot._sideNum = ltop._sideNum;
lbot._sideByteOff = ltop._sideByteOff;
lbot._fw = ltop._fw;
lbot._by = lbot._charOff >> 2;
assert_lt(lbot._by, (int)sideBwtSz);
if(!lbot._fw) lbot._by = sideBwtSz - lbot._by - 1;
lbot._bp = lbot._charOff & 3;
if(!lbot._fw) lbot._bp ^= 3;
} else {
lbot.initFromRow(bot, ep, ebwt);
}
}
/**
* Calculate SideLocus based on a row and other relevant
* information about the shape of the Ebwt.
*/
void initFromRow(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) {
const uint32_t sideSz = ep._sideSz;
// Side length is hard-coded for now; this allows the compiler
// to do clever things to accelerate / and %.
_sideNum = row / (56*OFF_SIZE);
_charOff = row % (56*OFF_SIZE);
_sideByteOff = _sideNum * sideSz;
assert_leq(row, ep._len);
assert_leq(_sideByteOff + sideSz, ep._ebwtTotSz);
#ifndef NO_PREFETCH
__builtin_prefetch((const void *)(ebwt + _sideByteOff),
0 /* prepare for read */,
PREFETCH_LOCALITY);
#endif
// prefetch this side too
_fw = (_sideNum & 1) != 0; // odd-numbered sides are forward
_by = _charOff >> 2; // byte within side
assert_lt(_by, (int)ep._sideBwtSz);
_bp = _charOff & 3; // bit-pair within byte
if(!_fw) {
_by = ep._sideBwtSz - _by - 1;
_bp ^= 3;
}
}
/// Return true iff this is an initialized SideLocus
bool valid() {
return _bp != -1;
}
/// Make this look like an invalid SideLocus
void invalidate() {
_bp = -1;
}
const uint8_t *side(const uint8_t* ebwt) const {
return ebwt + _sideByteOff;
}
const uint8_t *oside(const uint8_t* ebwt) const {
return ebwt + _sideByteOff + (_fw? (-128) : (128));
}
TIndexOffU _sideByteOff; // offset of top side within ebwt[]
TIndexOffU _sideNum; // index of side
uint16_t _charOff; // character offset within side
bool _fw; // side is forward or backward?
int16_t _by; // byte within side (not adjusted for bw sides)
int8_t _bp; // bitpair within byte (not adjusted for bw sides)
};
#include "ebwt_search_backtrack.h"
///////////////////////////////////////////////////////////////////////
//
// Functions for printing and sanity-checking Ebwts
//
///////////////////////////////////////////////////////////////////////
/**
* Given a range of positions in the EBWT array within the BWT portion
* of a forward side, print the characters at those positions along
* with a summary occ[] array.
*/
template<typename TStr>
void Ebwt<TStr>::printRangeFw(uint32_t begin, uint32_t end) const {
assert(isInMemory());
uint32_t occ[] = {0, 0, 0, 0};
assert_gt(end, begin);
for(uint32_t i = begin; i < end; i++) {
uint8_t by = this->_ebwt[i];
for(int j = 0; j < 4; j++) {
// Unpack from lowest to highest bit pair
int twoBit = unpack_2b_from_8b(by, j);
occ[twoBit]++;
cout << "ACGT"[twoBit];
}
assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
}
cout << ":{" << occ[0] << "," << occ[1] << "," << occ[2] << "," << occ[3] << "}" << endl;
}
/**
* Given a range of positions in the EBWT array within the BWT portion
* of a backward side, print the characters at those positions along
* with a summary occ[] array.
*/
template<typename TStr> /* check - update */
void Ebwt<TStr>::printRangeBw(uint32_t begin, uint32_t end) const {
assert(isInMemory());
uint32_t occ[] = {0, 0, 0, 0};
assert_gt(end, begin);
for(uint32_t i = end-1; i >= begin; i--) {
uint8_t by = this->_ebwt[i];
for(int j = 3; j >= 0; j--) {
// Unpack from lowest to highest bit pair
int twoBit = unpack_2b_from_8b(by, j);
occ[twoBit]++;
cout << "ACGT"[twoBit];
}
assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
if(i == 0) break;
}
cout << ":{" << occ[0] << "," << occ[1] << "," << occ[2] << "," << occ[3] << "}" << endl;
}
/**
* Check that the ebwt array is internally consistent up to (and not
* including) the given side index by re-counting the chars and
* comparing against the embedded occ[] arrays.
*/
template<typename TStr>
void Ebwt<TStr>::sanityCheckUpToSide(TIndexOff upToSide) const {
assert(isInMemory());
TIndexOffU occ[] = {0, 0, 0, 0};
ASSERT_ONLY(TIndexOffU occ_save[] = {0, 0});
TIndexOffU cur = 0; // byte pointer
const EbwtParams& eh = this->_eh;
bool fw = false;
while(cur < (TIndexOffU)(upToSide * eh._sideSz)) {
assert_leq(cur + eh._sideSz, eh._ebwtTotLen);
for(uint32_t i = 0; i < eh._sideBwtSz; i++) {
uint8_t by = this->_ebwt[cur + (fw ? i : eh._sideBwtSz-i-1)];
for(int j = 0; j < 4; j++) {
// Unpack from lowest to highest bit pair
int twoBit = unpack_2b_from_8b(by, fw ? j : 3-j);
occ[twoBit]++;
//if(_verbose) cout << "ACGT"[twoBit];
}
assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % 4);
}
assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % eh._sideBwtLen);
if(fw) {
// Finished forward bucket; check saved [G] and [T]
// against the two uint32_ts encoded here
ASSERT_ONLY(TIndexOffU *u32ebwt = reinterpret_cast<TIndexOffU*>(&this->_ebwt[cur + eh._sideBwtSz]));
ASSERT_ONLY(TIndexOffU gs = u32ebwt[0]);
ASSERT_ONLY(TIndexOffU ts = u32ebwt[1]);
assert_eq(gs, occ_save[0]);
assert_eq(ts, occ_save[1]);
fw = false;
} else {
// Finished backward bucket; check current [A] and [C]
// against the two uint32_ts encoded here
ASSERT_ONLY(TIndexOffU *u32ebwt = reinterpret_cast<TIndexOffU*>(&this->_ebwt[cur + eh._sideBwtSz]));
ASSERT_ONLY(TIndexOffU as = u32ebwt[0]);
ASSERT_ONLY(TIndexOffU cs = u32ebwt[1]);
assert(as == occ[0] || as == occ[0]-1); // one 'a' is a skipped '$' and doesn't count toward occ[]
assert_eq(cs, occ[1]);
ASSERT_ONLY(occ_save[0] = occ[2]); // save gs
ASSERT_ONLY(occ_save[1] = occ[3]); // save ts
fw = true;
}
cur += eh._sideSz;
}
}
/**
* Sanity-check various pieces of the Ebwt
*/
template<typename TStr>
void Ebwt<TStr>::sanityCheckAll(int reverse) const {
const EbwtParams& eh = this->_eh;
assert(isInMemory());
// Check ftab
for(TIndexOffU i = 1; i < eh._ftabLen; i++) {
assert_geq(this->ftabHi(i), this->ftabLo(i-1));
assert_geq(this->ftabLo(i), this->ftabHi(i-1));
assert_leq(this->ftabHi(i), eh._bwtLen+1);
}
assert_eq(this->ftabHi(eh._ftabLen-1), eh._bwtLen);
// Check offs
TIndexOff seenLen = (eh._bwtLen + 31) >> ((TIndexOffU)5);
TIndexOff *seen;
try {
seen = new TIndexOff[seenLen]; // bitvector marking seen offsets
} catch(bad_alloc& e) {
cerr << "Out of memory allocating seen[] at " << __FILE__ << ":" << __LINE__ << endl;
throw e;
}
memset(seen, 0, OFF_SIZE * seenLen);
TIndexOffU offsLen = eh._offsLen;
for(TIndexOffU i = 0; i < offsLen; i++) {
assert_lt(this->_offs[i], eh._bwtLen);
TIndexOff w = this->_offs[i] >> 5;
TIndexOff r = this->_offs[i] & 31;
assert_eq(0, (seen[w] >> r) & 1); // shouldn't have been seen before
seen[w] |= (1 << r);
}
delete[] seen;
// Check nPat
assert_gt(this->_nPat, 0);
// Check plen, flen
for(TIndexOffU i = 0; i < this->_nPat; i++) {
assert_geq(this->_plen[i], 0);
}
// Check rstarts
for(TIndexOffU i = 0; i < this->_nFrag-1; i++) {
assert_gt(this->_rstarts[(i+1)*3], this->_rstarts[i*3]);
if(reverse == REF_READ_REVERSE) {
assert(this->_rstarts[(i*3)+1] >= this->_rstarts[((i+1)*3)+1]);
} else {
assert(this->_rstarts[(i*3)+1] <= this->_rstarts[((i+1)*3)+1]);
}
}
// Check ebwt
sanityCheckUpToSide(eh._numSides);
VMSG_NL("Ebwt::sanityCheck passed");
}
///////////////////////////////////////////////////////////////////////
//
// Functions for searching Ebwts
//
///////////////////////////////////////////////////////////////////////
/**
* Return the final character in row i (i.e. the i'th character in the
* BWT transform). Note that the 'L' in the name of the function
* stands for 'last', as in the literature.
*/
template<typename TStr>
inline int Ebwt<TStr>::rowL(const SideLocus& l) const {
// Extract and return appropriate bit-pair
#ifdef SIXTY4_FORMAT
return (((uint64_t*)l.side(this->_ebwt))[l._by >> 3] >> ((((l._by & 7) << 2) + l._bp) << 1)) & 3;
#else
return unpack_2b_from_8b(l.side(this->_ebwt)[l._by], l._bp);
#endif
}
/**
* Inline-function version of the above. This does not always seem to
* be inlined
*/
#if 0
// Use gcc's intrinsic popcountll. I don't recommend it because it
// seems to be somewhat slower than the bit-bashing pop64 routine both
// on an AMD server and on an Intel workstation. On the other hand,
// perhaps when the builtin is used GCC is smart enough to insert a
// pop-count instruction on architectures that have one (e.g. Itanium).
// For now, it's disabled.
#define pop64(x) __builtin_popcountll(x)
#elif 0
__declspec naked int __stdcall pop64
(uint64_t v)
{
static const uint64_t C55 = 0x5555555555555555ll;
static const uint64_t C33 = 0x3333333333333333ll;
static const uint64_t C0F = 0x0F0F0F0F0F0F0F0Fll;
__asm {
MOVD MM0, [ESP+4] ;v_low
PUNPCKLDQ MM0, [ESP+8] ;v
MOVQ MM1, MM0 ;v
PSRLD MM0, 1 ;v >> 1
PAND MM0, [C55] ;(v >> 1) & 0x55555555
PSUBD MM1, MM0 ;w = v - ((v >> 1) & 0x55555555)
MOVQ MM0, MM1 ;w
PSRLD MM1, 2 ;w >> 2
PAND MM0, [C33] ;w & 0x33333333
PAND MM1, [C33] ;(w >> 2) & 0x33333333
PADDD MM0, MM1 ;x = (w & 0x33333333) +
; ((w >> 2) & 0x33333333)
MOVQ MM1, MM0 ;x
PSRLD MM0, 4 ;x >> 4
PADDD MM0, MM1 ;x + (x >> 4)
PAND MM0, [C0F] ;y = (x + (x >> 4) & 0x0F0F0F0F)
PXOR MM1, MM1 ;0
PSADBW (MM0, MM1) ;sum across all 8 bytes
MOVD EAX, MM0 ;result in EAX per calling
; convention
EMMS ;clear MMX state
RET 8 ;pop 8-byte argument off stack
; and return
}
}
#elif 0
// Use a bytewise LUT version of popcount. This is slower than the
// bit-bashing pop64 routine both on an AMD server and on an Intel
// workstation. It seems to be about the same speed as the GCC builtin
// on Intel, and a bit faster than it on AMD. For now, it's disabled.
const int popcntU8Table[256] = {
0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8
};
// Use this bytewise population count table
inline static int pop64(uint64_t x) {
const unsigned char * p = (const unsigned char *) &x;
return popcntU8Table[p[0]] +
popcntU8Table[p[1]] +
popcntU8Table[p[2]] +
popcntU8Table[p[3]] +
popcntU8Table[p[4]] +
popcntU8Table[p[5]] +
popcntU8Table[p[6]] +
popcntU8Table[p[7]];
}
#else
#ifdef POPCNT_CAPABILITY // wrapping of "struct"
struct USE_POPCNT_GENERIC {
#endif
// Use this standard bit-bashing population count
inline static int pop64(uint64_t x) {
x = x - ((x >> 1) & 0x5555555555555555llu);
x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu);
x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu;
x = x + (x >> 8);
x = x + (x >> 16);
x = x + (x >> 32);
return x & 0x3F;
}
#ifdef POPCNT_CAPABILITY // wrapping a "struct"
};
#endif
#ifdef POPCNT_CAPABILITY
struct USE_POPCNT_INSTRUCTION {
inline static int pop64(uint64_t x) {
int64_t count;
asm ("popcntq %[x],%[count]\n": [count] "=&r" (count): [x] "r" (x));
return count;
}
};
#endif
#endif
/**
* Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
* within a 64-bit argument.
*/
#ifdef POPCNT_CAPABILITY
template<typename Operation>
#endif
inline static int countInU64(int c, uint64_t dw) {
uint64_t c0 = c_table[c];
uint64_t x0 = dw ^ c0;
uint64_t x1 = (x0 >> 1);
uint64_t x2 = x1 & (0x5555555555555555llu);
uint64_t x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
uint64_t tmp = Operation().pop64(x3);
#else
uint64_t tmp = pop64(x3);
#endif
return (int) tmp;
}
/**
* Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
* within a 64-bit argument.
*
* Function gets 2.32% in profile
*/
#ifdef POPCNT_CAPABILITY
template<typename Operation>
#endif
inline static void countInU64Ex(uint64_t dw, TIndexOffU* arrs) {
uint64_t c0 = c_table[0];
uint64_t x0 = dw ^ c0;
uint64_t x1 = (x0 >> 1);
uint64_t x2 = x1 & (0x5555555555555555llu);
uint64_t x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
uint64_t tmp = Operation().pop64(x3);
#else
uint64_t tmp = pop64(x3);
#endif
arrs[0] += (uint32_t) tmp;
c0 = c_table[1];
x0 = dw ^ c0;
x1 = (x0 >> 1);
x2 = x1 & (0x5555555555555555llu);
x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
tmp = Operation().pop64(x3);
#else
tmp = pop64(x3);
#endif
arrs[1] += (uint32_t) tmp;
c0 = c_table[2];
x0 = dw ^ c0;
x1 = (x0 >> 1);
x2 = x1 & (0x5555555555555555llu);
x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
tmp = Operation().pop64(x3);
#else
tmp = pop64(x3);
#endif
arrs[2] += (uint32_t) tmp;
c0 = c_table[3];
x0 = dw ^ c0;
x1 = (x0 >> 1);
x2 = x1 & (0x5555555555555555llu);
x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
tmp = Operation().pop64(x3);
#else
tmp = pop64(x3);
#endif
arrs[3] += (uint32_t) tmp;
}
/**
* Counts the number of occurrences of character 'c' in the given Ebwt
* side up to (but not including) the given byte/bitpair (by/bp).
*
* This is a performance-critical function. This is the top search-
* related hit in the time profile.
*
* Function gets 11.09% in profile
*/
template<typename TStr>
inline TIndexOffU Ebwt<TStr>::countUpTo(const SideLocus& l, int c) const {
// Count occurrences of c in each 64-bit (using bit trickery);
// Someday countInU64() and pop() functions should be
// vectorized/SSE-ized in case that helps.
TIndexOffU cCnt = 0;
const uint8_t *side = l.side(this->_ebwt);
int i = 0;
#if 1
#ifdef POPCNT_CAPABILITY
if ( _usePOPCNTinstruction) {
for(; i + 7 < l._by; i += 8) {
cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, *(uint64_t*)&side[i]);
}
}
else {
for(; i + 7 < l._by; i += 8) {
cCnt += countInU64<USE_POPCNT_GENERIC>(c, *(uint64_t*)&side[i]);
}
}
#else
for(; i + 7 < l._by; i += 8) {
cCnt += countInU64(c, *(uint64_t*)&side[i]);
}
#endif
#else
for(; i + 2 < l._by; i += 2) {
cCnt += cCntLUT_16b_4[c][*(uint16_t*)&side[i]];
}
#endif
#ifdef SIXTY4_FORMAT
// Calculate number of bit pairs to shift off the end
const int bpShiftoff = 32 - (((l._by & 7) << 2) + l._bp);
if(bpShiftoff < 32) {
assert_lt(bpShiftoff, 32);
const uint64_t sw = (*(uint64_t*)&side[i]) << (bpShiftoff << 1);
#ifdef POPCNT_CAPABILITY
if (_usePOPCNTinstruction) {
cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, sw);
}
else {
cCnt += countInU64<USE_POPCNT_GENERIC>(c, sw);
}
#else
cCnt += countInU64(c, sw);
#endif
if(c == 0) cCnt -= bpShiftoff; // we turned these into As
}
#else
// Count occurences of c in the rest of the side (using LUT)
for(; i < l._by; i++) {
cCnt += cCntLUT_4[0][c][side[i]];
}
// Count occurences of c in the rest of the byte
if(l._bp > 0) {
cCnt += cCntLUT_4[(int)l._bp][c][side[i]];
}
#endif
return cCnt;
}
/**
* Counts the number of occurrences of character 'c' in the given Ebwt
* side up to (but not including) the given byte/bitpair (by/bp).
*/
template<typename TStr>
inline void Ebwt<TStr>::countUpToEx(const SideLocus& l, TIndexOffU* arrs) const {
int i = 0;
// Count occurrences of c in each 64-bit (using bit trickery);
// note: this seems does not seem to lend a significant boost to
// performance. If you comment out this whole loop (which won't
// affect correctness - it will just cause the following loop to
// take up the slack) then runtime does not change noticeably.
// Someday the countInU64() and pop() functions should be
// vectorized/SSE-ized in case that helps.
const uint8_t *side = l.side(this->_ebwt);
#ifdef POPCNT_CAPABILITY
if (_usePOPCNTinstruction) {
for(; i+7 < l._by; i += 8) {
countInU64Ex<USE_POPCNT_INSTRUCTION>(*(uint64_t*)&side[i], arrs);
}
}
else {
for(; i+7 < l._by; i += 8) {
countInU64Ex<USE_POPCNT_GENERIC>(*(uint64_t*)&side[i], arrs);
}
}
#else
for(; i+7 < l._by; i += 8) {
countInU64Ex(*(uint64_t*)&side[i], arrs);
}
#endif
#ifdef SIXTY4_FORMAT
// Calculate number of bit pairs to shift off the end
const int bpShiftoff = 32 - (((l._by & 7) << 2) + l._bp);
assert_leq(bpShiftoff, 32);
if(bpShiftoff < 32) {
const uint64_t sw = (*(uint64_t*)&l.side(this->_ebwt)[i]) << (bpShiftoff << 1);
#ifdef POPCNT_CAPABILITY
if (_usePOPCNTinstruction) {
countInU64Ex<USE_POPCNT_INSTRUCTION>(sw, arrs);
}
else{
countInU64Ex<USE_POPCNT_GENERIC>(sw, arrs);
}
#else
countInU64Ex(sw, arrs);
#endif
arrs[0] -= bpShiftoff;
}
#else
// Count occurences of c in the rest of the side (using LUT)
for(; i < l._by; i++) {
arrs[0] += cCntLUT_4[0][0][side[i]];
arrs[1] += cCntLUT_4[0][1][side[i]];
arrs[2] += cCntLUT_4[0][2][side[i]];
arrs[3] += cCntLUT_4[0][3][side[i]];
}
// Count occurences of c in the rest of the byte
if(l._bp > 0) {
arrs[0] += cCntLUT_4[(int)l._bp][0][side[i]];
arrs[1] += cCntLUT_4[(int)l._bp][1][side[i]];
arrs[2] += cCntLUT_4[(int)l._bp][2][side[i]];
arrs[3] += cCntLUT_4[(int)l._bp][3][side[i]];
}
#endif
}
/**
* Count all occurrences of character c from the beginning of the
* forward side to <by,bp> and add in the occ[] count up to the side
* break just prior to the side.
*/
template<typename TStr>
inline TIndexOffU Ebwt<TStr>::countFwSide(const SideLocus& l, int c) const { /* check */
assert_lt(c, 4);
assert_geq(c, 0);
assert_lt(l._by, (int)this->_eh._sideBwtSz);
assert_geq(l._by, 0);
assert_lt(l._bp, 4);
assert_geq(l._bp, 0);
const uint8_t *side = l.side(this->_ebwt);
TIndexOffU cCnt = countUpTo(l, c);
assert_leq(cCnt, this->_eh._sideBwtLen);
if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
// Adjust for the fact that we represented $ with an 'A', but
// shouldn't count it as an 'A' here
if((l._sideByteOff + l._by > _zEbwtByteOff) ||
(l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
{
cCnt--; // Adjust for '$' looking like an 'A'
}
}
TIndexOffU ret;
// Now factor in the occ[] count at the side break
if(c < 2) {
const TIndexOffU *ac = reinterpret_cast<const TIndexOffU*>(side - 2*OFF_SIZE);
assert_leq(ac[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding
assert_leq(ac[1], this->_eh._len);
ret = ac[c] + cCnt + this->_fchr[c];
} else {
const TIndexOffU *gt = reinterpret_cast<const TIndexOffU*>(side + this->_eh._sideSz - 2*OFF_SIZE); // next
assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
ret = gt[c-2] + cCnt + this->_fchr[c];
}
#ifndef NDEBUG
assert_leq(ret, this->_fchr[c+1]); // can't have jumpded into next char's section
if(c == 0) {
assert_leq(cCnt, this->_eh._sideBwtLen);
} else {
assert_leq(ret, this->_eh._bwtLen);
}
#endif
return ret;
}
/**
* Count all occurrences of character c from the beginning of the
* forward side to <by,bp> and add in the occ[] count up to the side
* break just prior to the side.
*/
template<typename TStr>
inline void Ebwt<TStr>::countFwSideEx(const SideLocus& l, TIndexOffU* arrs) const
{
assert_lt(l._by, (int)this->_eh._sideBwtSz);
assert_geq(l._by, 0);
assert_lt(l._bp, 4);
assert_geq(l._bp, 0);
countUpToEx(l, arrs);
#ifndef NDEBUG
assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section
assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section
assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section
assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section
#endif
assert_leq(arrs[0], this->_eh._sideBwtLen);
assert_leq(arrs[1], this->_eh._sideBwtLen);
assert_leq(arrs[2], this->_eh._sideBwtLen);
assert_leq(arrs[3], this->_eh._sideBwtLen);
const uint8_t *side = l.side(this->_ebwt);
if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
// Adjust for the fact that we represented $ with an 'A', but
// shouldn't count it as an 'A' here
if((l._sideByteOff + l._by > _zEbwtByteOff) ||
(l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
{
arrs[0]--; // Adjust for '$' looking like an 'A'
}
}
// Now factor in the occ[] count at the side break
const TIndexOffU *ac = reinterpret_cast<const TIndexOffU*>(side - 2*OFF_SIZE);
const TIndexOffU *gt = reinterpret_cast<const TIndexOffU*>(side + this->_eh._sideSz - 2*OFF_SIZE);
#ifndef NDEBUG
assert_leq(ac[0], this->_fchr[1] + this->_eh.sideBwtLen());
assert_leq(ac[1], this->_fchr[2]-this->_fchr[1]);
assert_leq(gt[0], this->_fchr[3]-this->_fchr[2]);
assert_leq(gt[1], this->_fchr[4]-this->_fchr[3]);
#endif
assert_leq(ac[0], this->_eh._len + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_eh._len);
assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
arrs[0] += (ac[0] + this->_fchr[0]);
arrs[1] += (ac[1] + this->_fchr[1]);
arrs[2] += (gt[0] + this->_fchr[2]);
arrs[3] += (gt[1] + this->_fchr[3]);
#ifndef NDEBUG
assert_leq(arrs[0], this->_fchr[1]); // can't have jumpded into next char's section
assert_leq(arrs[1], this->_fchr[2]); // can't have jumpded into next char's section
assert_leq(arrs[2], this->_fchr[3]); // can't have jumpded into next char's section
assert_leq(arrs[3], this->_fchr[4]); // can't have jumpded into next char's section
#endif
}
/**
* Count all instances of character c from <by,bp> to the logical end
* (actual beginning) of the backward side, and subtract that from the
* occ[] count up to the side break.
*/
template<typename TStr>
inline TIndexOffU Ebwt<TStr>::countBwSide(const SideLocus& l, int c) const {
assert_lt(c, 4);
assert_geq(c, 0);
assert_lt(l._by, (int)this->_eh._sideBwtSz);
assert_geq(l._by, 0);
assert_lt(l._bp, 4);
assert_geq(l._bp, 0);
const uint8_t *side = l.side(this->_ebwt);
TIndexOffU cCnt = countUpTo(l, c);
if(rowL(l) == c) cCnt++;
assert_leq(cCnt, this->_eh._sideBwtLen);
if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
// Adjust for the fact that we represented $ with an 'A', but
// shouldn't count it as an 'A' here
if((l._sideByteOff + l._by > _zEbwtByteOff) ||
(l._sideByteOff + l._by == _zEbwtByteOff && l._bp >= _zEbwtBpOff))
{
cCnt--;
}
}
TIndexOffU ret;
// Now factor in the occ[] count at the side break
if(c < 2) {
const TIndexOffU *ac = reinterpret_cast<const TIndexOffU*>(side + this->_eh._sideSz - 2*OFF_SIZE);
assert_leq(ac[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding
assert_leq(ac[1], this->_eh._len);
ret = ac[c] - cCnt + this->_fchr[c];
} else {
const TIndexOffU *gt = reinterpret_cast<const TIndexOffU*>(side + (2*this->_eh._sideSz) - 2*OFF_SIZE); // next
assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
ret = gt[c-2] - cCnt + this->_fchr[c];
}
#ifndef NDEBUG
assert_leq(ret, this->_fchr[c+1]); // can't have jumped into next char's section
if(c == 0) {
assert_leq(cCnt, this->_eh._sideBwtLen);
} else {
assert_lt(ret, this->_eh._bwtLen);
}
#endif
return ret;
}
/**
* Count all instances of character c from <by,bp> to the logical end
* (actual beginning) of the backward side, and subtract that from the
* occ[] count up to the side break.
*/
template<typename TStr>
inline void Ebwt<TStr>::countBwSideEx(const SideLocus& l, TIndexOffU* arrs) const {
assert_lt(l._by, (int)this->_eh._sideBwtSz);
assert_geq(l._by, 0);
assert_lt(l._bp, 4);
assert_geq(l._bp, 0);
const uint8_t *side = l.side(this->_ebwt);
countUpToEx(l, arrs);
arrs[rowL(l)]++;
assert_leq(arrs[0], this->_eh._sideBwtLen);
assert_leq(arrs[1], this->_eh._sideBwtLen);
assert_leq(arrs[2], this->_eh._sideBwtLen);
assert_leq(arrs[3], this->_eh._sideBwtLen);
if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
// Adjust for the fact that we represented $ with an 'A', but
// shouldn't count it as an 'A' here
if((l._sideByteOff + l._by > _zEbwtByteOff) ||
(l._sideByteOff + l._by == _zEbwtByteOff && l._bp >= _zEbwtBpOff))
{
arrs[0]--; // Adjust for '$' looking like an 'A'
}
}
// Now factor in the occ[] count at the side break
const TIndexOffU *ac = reinterpret_cast<const TIndexOffU*>(side + this->_eh._sideSz - 2*OFF_SIZE);
const TIndexOffU *gt = reinterpret_cast<const TIndexOffU*>(side + (2*this->_eh._sideSz) - 2*OFF_SIZE);
#ifndef NDEBUG
assert_leq(ac[0], this->_fchr[1] + this->_eh.sideBwtLen());
assert_leq(ac[1], this->_fchr[2]-this->_fchr[1]);
assert_leq(gt[0], this->_fchr[3]-this->_fchr[2]);
assert_leq(gt[1], this->_fchr[4]-this->_fchr[3]);
#endif
assert_leq(ac[0], this->_eh._len + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_eh._len);
assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
arrs[0] = (ac[0] - arrs[0] + this->_fchr[0]);
arrs[1] = (ac[1] - arrs[1] + this->_fchr[1]);
arrs[2] = (gt[0] - arrs[2] + this->_fchr[2]);
arrs[3] = (gt[1] - arrs[3] + this->_fchr[3]);
#ifndef NDEBUG
assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section
assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section
assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section
assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section
#endif
}
/**
* Given top and bot loci, calculate counts of all four DNA chars up to
* those loci. Used for more advanced backtracking-search.
*/
template<typename TStr>
inline void Ebwt<TStr>::mapLFEx(const SideLocus& ltop,
const SideLocus& lbot,
TIndexOffU *tops,
TIndexOffU *bots
ASSERT_ONLY(, bool overrideSanity)
) const
{
// TODO: Where there's overlap, reuse the count for the overlapping
// portion
#ifdef EBWT_STATS
const_cast<Ebwt<TStr>*>(this)->mapLFExs_++;
#endif
assert_eq(0, tops[0]); assert_eq(0, bots[0]);
assert_eq(0, tops[1]); assert_eq(0, bots[1]);
assert_eq(0, tops[2]); assert_eq(0, bots[2]);
assert_eq(0, tops[3]); assert_eq(0, bots[3]);
if(ltop._fw) countFwSideEx(ltop, tops); // Forward side
else countBwSideEx(ltop, tops); // Backward side
if(lbot._fw) countFwSideEx(lbot, bots); // Forward side
else countBwSideEx(lbot, bots); // Backward side
#ifndef NDEBUG
if(_sanity && !overrideSanity) {
// Make sure results match up with individual calls to mapLF;
// be sure to override sanity-checking in the callee, or we'll
// have infinite recursion
assert_eq(mapLF(ltop, 0, true), tops[0]);
assert_eq(mapLF(ltop, 1, true), tops[1]);
assert_eq(mapLF(ltop, 2, true), tops[2]);
assert_eq(mapLF(ltop, 3, true), tops[3]);
assert_eq(mapLF(lbot, 0, true), bots[0]);
assert_eq(mapLF(lbot, 1, true), bots[1]);
assert_eq(mapLF(lbot, 2, true), bots[2]);
assert_eq(mapLF(lbot, 3, true), bots[3]);
}
#endif
}
#ifndef NDEBUG
/**
* Given top and bot loci, calculate counts of all four DNA chars up to
* those loci. Used for more advanced backtracking-search.
*/
template<typename TStr>
inline void Ebwt<TStr>::mapLFEx(const SideLocus& l,
TIndexOffU *arrs
ASSERT_ONLY(, bool overrideSanity)
) const
{
assert_eq(0, arrs[0]);
assert_eq(0, arrs[1]);
assert_eq(0, arrs[2]);
assert_eq(0, arrs[3]);
if(l._fw) countFwSideEx(l, arrs); // Forward side
else countBwSideEx(l, arrs); // Backward side
#ifndef NDEBUG
if(_sanity && !overrideSanity) {
// Make sure results match up with individual calls to mapLF;
// be sure to override sanity-checking in the callee, or we'll
// have infinite recursion
assert_eq(mapLF(l, 0, true), arrs[0]);
assert_eq(mapLF(l, 1, true), arrs[1]);
assert_eq(mapLF(l, 2, true), arrs[2]);
assert_eq(mapLF(l, 3, true), arrs[3]);
}
#endif
}
#endif
/**
* Given row i, return the row that the LF mapping maps i to.
*/
template<typename TStr>
inline TIndexOffU Ebwt<TStr>::mapLF(const SideLocus& l
ASSERT_ONLY(, bool overrideSanity)
) const
{
#ifdef EBWT_STATS
const_cast<Ebwt<TStr>*>(this)->mapLFs_++;
#endif
TIndexOffU ret;
assert(l.side(this->_ebwt) != NULL);
int c = rowL(l);
assert_lt(c, 4);
assert_geq(c, 0);
if(l._fw) ret = countFwSide(l, c); // Forward side
else ret = countBwSide(l, c); // Backward side
assert_lt(ret, this->_eh._bwtLen);
#ifndef NDEBUG
if(_sanity && !overrideSanity) {
// Make sure results match up with results from mapLFEx;
// be sure to override sanity-checking in the callee, or we'll
// have infinite recursion
TIndexOffU arrs[] = { 0, 0, 0, 0 };
mapLFEx(l, arrs, true);
assert_eq(arrs[c], ret);
}
#endif
return ret;
}
/**
* Given row i and character c, return the row that the LF mapping maps
* i to on character c.
*/
template<typename TStr>
inline TIndexOffU Ebwt<TStr>::mapLF(const SideLocus& l, int c
ASSERT_ONLY(, bool overrideSanity)
) const
{
#ifdef EBWT_STATS
const_cast<Ebwt<TStr>*>(this)->mapLFcs_++;
#endif
TIndexOffU ret;
assert_lt(c, 4);
assert_geq(c, 0);
if(l._fw) ret = countFwSide(l, c); // Forward side
else ret = countBwSide(l, c); // Backward side
assert_lt(ret, this->_eh._bwtLen);
#ifndef NDEBUG
if(_sanity && !overrideSanity) {
// Make sure results match up with results from mapLFEx;
// be sure to override sanity-checking in the callee, or we'll
// have infinite recursion
TIndexOffU arrs[] = { 0, 0, 0, 0 };
mapLFEx(l, arrs, true);
assert_eq(arrs[c], ret);
}
#endif
return ret;
}
/**
* Given row i and character c, return the row that the LF mapping maps
* i to on character c.
*/
template<typename TStr>
inline TIndexOffU Ebwt<TStr>::mapLF1(TIndexOffU row, const SideLocus& l, int c
ASSERT_ONLY(, bool overrideSanity)
) const
{
#ifdef EBWT_STATS
const_cast<Ebwt<TStr>*>(this)->mapLF1cs_++;
#endif
if(rowL(l) != c || row == _zOff) return OFF_MASK;
TIndexOffU ret;
assert_lt(c, 4);
assert_geq(c, 0);
if(l._fw) ret = countFwSide(l, c); // Forward side
else ret = countBwSide(l, c); // Backward side
assert_lt(ret, this->_eh._bwtLen);
#ifndef NDEBUG
if(_sanity && !overrideSanity) {
// Make sure results match up with results from mapLFEx;
// be sure to override sanity-checking in the callee, or we'll
// have infinite recursion
TIndexOffU arrs[] = { 0, 0, 0, 0 };
mapLFEx(l, arrs, true);
assert_eq(arrs[c], ret);
}
#endif
return ret;
}
/**
* Given row i and character c, return the row that the LF mapping maps
* i to on character c.
*/
template<typename TStr>
inline int Ebwt<TStr>::mapLF1(TIndexOffU& row, const SideLocus& l
ASSERT_ONLY(, bool overrideSanity)
) const
{
#ifdef EBWT_STATS
const_cast<Ebwt<TStr>*>(this)->mapLF1s_++;
#endif
if(row == _zOff) return -1;
int c = rowL(l);
assert_lt(c, 4);
assert_geq(c, 0);
if(l._fw) row = countFwSide(l, c); // Forward side
else row = countBwSide(l, c); // Backward side
assert_lt(row, this->_eh._bwtLen);
#ifndef NDEBUG
if(_sanity && !overrideSanity) {
// Make sure results match up with results from mapLFEx;
// be sure to override sanity-checking in the callee, or we'll
// have infinite recursion
TIndexOffU arrs[] = { 0, 0, 0, 0 };
mapLFEx(l, arrs, true);
assert_eq(arrs[c], row);
}
#endif
return c;
}
/**
* Take an offset into the joined text and translate it into the
* reference of the index it falls on, the offset into the reference,
* and the length of the reference. Use a binary search through the
* sorted list of reference fragment ranges t
*/
template<typename TStr>
void Ebwt<TStr>::joinedToTextOff(TIndexOffU qlen, TIndexOffU off,
TIndexOffU& tidx,
TIndexOffU& textoff,
TIndexOffU& tlen) const
{
TIndexOffU top = 0;
TIndexOffU bot = _nFrag; // 1 greater than largest addressable element
TIndexOffU elt = OFF_MASK;
// Begin binary search
while(true) {
ASSERT_ONLY(TIndexOffU oldelt = elt);
elt = top + ((bot - top) >> 1);
assert_neq(oldelt, elt); // must have made progress
TIndexOffU lower = _rstarts[elt*3];
TIndexOffU upper;
if(elt == _nFrag-1) {
upper = _eh._len;
} else {
upper = _rstarts[((elt+1)*3)];
}
assert_gt(upper, lower);
TIndexOffU fraglen = upper - lower;
if(lower <= off) {
if(upper > off) { // not last element, but it's within
// off is in this range; check if it falls off
if(off + qlen > upper) {
// it falls off; signal no-go and return
tidx = OFF_MASK;
assert_lt(elt, _nFrag-1);
return;
}
tidx = _rstarts[(elt*3)+1];
assert_lt(tidx, this->_nPat);
assert_leq(fraglen, this->_plen[tidx]);
// it doesn't fall off; now calculate textoff.
// Initially it's the number of characters that precede
// the alignment in the fragment
uint32_t fragoff = off - _rstarts[(elt*3)];
if(!this->_fw) {
fragoff = fraglen - fragoff - 1;
fragoff -= (qlen-1);
}
// Add the alignment's offset into the fragment
// ('fragoff') to the fragment's offset within the text
textoff = fragoff + _rstarts[(elt*3)+2];
assert_lt(textoff, this->_plen[tidx]);
break; // done with binary search
} else {
// 'off' belongs somewhere in the region between elt
// and bot
top = elt;
}
} else {
// 'off' belongs somewhere in the region between top and
// elt
bot = elt;
}
// continue with binary search
}
tlen = this->_plen[tidx];
}
/**
* Report a potential match at offset 'off' with pattern length
* 'qlen'. Filter out spurious matches that span texts.
*/
template<typename TStr>
inline bool Ebwt<TStr>::report(const String<Dna5>& query,
String<char>* quals,
String<char>* name,
bool color,
char primer,
char trimc,
bool colExEnds,
int snpPhred,
const BitPairReference* ref,
const std::vector<TIndexOffU>& mmui32,
const std::vector<uint8_t>& refcs,
size_t numMms,
TIndexOffU off,
TIndexOffU top,
TIndexOffU bot,
uint32_t qlen,
int stratum,
uint16_t cost,
uint32_t patid,
uint32_t seed,
const EbwtSearchParams<TStr>& params) const
{
VMSG_NL("In report");
assert_geq(cost, (uint32_t)(stratum << 14));
assert_lt(off, this->_eh._len);
TIndexOffU tidx;
TIndexOffU textoff;
TIndexOffU tlen;
joinedToTextOff(qlen, off, tidx, textoff, tlen);
if(tidx == OFF_MASK) {
return false;
}
return params.reportHit(
query, // read sequence
quals, // read quality values
name, // read name
color, // true -> read is colorspace
primer,
trimc,
colExEnds, // true -> exclude nucleotides on ends
snpPhred, // phred probability of SNP
ref, // reference sequence
rmap_, // map to another reference coordinate system
_fw, // true = index is forward; false = mirror
mmui32, // mismatch positions
refcs, // reference characters for mms
numMms, // # mismatches
make_pair(tidx, textoff), // position
make_pair<TIndexOffU,TIndexOffU>(0, 0), // (bogus) mate position
true, // (bogus) mate orientation
0, // (bogus) mate length
make_pair(top, bot), // arrows
tlen, // textlen
qlen, // qlen
stratum, // alignment stratum
cost, // cost, including stratum & quality penalty
bot-top-1, // # other hits
patid, // pattern id
seed, // pseudo-random seed
0); // mate (0 = unpaired)
}
#include "row_chaser.h"
/**
* Report a result. Involves walking backwards along the original
* string by way of the LF-mapping until we reach a marked SA row or
* the row corresponding to the 0th suffix. A marked row's offset
* into the original string can be read directly from the this->_offs[]
* array.
*/
template<typename TStr>
inline bool Ebwt<TStr>::reportChaseOne(const String<Dna5>& query,
String<char>* quals,
String<char>* name,
bool color,
char primer,
char trimc,
bool colExEnds,
int snpPhred,
const BitPairReference* ref,
const std::vector<TIndexOffU>& mmui32,
const std::vector<uint8_t>& refcs,
size_t numMms,
TIndexOffU i,
TIndexOffU top,
TIndexOffU bot,
uint32_t qlen,
int stratum,
uint16_t cost,
uint32_t patid,
uint32_t seed,
const EbwtSearchParams<TStr>& params,
SideLocus *l) const
{
VMSG_NL("In reportChaseOne");
TIndexOffU off;
uint32_t jumps = 0;
ASSERT_ONLY(uint32_t origi = i);
SideLocus myl;
const TIndexOffU offMask = this->_eh._offMask;
const uint32_t offRate = this->_eh._offRate;
const TIndexOffU* offs = this->_offs;
// If the caller didn't give us a pre-calculated (and prefetched)
// locus, then we have to do that now
if(l == NULL) {
l = &myl;
l->initFromRow(i, this->_eh, this->_ebwt);
}
assert(l != NULL);
assert(l->valid());
// Walk along until we reach the next marked row to the left
while(((i & offMask) != i) && i != _zOff) {
// Not a marked row; walk left one more char
TIndexOffU newi = mapLF(*l); // calc next row
assert_neq(newi, i);
i = newi; // update row
l->initFromRow(i, this->_eh, this->_ebwt); // update locus
jumps++;
}
// This is a marked row
if(i == _zOff) {
// Special case: it's the row corresponding to the
// lexicographically smallest suffix, which is implicitly
// marked 0
off = jumps;
VMSG_NL("reportChaseOne found zoff off=" << off << " (jumps=" << jumps << ")");
} else {
// Normal marked row, calculate offset of row i
off = offs[i >> offRate] + jumps;
VMSG_NL("reportChaseOne found off=" << off << " (jumps=" << jumps << ")");
}
#ifndef NDEBUG
{
uint32_t rcoff = RowChaser<TStr>::toFlatRefOff(this, qlen, origi);
assert_eq(rcoff, off);
}
#endif
return report(query, quals, name, color, primer, trimc, colExEnds,
snpPhred, ref, mmui32, refcs, numMms, off, top, bot,
qlen, stratum, cost, patid, seed, params);
}
/**
* Report a result. Involves walking backwards along the original
* string by way of the LF-mapping until we reach a marked SA row or
* the row corresponding to the 0th suffix. A marked row's offset
* into the original string can be read directly from the this->_offs[]
* array.
*/
template<typename TStr>
inline bool Ebwt<TStr>::reportReconstruct(const String<Dna5>& query,
String<char>* quals,
String<char>* name,
String<Dna5>& lbuf,
String<Dna5>& rbuf,
const TIndexOffU *mmui32,
const char* refcs,
size_t numMms,
TIndexOffU i,
TIndexOffU top,
TIndexOffU bot,
uint32_t qlen,
int stratum,
const EbwtSearchParams<TStr>& params,
SideLocus *l) const
{
VMSG_NL("In reportReconstruct");
assert_gt(_eh._isaLen, 0); // Must have inverse suffix array to reconstruct
TIndexOffU off;
TIndexOffU jumps = 0;
SideLocus myl;
const TIndexOffU offMask = this->_eh._offMask;
const TIndexOffU offRate = this->_eh._offRate;
const TIndexOffU* offs = this->_offs;
const TIndexOffU* isa = this->_isa;
assert(isa != NULL);
if(l == NULL) {
l = &myl;
myl.initFromRow(i, this->_eh, this->_ebwt);
}
assert(l != NULL);
clear(lbuf); clear(rbuf);
// Walk along until we reach the next marked row to the left
while(((i & offMask) != i) && i != _zOff) {
// Not a marked row; walk left one more char
int c = rowL(*l);
appendValue(lbuf, (Dna5)c);
TIndexOffU newi;
assert_lt(c, 4);
assert_geq(c, 0);
if(l->_fw) newi = countFwSide(*l, c); // Forward side
else newi = countBwSide(*l, c); // Backward side
assert_lt(newi, this->_eh._bwtLen);
assert_neq(newi, i);
i = newi; // update row
l->initFromRow(i, this->_eh, this->_ebwt); // update locus
jumps++;
}
// This is a marked row
if(i == _zOff) {
// Special case: it's the row corresponding to the
// lexicographically smallest suffix, which is implicitly
// marked 0
off = jumps;
VMSG_NL("reportChaseOne found zoff off=" << off << " (jumps=" << jumps << ")");
} else {
// Normal marked row, calculate offset of row i
off = offs[i >> offRate] + jumps;
VMSG_NL("reportChaseOne found off=" << off << " (jumps=" << jumps << ")");
}
// 'off' now holds the text offset of the first (leftmost) position
// involved in the alignment. Next we call joinedToTextOff to
// check whether the seed is valid (i.e., does not straddle a
// boundary between two reference seuqences) and to obtain its
// extents
TIndexOffU tidx; // the index (id) of the reference we hit in
TIndexOffU textoff; // the offset of the alignment within the reference
TIndexOffU tlen; // length of reference seed hit in
joinedToTextOff(qlen, off, tidx, textoff, tlen);
if(tidx == OFF_MASK) {
// The seed straddled a reference boundary, and so is spurious.
// Return false, indicating that we shouldn't stop.
return false;
}
if(jumps > textoff) {
// In our progress toward a marked row, we passed the boundary
// between the reference sequence containing the seed and the
// reference sequence to the left of it. That's OK, we just
// need to knock off the extra characters we added to 'lbuf'.
assert_eq(jumps, length(lbuf));
_setLength(lbuf, textoff);
jumps = textoff;
assert_eq(textoff, length(lbuf));
} else if(jumps < textoff) {
// Keep walking until we reach the end of the reference
assert_neq(i, _zOff);
TIndexOffU diff = textoff-jumps;
for(size_t j = 0; j < diff; j++) {
// Not a marked row; walk left one more char
int c = rowL(*l);
appendValue(lbuf, (Dna5)c);
TIndexOffU newi;
assert_lt(c, 4);
assert_geq(c, 0);
if(l->_fw) newi = countFwSide(*l, c); // Forward side
else newi = countBwSide(*l, c); // Backward side
assert_lt(newi, this->_eh._bwtLen);
assert_neq(newi, i);
i = newi; // update row
assert_neq(i, _zOff);
l->initFromRow(i, this->_eh, this->_ebwt); // update locus
jumps++;
}
assert_eq(textoff, jumps);
assert_eq(textoff, length(lbuf));
}
assert_eq(textoff, jumps);
assert_eq(textoff, length(lbuf));
// Calculate the right-hand extent of the reference
TIndexOffU ref_right = off - textoff + tlen;
// Round the right-hand extent to the nearest ISA element that maps
// to it or a character to its right
TIndexOffU ref_right_rounded = ref_right;
if((ref_right_rounded & _eh._isaMask) != ref_right_rounded) {
ref_right_rounded = ((ref_right_rounded >> _eh._isaRate)+1) << _eh._isaRate;
}
// TODO: handle case where ref_right_rounded is off the end of _isa
// Let the current suffix-array elt be determined by the ISA
if((ref_right_rounded >> _eh._isaRate) >= _eh._isaLen) {
i = _eh._len;
ref_right_rounded = _eh._len;
} else {
i = isa[ref_right_rounded >> _eh._isaRate];
}
TIndexOffU right_steps_rounded = ref_right_rounded - (off + qlen);
TIndexOffU right_steps = ref_right - (off + qlen);
l->initFromRow(i, this->_eh, this->_ebwt); // update locus
for(size_t j = 0; j < right_steps_rounded; j++) {
// Not a marked row; walk left one more char
int c = rowL(*l);
appendValue(rbuf, (Dna5)c);
TIndexOffU newi;
assert_lt(c, 4); assert_geq(c, 0);
if(l->_fw) newi = countFwSide(*l, c); // Forward side
else newi = countBwSide(*l, c); // Backward side
assert_lt(newi, this->_eh._bwtLen);
assert_neq(newi, i);
i = newi; // update row
assert_neq(i, _zOff);
l->initFromRow(i, this->_eh, this->_ebwt); // update locus
jumps++;
}
if(right_steps_rounded > right_steps) {
jumps -= (right_steps_rounded - right_steps);
_setLength(rbuf, right_steps);
}
assert_eq(right_steps, length(rbuf));
assert_eq(tlen, jumps + qlen);
::reverseInPlace(lbuf);
::reverseInPlace(rbuf);
{
cout << "reportReconstruct:" << endl
<< " " << lbuf << query << rbuf << endl;
cout << " ";
for(size_t i = 0; i < length(lbuf); i++) cout << " ";
cout << query << endl;
}
// Now we've reconstructed the
return false;
}
/**
* Transform this Ebwt into the original string in linear time by using
* the LF mapping to walk backwards starting at the row correpsonding
* to the end of the string. The result is written to s. The Ebwt
* must be in memory.
*/
template<typename TStr>
void Ebwt<TStr>::restore(TStr& s) const {
assert(isInMemory());
resize(s, this->_eh._len, Exact());
TIndexOffU jumps = 0;
TIndexOffU i = this->_eh._len; // should point to final SA elt (starting with '$')
SideLocus l(i, this->_eh, this->_ebwt);
while(i != _zOff) {
assert_lt(jumps, this->_eh._len);
//if(_verbose) cout << "restore: i: " << i << endl;
// Not a marked row; go back a char in the original string
TIndexOffU newi = mapLF(l);
assert_neq(newi, i);
s[this->_eh._len - jumps - 1] = rowL(l);
i = newi;
l.initFromRow(i, this->_eh, this->_ebwt);
jumps++;
}
assert_eq(jumps, this->_eh._len);
}
/**
* Check that this Ebwt, when restored via restore(), matches up with
* the given array of reference sequences. For sanity checking.
*/
template <typename TStr>
void Ebwt<TStr>::checkOrigs(const vector<String<Dna5> >& os,
bool color, bool mirror) const
{
TStr rest;
restore(rest);
TIndexOffU restOff = 0;
size_t i = 0, j = 0;
if(mirror) {
// TODO: FIXME
return;
}
while(i < os.size()) {
size_t olen = length(os[i]);
int lastorig = -1;
for(; j < olen; j++) {
size_t joff = j;
if(mirror) joff = olen - j - 1;
if((int)os[i][joff] == 4) {
// Skip over Ns
lastorig = -1;
if(!mirror) {
while(j < olen && (int)os[i][j] == 4) j++;
} else {
while(j < olen && (int)os[i][olen-j-1] == 4) j++;
}
j--;
continue;
}
if(lastorig == -1 && color) {
lastorig = os[i][joff];
continue;
}
if(color) {
assert_neq(-1, lastorig);
assert_eq(dinuc2color[(int)os[i][joff]][lastorig], rest[restOff]);
} else {
assert_eq(os[i][joff], rest[restOff]);
}
lastorig = (int)os[i][joff];
restOff++;
}
if(j == length(os[i])) {
// Moved to next sequence
i++;
j = 0;
} else {
// Just jumped over a gap
}
}
}
///////////////////////////////////////////////////////////////////////
//
// Functions for reading and writing Ebwts
//
///////////////////////////////////////////////////////////////////////
/**
* Read an Ebwt from file with given filename.
*/
template<typename TStr>
void Ebwt<TStr>::readIntoMemory(
int color,
int needEntireRev,
bool justHeader,
EbwtParams *params,
bool mmSweep,
bool loadNames,
bool startVerbose)
{
bool switchEndian; // dummy; caller doesn't care
#ifdef BOWTIE_MM
char *mmFile[] = { NULL, NULL };
#endif
if(_in1Str.length() > 0) {
if(_verbose || startVerbose) {
cerr << " About to open input files: ";
logTime(cerr);
}
// Initialize our primary and secondary input-stream fields
if(_in1 != NULL) fclose(_in1);
if(_verbose || startVerbose) cerr << "Opening \"" << _in1Str << "\"" << endl;
if((_in1 = fopen(_in1Str.c_str(), "rb")) == NULL) {
cerr << "Could not open index file " << _in1Str << endl;
}
if(_in2 != NULL) fclose(_in2);
if(_verbose || startVerbose) cerr << "Opening \"" << _in2Str << "\"" << endl;
if((_in2 = fopen(_in2Str.c_str(), "rb")) == NULL) {
cerr << "Could not open index file " << _in2Str << endl;
}
if(_verbose || startVerbose) {
cerr << " Finished opening input files: ";
logTime(cerr);
}
#ifdef BOWTIE_MM
if(_useMm /*&& !justHeader*/) {
const char *names[] = {_in1Str.c_str(), _in2Str.c_str()};
int fds[] = { fileno(_in1), fileno(_in2) };
for(int i = 0; i < 2; i++) {
if(_verbose || startVerbose) {
cerr << " Memory-mapping input file " << (i+1) << ": ";
logTime(cerr);
}
struct stat sbuf;
if (stat(names[i], &sbuf) == -1) {
perror("stat");
cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl;
throw 1;
}
mmFile[i] = (char*)mmap((void *)0, sbuf.st_size,
PROT_READ, MAP_SHARED, fds[i], 0);
if(mmFile[i] == (void *)(-1)) {
perror("mmap");
cerr << "Error: Could not memory-map the index file " << names[i] << endl;
throw 1;
}
if(mmSweep) {
int sum = 0;
for(off_t j = 0; j < sbuf.st_size; j += 1024) {
sum += (int) mmFile[i][j];
}
if(startVerbose) {
cerr << " Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": ";
logTime(cerr);
}
}
}
mmFile1_ = mmFile[0];
mmFile2_ = mmFile[1];
}
#endif
}
#ifdef BOWTIE_MM
else if(_useMm && !justHeader) {
mmFile[0] = mmFile1_;
mmFile[1] = mmFile2_;
}
if(_useMm && !justHeader) {
assert(mmFile[0] == mmFile1_);
assert(mmFile[1] == mmFile2_);
}
#endif
if(_verbose || startVerbose) {
cerr << " Reading header: ";
logTime(cerr);
}
// Read endianness hints from both streams
uint64_t bytesRead = 0;
switchEndian = false;
uint32_t one = readU<uint32_t>(_in1, switchEndian); // 1st word of primary stream
bytesRead += 4;
#ifndef NDEBUG
assert_eq(one, readU<uint32_t>(_in2, switchEndian)); // should match!
#else
readU<uint32_t>(_in2, switchEndian);
#endif
if(one != 1) {
assert_eq((1u<<24), one);
assert_eq(1, endianSwapU32(one));
switchEndian = true;
}
// Can't switch endianness and use memory-mapped files; in order to
// support this, someone has to modify the file to switch
// endiannesses appropriately, and we can't do this inside Bowtie
// or we might be setting up a race condition with other processes.
if(switchEndian && _useMm) {
cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
throw 1;
}
// Reads header entries one by one from primary stream
TIndexOffU len = readU<TIndexOffU>(_in1, switchEndian);
bytesRead += OFF_SIZE;
int32_t lineRate = readI<int32_t>(_in1, switchEndian);
bytesRead += 4;
int32_t linesPerSide = readI<int32_t>(_in1, switchEndian);
bytesRead += 4;
int32_t offRate = readI<int32_t>(_in1, switchEndian);
bytesRead += 4;
// TODO: add isaRate to the actual file format (right now, the
// user has to tell us whether there's an ISA sample and what the
// sampling rate is.
int32_t isaRate = _overrideIsaRate;
int32_t ftabChars = readI<int32_t>(_in1, switchEndian);
bytesRead += 4;
// chunkRate was deprecated in an earlier version of Bowtie; now
// we use it to hold flags.
int32_t flags = readI<int32_t>(_in1, switchEndian);
bool entireRev = false;
if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) {
if(color != -1 && !color) {
cerr << "Error: -C was not specified when running bowtie, but index is in colorspace. If" << endl
<< "your reads are in colorspace, please use the -C option. If your reads are not" << endl
<< "in colorspace, please use a normal index (one built without specifying -C to" << endl
<< "bowtie-build)." << endl;
throw 1;
}
color = 1;
} else if(flags < 0) {
if(color != -1 && color) {
cerr << "Error: -C was specified when running bowtie, but index is not in colorspace. If" << endl
<< "your reads are in colorspace, please use a colorspace index (one built using" << endl
<< "bowtie-build -C). If your reads are not in colorspace, don't specify -C when" << endl
<< "running bowtie." << endl;
throw 1;
}
color = 0;
}
if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) == 0)) {
if(needEntireRev != -1 && needEntireRev != 0) {
cerr << "Error: This index is not compatible with this version of bowtie. Please use a" << endl
<< "current version of bowtie-build." << endl;
throw 1;
}
} else entireRev = true;
bytesRead += 4;
// Create a new EbwtParams from the entries read from primary stream
EbwtParams *eh;
bool deleteEh = false;
if(params != NULL) {
params->init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireRev);
if(_verbose || startVerbose) params->print(cerr);
eh = params;
} else {
eh = new EbwtParams(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireRev);
deleteEh = true;
}
// Set up overridden suffix-array-sample parameters
TIndexOffU offsLen = eh->_offsLen;
uint64_t offsSz = eh->_offsSz;
TIndexOffU offRateDiff = 0;
TIndexOffU offsLenSampled = offsLen;
if(_overrideOffRate > offRate) {
offRateDiff = _overrideOffRate - offRate;
}
if(offRateDiff > 0) {
offsLenSampled >>= offRateDiff;
if((offsLen & ~(OFF_MASK << offRateDiff)) != 0) {
offsLenSampled++;
}
}
// Set up overridden inverted-suffix-array-sample parameters
TIndexOffU isaLen = eh->_isaLen;
TIndexOffU isaRateDiff = 0;
TIndexOffU isaLenSampled = isaLen;
if(_overrideIsaRate > isaRate) {
isaRateDiff = _overrideIsaRate - isaRate;
}
if(isaRateDiff > 0) {
isaLenSampled >>= isaRateDiff;
if((isaLen & ~(OFF_MASK << isaRateDiff)) != 0) {
isaLenSampled++;
}
}
// Can't override the offrate or isarate and use memory-mapped
// files; ultimately, all processes need to copy the sparser sample
// into their own memory spaces.
if(_useMm && (offRateDiff || isaRateDiff)) {
cerr << "Error: Can't use memory-mapped files when the offrate or isarate is overridden" << endl;
throw 1;
}
// Read nPat from primary stream
this->_nPat = readI<TIndexOffU>(_in1, switchEndian);
bytesRead += OFF_SIZE;
if(this->_plen != NULL && !_useMm) {
// Delete it so that we can re-read it
delete[] this->_plen;
this->_plen = NULL;
}
// Read plen from primary stream
if(_useMm) {
#ifdef BOWTIE_MM
this->_plen = (TIndexOffU*)(mmFile[0] + bytesRead);
bytesRead += this->_nPat*OFF_SIZE;
fseeko(_in1, this->_nPat*OFF_SIZE, SEEK_CUR);
#endif
} else {
try {
if(_verbose || startVerbose) {
cerr << "Reading plen (" << this->_nPat << "): ";
logTime(cerr);
}
this->_plen = new TIndexOffU[this->_nPat];
if(switchEndian) {
for(TIndexOffU i = 0; i < this->_nPat; i++) {
this->_plen[i] = readU<TIndexOffU>(_in1, switchEndian);
}
} else {
size_t r = MM_READ(_in1, (void*)this->_plen, this->_nPat*OFF_SIZE);
if(r != (size_t)(this->_nPat*OFF_SIZE)) {
cerr << "Error reading _plen[] array: " << r << ", " << (this->_nPat*OFF_SIZE) << endl;
throw 1;
}
}
} catch(bad_alloc& e) {
cerr << "Out of memory allocating plen[] in Ebwt::read()"
<< " at " << __FILE__ << ":" << __LINE__ << endl;
throw e;
}
}
bool shmemLeader;
// TODO: I'm not consistent on what "header" means. Here I'm using
// "header" to mean everything that would exist in memory if we
// started to build the Ebwt but stopped short of the build*() step
// (i.e. everything up to and including join()).
if(justHeader) goto done;
this->_nFrag = readU<TIndexOffU>(_in1, switchEndian);
bytesRead += OFF_SIZE;
if(_verbose || startVerbose) {
cerr << "Reading rstarts (" << this->_nFrag*3 << "): ";
logTime(cerr);
}
assert_geq(this->_nFrag, this->_nPat);
if(_useMm) {
#ifdef BOWTIE_MM
this->_rstarts = (TIndexOffU*)(mmFile[0] + bytesRead);
bytesRead += this->_nFrag*OFF_SIZE*3;
fseeko(_in1, this->_nFrag*OFF_SIZE*3, SEEK_CUR);
#endif
} else {
this->_rstarts = new TIndexOffU[this->_nFrag*3];
if(switchEndian) {
for(TIndexOffU i = 0; i < this->_nFrag*3; i += 3) {
// fragment starting position in joined reference
// string, text id, and fragment offset within text
this->_rstarts[i] = readU<TIndexOffU>(_in1, switchEndian);
this->_rstarts[i+1] = readU<TIndexOffU>(_in1, switchEndian);
this->_rstarts[i+2] = readU<TIndexOffU>(_in1, switchEndian);
}
} else {
size_t r = MM_READ(_in1, (void *)this->_rstarts, this->_nFrag*OFF_SIZE*3);
if(r != (size_t)(this->_nFrag*OFF_SIZE*3)) {
cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*OFF_SIZE*3) << endl;
throw 1;
}
}
}
if(_useMm) {
#ifdef BOWTIE_MM
this->_ebwt = (uint8_t*)(mmFile[0] + bytesRead);
bytesRead += eh->_ebwtTotLen;
fseeko(_in1, eh->_ebwtTotLen, SEEK_CUR);
#endif
} else {
// Allocate ebwt (big allocation)
if(_verbose || startVerbose) {
cerr << "Reading ebwt (" << eh->_ebwtTotLen << "): ";
logTime(cerr);
}
bool shmemLeader = true;
if(useShmem_) {
shmemLeader = ALLOC_SHARED_U8(
(_in1Str + "[ebwt]"), eh->_ebwtTotLen, &this->_ebwt,
"ebwt[]", (_verbose || startVerbose));
if(_verbose || startVerbose) {
cerr << " shared-mem " << (shmemLeader ? "leader" : "follower") << endl;
}
} else {
try {
this->_ebwt = new uint8_t[eh->_ebwtTotLen];
} catch(bad_alloc& e) {
cerr << "Out of memory allocating the ebwt[] array for the Bowtie index. Please try" << endl
<< "again on a computer with more memory." << endl;
throw 1;
}
}
if(shmemLeader) {
// Read ebwt from primary stream
uint64_t bytesLeft = eh->_ebwtTotLen;
char *pebwt = (char*)this->ebwt();
while (bytesLeft>0){
size_t r = MM_READ(_in1, (void *)pebwt, bytesLeft);
if(MM_IS_IO_ERR(_in1,r,bytesLeft)) {
cerr << "Error reading ebwt array: returned " << r << ", length was " << (eh->_ebwtTotLen) << endl
<< "Your index files may be corrupt; please try re-building or re-downloading." << endl
<< "A complete index consists of 6 files: XYZ.1.ebwt, XYZ.2.ebwt, XYZ.3.ebwt," << endl
<< "XYZ.4.ebwt, XYZ.rev.1.ebwt, and XYZ.rev.2.ebwt. The XYZ.1.ebwt and " << endl
<< "XYZ.rev.1.ebwt files should have the same size, as should the XYZ.2.ebwt and" << endl
<< "XYZ.rev.2.ebwt files." << endl;
throw 1;
}
pebwt += r;
bytesLeft -= r;
}
if(switchEndian) {
uint8_t *side = this->_ebwt;
for(size_t i = 0; i < eh->_numSides; i++) {
TIndexOffU *cums = reinterpret_cast<TIndexOffU*>(side + eh->_sideSz - 2*OFF_SIZE);
cums[0] = endianSwapU(cums[0]);
cums[1] = endianSwapU(cums[1]);
side += this->_eh._sideSz;
}
}
if(useShmem_) NOTIFY_SHARED(this->_ebwt, eh->_ebwtTotLen);
} else {
// Seek past the data and wait until master is finished
fseeko(_in1, eh->_ebwtTotLen, SEEK_CUR);
if(useShmem_) WAIT_SHARED(this->_ebwt, eh->_ebwtTotLen);
}
}
// Read zOff from primary stream
_zOff = readU<TIndexOffU>(_in1, switchEndian);
bytesRead += OFF_SIZE;
assert_lt(_zOff, len);
try {
// Read fchr from primary stream
if(_verbose || startVerbose) cerr << "Reading fchr (5)" << endl;
if(_useMm) {
#ifdef BOWTIE_MM
this->_fchr = (TIndexOffU*)(mmFile[0] + bytesRead);
bytesRead += 5*OFF_SIZE;
fseeko(_in1, 5*OFF_SIZE, SEEK_CUR);
#endif
} else {
this->_fchr = new TIndexOffU[5];
for(int i = 0; i < 5; i++) {
this->_fchr[i] = readU<TIndexOffU>(_in1, switchEndian);
assert_leq(this->_fchr[i], len);
if(i > 0) assert_geq(this->_fchr[i], this->_fchr[i-1]);
}
}
assert_gt(this->_fchr[4], this->_fchr[0]);
// Read ftab from primary stream
if(_verbose || startVerbose) {
cerr << "Reading ftab (" << eh->_ftabLen << "): ";
logTime(cerr);
}
if(_useMm) {
#ifdef BOWTIE_MM
this->_ftab = (TIndexOffU*)(mmFile[0] + bytesRead);
bytesRead += eh->_ftabLen*OFF_SIZE;
fseeko(_in1, eh->_ftabLen*OFF_SIZE, SEEK_CUR);
#endif
} else {
this->_ftab = new TIndexOffU[eh->_ftabLen];
if(switchEndian) {
for(TIndexOffU i = 0; i < eh->_ftabLen; i++)
this->_ftab[i] = readU<TIndexOffU>(_in1, switchEndian);
} else {
size_t r = MM_READ(_in1, (void *)this->_ftab, eh->_ftabLen*OFF_SIZE);
if(r != (size_t)(eh->_ftabLen*OFF_SIZE)) {
cerr << "Error reading _ftab[] array: " << r << ", " << (eh->_ftabLen*OFF_SIZE) << endl;
throw 1;
}
}
}
// Read etab from primary stream
if(_verbose || startVerbose) {
cerr << "Reading eftab (" << eh->_eftabLen << "): ";
logTime(cerr);
}
if(_useMm) {
#ifdef BOWTIE_MM
this->_eftab = (TIndexOffU*)(mmFile[0] + bytesRead);
bytesRead += eh->_eftabLen*OFF_SIZE;
fseeko(_in1, eh->_eftabLen*OFF_SIZE, SEEK_CUR);
#endif
} else {
this->_eftab = new TIndexOffU[eh->_eftabLen];
if(switchEndian) {
for(TIndexOffU i = 0; i < eh->_eftabLen; i++)
this->_eftab[i] = readU<TIndexOffU>(_in1, switchEndian);
} else {
size_t r = MM_READ(_in1, (void *)this->_eftab, eh->_eftabLen*OFF_SIZE);
if(r != (size_t)(eh->_eftabLen*OFF_SIZE)) {
cerr << "Error reading _eftab[] array: " << r << ", " << (eh->_eftabLen*OFF_SIZE) << endl;
throw 1;
}
}
}
for(TIndexOffU i = 0; i < eh->_eftabLen; i++) {
if(i > 0 && this->_eftab[i] > 0) {
assert_geq(this->_eftab[i], this->_eftab[i-1]);
} else if(i > 0 && this->_eftab[i-1] == 0) {
assert_eq(0, this->_eftab[i]);
}
}
} catch(bad_alloc& e) {
cerr << "Out of memory allocating fchr[], ftab[] or eftab[] arrays for the Bowtie index." << endl
<< "Please try again on a computer with more memory." << endl;
throw 1;
}
// Read reference sequence names from primary index file (or not,
// if --refidx is specified)
if(loadNames) {
while(true) {
char c = '\0';
if(MM_READ(_in1, (void *)(&c), (size_t)1) != (size_t)1) break;
bytesRead++;
if(c == '\0') break;
else if(c == '\n') {
this->_refnames.push_back("");
} else {
if(this->_refnames.size() == 0) {
this->_refnames.push_back("");
}
this->_refnames.back().push_back(c);
}
}
}
bytesRead = 4; // reset for secondary index file (already read 1-sentinel)
shmemLeader = true;
if(_verbose || startVerbose) {
cerr << "Reading offs (" << offsLenSampled << " 32-bit words): ";
logTime(cerr);
}
if(!_useMm) {
if(!useShmem_) {
// Allocate offs_
try {
this->_offs = new TIndexOffU[offsLenSampled];
} catch(bad_alloc& e) {
cerr << "Out of memory allocating the offs[] array for the Bowtie index." << endl
<< "Please try again on a computer with more memory." << endl;
throw 1;
}
} else {
shmemLeader = ALLOC_SHARED_U(
(_in2Str + "[offs]"), offsLenSampled*OFF_SIZE, &this->_offs,
"offs", (_verbose || startVerbose));
}
}
if(_overrideOffRate < 32) {
if(shmemLeader) {
// Allocate offs (big allocation)
if(switchEndian || offRateDiff > 0) {
assert(!_useMm);
const TIndexOffU blockMaxSz = (2 * 1024 * 1024); // 2 MB block size
const TIndexOffU blockMaxSzU = (blockMaxSz >> (OFF_SIZE/4 +1)); // # U32s per block
char *buf = new char[blockMaxSz];
for(TIndexOffU i = 0; i < offsLen; i += blockMaxSzU) {
TIndexOffU block = min<TIndexOffU>(blockMaxSzU, offsLen - i);
size_t r = MM_READ(_in2, (void *)buf, block << (OFF_SIZE/4 + 1));
if(r != (size_t)(block << (OFF_SIZE/4 + 1))) {
cerr << "Error reading block of offs array: " << r << ", " << (block << (OFF_SIZE/4 + 1)) << endl
<< "Your index files may be corrupt; please try re-building or re-downloading." << endl
<< "A complete index consists of 6 files: XYZ.1.ebwt, XYZ.2.ebwt, XYZ.3.ebwt," << endl
<< "XYZ.4.ebwt, XYZ.rev.1.ebwt, and XYZ.rev.2.ebwt. The XYZ.1.ebwt and " << endl
<< "XYZ.rev.1.ebwt files should have the same size, as should the XYZ.2.ebwt and" << endl
<< "XYZ.rev.2.ebwt files." << endl;
throw 1;
}
TIndexOffU idx = i >> offRateDiff;
for(TIndexOffU j = 0; j < block; j += (1 << offRateDiff)) {
assert_lt(idx, offsLenSampled);
this->_offs[idx] = ((TIndexOffU*)buf)[j];
if(switchEndian) {
this->_offs[idx] = endianSwapU(this->_offs[idx]);
}
idx++;
}
}
delete[] buf;
} else {
if(_useMm) {
#ifdef BOWTIE_MM
this->_offs = (TIndexOffU*)(mmFile[1] + bytesRead);
bytesRead += offsSz;
// Argument to lseek can be 64 bits if compiled with
// _FILE_OFFSET_BITS
fseeko(_in2, offsSz, SEEK_CUR);
#endif
} else {
// If any of the high two bits are set
// Workaround for small-index mode where MM_READ may
// not be able to handle read amounts greater than 2^32
// bytes.
uint64_t bytesLeft = offsSz;
char *offs = (char *)this->offs();
while(bytesLeft > 0) {
size_t r = MM_READ(_in2, (void*)offs, bytesLeft);
if(MM_IS_IO_ERR(_in2,r,bytesLeft)) {
cerr << "Error reading block of _offs[] array: "
<< r << ", " << bytesLeft << gLastIOErrMsg << endl;
throw 1;
}
offs += r;
bytesLeft -= r;
}
}
}
{
ASSERT_ONLY(Bitset offsSeen(len+1));
for(TIndexOffU i = 0; i < offsLenSampled; i++) {
assert(!offsSeen.test(this->_offs[i]));
ASSERT_ONLY(offsSeen.set(this->_offs[i]));
assert_leq(this->_offs[i], len);
}
}
if(useShmem_) NOTIFY_SHARED(this->_offs, offsLenSampled*OFF_SIZE);
} else {
// Not the shmem leader
fseeko(_in2, offsLenSampled*OFF_SIZE, SEEK_CUR);
if(useShmem_) WAIT_SHARED(this->_offs, offsLenSampled*OFF_SIZE);
}
}
// Allocate _isa[] (big allocation)
if(_verbose || startVerbose) {
cerr << "Reading isa (" << isaLenSampled << "): ";
logTime(cerr);
}
if(!_useMm) {
try {
this->_isa = new TIndexOffU[isaLenSampled];
} catch(bad_alloc& e) {
cerr << "Out of memory allocating the isa[] array for the Bowtie index." << endl
<< "Please try again on a computer with more memory." << endl;
throw 1;
}
}
// Read _isa[]
if(switchEndian || isaRateDiff > 0) {
assert(!_useMm);
for(TIndexOffU i = 0; i < isaLen; i++) {
if((i & ~(OFF_MASK << isaRateDiff)) != 0) {
char tmp[OFF_SIZE];
size_t r = MM_READ(_in2, (void *)tmp, OFF_SIZE);
if(r != (size_t)OFF_SIZE) {
cerr << "Error reading a word of the _isa[] array: " << r << ", 4" << endl;
throw 1;
}
} else {
TIndexOffU idx = i >> isaRateDiff;
assert_lt(idx, isaLenSampled);
this->_isa[idx] = readU<TIndexOffU>(_in2, switchEndian);
}
}
} else {
if(_useMm) {
#ifdef BOWTIE_MM
this->_isa = (TIndexOffU*)(mmFile[1] + bytesRead);
bytesRead += (isaLen << 2);
fseeko(_in2, (isaLen << 2), SEEK_CUR);
#endif
} else {
size_t r = MM_READ(_in2, (void *)this->_isa, isaLen*OFF_SIZE);
if(r != (size_t)(isaLen*OFF_SIZE)) {
cerr << "Error reading _isa[] array: " << r << ", " << (isaLen*OFF_SIZE) << endl;
throw 1;
}
}
}
{
ASSERT_ONLY(Bitset isasSeen(len+1));
for(TIndexOffU i = 0; i < isaLenSampled; i++) {
assert(!isasSeen.test(this->_isa[i]));
ASSERT_ONLY(isasSeen.set(this->_isa[i]));
assert_leq(this->_isa[i], len);
}
}
this->postReadInit(*eh); // Initialize fields of Ebwt not read from file
if(_verbose || startVerbose) print(cerr, *eh);
// The fact that _ebwt and friends actually point to something
// (other than NULL) now signals to other member functions that the
// Ebwt is loaded into memory.
done: // Exit hatch for both justHeader and !justHeader
// Be kind
if(deleteEh) delete eh;
if (_in1 != NULL) rewind(_in1);
if (_in2 != NULL) rewind(_in2);
}
/**
* Read reference names from an input stream 'in' for an Ebwt primary
* file and store them in 'refnames'.
*/
static inline void
readEbwtRefnames(FILE* fin, vector<string>& refnames) {
// _in1 must already be open with the get cursor at the
// beginning and no error flags set.
assert(fin != NULL);
assert_eq(ftello(fin), 0);
// Read endianness hints from both streams
bool switchEndian = false;
uint32_t one = readU<uint32_t>(fin, switchEndian); // 1st word of primary stream
if(one != 1) {
assert_eq((1u<<24), one);
switchEndian = true;
}
// Reads header entries one by one from primary stream
TIndexOffU len = readU<TIndexOffU>(fin, switchEndian);
int32_t lineRate = readI<int32_t>(fin, switchEndian);
int32_t linesPerSide = readI<int32_t>(fin, switchEndian);
int32_t offRate = readI<int32_t>(fin, switchEndian);
int32_t ftabChars = readI<int32_t>(fin, switchEndian);
// BTL: chunkRate is now deprecated
int32_t flags = readI<int32_t>(fin, switchEndian);
bool color = false;
bool entireReverse = false;
if(flags < 0) {
color = (((-flags) & EBWT_COLOR) != 0);
entireReverse = (((-flags) & EBWT_ENTIRE_REV) != 0);
}
// Create a new EbwtParams from the entries read from primary stream
EbwtParams eh(len, lineRate, linesPerSide, offRate, -1, ftabChars, color, entireReverse);
TIndexOffU nPat = readI<TIndexOffU>(fin, switchEndian); // nPat
fseeko(fin, nPat*OFF_SIZE, SEEK_CUR);
// Skip rstarts
TIndexOffU nFrag = readU<TIndexOffU>(fin, switchEndian);
fseeko(fin, nFrag*OFF_SIZE*3, SEEK_CUR);
// Skip ebwt
fseeko(fin, eh._ebwtTotLen, SEEK_CUR);
// Skip zOff from primary stream
readU<TIndexOffU>(fin, switchEndian);
// Skip fchr
fseeko(fin, 5 * OFF_SIZE, SEEK_CUR);
// Skip ftab
fseeko(fin, eh._ftabLen*OFF_SIZE, SEEK_CUR);
// Skip eftab
fseeko(fin, eh._eftabLen*OFF_SIZE, SEEK_CUR);
// Read reference sequence names from primary index file
while(true) {
char c = '\0';
int read_value = 0;
read_value = fgetc(fin);
if(read_value == EOF) break;
c = read_value;
if(c == '\0') break;
else if(c == '\n') {
refnames.push_back("");
} else {
if(refnames.size() == 0) {
refnames.push_back("");
}
refnames.back().push_back(c);
}
}
if(refnames.back().empty()) {
refnames.pop_back();
}
// Be kind
fseeko(fin, 0, SEEK_SET);
assert(ferror(fin) == 0);
}
/**
* Read reference names from the index with basename 'in' and store
* them in 'refnames'.
*/
static inline void
readEbwtRefnames(const string& instr, vector<string>& refnames) {
FILE* fin;
// Initialize our primary and secondary input-stream fields
fin = fopen((instr + ".1." + gEbwt_ext).c_str(),"rb");
if(fin == NULL) {
throw EbwtFileOpenException("Cannot open file " + instr);
}
assert_eq(ftello(fin), 0);
readEbwtRefnames(fin, refnames);
fclose(fin);
}
/**
* Read just enough of the Ebwt's header to get its flags
*/
static inline int32_t readFlags(const string& instr) {
ifstream in;
// Initialize our primary and secondary input-stream fields
in.open((instr + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
if(!in.is_open()) {
throw EbwtFileOpenException("Cannot open file " + instr);
}
assert(in.is_open());
assert(in.good());
bool switchEndian = false;
uint32_t one = readU<uint32_t>(in, switchEndian); // 1st word of primary stream
if(one != 1) {
assert_eq((1u<<24), one);
assert_eq(1, endianSwapU32(one));
switchEndian = true;
}
readU<TIndexOffU>(in, switchEndian);
readI<int32_t>(in, switchEndian);
readI<int32_t>(in, switchEndian);
readI<int32_t>(in, switchEndian);
readI<int32_t>(in, switchEndian);
int32_t flags = readI<int32_t>(in, switchEndian);
return flags;
}
/**
* Read just enough of the Ebwt's header to determine whether it's
* colorspace.
*/
static inline bool
readEbwtColor(const string& instr) {
int32_t flags = readFlags(instr);
if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) {
return true;
} else {
return false;
}
}
/**
* Read just enough of the Ebwt's header to determine whether it's
* entirely reversed.
*/
static inline bool
readEntireReverse(const string& instr) {
int32_t flags = readFlags(instr);
if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) != 0)) {
return true;
} else {
return false;
}
}
/**
* Write an extended Burrows-Wheeler transform to a pair of output
* streams.
*
* @param out1 output stream to primary file
* @param out2 output stream to secondary file
* @param be write in big endian?
*/
template<typename TStr>
void Ebwt<TStr>::writeFromMemory(bool justHeader,
ostream& out1,
ostream& out2) const
{
const EbwtParams& eh = this->_eh;
assert(eh.repOk());
uint32_t be = this->toBe();
assert(out1.good());
assert(out2.good());
// When building an Ebwt, these header parameters are known
// "up-front", i.e., they can be written to disk immediately,
// before we join() or buildToDisk()
writeI<int32_t>(out1, 1, be); // endian hint for priamry stream
writeI<int32_t>(out2, 1, be); // endian hint for secondary stream
writeU<TIndexOffU>(out1, eh._len, be); // length of string (and bwt and suffix array)
writeI<int32_t>(out1, eh._lineRate, be); // 2^lineRate = size in bytes of 1 line
writeI<int32_t>(out1, eh._linesPerSide, be); // not used
writeI<int32_t>(out1, eh._offRate, be); // every 2^offRate chars is "marked"
writeI<int32_t>(out1, eh._ftabChars, be); // number of 2-bit chars used to address ftab
int32_t flags = 1;
if(eh._color) flags |= EBWT_COLOR;
if(eh._entireReverse) flags |= EBWT_ENTIRE_REV;
writeI<int32_t>(out1, -flags, be); // BTL: chunkRate is now deprecated
if(!justHeader) {
assert(isInMemory());
// These Ebwt parameters are known after the inputs strings have
// been joined() but before they have been built(). These can
// written to the disk next and then discarded from memory.
writeU<TIndexOffU>(out1, this->_nPat, be);
for(TIndexOffU i = 0; i < this->_nPat; i++)
writeU<TIndexOffU>(out1, this->_plen[i], be);
assert_geq(this->_nFrag, this->_nPat);
writeU<TIndexOffU>(out1, this->_nFrag, be);
for(TIndexOffU i = 0; i < this->_nFrag*3; i++)
writeU<TIndexOffU>(out1, this->_rstarts[i], be);
// These Ebwt parameters are discovered only as the Ebwt is being
// built (in buildToDisk()). Of these, only 'offs' and 'ebwt' are
// terribly large. 'ebwt' is written to the primary file and then
// discarded from memory as it is built; 'offs' is similarly
// written to the secondary file and discarded.
out1.write((const char *)this->ebwt(), eh._ebwtTotLen);
writeU<TIndexOffU>(out1, this->zOff(), be);
TIndexOffU offsLen = eh._offsLen;
for(TIndexOffU i = 0; i < offsLen; i++)
writeU<TIndexOffU>(out2, this->_offs[i], be);
uint32_t isaLen = eh._isaLen;
for(TIndexOffU i = 0; i < isaLen; i++)
writeU<TIndexOffU>(out2, this->_isa[i], be);
// 'fchr', 'ftab' and 'eftab' are not fully determined until the
// loop is finished, so they are written to the primary file after
// all of 'ebwt' has already been written and only then discarded
// from memory.
for(int i = 0; i < 5; i++)
writeU<TIndexOffU>(out1, this->_fchr[i], be);
for(TIndexOffU i = 0; i < eh._ftabLen; i++)
writeU<TIndexOffU>(out1, this->ftab()[i], be);
for(TIndexOffU i = 0; i < eh._eftabLen; i++)
writeU<TIndexOffU>(out1, this->eftab()[i], be);
}
}
/**
* Given a pair of strings representing output filenames, and assuming
* this Ebwt object is currently in memory, write out this Ebwt to the
* specified files.
*
* If sanity-checking is enabled, then once the streams have been
* fully written and closed, we reopen them and read them into a
* (hopefully) exact copy of this Ebwt. We then assert that the
* current Ebwt and the copy match in all of their fields.
*/
template<typename TStr>
void Ebwt<TStr>::writeFromMemory(bool justHeader,
const string& out1,
const string& out2) const
{
const EbwtParams& eh = this->_eh;
assert(isInMemory());
assert(eh.repOk());
ofstream fout1(out1.c_str(), ios::binary);
ofstream fout2(out2.c_str(), ios::binary);
writeFromMemory(justHeader, fout1, fout2);
fout1.close();
fout2.close();
// Read the file back in and assert that all components match
if(_sanity) {
if(_verbose)
cout << "Re-reading \"" << out1 << "\"/\"" << out2 << "\" for sanity check" << endl;
Ebwt copy(out1, out2, _verbose, _sanity);
assert(!isInMemory());
copy.loadIntoMemory(eh._color ? 1 : 0, -1, false, false);
assert(isInMemory());
assert_eq(eh._lineRate, copy.eh()._lineRate);
assert_eq(eh._linesPerSide, copy.eh()._linesPerSide);
assert_eq(eh._offRate, copy.eh()._offRate);
assert_eq(eh._isaRate, copy.eh()._isaRate);
assert_eq(eh._ftabChars, copy.eh()._ftabChars);
assert_eq(eh._len, copy.eh()._len);
assert_eq(_zOff, copy.zOff());
assert_eq(_zEbwtBpOff, copy.zEbwtBpOff());
assert_eq(_zEbwtByteOff, copy.zEbwtByteOff());
assert_eq(_nPat, copy.nPat());
for(TIndexOffU i = 0; i < _nPat; i++)
assert_eq(this->_plen[i], copy.plen()[i]);
assert_eq(this->_nFrag, copy.nFrag());
for(TIndexOffU i = 0; i < this->nFrag*3; i++) {
assert_eq(this->_rstarts[i], copy.rstarts()[i]);
}
for(uint32_t i = 0; i < 5; i++)
assert_eq(this->_fchr[i], copy.fchr()[i]);
for(TIndexOffU i = 0; i < eh._ftabLen; i++)
assert_eq(this->ftab()[i], copy.ftab()[i]);
for(TIndexOffU i = 0; i < eh._eftabLen; i++)
assert_eq(this->eftab()[i], copy.eftab()[i]);
for(TIndexOffU i = 0; i < eh._offsLen; i++)
assert_eq(this->_offs[i], copy.offs()[i]);
for(TIndexOffU i = 0; i < eh._isaLen; i++)
assert_eq(this->_isa[i], copy.isa()[i]);
for(TIndexOffU i = 0; i < eh._ebwtTotLen; i++)
assert_eq(this->ebwt()[i], copy.ebwt()[i]);
//copy.sanityCheckAll();
if(_verbose)
cout << "Read-in check passed for \"" << out1 << "\"/\"" << out2 << "\"" << endl;
}
}
///////////////////////////////////////////////////////////////////////
//
// Functions for building Ebwts
//
///////////////////////////////////////////////////////////////////////
/**
* Join several text strings together in a way that's compatible with
* the text-chunking scheme dictated by chunkRate parameter.
*
* The non-static member Ebwt::join additionally builds auxilliary
* arrays that maintain a mapping between chunks in the joined string
* and the original text strings.
*/
template<typename TStr>
TStr Ebwt<TStr>::join(vector<TStr>& l, uint32_t seed) {
RandomSource rand; // reproducible given same seed
rand.init(seed);
TStr ret;
size_t guessLen = 0;
for(size_t i = 0; i < l.size(); i++) {
guessLen += length(l[i]);
}
reserve(ret, guessLen, Exact());
for(size_t i = 0; i < l.size(); i++) {
TStr& s = l[i];
assert_gt(length(s), 0);
append(ret, s);
}
return ret;
}
/**
* Join several text strings together in a way that's compatible with
* the text-chunking scheme dictated by chunkRate parameter.
*
* The non-static member Ebwt::join additionally builds auxilliary
* arrays that maintain a mapping between chunks in the joined string
* and the original text strings.
*/
template<typename TStr>
TStr Ebwt<TStr>::join(vector<FileBuf*>& l,
vector<RefRecord>& szs,
TIndexOffU sztot,
const RefReadInParams& refparams,
uint32_t seed)
{
RandomSource rand; // reproducible given same seed
rand.init(seed);
RefReadInParams rpcp = refparams;
TStr ret;
size_t guessLen = sztot;
reserve(ret, guessLen, Exact());
ASSERT_ONLY(size_t szsi = 0);
for(size_t i = 0; i < l.size(); i++) {
// For each sequence we can pull out of istream l[i]...
assert(!l[i]->eof());
bool first = true;
while(!l[i]->eof()) {
RefRecord rec = fastaRefReadAppend(*l[i], first, ret, rpcp);
#ifndef ACCOUNT_FOR_ALL_GAP_REFS
if(rec.first && rec.len == 0) rec.first = false;
#endif
first = false;
size_t bases = rec.len;
assert_eq(rec.off, szs[szsi].off);
assert_eq(rec.len, szs[szsi].len);
assert_eq(rec.first, szs[szsi].first);
ASSERT_ONLY(szsi++);
if(bases == 0) continue;
}
}
return ret;
}
/**
* Join several text strings together according to the text-chunking
* scheme specified in the EbwtParams. Ebwt fields calculated in this
* function are written directly to disk.
*
* It is assumed, but not required, that the header values have already
* been written to 'out1' before this function is called.
*
* The static member Ebwt::join just returns a joined version of a
* list of strings without building any of the auxiliary arrays.
* Because the pseudo-random number generator is the same, we expect
* this function and the static function to give the same result given
* the same seed.
*/
template<typename TStr>
void Ebwt<TStr>::joinToDisk(
vector<FileBuf*>& l,
vector<RefRecord>& szs,
vector<uint32_t>& plens,
TIndexOffU sztot,
const RefReadInParams& refparams,
TStr& ret,
ostream& out1,
ostream& out2,
uint32_t seed)
{
RandomSource rand; // reproducible given same seed
rand.init(seed);
RefReadInParams rpcp = refparams;
assert_gt(szs.size(), 0);
assert_gt(l.size(), 0);
assert_gt(sztot, 0);
// Not every fragment represents a distinct sequence - many
// fragments may correspond to a single sequence. Count the
// number of sequences here by counting the number of "first"
// fragments.
this->_nPat = 0;
this->_nFrag = 0;
#ifdef ACCOUNT_FOR_ALL_GAP_REFS
int nGapFrag = 0;
#endif
for(size_t i = 0; i < szs.size(); i++) {
if(szs[i].len > 0) this->_nFrag++;
#ifdef ACCOUNT_FOR_ALL_GAP_REFS
if(szs[i].len == 0 && szs[i].off > 0) nGapFrag++;
if(szs[i].first && szs[i].len > 0) this->_nPat++;
#else
// For all records where len=0 and first=1, set first=0
assert(szs[i].len > 0 || !szs[i].first);
if(szs[i].first) this->_nPat++;
#endif
}
assert_gt(this->_nPat, 0);
assert_geq(this->_nFrag, this->_nPat);
this->_rstarts = NULL;
writeU<TIndexOffU>(out1, this->_nPat, this->toBe());
assert_eq(plens.size(), this->_nPat);
// Allocate plen[]
try {
this->_plen = new TIndexOffU[this->_nPat];
} catch(bad_alloc& e) {
cerr << "Out of memory allocating plen[] in Ebwt::join()"
<< " at " << __FILE__ << ":" << __LINE__ << endl;
throw e;
}
// For each pattern, set plen
for(TIndexOffU i = 0; i < plens.size(); i++) {
this->_plen[i] = plens[i];
writeU<TIndexOffU>(out1, this->_plen[i], this->toBe());
}
// Write the number of fragments
writeU<TIndexOffU>(out1, this->_nFrag, this->toBe());
TIndexOffU seqsRead = 0;
ASSERT_ONLY(TIndexOffU szsi = 0);
ASSERT_ONLY(TIndexOffU entsWritten = 0);
// For each filebuf
for(unsigned int i = 0; i < l.size(); i++) {
assert(!l[i]->eof());
bool first = true;
TIndexOffU patoff = 0;
// For each *fragment* (not necessary an entire sequence) we
// can pull out of istream l[i]...
while(!l[i]->eof()) {
string name;
// Push a new name onto our vector
_refnames.push_back("");
//uint32_t oldRetLen = length(ret);
RefRecord rec = fastaRefReadAppend(*l[i], first, ret, rpcp, &_refnames.back());
#ifndef ACCOUNT_FOR_ALL_GAP_REFS
if(rec.first && rec.len == 0) rec.first = false;
#endif
first = false;
if(rec.first) {
if(_refnames.back().length() == 0) {
// If name was empty, replace with an index
ostringstream stm;
stm << (_refnames.size()-1);
_refnames.back() = stm.str();
}
} else {
// This record didn't actually start a new sequence so
// no need to add a name
//assert_eq(0, _refnames.back().length());
_refnames.pop_back();
}
assert_lt(szsi, szs.size());
assert(szs[szsi].first == 0 || szs[szsi].first == 1);
assert_eq(rec.off, szs[szsi].off);
assert_eq(rec.len, szs[szsi].len);
// szs[szsi].first == 2 sometimes?!?! g++ is unable to do
// the following correctly, regardless of how I write it
//assert((rec.first == 0) == (szs[szsi].first == 0));
assert(rec.first || rec.off > 0);
ASSERT_ONLY(szsi++);
#ifdef ACCOUNT_FOR_ALL_GAP_REFS
if(rec.len == 0) continue;
if(rec.first && rec.len > 0) seqsRead++;
assert_leq(rec.len, this->_plen[seqsRead-1]);
#else
if(rec.first) seqsRead++;
if(rec.len == 0) continue;
assert_leq(rec.len, this->_plen[seqsRead-1]);
#endif
// Reset the patoff if this is the first fragment
if(rec.first) patoff = 0;
patoff += rec.off; // add fragment's offset from end of last frag.
// Adjust rpcps
//uint32_t seq = seqsRead-1;
ASSERT_ONLY(entsWritten++);
// This is where rstarts elements are written to the output stream
//writeU32(out1, oldRetLen, this->toBe()); // offset from beginning of joined string
//writeU32(out1, seq, this->toBe()); // sequence id
//writeU32(out1, patoff, this->toBe()); // offset into sequence
patoff += rec.len;
}
assert_gt(szsi, 0);
l[i]->reset();
assert(!l[i]->eof());
#ifndef NDEBUG
int c = l[i]->get();
assert_eq('>', c);
assert(!l[i]->eof());
l[i]->reset();
assert(!l[i]->eof());
#endif
}
assert_eq(entsWritten, this->_nFrag);
}
/**
* Build an Ebwt from a string 's' and its suffix array 'sa' (which
* might actually be a suffix array *builder* that builds blocks of the
* array on demand). The bulk of the Ebwt, i.e. the ebwt and offs
* arrays, is written directly to disk. This is by design: keeping
* those arrays in memory needlessly increases the footprint of the
* building process. Instead, we prefer to build the Ebwt directly
* "to disk" and then read it back into memory later as necessary.
*
* It is assumed that the header values and join-related values (nPat,
* plen) have already been written to 'out1' before this function
* is called. When this function is finished, it will have
* additionally written ebwt, zOff, fchr, ftab and eftab to the primary
* file and offs to the secondary file.
*
* Assume DNA/RNA/any alphabet with 4 or fewer elements.
* Assume occ array entries are 32 bits each.
*
* @param sa the suffix array to convert to a Ebwt
* @param buildISA whether to output an ISA sample into out2 after
* the SA sample
* @param s the original string
* @param out
*/
template<typename TStr>
void Ebwt<TStr>::buildToDisk(InorderBlockwiseSA<TStr>& sa,
const TStr& s,
ostream& out1,
ostream& out2)
{
const EbwtParams& eh = this->_eh;
assert(eh.repOk());
assert_eq(length(s)+1, sa.size());
assert_eq(length(s), eh._len);
assert_gt(eh._lineRate, 3);
assert(sa.suffixItrIsReset());
assert_leq((int)ValueSize<Dna>::VALUE, 4);
TIndexOffU len = eh._len;
TIndexOffU ftabLen = eh._ftabLen;
TIndexOffU sideSz = eh._sideSz;
TIndexOffU ebwtTotSz = eh._ebwtTotSz;
TIndexOffU fchr[] = {0, 0, 0, 0, 0};
TIndexOffU* ftab = NULL;
TIndexOffU zOff = OFF_MASK;
// Save # of occurrences of each character as we walk along the bwt
TIndexOffU occ[4] = {0, 0, 0, 0};
// Save 'G' and 'T' occurrences between backward and forward buckets
TIndexOffU occSave[2] = {0, 0};
// Record rows that should "absorb" adjacent rows in the ftab.
// The absorbed rows represent suffixes shorter than the ftabChars
// cutoff.
uint8_t absorbCnt = 0;
uint8_t *absorbFtab;
try {
VMSG_NL("Allocating ftab, absorbFtab");
ftab = new TIndexOffU[ftabLen];
memset(ftab, 0, OFF_SIZE * ftabLen);
absorbFtab = new uint8_t[ftabLen];
memset(absorbFtab, 0, ftabLen);
} catch(bad_alloc &e) {
cerr << "Out of memory allocating ftab[] or absorbFtab[] "
<< "in Ebwt::buildToDisk() at " << __FILE__ << ":"
<< __LINE__ << endl;
throw e;
}
assert(ftab != NULL);
assert(absorbFtab != NULL);
// Allocate the side buffer; holds a single side as its being
// constructed and then written to disk. Reused across all sides.
#ifdef SIXTY4_FORMAT
uint64_t *ebwtSide = NULL;
#else
uint8_t *ebwtSide = NULL;
#endif
try {
#ifdef SIXTY4_FORMAT
ebwtSide = new uint64_t[sideSz >> 3];
#else
ebwtSide = new uint8_t[sideSz];
#endif
} catch(bad_alloc &e) {
cerr << "Out of memory allocating ebwtSide[] in "
<< "Ebwt::buildToDisk() at " << __FILE__ << ":"
<< __LINE__ << endl;
throw e;
}
assert(ebwtSide != NULL);
// Allocate a buffer to hold the ISA sample, which we accumulate in
// the loop and then output at the end. We can't write output the
// ISA right away because the order in which we calculate its
// elements is based on the suffix array, which we only see bit by
// bit
uint32_t *isaSample = NULL;
if(eh._isaRate >= 0) {
try {
isaSample = new uint32_t[eh._isaLen];
} catch(bad_alloc &e) {
cerr << "Out of memory allocating isaSample[] in "
<< "Ebwt::buildToDisk() at " << __FILE__ << ":"
<< __LINE__ << endl;
throw e;
}
assert(isaSample != NULL);
}
// Points to the base offset within ebwt for the side currently
// being written
TIndexOffU side = 0;
// Points to a byte offset from 'side' within ebwt[] where next
// char should be written
#ifdef SIXTY4_FORMAT
TIndexOff sideCur = (eh._sideBwtSz >> 3) - 1;
#else
TIndexOff sideCur = eh._sideBwtSz - 1;
#endif
// Whether we're assembling a forward or a reverse bucket
bool fw = false;
// Did we just finish writing a forward bucket? (Must be true when
// we exit the loop.)
ASSERT_ONLY(bool wroteFwBucket = false);
// Have we skipped the '$' in the last column yet?
ASSERT_ONLY(bool dollarSkipped = false);
TIndexOffU si = 0; // string offset (chars)
ASSERT_ONLY(TIndexOffU lastSufInt = 0);
ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix
// array (as opposed to the padding at the
// end)
// Iterate over packed bwt bytes
VMSG_NL("Entering Ebwt loop");
ASSERT_ONLY(TIndexOffU beforeEbwtOff = (uint32_t)out1.tellp());
while(side < ebwtTotSz) {
ASSERT_ONLY(wroteFwBucket = false);
// Sanity-check our cursor into the side buffer
assert_geq(sideCur, 0);
assert_lt(sideCur, (int)eh._sideBwtSz);
assert_eq(0, side % sideSz); // 'side' must be on side boundary
ebwtSide[sideCur] = 0; // clear
assert_lt(side + sideCur, ebwtTotSz);
// Iterate over bit-pairs in the si'th character of the BWT
#ifdef SIXTY4_FORMAT
for(int bpi = 0; bpi < 32; bpi++, si++)
#else
for(int bpi = 0; bpi < 4; bpi++, si++)
#endif
{
int bwtChar;
bool count = true;
if(si <= len) {
// Still in the SA; extract the bwtChar
TIndexOffU saElt = sa.nextSuffix();
// (that might have triggered sa to calc next suf block)
if(isaSample != NULL && (saElt & eh._isaMask) == saElt) {
// This element belongs in the ISA sample. Add
// an entry mapping the text offset to the offset
// into the suffix array that holds the suffix
// beginning with the character at that text offset
assert_lt((saElt >> eh._isaRate), eh._isaLen);
isaSample[saElt >> eh._isaRate] = si;
}
if(saElt == 0) {
// Don't add the '$' in the last column to the BWT
// transform; we can't encode a $ (only A C T or G)
// and counting it as, say, an A, will mess up the
// LR mapping
bwtChar = 0; count = false;
ASSERT_ONLY(dollarSkipped = true);
zOff = si; // remember the SA row that
// corresponds to the 0th suffix
} else {
bwtChar = (int)(Dna)(s[saElt-1]);
assert_lt(bwtChar, 4);
// Update the fchr
fchr[bwtChar]++;
}
// Update ftab
if((len-saElt) >= (TIndexOffU)eh._ftabChars) {
// Turn the first ftabChars characters of the
// suffix into an integer index into ftab
TIndexOffU sufInt = 0;
for(int i = 0; i < eh._ftabChars; i++) {
sufInt <<= 2;
assert_lt((TIndexOffU)i, len-saElt);
sufInt |= (unsigned char)(Dna)(s[saElt+i]);
}
// Assert that this prefix-of-suffix is greater
// than or equal to the last one (true b/c the
// suffix array is sorted)
#ifndef NDEBUG
if(lastSufInt > 0) assert_geq(sufInt, lastSufInt);
lastSufInt = sufInt;
#endif
// Update ftab
assert_lt(sufInt+1, ftabLen);
ftab[sufInt+1]++;
if(absorbCnt > 0) {
// Absorb all short suffixes since the last
// transition into this transition
absorbFtab[sufInt] = absorbCnt;
absorbCnt = 0;
}
} else {
// Otherwise if suffix is fewer than ftabChars
// characters long, then add it to the 'absorbCnt';
// it will be absorbed into the next transition
assert_lt(absorbCnt, 255);
absorbCnt++;
}
// Suffix array offset boundary? - update offset array
if((si & eh._offMask) == si) {
assert_lt((si >> eh._offRate), eh._offsLen);
// Write offsets directly to the secondary output
// stream, thereby avoiding keeping them in memory
writeU<TIndexOffU>(out2, saElt, this->toBe());
}
} else {
// Strayed off the end of the SA, now we're just
// padding out a bucket
#ifndef NDEBUG
if(inSA) {
// Assert that we wrote all the characters in the
// string before now
assert_eq(si, len+1);
inSA = false;
}
#endif
// 'A' used for padding; important that padding be
// counted in the occ[] array
bwtChar = 0;
}
if(count) occ[bwtChar]++;
// Append BWT char to bwt section of current side
if(fw) {
// Forward bucket: fill from least to most
#ifdef SIXTY4_FORMAT
ebwtSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1));
if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
#else
pack_2b_in_8b(bwtChar, ebwtSide[sideCur], bpi);
assert_eq((ebwtSide[sideCur] >> (bpi*2)) & 3, bwtChar);
#endif
} else {
// Backward bucket: fill from most to least
#ifdef SIXTY4_FORMAT
ebwtSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1));
if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
#else
pack_2b_in_8b(bwtChar, ebwtSide[sideCur], 3-bpi);
assert_eq((ebwtSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar);
#endif
}
} // end loop over bit-pairs
assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
#ifdef SIXTY4_FORMAT
assert_eq(0, si & 31);
#else
assert_eq(0, si & 3);
#endif
if(fw) sideCur++;
else sideCur--;
#ifdef SIXTY4_FORMAT
if(sideCur == (int)eh._sideBwtSz >> 3)
#else
if(sideCur == (int)eh._sideBwtSz)
#endif
{
// Forward side boundary
assert_eq(0, si % eh._sideBwtLen);
#ifdef SIXTY4_FORMAT
sideCur = (eh._sideBwtSz >> 3) - 1;
#else
sideCur = eh._sideBwtSz - 1;
#endif
assert(fw); fw = false;
ASSERT_ONLY(wroteFwBucket = true);
// Write 'G' and 'T'
assert_leq(occSave[0], occ[2]);
assert_leq(occSave[1], occ[3]);
TIndexOffU *u32side = reinterpret_cast<TIndexOffU*>(ebwtSide);
side += sideSz;
assert_leq(side, eh._ebwtTotSz);
#ifdef BOWTIE_64BIT_INDEX
u32side[(sideSz >> 3)-2] = endianizeU<TIndexOffU>(occSave[0], this->toBe());
u32side[(sideSz >> 3)-1] = endianizeU<TIndexOffU>(occSave[1], this->toBe());
#else
u32side[(sideSz >> 2)-2] = endianizeU<TIndexOffU>(occSave[0], this->toBe());
u32side[(sideSz >> 2)-1] = endianizeU<TIndexOffU>(occSave[1], this->toBe());
#endif
// Write forward side to primary file
out1.write((const char *)ebwtSide, sideSz);
} else if (sideCur == -1) {
// Backward side boundary
assert_eq(0, si % eh._sideBwtLen);
sideCur = 0;
assert(!fw); fw = true;
// Write 'A' and 'C'
TIndexOffU *u32side = reinterpret_cast<TIndexOffU*>(ebwtSide);
side += sideSz;
assert_leq(side, eh._ebwtTotSz);
#ifdef BOWTIE_64BIT_INDEX
u32side[(sideSz >> 3)-2] = endianizeU<TIndexOffU>(occ[0], this->toBe());
u32side[(sideSz >> 3)-1] = endianizeU<TIndexOffU>(occ[1], this->toBe());
#else
u32side[(sideSz >> 2)-2] = endianizeU<TIndexOffU>(occ[0], this->toBe());
u32side[(sideSz >> 2)-1] = endianizeU<TIndexOffU>(occ[1], this->toBe());
#endif
occSave[0] = occ[2]; // save 'G' count
occSave[1] = occ[3]; // save 'T' count
// Write backward side to primary file
out1.write((const char *)ebwtSide, sideSz);
}
}
VMSG_NL("Exited Ebwt loop");
assert(ftab != NULL);
assert_neq(zOff, OFF_MASK);
if(absorbCnt > 0) {
// Absorb any trailing, as-yet-unabsorbed short suffixes into
// the last element of ftab
absorbFtab[ftabLen-1] = absorbCnt;
}
// Assert that our loop counter got incremented right to the end
assert_eq(side, eh._ebwtTotSz);
// Assert that we wrote the expected amount to out1
assert_eq(((TIndexOffU)out1.tellp() - beforeEbwtOff), eh._ebwtTotSz);
// assert that the last thing we did was write a forward bucket
assert(wroteFwBucket);
//
// Write zOff to primary stream
//
writeU<TIndexOffU>(out1, zOff, this->toBe());
//
// Finish building fchr
//
// Exclusive prefix sum on fchr
for(int i = 1; i < 4; i++) {
fchr[i] += fchr[i-1];
}
assert_eq(fchr[3], len);
// Shift everybody up by one
for(int i = 4; i >= 1; i--) {
fchr[i] = fchr[i-1];
}
fchr[0] = 0;
if(_verbose) {
for(int i = 0; i < 5; i++)
cout << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl;
}
// Write fchr to primary file
for(int i = 0; i < 5; i++) {
writeU<TIndexOffU>(out1, fchr[i], this->toBe());
}
//
// Finish building ftab and build eftab
//
// Prefix sum on ftable
TIndexOffU eftabLen = 0;
assert_eq(0, absorbFtab[0]);
for(TIndexOffU i = 1; i < ftabLen; i++) {
if(absorbFtab[i] > 0) eftabLen += 2;
}
assert_leq(eftabLen, (TIndexOffU)eh._ftabChars*2);
eftabLen = eh._ftabChars*2;
TIndexOffU *eftab = NULL;
try {
eftab = new TIndexOffU[eftabLen];
memset(eftab, 0, OFF_SIZE * eftabLen);
} catch(bad_alloc &e) {
cerr << "Out of memory allocating eftab[] "
<< "in Ebwt::buildToDisk() at " << __FILE__ << ":"
<< __LINE__ << endl;
throw e;
}
assert(eftab != NULL);
TIndexOffU eftabCur = 0;
for(TIndexOffU i = 1; i < ftabLen; i++) {
TIndexOffU lo = ftab[i] + Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, i-1);
if(absorbFtab[i] > 0) {
// Skip a number of short pattern indicated by absorbFtab[i]
TIndexOffU hi = lo + absorbFtab[i];
assert_lt(eftabCur*2+1, eftabLen);
eftab[eftabCur*2] = lo;
eftab[eftabCur*2+1] = hi;
ftab[i] = (eftabCur++) ^ OFF_MASK; // insert pointer into eftab
assert_eq(lo, Ebwt::ftabLo(ftab, eftab, len, ftabLen, eftabLen, i));
assert_eq(hi, Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, i));
} else {
ftab[i] = lo;
}
}
assert_eq(Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, ftabLen-1), len+1);
// Write ftab to primary file
for(TIndexOffU i = 0; i < ftabLen; i++) {
writeU<TIndexOffU>(out1, ftab[i], this->toBe());
}
// Write eftab to primary file
for(TIndexOffU i = 0; i < eftabLen; i++) {
writeU<TIndexOffU>(out1, eftab[i], this->toBe());
}
// Write isa to primary file
if(isaSample != NULL) {
ASSERT_ONLY(Bitset sawISA(eh._len+1));
for(TIndexOffU i = 0; i < eh._isaLen; i++) {
TIndexOffU s = isaSample[i];
assert_leq(s, eh._len);
assert(!sawISA.test(s));
ASSERT_ONLY(sawISA.set(s));
writeU<TIndexOffU>(out2, s, this->toBe());
}
delete[] isaSample;
}
delete[] ftab;
delete[] eftab;
delete[] absorbFtab;
// Note: if you'd like to sanity-check the Ebwt, you'll have to
// read it back into memory first!
assert(!isInMemory());
VMSG_NL("Exiting Ebwt::buildToDisk()");
}
/**
* Try to find the Bowtie index specified by the user. First try the
* exact path given by the user. Then try the user-provided string
* appended onto the path of the "indexes" subdirectory below this
* executable, then try the provided string appended onto
* "$BOWTIE_INDEXES/".
*/
string adjustEbwtBase(const string& cmdline,
const string& ebwtFileBase,
bool verbose = false);
#endif /*EBWT_H_*/