Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| #ifndef HIT_H_ | |
| #define HIT_H_ | |
| #include <vector> | |
| #include <stdint.h> | |
| #include <iostream> | |
| #include <fstream> | |
| #include <string> | |
| #include <stdexcept> | |
| #include <seqan/sequence.h> | |
| #include "alphabet.h" | |
| #include "assert_helpers.h" | |
| #include "threading.h" | |
| #include "bitset.h" | |
| #include "tokenize.h" | |
| #include "pat.h" | |
| #include "formats.h" | |
| #include "filebuf.h" | |
| #include "edit.h" | |
| #include "refmap.h" | |
| #include "annot.h" | |
| #include "sstring.h" | |
| /** | |
| * Classes for dealing with reporting alignments. | |
| */ | |
| using namespace std; | |
| using namespace seqan; | |
| /// Constants for the various output modes | |
| enum output_types { | |
| OUTPUT_FULL = 1, | |
| OUTPUT_CONCISE, | |
| OUTPUT_BINARY, | |
| OUTPUT_CHAIN, | |
| OUTPUT_SAM, | |
| OUTPUT_NONE | |
| }; | |
| /// Names of the various output modes | |
| static const std::string output_type_names[] = { | |
| "Invalid!", | |
| "Full", | |
| "Concise", | |
| "Binary", | |
| "None" | |
| }; | |
| typedef pair<TIndexOffU,TIndexOffU> UPair; | |
| /** | |
| * Encapsulates a hit, including a text-id/text-offset pair, a pattern | |
| * id, and a boolean indicating whether it matched as its forward or | |
| * reverse-complement version. | |
| */ | |
| class Hit { | |
| public: | |
| Hit() : stratum(-1) { } | |
| UPair h; /// reference index & offset | |
| UPair mh; /// reference index & offset for mate | |
| uint32_t patId; /// read index | |
| String<char> patName; /// read name | |
| String<Dna5> patSeq; /// read sequence | |
| String<Dna5> colSeq; /// original color sequence, not decoded | |
| String<char> quals; /// read qualities | |
| String<char> colQuals;/// original color qualities, not decoded | |
| FixedBitset<1024> mms; /// nucleotide mismatch mask | |
| FixedBitset<1024> cmms; /// color mismatch mask (if relevant) | |
| vector<char> refcs; /// reference characters for mms | |
| vector<char> crefcs; /// reference characters for cmms | |
| uint32_t oms; /// # of other possible mappings; 0 -> this is unique | |
| bool fw; /// orientation of read in alignment | |
| bool mfw; /// orientation of mate in alignment | |
| uint16_t mlen; /// length of mate | |
| int8_t stratum; /// stratum of hit (= mismatches in seed) | |
| uint32_t cost; /// total cost, factoring in stratum and quality penalty | |
| uint8_t mate; /// matedness; 0 = not a mate | |
| /// 1 = upstream mate | |
| /// 2 = downstream mate | |
| bool color; /// read is in colorspace? | |
| char primer; /// primer base, for csfasta files | |
| char trimc; /// trimmed color, for csfasta files | |
| uint32_t seed; /// pseudo-random seed for aligned read | |
| /** | |
| * Return true if this Hit is internally consistent. Otherwise, | |
| * throw an assertion. | |
| */ | |
| bool repOk() const { | |
| assert_geq(cost, (uint32_t)(stratum << 14)); | |
| return true; | |
| } | |
| size_t length() const { return seqan::length(patSeq); } | |
| Hit& operator = (const Hit &other) { | |
| this->h = other.h; | |
| this->mh = other.mh; | |
| this->patId = other.patId; | |
| this->patName = other.patName; | |
| this->patSeq = other.patSeq; | |
| this->colSeq = other.colSeq; | |
| this->quals = other.quals; | |
| this->colQuals= other.colQuals; | |
| this->mms = other.mms; | |
| this->cmms = other.cmms; | |
| this->refcs = other.refcs; | |
| this->crefcs = other.crefcs; | |
| this->oms = other.oms; | |
| this->fw = other.fw; | |
| this->mfw = other.mfw; | |
| this->mlen = other.mlen; | |
| this->stratum = other.stratum; | |
| this->cost = other.cost; | |
| this->mate = other.mate; | |
| this->color = other.color; | |
| this->cmms = other.cmms; | |
| this->seed = other.seed; | |
| return *this; | |
| } | |
| }; | |
| /** | |
| * Compare hits a and b; a < b if its cost is less than B's. If | |
| * there's a tie, break on position and orientation. | |
| */ | |
| class HitCostCompare { | |
| public: | |
| bool operator() (const Hit& a, const Hit& b) { | |
| if(a.cost < b.cost) return true; | |
| if(a.cost > b.cost) return false; | |
| if(a.h < b.h) return true; | |
| if(a.h > b.h) return false; | |
| if(a.fw < b.fw) return true; | |
| if(a.fw > b.fw) return false; | |
| return false; | |
| } | |
| }; | |
| /// Sort by text-id then by text-offset | |
| bool operator< (const Hit& a, const Hit& b); | |
| /** | |
| * Encapsulates an object that accepts hits, optionally retains them in | |
| * a vector, and does something else with them according to | |
| * descendent's implementation of pure virtual member reportHitImpl(). | |
| */ | |
| class HitSink { | |
| public: | |
| explicit HitSink( | |
| OutFileBuf* out, | |
| const std::string& dumpAl, | |
| const std::string& dumpUnal, | |
| const std::string& dumpMax, | |
| bool onePairFile, | |
| bool sampleMax, | |
| vector<string>* refnames, | |
| size_t nthreads, | |
| int perThreadBufSize) : | |
| _outs(), | |
| _deleteOuts(false), | |
| _refnames(refnames), | |
| _numWrappers(0), | |
| _locks(), | |
| ts_wrap(NULL), | |
| dumpAlBase_(dumpAl), | |
| dumpUnalBase_(dumpUnal), | |
| dumpMaxBase_(dumpMax), | |
| onePairFile_(onePairFile), | |
| sampleMax_(sampleMax), | |
| quiet_(false), | |
| nthreads_((nthreads > 0) ? nthreads : 1), | |
| ptBufs_(), | |
| ptCounts_(nthreads_), | |
| perThreadBufSize_(perThreadBufSize), | |
| ptNumAligned_(NULL) | |
| { | |
| size_t nelt = 5 * nthreads_; | |
| ptNumAligned_ = new uint64_t[nelt]; | |
| std::memset(reinterpret_cast<void*>(const_cast<uint64_t*>(ptNumAligned_)), 0, sizeof(uint64_t) * nelt); | |
| ptNumReported_ = ptNumAligned_ + nthreads_; | |
| ptNumReportedPaired_ = ptNumReported_ + nthreads_; | |
| ptNumUnaligned_ = ptNumReportedPaired_ + nthreads_; | |
| ptNumMaxed_ = ptNumUnaligned_ + nthreads_; | |
| _outs.push_back(out); | |
| ptBufs_.resize(nthreads_); | |
| ptCounts_.resize(nthreads_, 0); | |
| //had to move this below the array inits, otherwise it get's mangled leading to segfaults on access | |
| _locks.push_back(new MUTEX_T); | |
| initDumps(); | |
| } | |
| /** | |
| * Open a number of output streams; usually one per reference | |
| * sequence. For now, we give then names refXXXXX.map where XXXXX | |
| * is the 0-padded reference index. Someday we may want to include | |
| * the name of the reference sequence in the filename somehow. | |
| */ | |
| explicit HitSink( | |
| size_t numOuts, | |
| const std::string& dumpAl, | |
| const std::string& dumpUnal, | |
| const std::string& dumpMax, | |
| bool onePairFile, | |
| bool sampleMax, | |
| vector<string>* refnames, | |
| size_t nthreads, | |
| int perThreadBufSize) : | |
| _outs(), | |
| _deleteOuts(true), | |
| _refnames(refnames), | |
| _locks(), | |
| dumpAlBase_(dumpAl), | |
| dumpUnalBase_(dumpUnal), | |
| dumpMaxBase_(dumpMax), | |
| onePairFile_(onePairFile), | |
| sampleMax_(sampleMax), | |
| quiet_(false), | |
| nthreads_(0), | |
| perThreadBufSize_(0), | |
| ptNumAligned_(NULL) | |
| { | |
| // Open all files for writing and initialize all locks | |
| for(size_t i = 0; i < numOuts; i++) { | |
| _outs.push_back(NULL); // we open output streams lazily | |
| _locks.push_back(new MUTEX_T); | |
| } | |
| initDumps(); | |
| } | |
| /** | |
| * Destroy HitSinkobject; | |
| */ | |
| virtual ~HitSink() { | |
| if(ptNumAligned_ != NULL) { | |
| delete[] ptNumAligned_; | |
| ptNumAligned_ = NULL; | |
| } | |
| closeOuts(); | |
| if(_deleteOuts) { | |
| // Delete all non-NULL output streams | |
| for(size_t i = 0; i < _outs.size(); i++) { | |
| if(_outs[i] != NULL) { | |
| delete _outs[i]; | |
| _outs[i] = NULL; | |
| } | |
| if(_locks[i] != NULL) { | |
| delete _locks[i]; | |
| _locks[i] = NULL; | |
| } | |
| } | |
| } | |
| destroyDumps(); | |
| } | |
| /** | |
| * Call this whenever this HitSink is wrapped by a new | |
| * HitSinkPerThread. This helps us keep track of whether the main | |
| * lock or any of the per-stream locks will be contended. | |
| */ | |
| void addWrapper() { | |
| ThreadSafe ts(&numWrapper_mutex_m); | |
| _numWrappers++; | |
| } | |
| /** | |
| * Called by concrete subclasses to figure out which elements of | |
| * the _outs/_locks array to use when outputting the alignment. | |
| */ | |
| size_t refIdxToStreamIdx(size_t refIdx) { | |
| if(refIdx >= _outs.size()) return 0; | |
| return refIdx; | |
| } | |
| /** | |
| * Append a single hit to the given output stream. | |
| */ | |
| virtual void append(BTString& o, const Hit& h, int mapq, int xms) = 0; | |
| /** | |
| * Add a number of alignments to the tally. Tally shouldn't | |
| * include reads that fail to align either because they had 0 | |
| * alignments or because of -m. | |
| */ | |
| void tallyAlignments(size_t threadId, size_t numAl, bool paired) { | |
| ptNumAligned_[threadId] += numAl; | |
| if(paired) { | |
| ptNumReportedPaired_[threadId] += numAl; | |
| } else { | |
| ptNumReported_[threadId] += numAl; | |
| } | |
| } | |
| /** | |
| * Report a batch of hits from a vector, perhaps subsetting it. | |
| */ | |
| virtual void reportHits( | |
| const Hit *hptr, | |
| vector<Hit> *hsptr, | |
| size_t start, | |
| size_t end, | |
| size_t threadId, | |
| int mapq, | |
| int xms, | |
| bool tally) | |
| { | |
| assert_geq(end, start); | |
| assert(nthreads_ > 1 || threadId == 0); | |
| if(end == start) { | |
| return; | |
| } | |
| const Hit& firstHit = (hptr == NULL) ? (*hsptr)[start] : *hptr; | |
| bool paired = firstHit.mate > 0; | |
| // Sort reads so that those against the same reference sequence | |
| // are consecutive. | |
| if(hsptr != NULL && _outs.size() > 1 && end - start > 2) { | |
| sort(hsptr->begin() + start, hsptr->begin() + end); | |
| } | |
| BTString& o = ptBufs_[threadId]; | |
| if(_outs.size() == 1) { | |
| // Per-thread buffering is active | |
| for(size_t i = start; i < end; i++) { | |
| const Hit& h = (hptr == NULL) ? (*hsptr)[i] : *hptr; | |
| assert(h.repOk()); | |
| if(nthreads_ > 1) { | |
| maybeFlush(threadId, 0); | |
| append(o, h, mapq, xms); | |
| ptCounts_[threadId]++; | |
| } else { | |
| append(o, h, mapq, xms); | |
| out(0).writeString(o); | |
| o.clear(); | |
| } | |
| } | |
| } else { | |
| // multiple output streams or alignments | |
| // Note: in this case we basically don't get the benefit | |
| // from per-thread buffering, becuase it is too | |
| // complicated to provide per-thread, per-output-lock | |
| // buffers. | |
| size_t i = start; | |
| while(i < end) { | |
| const Hit& h = (hptr == NULL) ? (*hsptr)[i] : *hptr; | |
| size_t strIdx = refIdxToStreamIdx(h.h.first); | |
| { | |
| assert(h.repOk()); | |
| do { | |
| append(o, h, mapq, xms); | |
| { | |
| ThreadSafe _ts(_locks[strIdx]); | |
| out(h.h.first).writeString(o); | |
| } | |
| o.clear(); | |
| i++; | |
| } while(refIdxToStreamIdx(h.h.first) == strIdx && i < end); | |
| } | |
| } | |
| } | |
| if(tally) { | |
| tallyAlignments(threadId, end - start, paired); | |
| } | |
| } | |
| /** | |
| * Called when all alignments are complete. It is assumed that no | |
| * synchronization is necessary | |
| */ | |
| void finish(bool hadoopOut) { | |
| // Flush all per-thread buffers | |
| flushAll(0); | |
| // Close all output streams | |
| closeOuts(); | |
| // Print information about how many unpaired and/or paired | |
| // reads were aligned. | |
| if(!quiet_) { | |
| uint64_t numReported = 0, numReportedPaired = 0; | |
| uint64_t numAligned = 0, numUnaligned = 0; | |
| uint64_t numMaxed = 0; | |
| for(int i = 0; i < nthreads_; i++) { | |
| numReported += ptNumReported_[i]; | |
| numReportedPaired += ptNumReportedPaired_[i]; | |
| numAligned += ptNumAligned_[i]; | |
| numUnaligned += ptNumUnaligned_[i]; | |
| numMaxed += ptNumMaxed_[i]; | |
| } | |
| uint64_t tot = numAligned + numUnaligned + numMaxed; | |
| double alPct = 0.0, unalPct = 0.0, maxPct = 0.0; | |
| if(tot > 0) { | |
| alPct = 100.0 * (double)numAligned / (double)tot; | |
| unalPct = 100.0 * (double)numUnaligned / (double)tot; | |
| maxPct = 100.0 * (double)numMaxed / (double)tot; | |
| } | |
| cerr << "# reads processed: " << tot << endl; | |
| cerr << "# reads with at least one reported alignment: " | |
| << numAligned << " (" << fixed << setprecision(2) | |
| << alPct << "%)" << endl; | |
| cerr << "# reads that failed to align: " | |
| << numUnaligned << " (" << fixed << setprecision(2) | |
| << unalPct << "%)" << endl; | |
| if(numMaxed > 0) { | |
| if(sampleMax_) { | |
| cerr << "# reads with alignments sampled due to -M: " | |
| << numMaxed << " (" << fixed << setprecision(2) | |
| << maxPct << "%)" << endl; | |
| } else { | |
| cerr << "# reads with alignments suppressed due to -m: " | |
| << numMaxed << " (" << fixed << setprecision(2) | |
| << maxPct << "%)" << endl; | |
| } | |
| } | |
| if(numReported == 0 && numReportedPaired == 0) { | |
| cerr << "No alignments" << endl; | |
| } | |
| else if(numReportedPaired > 0 && numReported == 0) { | |
| cerr << "Reported " << (numReportedPaired >> 1) | |
| << " paired-end alignments to " << _outs.size() | |
| << " output stream(s)" << endl; | |
| } | |
| else if(numReported > 0 && numReportedPaired == 0) { | |
| cerr << "Reported " << numReported | |
| << " alignments to " << _outs.size() | |
| << " output stream(s)" << endl; | |
| } | |
| else { | |
| assert_gt(numReported + numReportedPaired, 0); | |
| cerr << "Reported " << (numReportedPaired >> 1) | |
| << " paired-end alignments and " << numReported | |
| << " singleton alignments to " << _outs.size() | |
| << " output stream(s)" << endl; | |
| } | |
| if(hadoopOut) { | |
| cerr << "reporter:counter:Bowtie,Reads with reported alignments," << numAligned << endl; | |
| cerr << "reporter:counter:Bowtie,Reads with no alignments," << numUnaligned << endl; | |
| cerr << "reporter:counter:Bowtie,Reads exceeding -m limit," << numMaxed << endl; | |
| cerr << "reporter:counter:Bowtie,Unpaired alignments reported," << numReported << endl; | |
| cerr << "reporter:counter:Bowtie,Paired alignments reported," << numReportedPaired << endl; | |
| } | |
| } | |
| } | |
| /** | |
| * Returns alignment output stream, lazily created if needed. | |
| */ | |
| OutFileBuf& out(size_t refIdx) { | |
| const size_t strIdx = refIdxToStreamIdx(refIdx); | |
| if(_outs[strIdx] == NULL) { | |
| assert(_deleteOuts); | |
| { | |
| ThreadSafe _ts(&main_mutex_m); | |
| if(_outs[strIdx] == NULL) { // avoid race | |
| BTString o; | |
| o << "ref"; | |
| if (strIdx < 10) o << "0000"; | |
| else if(strIdx < 100) o << "000"; | |
| else if(strIdx < 1000) o << "00"; | |
| else if(strIdx < 10000) o << "0"; | |
| o << strIdx << ".map"; | |
| _outs[strIdx] = new OutFileBuf(o.toZBuf(), false); | |
| } | |
| } | |
| } | |
| return *(_outs[strIdx]); | |
| } | |
| /** | |
| * Return true iff this HitSink dumps aligned reads to an output | |
| * stream (i.e., iff --alfa or --alfq are specified). | |
| */ | |
| bool dumpsAlignedReads() const { return dumpAlignFlag_; } | |
| /** | |
| * Return true iff this HitSink dumps unaligned reads to an output | |
| * stream (i.e., iff --unfa or --unfq are specified). | |
| */ | |
| bool dumpsUnalignedReads() const { return dumpUnalignFlag_; } | |
| /** | |
| * Return true iff this HitSink dumps maxed-out reads to an output | |
| * stream (i.e., iff --maxfa or --maxfq are specified). | |
| */ | |
| bool dumpsMaxedReads() const { return dumpMaxedFlag_ || dumpUnalignFlag_; } | |
| /** | |
| * Return true iff this HitSink dumps either unaligned or maxed- | |
| * out reads to an output stream (i.e., iff --unfa, --maxfa, | |
| * --unfq, or --maxfq are specified). | |
| */ | |
| bool dumpsReads() const { | |
| return dumpAlignFlag_ || dumpUnalignFlag_ || dumpMaxedFlag_; | |
| } | |
| /** | |
| * Dump an aligned read to all of the appropriate output streams. | |
| * Be careful to synchronize correctly - there may be multiple | |
| * simultaneous writers. | |
| */ | |
| void dumpAlign(PatternSourcePerThread& p) { | |
| if(!dumpAlignFlag_) return; | |
| const bool paired = p.bufa().mate > 0; | |
| if(!paired || onePairFile_) { | |
| // Dump unpaired read to an aligned-read file of the same format | |
| if(!dumpAlBase_.empty()) { | |
| ThreadSafe _ts(&dumpAlignLock_); | |
| if(dumpAl_ == NULL) { | |
| assert(dumpAlQv_ == NULL); | |
| dumpAl_ = openOf(dumpAlBase_, 0, ""); | |
| assert(dumpAl_ != NULL); | |
| if(p.bufa().qualOrigBufLen > 0) { | |
| dumpAlQv_ = openOf(dumpAlBase_ + ".qual", 0, ""); | |
| assert(dumpAlQv_ != NULL); | |
| } | |
| } | |
| dumpAl_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen); | |
| if(dumpAlQv_ != NULL) { | |
| dumpAlQv_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen); | |
| } | |
| } | |
| } else { | |
| // Dump paired-end read to an aligned-read file (or pair of | |
| // files) of the same format | |
| if(!dumpAlBase_.empty()) { | |
| ThreadSafe _ts(&dumpAlignLockPE_); | |
| if(dumpAl_1_ == NULL) { | |
| assert(dumpAlQv_1_ == NULL); | |
| assert(dumpAlQv_2_ == NULL); | |
| dumpAl_1_ = openOf(dumpAlBase_, 1, ""); | |
| dumpAl_2_ = openOf(dumpAlBase_, 2, ""); | |
| assert(dumpAl_1_ != NULL); | |
| assert(dumpAl_2_ != NULL); | |
| if(p.bufa().qualOrigBufLen > 0) { | |
| dumpAlQv_1_ = openOf(dumpAlBase_ + ".qual", 1, ""); | |
| dumpAlQv_2_ = openOf(dumpAlBase_ + ".qual", 2, ""); | |
| assert(dumpAlQv_1_ != NULL); | |
| assert(dumpAlQv_2_ != NULL); | |
| } | |
| } | |
| dumpAl_1_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen); | |
| dumpAl_2_->write(p.bufb().readOrigBuf, p.bufb().readOrigBufLen); | |
| if(dumpAlQv_1_ != NULL) { | |
| dumpAlQv_1_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen); | |
| dumpAlQv_2_->write(p.bufb().qualOrigBuf, p.bufb().qualOrigBufLen); | |
| } | |
| } | |
| } | |
| } | |
| /** | |
| * Dump an unaligned read to all of the appropriate output streams. | |
| * Be careful to synchronize correctly - there may be multiple | |
| * simultaneous writers. | |
| */ | |
| void dumpUnal(PatternSourcePerThread& p) { | |
| if(!dumpUnalignFlag_) return; | |
| const bool paired = p.bufa().mate > 0; | |
| if(!paired || onePairFile_) { | |
| // Dump unpaired read to an unaligned-read file of the same format | |
| if(!dumpUnalBase_.empty()) { | |
| ThreadSafe _ts(&dumpUnalLock_); | |
| if(dumpUnal_ == NULL) { | |
| assert(dumpUnalQv_ == NULL); | |
| dumpUnal_ = openOf(dumpUnalBase_, 0, ""); | |
| assert(dumpUnal_ != NULL); | |
| if(p.bufa().qualOrigBufLen > 0) { | |
| dumpUnalQv_ = openOf(dumpUnalBase_ + ".qual", 0, ""); | |
| assert(dumpUnalQv_ != NULL); | |
| } | |
| } | |
| dumpUnal_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen); | |
| if(dumpUnalQv_ != NULL) { | |
| dumpUnalQv_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen); | |
| } | |
| } | |
| } else { | |
| // Dump paired-end read to an unaligned-read file (or pair | |
| // of files) of the same format | |
| if(!dumpUnalBase_.empty()) { | |
| ThreadSafe _ts(&dumpUnalLockPE_); | |
| if(dumpUnal_1_ == NULL) { | |
| assert(dumpUnal_1_ == NULL); | |
| assert(dumpUnal_2_ == NULL); | |
| dumpUnal_1_ = openOf(dumpUnalBase_, 1, ""); | |
| dumpUnal_2_ = openOf(dumpUnalBase_, 2, ""); | |
| assert(dumpUnal_1_ != NULL); | |
| assert(dumpUnal_2_ != NULL); | |
| if(p.bufa().qualOrigBufLen > 0) { | |
| dumpUnalQv_1_ = openOf(dumpUnalBase_ + ".qual", 1, ""); | |
| dumpUnalQv_2_ = openOf(dumpUnalBase_ + ".qual", 2, ""); | |
| } | |
| } | |
| dumpUnal_1_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen); | |
| dumpUnal_2_->write(p.bufb().readOrigBuf, p.bufb().readOrigBufLen); | |
| if(dumpUnalQv_1_ != NULL) { | |
| dumpUnalQv_1_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen); | |
| dumpUnalQv_2_->write(p.bufb().qualOrigBuf, p.bufb().qualOrigBufLen); | |
| } | |
| } | |
| } | |
| } | |
| /** | |
| * Dump a maxed-out read to all of the appropriate output streams. | |
| * Be careful to synchronize correctly - there may be multiple | |
| * simultaneous writers. | |
| */ | |
| void dumpMaxed(PatternSourcePerThread& p) { | |
| if(!dumpMaxedFlag_) { | |
| if(dumpUnalignFlag_) dumpUnal(p); | |
| return; | |
| } | |
| const bool paired = p.bufa().mate > 0; | |
| if(!paired || onePairFile_) { | |
| // Dump unpaired read to an maxed-out-read file of the same format | |
| if(!dumpMaxBase_.empty()) { | |
| ThreadSafe _ts(&dumpMaxLock_); | |
| if(dumpMax_ == NULL) { | |
| dumpMax_ = openOf(dumpMaxBase_, 0, ""); | |
| assert(dumpMax_ != NULL); | |
| if(p.bufa().qualOrigBufLen > 0) { | |
| dumpMaxQv_ = openOf(dumpMaxBase_ + ".qual", 0, ""); | |
| } | |
| } | |
| dumpMax_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen); | |
| if(dumpMaxQv_ != NULL) { | |
| dumpMaxQv_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen); | |
| } | |
| } | |
| } else { | |
| // Dump paired-end read to a maxed-out-read file (or pair | |
| // of files) of the same format | |
| if(!dumpMaxBase_.empty()) { | |
| ThreadSafe _ts(&dumpMaxLockPE_); | |
| if(dumpMax_1_ == NULL) { | |
| assert(dumpMaxQv_1_ == NULL); | |
| assert(dumpMaxQv_2_ == NULL); | |
| dumpMax_1_ = openOf(dumpMaxBase_, 1, ""); | |
| dumpMax_2_ = openOf(dumpMaxBase_, 2, ""); | |
| assert(dumpMax_1_ != NULL); | |
| assert(dumpMax_2_ != NULL); | |
| if(p.bufa().qualOrigBufLen > 0) { | |
| dumpMaxQv_1_ = openOf(dumpMaxBase_ + ".qual", 1, ""); | |
| dumpMaxQv_2_ = openOf(dumpMaxBase_ + ".qual", 2, ""); | |
| } | |
| } | |
| dumpMax_1_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen); | |
| dumpMax_2_->write(p.bufb().readOrigBuf, p.bufb().readOrigBufLen); | |
| if(dumpMaxQv_1_ != NULL) { | |
| dumpMaxQv_1_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen); | |
| dumpMaxQv_2_->write(p.bufb().qualOrigBuf, p.bufb().qualOrigBufLen); | |
| } | |
| } | |
| } | |
| } | |
| /** | |
| * Report a maxed-out read. Typically we do nothing, but we might | |
| * want to print a placeholder when output is chained. | |
| */ | |
| virtual void reportMaxed( | |
| vector<Hit>& hs, | |
| size_t threadId, | |
| PatternSourcePerThread& p) | |
| { | |
| ptNumMaxed_[threadId]++; | |
| } | |
| /** | |
| * Report an unaligned read. Typically we do nothing, but we might | |
| * want to print a placeholder when output is chained. | |
| */ | |
| virtual void reportUnaligned( | |
| size_t threadId, | |
| PatternSourcePerThread& p) | |
| { | |
| ptNumUnaligned_[threadId]++; | |
| } | |
| protected: | |
| /** | |
| * Flush thread's output buffer and reset both buffer and count. | |
| */ | |
| void flush(size_t threadId, size_t outId) { | |
| { | |
| ThreadSafe _ts(_locks[0]); // flush | |
| out(outId).writeString(ptBufs_[threadId]); | |
| } | |
| ptCounts_[threadId] = 0; | |
| ptBufs_[threadId].clear(); | |
| } | |
| /** | |
| * Flush all output buffers. | |
| */ | |
| void flushAll(size_t outId) { | |
| for(int i = 0; i < nthreads_; i++) { | |
| flush(i, outId); | |
| } | |
| } | |
| /** | |
| * If the thread's output buffer is currently full, flush it and | |
| * reset both buffer and count. | |
| */ | |
| void maybeFlush(size_t threadId, size_t outId) { | |
| if(ptCounts_[threadId] >= perThreadBufSize_) { | |
| flush(threadId, outId); | |
| } | |
| } | |
| /** | |
| * Close (and flush) all OutFileBufs. | |
| */ | |
| void closeOuts() { | |
| // Flush and close all non-NULL output streams | |
| for(size_t i = 0; i < _outs.size(); i++) { | |
| if(_outs[i] != NULL && !_outs[i]->closed()) { | |
| _outs[i]->close(); | |
| } | |
| } | |
| } | |
| vector<OutFileBuf*> _outs; /// the alignment output stream(s) | |
| bool _deleteOuts; /// Whether to delete elements of _outs upon exit | |
| vector<string>* _refnames; /// map from reference indexes to names | |
| int _numWrappers; /// # threads owning a wrapper for this HitSink | |
| vector<MUTEX_T*> _locks; /// pthreads mutexes for per-file critical sections | |
| MUTEX_T main_mutex_m; /// pthreads mutexes for fields of this object | |
| MUTEX_T numWrapper_mutex_m; | |
| MUTEX_T firstLock; | |
| ThreadSafe* ts_wrap; /// for mutual exclusion | |
| // used for output read buffer | |
| size_t nthreads_; | |
| std::vector<BTString> ptBufs_; | |
| std::vector<size_t> ptCounts_; | |
| int perThreadBufSize_; | |
| // Output filenames for dumping | |
| std::string dumpAlBase_; | |
| std::string dumpUnalBase_; | |
| std::string dumpMaxBase_; | |
| bool onePairFile_; | |
| bool sampleMax_; | |
| // Output streams for dumping sequences | |
| std::ofstream *dumpAl_; // for single-ended reads | |
| std::ofstream *dumpAl_1_; // for first mates | |
| std::ofstream *dumpAl_2_; // for second mates | |
| std::ofstream *dumpUnal_; // for single-ended reads | |
| std::ofstream *dumpUnal_1_; // for first mates | |
| std::ofstream *dumpUnal_2_; // for second mates | |
| std::ofstream *dumpMax_; // for single-ended reads | |
| std::ofstream *dumpMax_1_; // for first mates | |
| std::ofstream *dumpMax_2_; // for second mates | |
| // Output streams for dumping qualities | |
| std::ofstream *dumpAlQv_; // for single-ended reads | |
| std::ofstream *dumpAlQv_1_; // for first mates | |
| std::ofstream *dumpAlQv_2_; // for second mates | |
| std::ofstream *dumpUnalQv_; // for single-ended reads | |
| std::ofstream *dumpUnalQv_1_; // for first mates | |
| std::ofstream *dumpUnalQv_2_; // for second mates | |
| std::ofstream *dumpMaxQv_; // for single-ended reads | |
| std::ofstream *dumpMaxQv_1_; // for first mates | |
| std::ofstream *dumpMaxQv_2_; // for second mates | |
| /** | |
| * Open an ofstream with given name; output error message and quit | |
| * if it fails. | |
| */ | |
| std::ofstream* openOf(const std::string& name, | |
| int mateType, | |
| const std::string& suffix) | |
| { | |
| std::string s = name; | |
| size_t dotoff = name.find_last_of("."); | |
| if(mateType == 1) { | |
| if(dotoff == string::npos) { | |
| s += "_1"; s += suffix; | |
| } else { | |
| s = name.substr(0, dotoff) + "_1" + s.substr(dotoff); | |
| } | |
| } else if(mateType == 2) { | |
| if(dotoff == string::npos) { | |
| s += "_2"; s += suffix; | |
| } else { | |
| s = name.substr(0, dotoff) + "_2" + s.substr(dotoff); | |
| } | |
| } else if(mateType != 0) { | |
| cerr << "Bad mate type " << mateType << endl; throw 1; | |
| } | |
| std::ofstream* tmp = new ofstream(s.c_str(), ios::out); | |
| if(tmp->fail()) { | |
| if(mateType == 0) { | |
| cerr << "Could not open single-ended aligned/unaligned-read file for writing: " << name << endl; | |
| } else { | |
| cerr << "Could not open paired-end aligned/unaligned-read file for writing: " << name << endl; | |
| } | |
| throw 1; | |
| } | |
| return tmp; | |
| } | |
| /** | |
| * Initialize all the locks for dumping. | |
| */ | |
| void initDumps() { | |
| dumpAl_ = dumpAl_1_ = dumpAl_2_ = NULL; | |
| dumpUnal_ = dumpUnal_1_ = dumpUnal_2_ = NULL; | |
| dumpMax_ = dumpMax_1_ = dumpMax_2_ = NULL; | |
| dumpAlQv_ = dumpAlQv_1_ = dumpAlQv_2_ = NULL; | |
| dumpUnalQv_ = dumpUnalQv_1_ = dumpUnalQv_2_ = NULL; | |
| dumpMaxQv_ = dumpMaxQv_1_ = dumpMaxQv_2_ = NULL; | |
| dumpAlignFlag_ = !dumpAlBase_.empty(); | |
| dumpUnalignFlag_ = !dumpUnalBase_.empty(); | |
| dumpMaxedFlag_ = !dumpMaxBase_.empty(); | |
| } | |
| void destroyDumps() { | |
| if(dumpAl_ != NULL) { dumpAl_->close(); delete dumpAl_; } | |
| if(dumpAl_1_ != NULL) { dumpAl_1_->close(); delete dumpAl_1_; } | |
| if(dumpAl_2_ != NULL) { dumpAl_2_->close(); delete dumpAl_2_; } | |
| if(dumpUnal_ != NULL) { dumpUnal_->close(); delete dumpUnal_; } | |
| if(dumpUnal_1_ != NULL) { dumpUnal_1_->close(); delete dumpUnal_1_; } | |
| if(dumpUnal_2_ != NULL) { dumpUnal_2_->close(); delete dumpUnal_2_; } | |
| if(dumpMax_ != NULL) { dumpMax_->close(); delete dumpMax_; } | |
| if(dumpMax_1_ != NULL) { dumpMax_1_->close(); delete dumpMax_1_; } | |
| if(dumpMax_2_ != NULL) { dumpMax_2_->close(); delete dumpMax_2_; } | |
| if(dumpAlQv_ != NULL) { dumpAlQv_->close(); delete dumpAlQv_; } | |
| if(dumpAlQv_1_ != NULL) { dumpAlQv_1_->close(); delete dumpAlQv_1_; } | |
| if(dumpAlQv_2_ != NULL) { dumpAlQv_2_->close(); delete dumpAlQv_2_; } | |
| if(dumpUnalQv_ != NULL) { dumpUnalQv_->close(); delete dumpUnalQv_; } | |
| if(dumpUnalQv_1_ != NULL) { dumpUnalQv_1_->close(); delete dumpUnalQv_1_; } | |
| if(dumpUnalQv_2_ != NULL) { dumpUnalQv_2_->close(); delete dumpUnalQv_2_; } | |
| if(dumpMaxQv_ != NULL) { dumpMaxQv_->close(); delete dumpMaxQv_; } | |
| if(dumpMaxQv_1_ != NULL) { dumpMaxQv_1_->close(); delete dumpMaxQv_1_; } | |
| if(dumpMaxQv_2_ != NULL) { dumpMaxQv_2_->close(); delete dumpMaxQv_2_; } | |
| } | |
| // Locks for dumping | |
| MUTEX_T dumpAlignLock_; | |
| MUTEX_T dumpAlignLockPE_; // _1 and _2 | |
| MUTEX_T dumpUnalLock_; | |
| MUTEX_T dumpUnalLockPE_; // _1 and _2 | |
| MUTEX_T dumpMaxLock_; | |
| MUTEX_T dumpMaxLockPE_; // _1 and _2 | |
| // false -> no dumping | |
| bool dumpAlignFlag_; | |
| bool dumpUnalignFlag_; | |
| bool dumpMaxedFlag_; | |
| volatile bool first_; /// true -> first hit hasn't yet been reported | |
| volatile uint64_t *ptNumAligned_; | |
| volatile uint64_t *ptNumReported_; | |
| volatile uint64_t *ptNumReportedPaired_; | |
| volatile uint64_t *ptNumUnaligned_; | |
| volatile uint64_t *ptNumMaxed_; | |
| bool quiet_; /// true -> don't print alignment stats at the end | |
| }; | |
| /** | |
| * A per-thread wrapper for a HitSink. Incorporates state that a | |
| * single search thread cares about. | |
| */ | |
| class HitSinkPerThread { | |
| public: | |
| explicit HitSinkPerThread( | |
| HitSink& sink, | |
| uint32_t max, | |
| uint32_t n, | |
| int defaultMapq, | |
| size_t threadId) : | |
| _sink(sink), | |
| _bestRemainingStratum(0), | |
| _numValidHits(0llu), | |
| _hits(), | |
| _bufferedHits(), | |
| hitsForThisRead_(), | |
| _max(max), | |
| _n(n), | |
| defaultMapq_(defaultMapq), | |
| threadId_(threadId) | |
| { | |
| sink.addWrapper(); | |
| assert_gt(_n, 0); | |
| } | |
| virtual ~HitSinkPerThread() { } | |
| /// Return the vector of retained hits | |
| vector<Hit>& retainedHits() { return _hits; } | |
| /// Finalize current read | |
| virtual uint32_t finishRead(PatternSourcePerThread& p, bool report, bool dump) { | |
| uint32_t ret = finishReadImpl(); | |
| _bestRemainingStratum = 0; | |
| if(!report) { | |
| _bufferedHits.clear(); | |
| return 0; | |
| } | |
| bool maxed = (ret > _max); | |
| bool unal = (ret == 0); | |
| if(dump && (unal || maxed)) { | |
| // Either no reportable hits were found or the number of | |
| // reportable hits exceeded the -m limit specified by the | |
| // user | |
| assert(ret == 0 || ret > _max); | |
| if(maxed) _sink.dumpMaxed(p); | |
| else _sink.dumpUnal(p); | |
| } | |
| ret = 0; | |
| if(maxed) { | |
| // Report that the read maxed-out; useful for chaining output | |
| if(dump) _sink.reportMaxed(_bufferedHits, threadId_, p); | |
| _bufferedHits.clear(); | |
| } else if(unal) { | |
| // Report that the read failed to align; useful for chaining output | |
| if(dump) _sink.reportUnaligned(threadId_, p); | |
| } else { | |
| // Flush buffered hits | |
| assert_gt(_bufferedHits.size(), 0); | |
| if(_bufferedHits.size() > _n) { | |
| _bufferedHits.resize(_n); | |
| } | |
| int mapq = defaultMapq_; | |
| int xms = (int)(_bufferedHits.size()); | |
| const bool paired = p.bufa().mate > 0; | |
| if(paired) { | |
| xms /= 2; | |
| } | |
| xms++; | |
| _sink.reportHits(NULL, &_bufferedHits, 0, _bufferedHits.size(), | |
| threadId_, mapq, xms, true); | |
| _sink.dumpAlign(p); | |
| ret = (uint32_t)_bufferedHits.size(); | |
| _bufferedHits.clear(); | |
| } | |
| assert_eq(0, _bufferedHits.size()); | |
| return ret; | |
| } | |
| virtual uint32_t finishReadImpl() = 0; | |
| /** | |
| * Add a hit to the internal buffer. Not yet reporting the hits. | |
| */ | |
| virtual void bufferHit(const Hit& h, int stratum) { | |
| #ifndef NDEBUG | |
| // Ensure all buffered hits have the same patid | |
| for(size_t i = 1; i < _bufferedHits.size(); i++) { | |
| assert_eq(_bufferedHits[0].patId, _bufferedHits[i].patId); | |
| } | |
| #endif | |
| _bufferedHits.push_back(h); | |
| } | |
| /** | |
| * Concrete subclasses override this to (possibly) report a hit and | |
| * return true iff the caller should continue to report more hits. | |
| */ | |
| virtual bool reportHit(const Hit& h, int stratum) { | |
| assert(h.repOk()); | |
| _numValidHits++; | |
| return true; | |
| } | |
| /// Return the number of valid hits so far. | |
| uint64_t numValidHits() { return _numValidHits; } | |
| /** | |
| * Return true if there are no more reportable hits. | |
| */ | |
| bool finishedWithStratum(int stratum) { | |
| bool ret = finishedWithStratumImpl(stratum); | |
| _bestRemainingStratum = stratum+1; | |
| return ret; | |
| } | |
| /** | |
| * Use the given set of hits as a starting point. By default, we don't | |
| */ | |
| virtual bool setHits(HitSet& hs) { | |
| if(!hs.empty()) { | |
| cerr << "Error: default setHits() called with non-empty HitSet" << endl; | |
| throw 1; | |
| } | |
| return false; | |
| } | |
| /** | |
| * Return true if there are no reportable hits with the given cost | |
| * (or worse). | |
| */ | |
| virtual bool irrelevantCost(uint16_t cost) const { | |
| return false; | |
| } | |
| /** | |
| * Concrete subclasses override this to determine whether the | |
| * search routine should keep searching after having finished | |
| * reporting all alignments at the given stratum. | |
| */ | |
| virtual bool finishedWithStratumImpl(int stratum) = 0; | |
| /// The mhits maximum | |
| uint32_t overThresh() { return _max; } | |
| /// Whether this thread, for this read, knows that we have already | |
| /// exceeded the mhits maximum | |
| bool exceededOverThresh() { return hitsForThisRead_ > _max; } | |
| /// Return whether we span strata | |
| virtual bool spanStrata() = 0; | |
| /// Return whether we report only the best possible hits | |
| virtual bool best() = 0; | |
| /** | |
| * Return true iff the underlying HitSink dumps unaligned or | |
| * maxed-out reads. | |
| */ | |
| bool dumpsReads() const { | |
| return _sink.dumpsReads(); | |
| } | |
| /** | |
| * Return true iff there are currently no buffered hits. | |
| */ | |
| bool empty() const { | |
| return _bufferedHits.empty(); | |
| } | |
| /** | |
| * Return the number of currently buffered hits. | |
| */ | |
| bool size() const { | |
| return _bufferedHits.size(); | |
| } | |
| /** | |
| * Return max # hits to report (*2 in paired-end mode because mates | |
| * count separately) | |
| */ | |
| virtual uint32_t maxHits() const { | |
| return _n; | |
| } | |
| protected: | |
| HitSink& _sink; /// Ultimate destination of reported hits | |
| /// Least # mismatches in alignments that will be reported in the | |
| /// future. Updated by the search routine. | |
| int _bestRemainingStratum; | |
| /// # hits reported to this HitSink so far (not all of which were | |
| /// necesssary reported to _sink) | |
| uint64_t _numValidHits; | |
| vector<Hit> _hits; /// Repository for retained hits | |
| /// Buffered hits, to be reported and flushed at end of read-phase | |
| vector<Hit> _bufferedHits; | |
| // Following variables are declared in the parent but maintained in | |
| // the concrete subcalsses | |
| uint32_t hitsForThisRead_; /// # hits for this read so far | |
| uint32_t _max; /// don't report any hits if there were > _max | |
| uint32_t _n; /// report at most _n hits | |
| int defaultMapq_; | |
| size_t threadId_; | |
| }; | |
| /** | |
| * Abstract parent factory for HitSinkPerThreads. | |
| */ | |
| class HitSinkPerThreadFactory { | |
| public: | |
| virtual ~HitSinkPerThreadFactory() { } | |
| virtual HitSinkPerThread* create() const = 0; | |
| virtual HitSinkPerThread* createMult(uint32_t m) const = 0; | |
| /// Free memory associated with a per-thread hit sink | |
| virtual void destroy(HitSinkPerThread* sink) const { | |
| assert(sink != NULL); | |
| // Free the HitSinkPerThread | |
| delete sink; | |
| } | |
| }; | |
| /** | |
| * Report first N good alignments encountered; trust search routine | |
| * to try alignments in something approximating a best-first order. | |
| * Best used in combination with a stringent alignment policy. | |
| */ | |
| class NGoodHitSinkPerThread : public HitSinkPerThread { | |
| public: | |
| NGoodHitSinkPerThread( | |
| HitSink& sink, | |
| uint32_t n, | |
| uint32_t max, | |
| int defaultMapq, | |
| size_t threadId) : | |
| HitSinkPerThread(sink, max, n, defaultMapq, threadId) | |
| { } | |
| virtual bool spanStrata() { | |
| return true; // we span strata | |
| } | |
| virtual bool best() { | |
| return false; // we settle for "good" hits | |
| } | |
| /// Finalize current read | |
| virtual uint32_t finishReadImpl() { | |
| uint32_t ret = hitsForThisRead_; | |
| hitsForThisRead_ = 0; | |
| return ret; | |
| } | |
| /** | |
| * Report and then return true if we've already reported N good | |
| * hits. Ignore the stratum - it's not relevant for finding "good" | |
| * hits. | |
| */ | |
| virtual bool reportHit(const Hit& h, int stratum) { | |
| HitSinkPerThread::reportHit(h, stratum); | |
| hitsForThisRead_++; | |
| if(hitsForThisRead_ > _max) { | |
| return true; // done - report nothing | |
| } | |
| //if(hitsForThisRead_ <= _n) { | |
| // Only report hit if we haven't | |
| bufferHit(h, stratum); | |
| //} | |
| if(hitsForThisRead_ == _n && | |
| (_max == 0xffffffff || _max < _n)) | |
| { | |
| return true; // already reported N good hits and max isn't set; stop! | |
| } | |
| return false; // not at N or max yet; keep going | |
| } | |
| /** | |
| * Always return true; search routine should only stop if it's | |
| * already reported N hits. | |
| */ | |
| virtual bool finishedWithStratumImpl(int stratum) { return false; } | |
| }; | |
| /** | |
| * Concrete factory for FirstNGoodHitSinkPerThreads. | |
| */ | |
| class NGoodHitSinkPerThreadFactory : public HitSinkPerThreadFactory { | |
| public: | |
| NGoodHitSinkPerThreadFactory( | |
| HitSink& sink, | |
| uint32_t n, | |
| uint32_t max, | |
| int defaultMapq, | |
| size_t threadId) : | |
| sink_(sink), | |
| n_(n), | |
| max_(max), | |
| defaultMapq_(defaultMapq), | |
| threadId_(threadId) { } | |
| /** | |
| * Allocate a new NGoodHitSinkPerThread object on the heap, | |
| * using the parameters given in the constructor. | |
| */ | |
| virtual HitSinkPerThread* create() const { | |
| return new NGoodHitSinkPerThread(sink_, n_, max_, defaultMapq_, threadId_); | |
| } | |
| virtual HitSinkPerThread* createMult(uint32_t m) const { | |
| uint32_t max = max_ * (max_ == 0xffffffff ? 1 : m); | |
| uint32_t n = n_ * (n_ == 0xffffffff ? 1 : m); | |
| return new NGoodHitSinkPerThread(sink_, n, max, defaultMapq_, threadId_); | |
| } | |
| private: | |
| HitSink& sink_; | |
| uint32_t n_; | |
| uint32_t max_; | |
| int defaultMapq_; | |
| size_t threadId_; | |
| }; | |
| /** | |
| * Report the first N best alignments encountered in a single | |
| * alignment stratum assuming that we're receiving the alignments in | |
| * best-first order. | |
| */ | |
| class NBestFirstStratHitSinkPerThread : public HitSinkPerThread { | |
| public: | |
| NBestFirstStratHitSinkPerThread( | |
| HitSink& sink, | |
| uint32_t n, | |
| uint32_t max, | |
| uint32_t mult, | |
| int defaultMapq, | |
| size_t threadId) : | |
| HitSinkPerThread(sink, max, n, defaultMapq, threadId), | |
| bestStratum_(999), | |
| mult_(mult) { } | |
| /** | |
| * false -> we do not allow strata to be spanned | |
| */ | |
| virtual bool spanStrata() { | |
| return false; // we do not span strata | |
| } | |
| /** | |
| * true -> we report best hits | |
| */ | |
| virtual bool best() { | |
| return true; | |
| } | |
| /** | |
| * Report and then return false if we've already reported N. | |
| */ | |
| virtual bool reportHit(const Hit& h, int stratum) { | |
| HitSinkPerThread::reportHit(h, stratum); | |
| // This hit is within th best possible remaining stratum, | |
| // so it should definitely count | |
| hitsForThisRead_++; | |
| // It doesn't exceed the limit, so buffer it | |
| if(stratum < bestStratum_) { | |
| bestStratum_ = stratum; | |
| } | |
| if(hitsForThisRead_ > _max) { | |
| return true; // done - report nothing | |
| } | |
| //if(hitsForThisRead_ <= _n) { | |
| bufferHit(h, stratum); | |
| //} | |
| if(hitsForThisRead_ == _n && | |
| (_max == 0xffffffff || _max < _n)) | |
| { | |
| return true; // already reported N good hits; stop! | |
| } | |
| return false; // not at N yet; keep going | |
| } | |
| /** | |
| * Finalize current read by reporting any buffered hits from best | |
| * to worst until they're all reported or until we've reported all | |
| * N | |
| */ | |
| virtual uint32_t finishReadImpl() { | |
| uint32_t ret = hitsForThisRead_; | |
| hitsForThisRead_ = 0; | |
| bestStratum_ = 999; | |
| const size_t sz = _bufferedHits.size(); | |
| for(size_t i = 0; i < sz; i++) { | |
| // Set 'oms' according to the number of other alignments | |
| // at this stratum | |
| _bufferedHits[i].oms = ((uint32_t)sz / mult_) - 1; | |
| } | |
| return ret; | |
| } | |
| /** | |
| * If we had any alignments at all and we're now moving on to a new | |
| * stratum, then we're done. | |
| */ | |
| virtual bool finishedWithStratumImpl(int stratum) { | |
| return hitsForThisRead_ > 0; | |
| } | |
| /** | |
| * If there have been any hits reported so far, classify any | |
| * subsequent alignments with higher strata as irrelevant. | |
| */ | |
| virtual bool irrelevantCost(uint16_t cost) { | |
| if(hitsForThisRead_) { | |
| // irrelevant iff at worse stratum | |
| return ((int)cost >> 14) > bestStratum_; | |
| } | |
| return false; | |
| } | |
| private: | |
| int bestStratum_; /// best stratum observed so far | |
| uint32_t mult_; /// number of batched-up alignments | |
| }; | |
| /** | |
| * Concrete factory for NBestStratHitSinkPerThread. | |
| */ | |
| class NBestFirstStratHitSinkPerThreadFactory : public HitSinkPerThreadFactory { | |
| public: | |
| NBestFirstStratHitSinkPerThreadFactory( | |
| HitSink& sink, | |
| uint32_t n, | |
| uint32_t max, | |
| int defaultMapq, | |
| size_t threadId) : | |
| sink_(sink), | |
| n_(n), | |
| max_(max), | |
| defaultMapq_(defaultMapq), | |
| threadId_(threadId) { } | |
| /** | |
| * Allocate a new NGoodHitSinkPerThread object on the heap, | |
| * using the parameters given in the constructor. | |
| */ | |
| virtual HitSinkPerThread* create() const { | |
| return new NBestFirstStratHitSinkPerThread(sink_, n_, max_, 1, defaultMapq_, threadId_); | |
| } | |
| virtual HitSinkPerThread* createMult(uint32_t m) const { | |
| uint32_t max = max_ * (max_ == 0xffffffff ? 1 : m); | |
| uint32_t n = n_ * (n_ == 0xffffffff ? 1 : m); | |
| return new NBestFirstStratHitSinkPerThread(sink_, n, max, m, defaultMapq_, threadId_); | |
| } | |
| private: | |
| HitSink& sink_; | |
| uint32_t n_; | |
| uint32_t max_; | |
| int defaultMapq_; | |
| size_t threadId_; | |
| }; | |
| /** | |
| * Report all valid alignments. | |
| */ | |
| class AllHitSinkPerThread : public HitSinkPerThread { | |
| public: | |
| AllHitSinkPerThread( | |
| HitSink& sink, | |
| uint32_t max, | |
| int defaultMapq, | |
| size_t threadId) : | |
| HitSinkPerThread(sink, max, 0xffffffff, defaultMapq, threadId) { } | |
| virtual bool spanStrata() { | |
| return true; // we span strata | |
| } | |
| virtual bool best() { | |
| return true; // we report "best" hits | |
| } | |
| /** | |
| * Report and always return true; we're finiding all hits so that | |
| * search routine should always continue. | |
| */ | |
| virtual bool reportHit(const Hit& h, int stratum) { | |
| HitSinkPerThread::reportHit(h, stratum); | |
| hitsForThisRead_++; | |
| if(hitsForThisRead_ > _max) { | |
| return true; // done - report nothing | |
| } | |
| bufferHit(h, stratum); | |
| return false; // reporting all; always keep going | |
| } | |
| /** | |
| * Finalize; do nothing because we haven't buffered anything | |
| */ | |
| virtual uint32_t finishReadImpl() { | |
| uint32_t ret = hitsForThisRead_; | |
| hitsForThisRead_ = 0; | |
| return ret; | |
| } | |
| /** | |
| * Always return false; search routine should not stop. | |
| */ | |
| virtual bool finishedWithStratumImpl(int stratum) { return false; } | |
| }; | |
| /** | |
| * Concrete factory for AllHitSinkPerThread. | |
| */ | |
| class AllHitSinkPerThreadFactory : public HitSinkPerThreadFactory { | |
| public: | |
| AllHitSinkPerThreadFactory( | |
| HitSink& sink, | |
| uint32_t max, | |
| int defaultMapq, | |
| size_t threadId) : | |
| sink_(sink), | |
| max_(max), | |
| defaultMapq_(defaultMapq), | |
| threadId_(threadId) { } | |
| /** | |
| * Allocate a new NGoodHitSinkPerThread object on the heap, | |
| * using the parameters given in the constructor. | |
| */ | |
| virtual HitSinkPerThread* create() const { | |
| return new AllHitSinkPerThread(sink_, max_, defaultMapq_, threadId_); | |
| } | |
| virtual HitSinkPerThread* createMult(uint32_t m) const { | |
| uint32_t max = max_ * (max_ == 0xffffffff ? 1 : m); | |
| return new AllHitSinkPerThread(sink_, max, defaultMapq_, threadId_); | |
| } | |
| private: | |
| HitSink& sink_; | |
| uint32_t max_; | |
| int defaultMapq_; | |
| size_t threadId_; | |
| }; | |
| /** | |
| * Sink that prints lines like this: | |
| * (pat-id)[-|+]:<hit1-text-id,hit2-text-offset>,<hit2-text-id... | |
| * | |
| * Activated with --concise | |
| */ | |
| class ConciseHitSink : public HitSink { | |
| public: | |
| /** | |
| * Construct a single-stream ConciseHitSink (default) | |
| */ | |
| ConciseHitSink( | |
| OutFileBuf* out, | |
| int offBase, | |
| const std::string& dumpAl, | |
| const std::string& dumpUnal, | |
| const std::string& dumpMax, | |
| bool onePairFile, | |
| bool sampleMax, | |
| std::vector<std::string>* refnames, | |
| size_t nthreads, | |
| int perThreadBufSize, | |
| bool reportOpps = false) : | |
| HitSink( | |
| out, | |
| dumpAl, | |
| dumpUnal, | |
| dumpMax, | |
| onePairFile, | |
| sampleMax, | |
| refnames, | |
| nthreads, | |
| perThreadBufSize), | |
| _reportOpps(reportOpps), | |
| offBase_(offBase) { } | |
| /** | |
| * Construct a multi-stream ConciseHitSink with one stream per | |
| * reference string (see --refout) | |
| */ | |
| ConciseHitSink( | |
| size_t numOuts, | |
| int offBase, | |
| const std::string& dumpAl, | |
| const std::string& dumpUnal, | |
| const std::string& dumpMax, | |
| bool onePairFile, | |
| bool sampleMax, | |
| std::vector<std::string>* refnames, | |
| size_t nthreads, | |
| int perThreadBufSize, | |
| bool reportOpps = false) : | |
| HitSink( | |
| numOuts, | |
| dumpAl, | |
| dumpUnal, | |
| dumpMax, | |
| onePairFile, | |
| sampleMax, | |
| refnames, | |
| nthreads, | |
| perThreadBufSize), | |
| _reportOpps(reportOpps), | |
| offBase_(offBase) { } | |
| /** | |
| * Append a verbose, readable hit to the given output stream. | |
| */ | |
| static void append( | |
| BTString& o, | |
| const Hit& h, | |
| int offBase, | |
| bool reportOpps) | |
| { | |
| o << h.patId; | |
| if(h.mate > 0) { | |
| assert(h.mate == 1 || h.mate == 2); | |
| o << '/' << (int)h.mate; | |
| } | |
| o << (h.fw? '+' : '-') << ':'; | |
| // .first is text id, .second is offset | |
| o << '<' << h.h.first << ',' << (h.h.second + offBase) << ',' << h.mms.count(); | |
| if(reportOpps) { | |
| o << ',' << h.oms; | |
| } | |
| o << '>' << '\n'; | |
| } | |
| /** | |
| * Append a verbose, readable hit to the given output stream. | |
| */ | |
| virtual void append(BTString& o, const Hit& h, int mapq, int xms) { | |
| ConciseHitSink::append(o, h, this->offBase_, this->_reportOpps); | |
| } | |
| protected: | |
| private: | |
| bool _reportOpps; | |
| int offBase_; /// Add this to reference offsets before outputting. | |
| /// (An easy way to make things 1-based instead of | |
| /// 0-based) | |
| }; | |
| /** | |
| * Print the given string. If ws = true, print only up to and not | |
| * including the first space or tab. Useful for printing reference | |
| * names. | |
| */ | |
| inline void printUptoWs(BTString& o, const std::string& str, bool ws) { | |
| if(!ws) { | |
| o << str; | |
| } else { | |
| size_t pos = str.find_first_of(" \t"); | |
| if(pos != string::npos) { | |
| o << str.substr(0, pos); | |
| } else { | |
| o << str; | |
| } | |
| } | |
| } | |
| /** | |
| * Sink that prints lines like this: | |
| * pat-name \t [-|+] \t ref-name \t ref-off \t pat \t qual \t #-alt-hits \t mm-list | |
| */ | |
| class VerboseHitSink : public HitSink { | |
| public: | |
| /** | |
| * Construct a single-stream VerboseHitSink (default) | |
| */ | |
| VerboseHitSink( | |
| OutFileBuf* out, | |
| int offBase, | |
| bool colorSeq, | |
| bool colorQual, | |
| bool printCost, | |
| const Bitset& suppressOuts, | |
| ReferenceMap *rmap, | |
| AnnotationMap *amap, | |
| bool fullRef, | |
| const std::string& dumpAl, | |
| const std::string& dumpUnal, | |
| const std::string& dumpMax, | |
| bool onePairFile, | |
| bool sampleMax, | |
| std::vector<std::string>* refnames, | |
| size_t nthreads, | |
| int perThreadBufSize, | |
| int partition = 0) : | |
| HitSink( | |
| out, | |
| dumpAl, | |
| dumpUnal, | |
| dumpMax, | |
| onePairFile, | |
| sampleMax, | |
| refnames, | |
| nthreads, | |
| perThreadBufSize), | |
| partition_(partition), | |
| offBase_(offBase), | |
| colorSeq_(colorSeq), | |
| colorQual_(colorQual), | |
| cost_(printCost), | |
| suppress_(suppressOuts), | |
| fullRef_(fullRef), | |
| rmap_(rmap), amap_(amap) | |
| { } | |
| /** | |
| * Construct a multi-stream VerboseHitSink with one stream per | |
| * reference string (see --refout) | |
| */ | |
| VerboseHitSink( | |
| size_t numOuts, | |
| int offBase, | |
| bool colorSeq, | |
| bool colorQual, | |
| bool printCost, | |
| const Bitset& suppressOuts, | |
| ReferenceMap *rmap, | |
| AnnotationMap *amap, | |
| bool fullRef, | |
| const std::string& dumpAl, | |
| const std::string& dumpUnal, | |
| const std::string& dumpMax, | |
| bool onePairFile, | |
| bool sampleMax, | |
| std::vector<std::string>* refnames, | |
| size_t nthreads, | |
| int perThreadBufSize, | |
| int partition = 0) : | |
| HitSink( | |
| numOuts, | |
| dumpAl, | |
| dumpUnal, | |
| dumpMax, | |
| onePairFile, | |
| sampleMax, | |
| refnames, | |
| nthreads, | |
| perThreadBufSize), | |
| partition_(partition), | |
| offBase_(offBase), | |
| colorSeq_(colorSeq), | |
| colorQual_(colorQual), | |
| cost_(printCost), | |
| suppress_(64), | |
| fullRef_(fullRef), | |
| rmap_(rmap), | |
| amap_(amap) | |
| { } | |
| static void append( | |
| BTString& o, | |
| const Hit& h, | |
| const vector<string>* refnames, | |
| ReferenceMap *rmap, | |
| AnnotationMap *amap, | |
| bool fullRef, | |
| int partition, | |
| int offBase, | |
| bool colorSeq, | |
| bool colorQual, | |
| bool cost, | |
| const Bitset& suppress); | |
| /** | |
| * Append a verbose, readable hit to the output stream | |
| * corresponding to the hit. | |
| */ | |
| virtual void append(BTString& o, const Hit& h, int mapq, int xms) { | |
| VerboseHitSink::append(o, h, _refnames, rmap_, amap_, | |
| fullRef_, partition_, offBase_, | |
| colorSeq_, colorQual_, cost_, | |
| suppress_); | |
| } | |
| /** | |
| * See hit.cpp | |
| */ | |
| virtual void reportMaxed( | |
| vector<Hit>& hs, | |
| size_t threadId, | |
| PatternSourcePerThread& p); | |
| private: | |
| int partition_; /// partition size, or 0 if partitioning is disabled | |
| int offBase_; /// Add this to reference offsets before outputting. | |
| /// (An easy way to make things 1-based instead of | |
| /// 0-based) | |
| bool colorSeq_; /// true -> print colorspace alignment sequence in colors | |
| bool colorQual_; /// true -> print colorspace quals as originals, not decoded | |
| bool cost_; /// true -> print statum and cost | |
| Bitset suppress_; /// output fields to suppress | |
| bool fullRef_; /// print full reference name | |
| ReferenceMap *rmap_; /// mapping to reference coordinate system. | |
| AnnotationMap *amap_; /// | |
| }; | |
| /** | |
| * Sink that does nothing. | |
| */ | |
| class StubHitSink : public HitSink { | |
| public: | |
| StubHitSink() : HitSink(new OutFileBuf(".tmp"), "", "", "", false, false, NULL, 1, 1) { } | |
| virtual void append(BTString& o, const Hit& h, int mapq, int xms) { } | |
| }; | |
| #endif /*HIT_H_*/ |