Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| /** | |
| * aligner.h | |
| * | |
| * A generic class providing a stateful way to find alignments. | |
| */ | |
| #ifndef ALIGNER_H_ | |
| #define ALIGNER_H_ | |
| #include <iostream> | |
| #include <set> | |
| #include <stdint.h> | |
| #include "seqan/sequence.h" | |
| #include "assert_helpers.h" | |
| #include "ebwt.h" | |
| #include "pat.h" | |
| #include "range.h" | |
| #include "range_source.h" | |
| #include "range_chaser.h" | |
| #include "ref_aligner.h" | |
| #include "reference.h" | |
| #include "aligner_metrics.h" | |
| #include "search_globals.h" | |
| #ifdef PER_THREAD_TIMING | |
| /// Based on http://stackoverflow.com/questions/16862620/numa-get-current-node-core | |
| void get_cpu_and_node(int& cpu, int& node) { | |
| unsigned long a,d,c; | |
| __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c)); | |
| node = (c & 0xFFF000)>>12; | |
| cpu = c & 0xFFF; | |
| } | |
| #endif | |
| /** | |
| * State machine for carrying out an alignment, which usually consists | |
| * of a series of phases that conduct different alignments using | |
| * different backtracking constraints. | |
| * | |
| * Each Aligner should have a dedicated PatternSourcePerThread. | |
| */ | |
| class Aligner { | |
| public: | |
| Aligner(bool _done, bool rangeMode) : | |
| done(_done), patsrc_(NULL), bufa_(NULL), bufb_(NULL), | |
| rangeMode_(rangeMode) | |
| { } | |
| virtual ~Aligner() { } | |
| /// Advance the range search by one memory op | |
| virtual bool advance() = 0; | |
| /// Prepare Aligner for the next read | |
| virtual void setQuery(PatternSourcePerThread *patsrc) { | |
| assert(patsrc != NULL); | |
| patsrc_ = patsrc; | |
| bufa_ = &patsrc->bufa(); | |
| assert(bufa_ != NULL); | |
| bufb_ = &patsrc->bufb(); | |
| alen_ = bufa_->length(); | |
| blen_ = (bufb_ != NULL) ? bufb_->length() : 0; | |
| rand_.init(bufa_->seed); | |
| } | |
| /** | |
| * Set to true if all searching w/r/t the current query is | |
| * finished or if there is no current query. | |
| */ | |
| bool done; | |
| protected: | |
| // Current read pair | |
| PatternSourcePerThread* patsrc_; | |
| Read* bufa_; | |
| uint32_t alen_; | |
| Read* bufb_; | |
| uint32_t blen_; | |
| bool rangeMode_; | |
| RandomSource rand_; | |
| }; | |
| /** | |
| * Abstract parent factory class for constructing aligners of all kinds. | |
| */ | |
| class AlignerFactory { | |
| public: | |
| virtual ~AlignerFactory() { } | |
| virtual Aligner* create() const = 0; | |
| /** | |
| * Allocate a vector of n Aligners; use destroy(std::vector...) to | |
| * free the memory. | |
| */ | |
| virtual std::vector<Aligner*>* create(uint32_t n) const { | |
| std::vector<Aligner*>* v = new std::vector<Aligner*>; | |
| for(uint32_t i = 0; i < n; i++) { | |
| v->push_back(create()); | |
| assert(v->back() != NULL); | |
| } | |
| return v; | |
| } | |
| /// Free memory associated with the aligner | |
| virtual void destroy(Aligner* al) const { | |
| assert(al != NULL); | |
| // Free the Aligner | |
| delete al; | |
| } | |
| /// Free memory associated with an aligner list | |
| virtual void destroy(std::vector<Aligner*>* als) const { | |
| assert(als != NULL); | |
| // Free all of the Aligners | |
| for(size_t i = 0; i < als->size(); i++) { | |
| if((*als)[i] != NULL) { | |
| delete (*als)[i]; | |
| (*als)[i] = NULL; | |
| } | |
| } | |
| // Free the vector | |
| delete als; | |
| } | |
| }; | |
| /** | |
| * Coordinates multiple aligners of the same type (i.e. either all | |
| * single-end or all paired-end). | |
| */ | |
| class MultiAligner { | |
| public: | |
| MultiAligner( | |
| uint32_t n, | |
| uint32_t qUpto, | |
| const AlignerFactory& alignFact, | |
| const PatternSourcePerThreadFactory& patsrcFact) : | |
| n_(n), qUpto_(qUpto), | |
| alignFact_(alignFact), patsrcFact_(patsrcFact), | |
| aligners_(NULL), patsrcs_(NULL) | |
| { | |
| aligners_ = alignFact_.create(n_); | |
| assert(aligners_ != NULL); | |
| patsrcs_ = patsrcFact_.create(n_); | |
| assert(patsrcs_ != NULL); | |
| } | |
| /// Free memory associated with the aligners and their pattern sources. | |
| virtual ~MultiAligner() { | |
| alignFact_.destroy(aligners_); | |
| patsrcFact_.destroy(patsrcs_); | |
| } | |
| /** | |
| * Advance an array of aligners in parallel, using prefetches to | |
| * try to hide all the latency. | |
| */ | |
| void run() { | |
| bool saw_last_read = false; | |
| while(!saw_last_read) { | |
| for(uint32_t i = 0; i < n_; i++) { | |
| if(!(*aligners_)[i]->done) { | |
| // Advance an aligner already in progress | |
| (*aligners_)[i]->advance(); | |
| } else { | |
| if(saw_last_read) { | |
| break; | |
| } | |
| // Get a new read and initialize an aligner with it | |
| pair<bool, bool> ret = (*patsrcs_)[i]->nextReadPair(); | |
| saw_last_read = ret.second; | |
| if(ret.first && (*patsrcs_)[i]->rdid() < qUpto_) { | |
| (*aligners_)[i]->setQuery((*patsrcs_)[i]); | |
| } | |
| } | |
| } | |
| } | |
| } | |
| protected: | |
| uint32_t n_; /// Number of aligners | |
| uint32_t qUpto_; /// Number of reads to align before stopping | |
| const AlignerFactory& alignFact_; | |
| const PatternSourcePerThreadFactory& patsrcFact_; | |
| std::vector<Aligner *>* aligners_; | |
| std::vector<PatternSourcePerThread *>* patsrcs_; | |
| }; | |
| /** | |
| * Coordinates multiple single-end and paired-end aligners, routing | |
| * reads to one or the other type as appropriate. | |
| */ | |
| class MixedMultiAligner { | |
| public: | |
| MixedMultiAligner( | |
| uint32_t n, | |
| uint32_t qUpto, | |
| const AlignerFactory& alignSEFact, | |
| const AlignerFactory& alignPEFact, | |
| const PatternSourcePerThreadFactory& patsrcFact) : | |
| n_(n), qUpto_(qUpto), | |
| alignSEFact_(alignSEFact), | |
| alignPEFact_(alignPEFact), | |
| patsrcFact_(patsrcFact), | |
| alignersSE_(NULL), | |
| alignersPE_(NULL), | |
| seOrPe_(NULL), | |
| patsrcs_(NULL) | |
| { | |
| // Instantiate all single-end aligners | |
| alignersSE_ = alignSEFact_.create(n_); | |
| assert(alignersSE_ != NULL); | |
| // Instantiate all paired-end aligners | |
| alignersPE_ = alignPEFact_.create(n_); | |
| assert(alignersPE_ != NULL); | |
| // Allocate array of boolean flags indicating whether each of | |
| // the slots is currently using the single-end or paired-end | |
| // aligner | |
| seOrPe_ = new bool[n_]; | |
| for(uint32_t i = 0; i < n_; i++) { | |
| seOrPe_[i] = true; | |
| } | |
| // Instantiate all read sources | |
| patsrcs_ = patsrcFact_.create(n_); | |
| assert(patsrcs_ != NULL); | |
| } | |
| /// Free memory associated with the aligners and their pattern sources. | |
| virtual ~MixedMultiAligner() { | |
| alignSEFact_.destroy(alignersSE_); | |
| alignPEFact_.destroy(alignersPE_); | |
| patsrcFact_.destroy(patsrcs_); | |
| delete[] seOrPe_; | |
| } | |
| /** | |
| * Advance an array of aligners in parallel, using prefetches to | |
| * try to hide all the latency. | |
| */ | |
| void run(bool verbose = false, int tid = -1) { | |
| #ifdef PER_THREAD_TIMING | |
| uint64_t ncpu_changeovers = 0; | |
| uint64_t nnuma_changeovers = 0; | |
| int current_cpu = 0, current_node = 0; | |
| get_cpu_and_node(current_cpu, current_node); | |
| std::stringstream ss; | |
| std::string msg; | |
| ss << "thread: " << tid << " time: "; | |
| msg = ss.str(); | |
| Timer timer(std::cout, msg.c_str()); | |
| #endif | |
| bool first = true; | |
| bool saw_last_read = false; | |
| if(n_ == 1) { | |
| Aligner *al = seOrPe_[0] ? (*alignersSE_)[0] : (*alignersPE_)[0]; | |
| PatternSourcePerThread *ps = (*patsrcs_)[0]; | |
| while(true) { | |
| #ifdef PER_THREAD_TIMING | |
| int cpu = 0, node = 0; | |
| get_cpu_and_node(cpu, node); | |
| if(cpu != current_cpu) { | |
| ncpu_changeovers++; | |
| current_cpu = cpu; | |
| } | |
| if(node != current_node) { | |
| nnuma_changeovers++; | |
| current_node = node; | |
| } | |
| #endif | |
| if(!first && !al->done) { | |
| // Advance an aligner already in progress; this is | |
| // the common case | |
| al->advance(); | |
| } else { | |
| if(saw_last_read) { | |
| break; | |
| } | |
| // Get a new read | |
| pair<bool, bool> ret = ps->nextReadPair(); | |
| saw_last_read = ret.second; | |
| if(ret.first && ps->rdid() < qUpto_) { | |
| if(ps->paired()) { | |
| // Read currently in buffer is paired-end | |
| (*alignersPE_)[0]->setQuery(ps); | |
| al = (*alignersPE_)[0]; | |
| seOrPe_[0] = false; // false -> paired | |
| } else { | |
| // Read currently in buffer is single-end | |
| (*alignersSE_)[0]->setQuery(ps); | |
| al = (*alignersSE_)[0]; | |
| seOrPe_[0] = true; // true = unpaired | |
| } | |
| } | |
| } | |
| first = false; | |
| } | |
| } else { | |
| while(true) { | |
| #ifdef PER_THREAD_TIMING | |
| int cpu = 0, node = 0; | |
| get_cpu_and_node(cpu, node); | |
| if(cpu != current_cpu) { | |
| ncpu_changeovers++; | |
| current_cpu = cpu; | |
| } | |
| if(node != current_node) { | |
| nnuma_changeovers++; | |
| current_node = node; | |
| } | |
| #endif | |
| bool saw_last_read = false; | |
| for(uint32_t i = 0; i < n_; i++) { | |
| Aligner *al = seOrPe_[i] ? (*alignersSE_)[i] : | |
| (*alignersPE_)[i]; | |
| if(!first && !al->done) { | |
| // Advance an aligner already in progress; this is | |
| // the common case | |
| al->advance(); | |
| } else { | |
| if(saw_last_read) { | |
| break; | |
| } | |
| // Feed a new read to a vacant aligner | |
| PatternSourcePerThread *ps = (*patsrcs_)[i]; | |
| // Get a new read | |
| pair<bool, bool> ret = ps->nextReadPair(); | |
| saw_last_read = ret.second; | |
| if(ps->rdid() < qUpto_ && ret.first) { | |
| if(ps->paired()) { | |
| // Read currently in buffer is paired-end | |
| (*alignersPE_)[i]->setQuery(ps); | |
| seOrPe_[i] = false; // false -> paired | |
| } else { | |
| // Read currently in buffer is single-end | |
| (*alignersSE_)[i]->setQuery(ps); | |
| seOrPe_[i] = true; // true = unpaired | |
| } | |
| } | |
| } | |
| } | |
| first = false; | |
| } | |
| } | |
| #ifdef PER_THREAD_TIMING | |
| ss.str(""); | |
| ss.clear(); | |
| ss << "thread: " << tid << " cpu_changeovers: " << ncpu_changeovers << std::endl | |
| << "thread: " << tid << " node_changeovers: " << nnuma_changeovers << std::endl; | |
| std::cout << ss.str(); | |
| #endif | |
| } | |
| protected: | |
| uint32_t n_; /// Number of aligners | |
| uint32_t qUpto_; /// Number of reads to align before stopping | |
| const AlignerFactory& alignSEFact_; | |
| const AlignerFactory& alignPEFact_; | |
| const PatternSourcePerThreadFactory& patsrcFact_; | |
| std::vector<Aligner *>* alignersSE_; | |
| std::vector<Aligner *>* alignersPE_; | |
| bool * seOrPe_; | |
| std::vector<PatternSourcePerThread *>* patsrcs_; | |
| }; | |
| /** | |
| * An aligner for finding exact matches of unpaired reads. Always | |
| * tries the forward-oriented version of the read before the reverse- | |
| * oriented read. | |
| */ | |
| template<typename TRangeSource> | |
| class UnpairedAlignerV2 : public Aligner { | |
| typedef RangeSourceDriver<TRangeSource> TDriver; | |
| public: | |
| UnpairedAlignerV2( | |
| EbwtSearchParams<String<Dna> >* params, | |
| TDriver* driver, | |
| RangeChaser<String<Dna> >* rchase, | |
| HitSink& sink, | |
| const HitSinkPerThreadFactory& sinkPtFactory, | |
| HitSinkPerThread* sinkPt, | |
| vector<String<Dna5> >& os, | |
| const BitPairReference* refs, | |
| bool rangeMode, | |
| bool verbose, | |
| bool quiet, | |
| int maxBts, | |
| ChunkPool *pool, | |
| int *btCnt = NULL, | |
| AlignerMetrics *metrics = NULL) : | |
| Aligner(true, rangeMode), | |
| refs_(refs), | |
| doneFirst_(true), | |
| firstIsFw_(true), | |
| chase_(false), | |
| sinkPtFactory_(sinkPtFactory), | |
| sinkPt_(sinkPt), | |
| params_(params), | |
| rchase_(rchase), | |
| driver_(driver), | |
| verbose_(verbose), | |
| quiet_(quiet), | |
| maxBts_(maxBts), | |
| pool_(pool), | |
| btCnt_(btCnt), | |
| metrics_(metrics) | |
| { | |
| assert(pool_ != NULL); | |
| assert(sinkPt_ != NULL); | |
| assert(params_ != NULL); | |
| assert(driver_ != NULL); | |
| } | |
| virtual ~UnpairedAlignerV2() { | |
| delete driver_; driver_ = NULL; | |
| delete params_; params_ = NULL; | |
| delete rchase_; rchase_ = NULL; | |
| delete[] btCnt_; btCnt_ = NULL; | |
| sinkPtFactory_.destroy(sinkPt_); sinkPt_ = NULL; | |
| } | |
| /** | |
| * Prepare this aligner for the next read. | |
| */ | |
| virtual void setQuery(PatternSourcePerThread* patsrc) { | |
| Aligner::setQuery(patsrc); // set fields & random seed | |
| if(metrics_ != NULL) { | |
| metrics_->nextRead(patsrc->bufa().patFw); | |
| } | |
| pool_->reset(&patsrc->bufa().name, (uint32_t)patsrc->rdid()); | |
| if(patsrc->bufa().length() < 4) { | |
| if(!quiet_) { | |
| cerr << "Warning: Skipping read " << patsrc->bufa().name | |
| << " because it is less than 4 characters long" << endl; | |
| } | |
| this->done = true; | |
| sinkPt_->finishRead(*patsrc_, true, true); | |
| return; | |
| } | |
| driver_->setQuery(patsrc, NULL); | |
| this->done = driver_->done; | |
| doneFirst_ = false; | |
| // Reset #-backtrack countdown | |
| if(btCnt_ != NULL) *btCnt_ = maxBts_; | |
| if(sinkPt_->setHits(patsrc->bufa().hitset)) { | |
| this->done = true; | |
| sinkPt_->finishRead(*patsrc_, true, true); | |
| } | |
| // Grab a bit from the pseudo-random seed to determine whether | |
| // to start with forward or reverse complement | |
| firstIsFw_ = ((patsrc->bufa().seed & 0x10) == 0); | |
| chase_ = false; | |
| } | |
| /** | |
| * Helper for reporting an alignment. | |
| */ | |
| inline bool report(const Range& ra, | |
| TIndexOffU first, | |
| TIndexOffU second, | |
| uint32_t tlen) | |
| { | |
| bool ebwtFw = ra.ebwt->fw(); | |
| params_->setFw(ra.fw); | |
| assert_eq(bufa_->color, color); | |
| return params_->reportHit( | |
| ra.fw ? (ebwtFw? bufa_->patFw : bufa_->patFwRev) : | |
| (ebwtFw? bufa_->patRc : bufa_->patRcRev), | |
| ra.fw ? (ebwtFw? &bufa_->qual : &bufa_->qualRev) : | |
| (ebwtFw? &bufa_->qualRev : &bufa_->qual), | |
| &bufa_->name, | |
| bufa_->color, | |
| bufa_->primer, | |
| bufa_->trimc, | |
| colorExEnds, | |
| snpPhred, | |
| refs_, | |
| ra.ebwt->rmap(), | |
| ebwtFw, | |
| ra.mms, // mismatch positions | |
| ra.refcs, // reference characters for mms | |
| ra.numMms, // # mismatches | |
| make_pair(first, second), // position | |
| make_pair<TIndexOffU,TIndexOffU>(0, 0), // (bogus) mate position | |
| true, // (bogus) mate orientation | |
| 0, // (bogus) mate length | |
| make_pair(ra.top, ra.bot),// arrows | |
| tlen, // textlen | |
| alen_, // qlen | |
| ra.stratum, // alignment stratum | |
| ra.cost, // cost, including qual penalty | |
| ra.bot - ra.top - 1, // # other hits | |
| (uint32_t)patsrc_->rdid(),// pattern id | |
| bufa_->seed, // pseudo-random seed | |
| 0); // mate (0 = unpaired) | |
| } | |
| /** | |
| * Advance the aligner. Return true iff we're | |
| * done with this read. | |
| */ | |
| virtual bool advance() { | |
| assert(!this->done); | |
| if(chase_) { | |
| assert(!rangeMode_); | |
| assert(driver_->foundRange); | |
| assert(!sinkPt_->irrelevantCost(driver_->range().cost)); | |
| if(!rchase_->foundOff() && !rchase_->done) { | |
| rchase_->advance(); | |
| return false; | |
| } | |
| if(rchase_->foundOff()) { | |
| this->done = report(driver_->range(), rchase_->off().first, | |
| rchase_->off().second, rchase_->tlen()); | |
| rchase_->reset(); | |
| } else { | |
| assert(rchase_->done); | |
| // Forget this range; keep looking for ranges | |
| chase_ = false; | |
| driver_->foundRange = false; | |
| this->done = driver_->done; | |
| } | |
| } | |
| // Still advancing a | |
| if(!this->done && !chase_) { | |
| assert(!driver_->done || driver_->foundRange); | |
| if(driver_->foundRange) { | |
| const Range& ra = driver_->range(); | |
| assert(!sinkPt_->irrelevantCost(ra.cost)); | |
| assert(ra.repOk()); | |
| if(rangeMode_) { | |
| this->done = report(ra, ra.top, ra.bot, 0); | |
| driver_->foundRange = false; | |
| } else { | |
| rchase_->setTopBot(ra.top, ra.bot, alen_, rand_, ra.ebwt); | |
| if(rchase_->foundOff()) { | |
| this->done = report( | |
| ra, rchase_->off().first, | |
| rchase_->off().second, rchase_->tlen()); | |
| rchase_->reset(); | |
| } | |
| if(!rchase_->done && !sinkPt_->irrelevantCost(ra.cost)) { | |
| // Keep chasing this range | |
| chase_ = true; | |
| } else { | |
| driver_->foundRange = false; | |
| } | |
| } | |
| } else { | |
| this->done = sinkPt_->irrelevantCost(driver_->minCost); | |
| if(!this->done) { | |
| driver_->advance(ADV_COST_CHANGES); | |
| } else { | |
| // No longer necessarily true with chain input | |
| //assert(!sinkPt_->spanStrata()); | |
| } | |
| } | |
| if(driver_->done && !driver_->foundRange && !chase_) { | |
| this->done = true; | |
| } | |
| } | |
| if(this->done) { | |
| sinkPt_->finishRead(*patsrc_, true, true); | |
| } | |
| return this->done; | |
| } | |
| protected: | |
| // Reference sequences (needed for colorspace decoding) | |
| const BitPairReference* refs_; | |
| // Progress state | |
| bool doneFirst_; | |
| bool firstIsFw_; | |
| bool chase_; | |
| // Temporary HitSink; to be deleted | |
| const HitSinkPerThreadFactory& sinkPtFactory_; | |
| HitSinkPerThread* sinkPt_; | |
| // State for alignment | |
| EbwtSearchParams<String<Dna> >* params_; | |
| // State for getting alignments from ranges statefully | |
| RangeChaser<String<Dna> >* rchase_; | |
| // Range-finding state | |
| TDriver* driver_; | |
| bool verbose_; // be talkative | |
| bool quiet_; // don't print informational/warning info | |
| const int maxBts_; | |
| ChunkPool *pool_; | |
| int *btCnt_; | |
| AlignerMetrics *metrics_; | |
| }; | |
| /** | |
| * An aligner for finding paired alignments while operating entirely | |
| * within the Burrows-Wheeler domain. | |
| */ | |
| template<typename TRangeSource> | |
| class PairedBWAlignerV1 : public Aligner { | |
| typedef std::pair<TIndexOffU,TIndexOffU> UPair; | |
| typedef std::vector<UPair> UPairVec; | |
| typedef std::vector<Range> TRangeVec; | |
| typedef RangeSourceDriver<TRangeSource> TDriver; | |
| typedef std::pair<uint64_t, uint64_t> TU64Pair; | |
| typedef std::set<TU64Pair> TSetPairs; | |
| public: | |
| PairedBWAlignerV1( | |
| EbwtSearchParams<String<Dna> >* params, | |
| TDriver* driver1Fw, TDriver* driver1Rc, | |
| TDriver* driver2Fw, TDriver* driver2Rc, | |
| RefAligner<String<Dna5> >* refAligner, | |
| RangeChaser<String<Dna> >* rchase, | |
| HitSink& sink, | |
| const HitSinkPerThreadFactory& sinkPtFactory, | |
| HitSinkPerThread* sinkPt, | |
| bool fw1, bool fw2, | |
| uint32_t minInsert, | |
| uint32_t maxInsert, | |
| bool dontReconcile, | |
| uint32_t symCeiling, | |
| uint32_t mixedThresh, | |
| uint32_t mixedAttemptLim, | |
| const BitPairReference* refs, | |
| bool rangeMode, | |
| bool verbose, | |
| bool quiet, | |
| int maxBts, | |
| ChunkPool *pool, | |
| int *btCnt) : | |
| Aligner(true, rangeMode), | |
| refs_(refs), | |
| patsrc_(NULL), qlen1_(0), qlen2_(0), doneFw_(true), | |
| doneFwFirst_(true), | |
| chase1Fw_(false), chase1Rc_(false), | |
| chase2Fw_(false), chase2Rc_(false), | |
| delayedChase1Fw_(false), delayedChase1Rc_(false), | |
| delayedChase2Fw_(false), delayedChase2Rc_(false), | |
| refAligner_(refAligner), | |
| sinkPtFactory_(sinkPtFactory), | |
| sinkPt_(sinkPt), | |
| params_(params), | |
| minInsert_(minInsert), | |
| maxInsert_(maxInsert), | |
| dontReconcile_(dontReconcile), | |
| symCeiling_(symCeiling), | |
| mixedThresh_(mixedThresh), | |
| mixedAttemptLim_(mixedAttemptLim), | |
| mixedAttempts_(0), | |
| fw1_(fw1), fw2_(fw2), | |
| rchase_(rchase), | |
| verbose_(verbose), | |
| quiet_(quiet), | |
| maxBts_(maxBts), | |
| pool_(pool), | |
| btCnt_(btCnt), | |
| driver1Fw_(driver1Fw), driver1Rc_(driver1Rc), | |
| offs1FwSz_(0), offs1RcSz_(0), | |
| driver2Fw_(driver2Fw), driver2Rc_(driver2Rc), | |
| offs2FwSz_(0), offs2RcSz_(0), | |
| chaseL_fw_ (fw1_ ? chase1Fw_ : chase1Rc_), | |
| chaseR_fw_ (fw2_ ? chase2Fw_ : chase2Rc_), | |
| delayedchaseL_fw_(fw1_ ? delayedChase1Fw_ : delayedChase1Rc_), | |
| delayedchaseR_fw_(fw2_ ? delayedChase2Fw_ : delayedChase2Rc_), | |
| drL_fw_ (fw1_ ? *driver1Fw_ : *driver1Rc_), | |
| drR_fw_ (fw2_ ? *driver2Fw_ : *driver2Rc_), | |
| offsLarr_fw_ (fw1_ ? offs1FwArr_ : offs1RcArr_), | |
| offsRarr_fw_ (fw2_ ? offs2FwArr_ : offs2RcArr_), | |
| rangesLarr_fw_ (fw1_ ? ranges1FwArr_ : ranges1RcArr_), | |
| rangesRarr_fw_ (fw2_ ? ranges2FwArr_ : ranges2RcArr_), | |
| offsLsz_fw_ (fw1_ ? offs1FwSz_ : offs1RcSz_), | |
| offsRsz_fw_ (fw2_ ? offs2FwSz_ : offs2RcSz_), | |
| chaseL_rc_ (fw2_ ? chase2Rc_ : chase2Fw_), | |
| chaseR_rc_ (fw1_ ? chase1Rc_ : chase1Fw_), | |
| delayedchaseL_rc_(fw2_ ? delayedChase2Rc_ : delayedChase2Fw_), | |
| delayedchaseR_rc_(fw1_ ? delayedChase1Rc_ : delayedChase1Fw_), | |
| drL_rc_ (fw2_ ? *driver2Rc_ : *driver2Fw_), | |
| drR_rc_ (fw1_ ? *driver1Rc_ : *driver1Fw_), | |
| offsLarr_rc_ (fw2_ ? offs2RcArr_ : offs2FwArr_), | |
| offsRarr_rc_ (fw1_ ? offs1RcArr_ : offs1FwArr_), | |
| rangesLarr_rc_ (fw2_ ? ranges2RcArr_ : ranges2FwArr_), | |
| rangesRarr_rc_ (fw1_ ? ranges1RcArr_ : ranges1FwArr_), | |
| offsLsz_rc_ (fw2_ ? offs2RcSz_ : offs2FwSz_), | |
| offsRsz_rc_ (fw1_ ? offs1RcSz_ : offs1FwSz_), | |
| chaseL_ (&chaseL_fw_), | |
| chaseR_ (&chaseR_fw_), | |
| delayedchaseL_(&delayedchaseL_fw_), | |
| delayedchaseR_(&delayedchaseR_fw_), | |
| drL_ (&drL_fw_), | |
| drR_ (&drR_fw_), | |
| offsLarr_ (offsLarr_fw_), | |
| offsRarr_ (offsRarr_fw_), | |
| rangesLarr_ (rangesLarr_fw_), | |
| rangesRarr_ (rangesRarr_fw_), | |
| offsLsz_ (&offsLsz_fw_), | |
| offsRsz_ (&offsRsz_fw_), | |
| donePair_ (&doneFw_), | |
| fwL_(fw1), | |
| fwR_(fw2), | |
| verbose2_(false) | |
| { | |
| assert(pool_ != NULL); | |
| assert(sinkPt_ != NULL); | |
| assert(params_ != NULL); | |
| assert(driver1Fw_ != NULL); | |
| assert(driver1Rc_ != NULL); | |
| assert(driver2Fw_ != NULL); | |
| assert(driver2Rc_ != NULL); | |
| } | |
| virtual ~PairedBWAlignerV1() { | |
| delete driver1Fw_; driver1Fw_ = NULL; | |
| delete driver1Rc_; driver1Rc_ = NULL; | |
| delete driver2Fw_; driver2Fw_ = NULL; | |
| delete driver2Rc_; driver2Rc_ = NULL; | |
| delete params_; params_ = NULL; | |
| delete rchase_; rchase_ = NULL; | |
| delete[] btCnt_; btCnt_ = NULL; | |
| delete refAligner_; refAligner_ = NULL; | |
| sinkPtFactory_.destroy(sinkPt_); sinkPt_ = NULL; | |
| } | |
| /** | |
| * Prepare this aligner for the next read. | |
| */ | |
| virtual void setQuery(PatternSourcePerThread* patsrc) { | |
| Aligner::setQuery(patsrc); // set fields & random seed | |
| assert(!patsrc->bufb().empty()); | |
| // Give all of the drivers pointers to the relevant read info | |
| patsrc_ = patsrc; | |
| pool_->reset(&patsrc->bufa().name, (uint32_t)patsrc->rdid()); | |
| if(patsrc->bufa().length() < 4 || patsrc->bufb().length() < 4) { | |
| if(!quiet_) { | |
| cerr << "Warning: Skipping pair " << patsrc->bufa().name | |
| << " because a mate is less than 4 characters long" << endl; | |
| } | |
| this->done = true; | |
| sinkPt_->finishRead(*patsrc_, true, true); | |
| return; | |
| } | |
| driver1Fw_->setQuery(patsrc, NULL); | |
| driver1Rc_->setQuery(patsrc, NULL); | |
| driver2Fw_->setQuery(patsrc, NULL); | |
| driver2Rc_->setQuery(patsrc, NULL); | |
| qlen1_ = patsrc_->bufa().length(); | |
| qlen2_ = patsrc_->bufb().length(); | |
| if(btCnt_ != NULL) (*btCnt_) = maxBts_; | |
| // Neither orientation is done | |
| doneFw_ = false; | |
| doneFwFirst_ = true; | |
| this->done = false; | |
| // No ranges are being chased yet | |
| chase1Fw_ = false; | |
| chase1Rc_ = false; | |
| chase2Fw_ = false; | |
| chase2Rc_ = false; | |
| delayedChase1Fw_ = false; | |
| delayedChase1Rc_ = false; | |
| delayedChase2Fw_ = false; | |
| delayedChase2Rc_ = false; | |
| // Clear all intermediate ranges | |
| for(size_t i = 0; i < 32; i++) { | |
| offs1FwArr_[i].clear(); offs1RcArr_[i].clear(); | |
| offs2FwArr_[i].clear(); offs2RcArr_[i].clear(); | |
| ranges1FwArr_[i].clear(); ranges1RcArr_[i].clear(); | |
| ranges2FwArr_[i].clear(); ranges2RcArr_[i].clear(); | |
| } | |
| offs1FwSz_ = offs1RcSz_ = offs2FwSz_ = offs2RcSz_ = 0; | |
| chaseL_ = &chaseL_fw_; | |
| chaseR_ = &chaseR_fw_; | |
| delayedchaseL_ = &delayedchaseL_fw_; | |
| delayedchaseR_ = &delayedchaseR_fw_; | |
| drL_ = &drL_fw_; | |
| drR_ = &drR_fw_; | |
| offsLarr_ = offsLarr_fw_; | |
| offsRarr_ = offsRarr_fw_; | |
| rangesLarr_ = rangesLarr_fw_; | |
| rangesRarr_ = rangesRarr_fw_; | |
| offsLsz_ = &offsLsz_fw_; | |
| offsRsz_ = &offsRsz_fw_; | |
| donePair_ = &doneFw_; | |
| fwL_ = fw1_; | |
| fwR_ = fw2_; | |
| mixedAttempts_ = 0; | |
| pairs_fw_.clear(); | |
| pairs_rc_.clear(); | |
| #ifndef NDEBUG | |
| allTopsL_fw_.clear(); | |
| allTopsR_fw_.clear(); | |
| allTopsL_rc_.clear(); | |
| allTopsR_rc_.clear(); | |
| #endif | |
| } | |
| /** | |
| * Advance the aligner by one memory op. Return true iff we're | |
| * done with this read. | |
| * | |
| * A call to this function does one of many things: | |
| * 1. Advance a RangeSourceDriver and check if it found a new range | |
| * 2. Advance a RowChaseDriver and check if it found a reference | |
| * offset for a an alignment in a range | |
| */ | |
| virtual bool advance() { | |
| assert(!this->done); | |
| if(doneFw_ && doneFwFirst_) { | |
| if(verbose2_) cout << "--" << endl; | |
| chaseL_ = &chaseL_rc_; | |
| chaseR_ = &chaseR_rc_; | |
| delayedchaseL_ = &delayedchaseL_rc_; | |
| delayedchaseR_ = &delayedchaseR_rc_; | |
| drL_ = &drL_rc_; | |
| drR_ = &drR_rc_; | |
| offsLarr_ = offsLarr_rc_; | |
| offsRarr_ = offsRarr_rc_; | |
| rangesLarr_ = rangesLarr_rc_; | |
| rangesRarr_ = rangesRarr_rc_; | |
| offsLsz_ = &offsLsz_rc_; | |
| offsRsz_ = &offsRsz_rc_; | |
| donePair_ = &this->done; | |
| fwL_ = !fw2_; | |
| fwR_ = !fw1_; | |
| doneFwFirst_ = false; | |
| mixedAttempts_ = 0; | |
| } | |
| bool chasing = *chaseL_ || *chaseR_; | |
| if(chasing && !rchase_->foundOff() && !rchase_->done) { | |
| rchase_->advance(); | |
| return false; | |
| } | |
| advanceOrientation(!doneFw_); | |
| if(this->done) { | |
| if(verbose2_) cout << "----" << endl; | |
| sinkPt_->finishRead(*patsrc_, true, true); | |
| } | |
| return this->done; | |
| } | |
| protected: | |
| /** | |
| * Helper for reporting a pair of alignments. As of now, we report | |
| * a paired alignment by reporting two consecutive alignments, one | |
| * for each mate. | |
| */ | |
| bool report(const Range& rL, // range for upstream mate | |
| const Range& rR, // range for downstream mate | |
| TIndexOffU first, // ref idx | |
| TIndexOffU upstreamOff, // offset for upstream mate | |
| TIndexOffU dnstreamOff, // offset for downstream mate | |
| TIndexOffU tlen, // length of ref | |
| bool pairFw, // whether the pair is being mapped to fw strand | |
| bool ebwtFwL, | |
| bool ebwtFwR, | |
| const ReferenceMap* rmap) | |
| { | |
| assert(gAllowMateContainment || upstreamOff < dnstreamOff); | |
| TIndexOffU spreadL = rL.bot - rL.top; | |
| TIndexOffU spreadR = rR.bot - rR.top; | |
| TIndexOffU oms = min(spreadL, spreadR) - 1; | |
| Read* bufL = pairFw ? bufa_ : bufb_; | |
| Read* bufR = pairFw ? bufb_ : bufa_; | |
| TIndexOffU lenL = pairFw ? alen_ : blen_; | |
| TIndexOffU lenR = pairFw ? blen_ : alen_; | |
| bool ret; | |
| assert(!params_->sink().exceededOverThresh()); | |
| params_->setFw(rL.fw); | |
| assert_eq(bufL->color, color); | |
| // Print upstream mate first | |
| ret = params_->reportHit( | |
| rL.fw ? (ebwtFwL? bufL->patFw : bufL->patFwRev) : | |
| (ebwtFwL? bufL->patRc : bufL->patRcRev), | |
| rL.fw ? (ebwtFwL? &bufL->qual : &bufL->qualRev) : | |
| (ebwtFwL? &bufL->qualRev : &bufL->qual), | |
| &bufL->name, | |
| bufL->color, | |
| bufL->primer, | |
| bufL->trimc, | |
| colorExEnds, | |
| snpPhred, | |
| refs_, | |
| rmap, | |
| ebwtFwL, | |
| rL.mms, // mismatch positions | |
| rL.refcs, // reference characters for mms | |
| rL.numMms, // # mismatches | |
| make_pair(first, upstreamOff),// position | |
| make_pair(first, dnstreamOff),// mate position | |
| rR.fw, // mate orientation | |
| lenR, // mate length | |
| make_pair(rL.top, rL.bot), // arrows | |
| tlen, // textlen | |
| lenL, // qlen | |
| rL.stratum, // alignment stratum | |
| rL.cost, // cost, including quality penalty | |
| oms, // # other hits | |
| bufL->patid, | |
| bufL->seed, | |
| pairFw ? 1 : 2); | |
| if(ret) { | |
| return true; // can happen when -m is set | |
| } | |
| params_->setFw(rR.fw); | |
| assert_eq(bufR->color, color); | |
| ret = params_->reportHit( | |
| rR.fw ? (ebwtFwR? bufR->patFw : bufR->patFwRev) : | |
| (ebwtFwR? bufR->patRc : bufR->patRcRev), | |
| rR.fw ? (ebwtFwR? &bufR->qual : &bufR->qualRev) : | |
| (ebwtFwR? &bufR->qualRev : &bufR->qual), | |
| &bufR->name, | |
| bufR->color, | |
| bufR->primer, | |
| bufR->trimc, | |
| colorExEnds, | |
| snpPhred, | |
| refs_, | |
| rmap, | |
| ebwtFwR, | |
| rR.mms, // mismatch positions | |
| rR.refcs, // reference characters for mms | |
| rR.numMms, // # mismatches | |
| make_pair(first, dnstreamOff),// position | |
| make_pair(first, upstreamOff),// mate position | |
| rL.fw, // mate orientation | |
| lenL, // mate length | |
| make_pair(rR.top, rR.bot), // arrows | |
| tlen, // textlen | |
| lenR, // qlen | |
| rR.stratum, // alignment stratum | |
| rR.cost, // cost, including quality penalty | |
| oms, // # other hits | |
| bufR->patid, | |
| bufR->seed, | |
| pairFw ? 2 : 1); | |
| return ret; | |
| } | |
| bool report(const Range& rL, // range for upstream mate | |
| const Range& rR, // range for downstream mate | |
| TIndexOffU first, // ref idx | |
| TIndexOffU upstreamOff, // offset for upstream mate | |
| TIndexOffU dnstreamOff, // offset for downstream mate | |
| TIndexOffU tlen, // length of ref | |
| bool pairFw, // whether the pair is being mapped to fw strand | |
| const ReferenceMap* rmap) | |
| { | |
| return report(rL, rR, first, upstreamOff, | |
| dnstreamOff, tlen, | |
| pairFw, rL.ebwt->fw(), rR.ebwt->fw(), rmap); | |
| } | |
| /** | |
| * Given a vector of reference positions where one of the two mates | |
| * (the "anchor" mate) has aligned, look directly at the reference | |
| * sequence for instances where the other mate (the "outstanding" | |
| * mate) aligns such that mating constraint is satisfied. | |
| */ | |
| bool resolveOutstandingInRef(const bool off1, | |
| const UPair& off, | |
| const TIndexOffU tlen, | |
| const Range& range) | |
| { | |
| assert(refs_->loaded()); | |
| assert_lt(off.first, refs_->numRefs()); | |
| // If matchRight is true, then we're trying to align the other | |
| // mate to the right of the already-aligned mate. Otherwise, | |
| // to the left. | |
| bool matchRight = (off1 ? !doneFw_ : doneFw_); | |
| // Sequence and quals for mate to be matched | |
| bool fw = off1 ? fw2_ : fw1_; // whether outstanding mate is fw/rc | |
| if(doneFw_) fw = !fw; | |
| // 'seq' gets sequence of outstanding mate w/r/t the forward | |
| // reference strand | |
| const String<Dna5>& seq = fw ? (off1 ? patsrc_->bufb().patFw : | |
| patsrc_->bufa().patFw) : | |
| (off1 ? patsrc_->bufb().patRc : | |
| patsrc_->bufa().patRc); | |
| // 'seq' gets qualities of outstanding mate w/r/t the forward | |
| // reference strand | |
| const String<char>& qual = fw ? (off1 ? patsrc_->bufb().qual : | |
| patsrc_->bufa().qual) : | |
| (off1 ? patsrc_->bufb().qualRev : | |
| patsrc_->bufa().qualRev); | |
| uint32_t qlen = (uint32_t)seqan::length(seq); // length of outstanding mate | |
| uint32_t alen = (off1 ? patsrc_->bufa().length() : | |
| patsrc_->bufb().length()); | |
| int minins = minInsert_; | |
| int maxins = maxInsert_; | |
| // Let minins and maxins be the minimum and maximum insert | |
| // lengths once we account for the trimming applied to the | |
| // mates. The idea is that the insert constraint placed by | |
| // the user applies to the raw reads, not the trimmed reads. | |
| if(fw1_) { | |
| minins = max<int>(0, minins - patsrc_->bufa().trimmed5); | |
| maxins = max<int>(0, maxins - patsrc_->bufa().trimmed5); | |
| } else { | |
| minins = max<int>(0, minins - patsrc_->bufa().trimmed3); | |
| maxins = max<int>(0, maxins - patsrc_->bufa().trimmed3); | |
| } | |
| if(fw2_) { | |
| minins = max<int>(0, minins - patsrc_->bufb().trimmed3); | |
| maxins = max<int>(0, maxins - patsrc_->bufb().trimmed3); | |
| } else { | |
| minins = max<int>(0, minins - patsrc_->bufb().trimmed5); | |
| maxins = max<int>(0, maxins - patsrc_->bufb().trimmed5); | |
| } | |
| // Sanity check minins and maxins | |
| assert_geq(minins, 0); | |
| assert_geq(maxins, 0); | |
| assert_geq(maxins, minins); | |
| // Don't even try if either of the mates is longer than the | |
| // maximum insert size. | |
| if((uint32_t)maxins <= max(qlen, alen)) { | |
| return false; | |
| } | |
| const TIndexOffU tidx = off.first; | |
| const TIndexOffU toff = off.second; | |
| // Set begin/end to be a range of all reference | |
| // positions that are legally permitted to be involved in | |
| // the alignment of the outstanding mate. | |
| // | |
| // Note that one of the constraints imposed on which positions | |
| // go into this range is that the opposite mate cannot be | |
| // contained entirely within the anchor mate, or vice versa. | |
| TIndexOffU begin, end; | |
| TIndexOffU insDiff = maxins - minins; | |
| if(matchRight) { | |
| end = toff + maxins; | |
| // Adding 1 disallows the opposite from starting at the | |
| // left-hand edge of the anchor mate. | |
| begin = toff + (gAllowMateContainment ? 0 : 1); | |
| // We can also add a bit more if qlen is less than alen, | |
| // since we're requiring that opposite not be contained | |
| // within anchor. | |
| if(!gAllowMateContainment && qlen < alen) { | |
| begin += alen-qlen; | |
| } | |
| if(end > insDiff + qlen) { | |
| begin = max<TIndexOffU>(begin, end - insDiff - qlen); | |
| } | |
| end = min<TIndexOffU>(refs_->approxLen(tidx), end); | |
| begin = min<TIndexOffU>(refs_->approxLen(tidx), begin); | |
| } else { | |
| if(toff + alen < (TIndexOffU)maxins) { | |
| begin = 0; | |
| } else { | |
| begin = toff + alen - maxins; | |
| } | |
| uint32_t mi = min<uint32_t>(alen, qlen); | |
| if(gAllowMateContainment) { | |
| end = toff + alen; | |
| } else { | |
| end = toff + mi - 1; | |
| end = min<TIndexOffU>(end, toff + alen - minins + qlen - 1); | |
| if(toff + alen + qlen < (uint32_t)minins + 1) end = 0; | |
| } | |
| } | |
| // Check if there's not enough space in the range to fit an | |
| // alignment for the outstanding mate. | |
| if(end - begin < qlen) return false; | |
| std::vector<Range> ranges; | |
| std::vector<TIndexOffU> offs; | |
| refAligner_->find(1, tidx, refs_, seq, qual, begin, end, ranges, | |
| offs, doneFw_ ? &pairs_rc_ : &pairs_fw_, | |
| toff, fw); | |
| assert_eq(ranges.size(), offs.size()); | |
| for(size_t i = 0; i < ranges.size(); i++) { | |
| Range& r = ranges[i]; | |
| // Fill in fields that aren't filled in by the RefAligner | |
| r.fw = fw; | |
| r.cost |= (r.stratum << 14); | |
| r.mate1 = !off1; | |
| const TIndexOffU result = offs[i]; | |
| // NOTE: We have no idea what the BW range delimiting the | |
| // opposite hit is, because we were operating entirely in | |
| // reference space when we found it. For now, we just copy | |
| // the known range's top and bot for now. | |
| r.top = range.top; | |
| r.bot = range.bot; | |
| bool ebwtLFw = matchRight ? range.ebwt->fw() : true; | |
| bool ebwtRFw = matchRight ? true : range.ebwt->fw(); | |
| if(report( | |
| matchRight ? range : r, // range for upstream mate | |
| matchRight ? r : range, // range for downstream mate | |
| tidx, // ref idx | |
| matchRight ? toff : result, // upstream offset | |
| matchRight ? result : toff, // downstream offset | |
| tlen, // length of ref | |
| !doneFw_, // whether the pair is being mapped to fw strand | |
| ebwtLFw, | |
| ebwtRFw, | |
| range.ebwt->rmap())) return true; | |
| } | |
| return false; | |
| } | |
| /** | |
| * Advance paired-end alignment. | |
| */ | |
| void advanceOrientation(bool pairFw) { | |
| assert(!this->done); | |
| assert(!*donePair_); | |
| assert(!*chaseL_ || !*chaseR_); | |
| if(*chaseL_) { | |
| assert(!rangeMode_); | |
| assert(!*delayedchaseL_); | |
| assert(drL_->foundRange); | |
| assert(rchase_->foundOff() || rchase_->done); | |
| if(rchase_->foundOff()) { | |
| // Resolve this against the reference loci | |
| // determined for the other mate | |
| const bool overThresh = (*offsLsz_ + *offsRsz_) > mixedThresh_; | |
| if(!this->done && (overThresh || dontReconcile_)) { | |
| // Because the total size of both ranges exceeds | |
| // our threshold, we're now operating in "mixed | |
| // mode" | |
| const Range& r = drL_->range(); | |
| assert(r.repOk()); | |
| if(verbose_) cout << "Making an attempt to find the outstanding mate" << endl; | |
| this->done = resolveOutstandingInRef( | |
| pairFw, rchase_->off(), | |
| r.ebwt->_plen[rchase_->off().first], r); | |
| if(++mixedAttempts_ > mixedAttemptLim_) { | |
| // Give up on this pair | |
| *donePair_ = true; | |
| return; | |
| } | |
| } | |
| rchase_->reset(); | |
| } else { | |
| assert(rchase_->done); | |
| // Forget this range; keep looking for ranges | |
| *chaseL_ = false; | |
| drL_->foundRange = false; | |
| if(verbose_) cout << "Done with chase for first mate" << endl; | |
| if(*delayedchaseR_) { | |
| // Start chasing the delayed range | |
| if(verbose_) cout << "Resuming delayed chase for second mate" << endl; | |
| assert(drR_->foundRange); | |
| const Range& r = drR_->range(); | |
| assert(r.repOk()); | |
| TIndexOffU top = r.top; | |
| TIndexOffU bot = r.bot; | |
| uint32_t qlen = doneFw_? qlen1_ : qlen2_; | |
| rchase_->setTopBot(top, bot, qlen, rand_, r.ebwt); | |
| *chaseR_ = true; | |
| *delayedchaseR_ = false; | |
| } | |
| } | |
| } else if(*chaseR_) { | |
| assert(!rangeMode_); | |
| assert(!*delayedchaseR_); | |
| assert(drR_->foundRange); | |
| assert(rchase_->foundOff() || rchase_->done); | |
| if(rchase_->foundOff()) { | |
| // Resolve this against the reference loci | |
| // determined for the other mate | |
| const bool overThresh = (*offsLsz_ + *offsRsz_) > mixedThresh_; | |
| if(!this->done && (overThresh || dontReconcile_)) { | |
| // Because the total size of both ranges exceeds | |
| // our threshold, we're now operating in "mixed | |
| // mode" | |
| const Range& r = drR_->range(); | |
| if(verbose_) cout << "Making an attempt to find the outstanding mate" << endl; | |
| this->done = resolveOutstandingInRef( | |
| !pairFw, rchase_->off(), | |
| r.ebwt->_plen[rchase_->off().first], r); | |
| if(++mixedAttempts_ > mixedAttemptLim_) { | |
| // Give up on this pair | |
| *donePair_ = true; | |
| return; | |
| } | |
| } | |
| rchase_->reset(); | |
| } else { | |
| assert(rchase_->done); | |
| // Forget this range; keep looking for ranges | |
| *chaseR_ = false; | |
| drR_->foundRange = false; | |
| if(verbose_) cout << "Done with chase for second mate" << endl; | |
| if(*delayedchaseL_) { | |
| // Start chasing the delayed range | |
| if(verbose_) cout << "Resuming delayed chase for first mate" << endl; | |
| assert(drL_->foundRange); | |
| const Range& r = drL_->range(); | |
| assert(r.repOk()); | |
| TIndexOffU top = r.top; | |
| TIndexOffU bot = r.bot; | |
| uint32_t qlen = doneFw_? qlen2_ : qlen1_; | |
| rchase_->setTopBot(top, bot, qlen, rand_, r.ebwt); | |
| *chaseL_ = true; | |
| *delayedchaseL_ = false; | |
| } | |
| } | |
| } | |
| if(!this->done && !*donePair_ && !*chaseL_ && !*chaseR_) { | |
| // Search for more ranges for whichever mate currently has | |
| // fewer candidate alignments | |
| if((*offsLsz_ < *offsRsz_ || drR_->done) && !drL_->done) { | |
| // If there are no more ranges for the other mate and | |
| // there are no candidate alignments either, then we're | |
| // not going to find a paired alignment in this | |
| // orientation. | |
| if(drR_->done && *offsRsz_ == 0) { | |
| // Give up on this orientation | |
| if(verbose_) cout << "Giving up on paired orientation " << (pairFw? "fw" : "rc") << " in mate 1" << endl; | |
| *donePair_ = true; | |
| if(verbose2_) cout << *offsLsz_ << " " << *offsRsz_ << endl; | |
| return; | |
| } | |
| assert(!*delayedchaseL_); | |
| if(!drL_->foundRange) drL_->advance(ADV_FOUND_RANGE); | |
| if(drL_->foundRange) { | |
| #ifndef NDEBUG | |
| { | |
| std::set<int64_t>& s = (pairFw ? allTopsL_fw_ : allTopsL_rc_); | |
| int64_t t = drL_->range().top + 1; // add 1 to avoid 0 | |
| if(!drL_->range().ebwt->fw()) t = -t; // invert for bw index | |
| assert(s.find(t) == s.end()); | |
| s.insert(t); | |
| } | |
| #endif | |
| // Add the size of this range to the total for this mate | |
| *offsLsz_ += (drL_->range().bot - drL_->range().top); | |
| if(*offsRsz_ == 0 && (!dontReconcile_ || *offsLsz_ > 3)) { | |
| // Delay chasing this range; we delay to avoid | |
| // needlessly chasing rows in this range when | |
| // the other mate doesn't end up aligning | |
| // anywhere | |
| if(verbose_) cout << "Delaying a chase for first mate" << endl; | |
| *delayedchaseL_ = true; | |
| } else { | |
| // Start chasing this range | |
| if(verbose2_) cout << *offsLsz_ << " " << *offsRsz_ << " " << drL_->range().top << endl; | |
| if(verbose_) cout << "Chasing a range for first mate" << endl; | |
| if(*offsLsz_ > symCeiling_ && *offsRsz_ > symCeiling_) { | |
| // Too many candidates for both mates; abort | |
| // without any more searching | |
| *donePair_ = true; | |
| return; | |
| } | |
| // If this is the first range for both mates, | |
| // choose the smaller range to chase down first | |
| if(*delayedchaseR_ && (*offsRsz_ < *offsLsz_)) { | |
| assert(drR_->foundRange); | |
| *delayedchaseR_ = false; | |
| *delayedchaseL_ = true; | |
| *chaseR_ = true; | |
| const Range& r = drR_->range(); | |
| assert(r.repOk()); | |
| uint32_t qlen = doneFw_? qlen1_ : qlen2_; | |
| rchase_->setTopBot(r.top, r.bot, qlen, rand_, r.ebwt); | |
| } else { | |
| // Use Burrows-Wheeler for this pair (as | |
| // usual) | |
| *chaseL_ = true; | |
| const Range& r = drL_->range(); | |
| uint32_t qlen = doneFw_? qlen2_ : qlen1_; | |
| rchase_->setTopBot(r.top, r.bot, qlen, rand_, r.ebwt); | |
| } | |
| } | |
| } | |
| } else if(!drR_->done) { | |
| // If there are no more ranges for the other mate and | |
| // there are no candidate alignments either, then we're | |
| // not going to find a paired alignment in this | |
| // orientation. | |
| if(drL_->done && *offsLsz_ == 0) { | |
| // Give up on this orientation | |
| if(verbose_) cout << "Giving up on paired orientation " << (pairFw? "fw" : "rc") << " in mate 2" << endl; | |
| if(verbose2_) cout << *offsLsz_ << " " << *offsRsz_ << endl; | |
| *donePair_ = true; | |
| return; | |
| } | |
| assert(!*delayedchaseR_); | |
| if(!drR_->foundRange) drR_->advance(ADV_FOUND_RANGE); | |
| if(drR_->foundRange) { | |
| #ifndef NDEBUG | |
| { | |
| std::set<int64_t>& s = (pairFw ? allTopsR_fw_ : allTopsR_rc_); | |
| int64_t t = drR_->range().top + 1; // add 1 to avoid 0 | |
| if(!drR_->range().ebwt->fw()) t = -t; // invert for bw index | |
| assert(s.find(t) == s.end()); | |
| s.insert(t); | |
| } | |
| #endif | |
| // Add the size of this range to the total for this mate | |
| *offsRsz_ += (drR_->range().bot - drR_->range().top); | |
| if(*offsLsz_ == 0 && (!dontReconcile_ || *offsRsz_ > 3)) { | |
| // Delay chasing this range; we delay to avoid | |
| // needlessly chasing rows in this range when | |
| // the other mate doesn't end up aligning | |
| // anywhere | |
| if(verbose_) cout << "Delaying a chase for second mate" << endl; | |
| *delayedchaseR_ = true; | |
| } else { | |
| // Start chasing this range | |
| if(verbose2_) cout << *offsLsz_ << " " << *offsRsz_ << " " << drR_->range().top << endl; | |
| if(verbose_) cout << "Chasing a range for second mate" << endl; | |
| if(*offsLsz_ > symCeiling_ && *offsRsz_ > symCeiling_) { | |
| // Too many candidates for both mates; abort | |
| // without any more searching | |
| *donePair_ = true; | |
| return; | |
| } | |
| // If this is the first range for both mates, | |
| // choose the smaller range to chase down first | |
| if(*delayedchaseL_ && *offsLsz_ < *offsRsz_) { | |
| assert(drL_->foundRange); | |
| *delayedchaseL_ = false; | |
| *delayedchaseR_ = true; | |
| *chaseL_ = true; | |
| const Range& r = drL_->range(); | |
| assert(r.repOk()); | |
| uint32_t qlen = doneFw_? qlen2_ : qlen1_; | |
| rchase_->setTopBot(r.top, r.bot, qlen, rand_, r.ebwt); | |
| } else { | |
| // Use Burrows-Wheeler for this pair (as | |
| // usual) | |
| *chaseR_ = true; | |
| const Range& r = drR_->range(); | |
| assert(r.repOk()); | |
| uint32_t qlen = doneFw_? qlen1_ : qlen2_; | |
| rchase_->setTopBot(r.top, r.bot, qlen, rand_, r.ebwt); | |
| } | |
| } | |
| } | |
| } else { | |
| // Finished processing ranges for both mates | |
| assert(drL_->done && drR_->done); | |
| *donePair_ = true; | |
| } | |
| } | |
| } | |
| const BitPairReference* refs_; | |
| PatternSourcePerThread *patsrc_; | |
| uint32_t qlen1_; | |
| uint32_t qlen2_; | |
| // Progress state | |
| bool doneFw_; // finished with forward orientation of both mates? | |
| bool doneFwFirst_; | |
| bool chase1Fw_; | |
| bool chase1Rc_; | |
| bool chase2Fw_; | |
| bool chase2Rc_; | |
| bool delayedChase1Fw_; | |
| bool delayedChase1Rc_; | |
| bool delayedChase2Fw_; | |
| bool delayedChase2Rc_; | |
| // For searching for outstanding mates | |
| RefAligner<String<Dna5> >* refAligner_; | |
| // Temporary HitSink; to be deleted | |
| const HitSinkPerThreadFactory& sinkPtFactory_; | |
| HitSinkPerThread* sinkPt_; | |
| // State for alignment | |
| EbwtSearchParams<String<Dna> >* params_; | |
| // Paired-end boundaries | |
| const uint32_t minInsert_; | |
| const uint32_t maxInsert_; | |
| // Don't attempt pairwise all-versus-all style of mate | |
| // reconciliation; just rely on mixed mode | |
| const bool dontReconcile_; | |
| // If both mates in a given orientation align >= symCeiling times, | |
| // then immediately give up | |
| const uint32_t symCeiling_; | |
| // If the total number of alignments for both mates in a given | |
| // orientation exceeds mixedThresh, then switch to mixed mode | |
| const uint32_t mixedThresh_; | |
| const uint32_t mixedAttemptLim_; | |
| uint32_t mixedAttempts_; | |
| // Orientation of upstream/downstream mates when aligning to | |
| // forward strand | |
| const bool fw1_; | |
| const bool fw2_; | |
| // State for getting alignments from ranges statefully | |
| RangeChaser<String<Dna> >* rchase_; | |
| // true -> be talkative | |
| bool verbose_; | |
| // true -> suppress warnings | |
| bool quiet_; | |
| int maxBts_; | |
| ChunkPool *pool_; | |
| int *btCnt_; | |
| // Range-finding state for first mate | |
| TDriver* driver1Fw_; | |
| TDriver* driver1Rc_; | |
| UPairVec offs1FwArr_[32]; | |
| TRangeVec ranges1FwArr_[32]; | |
| uint32_t offs1FwSz_; // total size of all ranges found in this category | |
| UPairVec offs1RcArr_[32]; | |
| TRangeVec ranges1RcArr_[32]; | |
| uint32_t offs1RcSz_; // total size of all ranges found in this category | |
| // Range-finding state for second mate | |
| TDriver* driver2Fw_; | |
| TDriver* driver2Rc_; | |
| UPairVec offs2FwArr_[32]; | |
| TRangeVec ranges2FwArr_[32]; | |
| uint32_t offs2FwSz_; // total size of all ranges found in this category | |
| UPairVec offs2RcArr_[32]; | |
| TRangeVec ranges2RcArr_[32]; | |
| uint32_t offs2RcSz_; // total size of all ranges found in this category | |
| bool& chaseL_fw_; | |
| bool& chaseR_fw_; | |
| bool& delayedchaseL_fw_; | |
| bool& delayedchaseR_fw_; | |
| TDriver& drL_fw_; | |
| TDriver& drR_fw_; | |
| UPairVec* offsLarr_fw_; | |
| UPairVec* offsRarr_fw_; | |
| TRangeVec* rangesLarr_fw_; | |
| TRangeVec* rangesRarr_fw_; | |
| uint32_t& offsLsz_fw_; | |
| uint32_t& offsRsz_fw_; | |
| bool& chaseL_rc_; | |
| bool& chaseR_rc_; | |
| bool& delayedchaseL_rc_; | |
| bool& delayedchaseR_rc_; | |
| TDriver& drL_rc_; | |
| TDriver& drR_rc_; | |
| UPairVec* offsLarr_rc_; | |
| UPairVec* offsRarr_rc_; | |
| TRangeVec* rangesLarr_rc_; | |
| TRangeVec* rangesRarr_rc_; | |
| uint32_t& offsLsz_rc_; | |
| uint32_t& offsRsz_rc_; | |
| bool* chaseL_; | |
| bool* chaseR_; | |
| bool* delayedchaseL_; | |
| bool* delayedchaseR_; | |
| TDriver* drL_; | |
| TDriver* drR_; | |
| UPairVec* offsLarr_; | |
| UPairVec* offsRarr_; | |
| TRangeVec* rangesLarr_; | |
| TRangeVec* rangesRarr_; | |
| uint32_t* offsLsz_; | |
| uint32_t* offsRsz_; | |
| bool* donePair_; | |
| bool fwL_; | |
| bool fwR_; | |
| /// For keeping track of paired alignments that have already been | |
| /// found for the forward and reverse-comp pair orientations | |
| TSetPairs pairs_fw_; | |
| TSetPairs pairs_rc_; | |
| #ifndef NDEBUG | |
| std::set<int64_t> allTopsL_fw_; | |
| std::set<int64_t> allTopsR_fw_; | |
| std::set<int64_t> allTopsL_rc_; | |
| std::set<int64_t> allTopsR_rc_; | |
| #endif | |
| bool verbose2_; | |
| }; | |
| /** | |
| * Helper struct that holds a Range together with the coordinates where it al | |
| */ | |
| struct RangeWithCoords { | |
| Range r; | |
| UPair h; | |
| }; | |
| /** | |
| * An aligner for finding paired alignments while operating entirely | |
| * within the Burrows-Wheeler domain. | |
| */ | |
| template<typename TRangeSource> | |
| class PairedBWAlignerV2 : public Aligner { | |
| typedef std::pair<TIndexOffU,TIndexOffU> UPair; | |
| typedef std::vector<UPair> UPairVec; | |
| typedef std::vector<Range> TRangeVec; | |
| typedef RangeSourceDriver<TRangeSource> TDriver; | |
| typedef std::pair<uint64_t, uint64_t> TU64Pair; | |
| typedef std::set<TU64Pair> TSetPairs; | |
| public: | |
| PairedBWAlignerV2( | |
| EbwtSearchParams<String<Dna> >* params, | |
| EbwtSearchParams<String<Dna> >* paramsSe1, | |
| EbwtSearchParams<String<Dna> >* paramsSe2, | |
| TDriver* driver, | |
| RefAligner<String<Dna5> >* refAligner, | |
| RangeChaser<String<Dna> >* rchase, | |
| HitSink& sink, | |
| const HitSinkPerThreadFactory& sinkPtFactory, | |
| HitSinkPerThread* sinkPt, | |
| HitSinkPerThread* sinkPtSe1, | |
| HitSinkPerThread* sinkPtSe2, | |
| bool fw1, bool fw2, | |
| uint32_t minInsert, | |
| uint32_t maxInsert, | |
| uint32_t mixedAttemptLim, | |
| const BitPairReference* refs, | |
| bool rangeMode, | |
| bool verbose, | |
| bool quiet, | |
| int maxBts, | |
| ChunkPool *pool, | |
| int *btCnt) : | |
| Aligner(true, rangeMode), | |
| refs_(refs), | |
| patsrc_(NULL), | |
| qlen1_(0), qlen2_(0), | |
| chase_(false), | |
| donePe_(false), | |
| doneSe1_(false), | |
| doneSe2_(false), | |
| refAligner_(refAligner), | |
| sinkPtFactory_(sinkPtFactory), | |
| sinkPt_(sinkPt), | |
| sinkPtSe1_(sinkPtSe1), | |
| sinkPtSe2_(sinkPtSe2), | |
| params_(params), | |
| paramsSe1_(paramsSe1), | |
| paramsSe2_(paramsSe2), | |
| minInsert_(minInsert), | |
| maxInsert_(maxInsert), | |
| mixedAttemptLim_(mixedAttemptLim), | |
| mixedAttempts_(0), | |
| fw1_(fw1), fw2_(fw2), | |
| rchase_(rchase), | |
| driver_(driver), | |
| pool_(pool), | |
| verbose_(verbose), | |
| quiet_(quiet), | |
| maxBts_(maxBts), | |
| btCnt_(btCnt) | |
| { | |
| assert(sinkPt_ != NULL); | |
| assert(params_ != NULL); | |
| assert(driver_ != NULL); | |
| } | |
| virtual ~PairedBWAlignerV2() { | |
| delete driver_; driver_ = NULL; | |
| delete params_; params_ = NULL; | |
| if(paramsSe1_ != NULL) { | |
| delete paramsSe1_; paramsSe1_ = NULL; | |
| delete paramsSe2_; paramsSe2_ = NULL; | |
| } | |
| delete rchase_; rchase_ = NULL; | |
| delete[] btCnt_; btCnt_ = NULL; | |
| delete refAligner_; refAligner_ = NULL; | |
| sinkPtFactory_.destroy(sinkPt_); sinkPt_ = NULL; | |
| if(sinkPtSe1_ != NULL) { | |
| sinkPtFactory_.destroy(sinkPtSe1_); sinkPtSe1_ = NULL; | |
| sinkPtFactory_.destroy(sinkPtSe2_); sinkPtSe2_ = NULL; | |
| } | |
| } | |
| /** | |
| * Prepare this aligner for the next read. | |
| */ | |
| virtual void setQuery(PatternSourcePerThread* patsrc) { | |
| Aligner::setQuery(patsrc); // set fields & random seed | |
| assert(!patsrc->bufb().empty()); | |
| // Give all of the drivers pointers to the relevant read info | |
| patsrc_ = patsrc; | |
| pool_->reset(&patsrc->bufa().name, (uint32_t)patsrc->rdid()); | |
| if(patsrc->bufa().length() < 4 || patsrc->bufb().length() < 4) { | |
| if(!quiet_) { | |
| cerr << "Warning: Skipping pair " << patsrc->bufa().name | |
| << " because a mate is less than 4 characters long" << endl; | |
| } | |
| this->done = true; | |
| sinkPt_->finishRead(*patsrc_, true, true); | |
| return; | |
| } | |
| driver_->setQuery(patsrc, NULL); | |
| qlen1_ = patsrc_->bufa().length(); | |
| qlen2_ = patsrc_->bufb().length(); | |
| if(btCnt_ != NULL) (*btCnt_) = maxBts_; | |
| mixedAttempts_ = 0; | |
| // Neither orientation is done | |
| this->done = false; | |
| // No ranges are being chased yet | |
| chase_ = false; | |
| donePe_ = doneSe1_ = doneSe2_ = false; | |
| pairs_fw_.clear(); | |
| pairs_rc_.clear(); | |
| } | |
| /** | |
| * Advance the aligner by one memory op. Return true iff we're | |
| * done with this read. | |
| * | |
| * A call to this function does one of many things: | |
| * 1. Advance a RangeSourceDriver and check if it found a new range | |
| * 2. Advance a RowChaseDriver and check if it found a reference | |
| * offset for a an alignment in a range | |
| */ | |
| virtual bool advance() { | |
| assert(!this->done); | |
| if(chase_) { | |
| assert(!rangeMode_); // chasing ranges | |
| if(!rchase_->foundOff() && !rchase_->done) { | |
| rchase_->advance(); | |
| return false; | |
| } | |
| assert(rchase_->foundOff() || rchase_->done); | |
| if(rchase_->foundOff()) { | |
| const Range& r = driver_->range(); | |
| assert(r.repOk()); | |
| resolveOutstanding( | |
| rchase_->off(), | |
| r.ebwt->_plen[rchase_->off().first], r); | |
| rchase_->reset(); | |
| } else { | |
| assert(rchase_->done); | |
| // Forget this range; keep looking for ranges | |
| chase_ = false; | |
| this->done = driver_->done; | |
| } | |
| } | |
| if(!this->done && !chase_) { | |
| // Search for more ranges for whichever mate currently has | |
| // fewer candidate alignments | |
| if(!driver_->done) { | |
| if(!this->done) { | |
| // | |
| // Check whether any of the PE/SE possibilities | |
| // have become impossible due to the minCost | |
| // | |
| if(!donePe_) { | |
| assert(!this->done); | |
| donePe_ = sinkPt_->irrelevantCost(driver_->minCost); | |
| if(donePe_ && (!sinkPt_->empty() || sinkPtSe1_ == NULL)) { | |
| // Paired-end alignment(s) were found, no | |
| // more will be found, and no unpaired | |
| // alignments are requested, so stop | |
| this->done = true; | |
| } | |
| if(donePe_ && sinkPtSe1_ != NULL) { | |
| // Note: removeMate affects minCost | |
| if(doneSe1_) driver_->removeMate(1); | |
| if(doneSe2_) driver_->removeMate(2); | |
| } | |
| } | |
| if(!this->done && sinkPtSe1_ != NULL) { | |
| if(!doneSe1_) { | |
| doneSe1_ = sinkPtSe1_->irrelevantCost(driver_->minCost); | |
| if(doneSe1_ && donePe_) driver_->removeMate(1); | |
| } | |
| if(!doneSe2_) { | |
| doneSe2_ = sinkPtSe2_->irrelevantCost(driver_->minCost); | |
| if(doneSe2_ && donePe_) driver_->removeMate(2); | |
| } | |
| // Do Se1 again, because removing Se2 may have | |
| // nudged minCost over the threshold | |
| if(!doneSe1_) { | |
| doneSe1_ = sinkPtSe1_->irrelevantCost(driver_->minCost); | |
| if(doneSe1_ && donePe_) driver_->removeMate(1); | |
| } | |
| if(doneSe1_ && doneSe2_) assert(donePe_); | |
| this->done = donePe_ && doneSe1_ && doneSe2_; | |
| } | |
| if(!this->done) { | |
| if(sinkPtSe1_ != NULL) { | |
| assert(doneSe1_ || !sinkPtSe1_->irrelevantCost(driver_->minCost)); | |
| assert(doneSe2_ || !sinkPtSe2_->irrelevantCost(driver_->minCost)); | |
| } | |
| assert(donePe_ || !sinkPt_->irrelevantCost(driver_->minCost)); | |
| driver_->advance(ADV_COST_CHANGES); | |
| } | |
| } | |
| if(driver_->foundRange) { | |
| // Use Burrows-Wheeler for this pair (as usual) | |
| chase_ = true; | |
| driver_->foundRange = false; | |
| const Range& r = driver_->range(); | |
| assert(r.repOk()); | |
| rchase_->setTopBot(r.top, r.bot, | |
| r.mate1 ? qlen1_ : qlen2_, | |
| rand_, r.ebwt); | |
| } | |
| } else { | |
| this->done = true; | |
| } | |
| } | |
| if(this->done) { | |
| bool reportedPe = (sinkPt_->finishRead(*patsrc_, true, true) > 0); | |
| if(sinkPtSe1_ != NULL) { | |
| sinkPtSe1_->finishRead(*patsrc_, !reportedPe, false); | |
| sinkPtSe2_->finishRead(*patsrc_, !reportedPe, false); | |
| } | |
| } | |
| return this->done; | |
| } | |
| protected: | |
| /** | |
| * Helper for reporting a pair of alignments. As of now, we report | |
| * a paired alignment by reporting two consecutive alignments, one | |
| * for each mate. | |
| */ | |
| bool report(const Range& rL, // range for upstream mate | |
| const Range& rR, // range for downstream mate | |
| TIndexOffU first, // ref idx | |
| TIndexOffU upstreamOff, // offset for upstream mate | |
| TIndexOffU dnstreamOff, // offset for downstream mate | |
| TIndexOffU tlen, // length of ref | |
| bool pairFw, // whether the pair is being mapped to fw strand | |
| bool ebwtFwL, | |
| bool ebwtFwR, | |
| const ReferenceMap *rmap) | |
| { | |
| assert(gAllowMateContainment || upstreamOff < dnstreamOff); | |
| TIndexOffU spreadL = rL.bot - rL.top; | |
| TIndexOffU spreadR = rR.bot - rR.top; | |
| TIndexOffU oms = min(spreadL, spreadR) - 1; | |
| Read* bufL = pairFw ? bufa_ : bufb_; | |
| Read* bufR = pairFw ? bufb_ : bufa_; | |
| TIndexOffU lenL = pairFw ? alen_ : blen_; | |
| TIndexOffU lenR = pairFw ? blen_ : alen_; | |
| bool ret; | |
| assert(!params_->sink().exceededOverThresh()); | |
| params_->setFw(rL.fw); | |
| assert_eq(bufL->color, color); | |
| // Print upstream mate first | |
| ret = params_->reportHit( | |
| rL.fw ? (ebwtFwL? bufL->patFw : bufL->patFwRev) : | |
| (ebwtFwL? bufL->patRc : bufL->patRcRev), | |
| rL.fw ? (ebwtFwL? &bufL->qual : &bufL->qualRev) : | |
| (ebwtFwL? &bufL->qualRev : &bufL->qual), | |
| &bufL->name, | |
| bufL->color, | |
| bufL->primer, | |
| bufL->trimc, | |
| colorExEnds, | |
| snpPhred, | |
| refs_, | |
| rmap, | |
| ebwtFwL, | |
| rL.mms, // mismatch positions | |
| rL.refcs, // reference characters for mms | |
| rL.numMms, // # mismatches | |
| make_pair(first, upstreamOff),// position | |
| make_pair(first, dnstreamOff),// mate position | |
| rR.fw, // mate orientation | |
| lenR, // mate length | |
| make_pair(rL.top, rL.bot), // arrows | |
| tlen, // textlen | |
| lenL, // qlen | |
| rL.stratum, // alignment stratum | |
| rL.cost, // cost, including quality penalty | |
| oms, // # other hits | |
| bufL->patid, | |
| bufL->seed, | |
| pairFw ? 1 : 2); | |
| if(ret) { | |
| return true; // can happen when -m is set | |
| } | |
| params_->setFw(rR.fw); | |
| assert_eq(bufR->color, color); | |
| ret = params_->reportHit( | |
| rR.fw ? (ebwtFwR? bufR->patFw : bufR->patFwRev) : | |
| (ebwtFwR? bufR->patRc : bufR->patRcRev), | |
| rR.fw ? (ebwtFwR? &bufR->qual : &bufR->qualRev) : | |
| (ebwtFwR? &bufR->qualRev : &bufR->qual), | |
| &bufR->name, | |
| bufR->color, | |
| bufR->primer, | |
| bufR->trimc, | |
| colorExEnds, | |
| snpPhred, | |
| refs_, | |
| rmap, | |
| ebwtFwR, | |
| rR.mms, // mismatch positions | |
| rR.refcs, // reference characters for mms | |
| rR.numMms, // # mismatches | |
| make_pair(first, dnstreamOff),// position | |
| make_pair(first, upstreamOff),// mate position | |
| rL.fw, // mate orientation | |
| lenL, // mate length | |
| make_pair(rR.top, rR.bot), // arrows | |
| tlen, // textlen | |
| lenR, // qlen | |
| rR.stratum, // alignment stratum | |
| rR.cost, // cost, including quality penalty | |
| oms, // # other hits | |
| bufR->patid, | |
| bufR->seed, | |
| pairFw ? 2 : 1); | |
| return ret; | |
| } | |
| /** | |
| * Helper for reporting a pair of alignments. As of now, we report | |
| * a paired alignment by reporting two consecutive alignments, one | |
| * for each mate. | |
| */ | |
| void reportSe(const Range& r, UPair h, uint32_t tlen) { | |
| EbwtSearchParams<String<Dna> >*params = (r.mate1 ? paramsSe1_ : paramsSe2_); | |
| assert(!(r.mate1 ? doneSe1_ : doneSe2_)); | |
| params->setFw(r.fw); | |
| Read* buf = r.mate1 ? bufa_ : bufb_; | |
| bool ebwtFw = r.ebwt->fw(); | |
| uint32_t len = r.mate1 ? alen_ : blen_; | |
| assert_eq(buf->color, color); | |
| // Print upstream mate first | |
| if(params->reportHit( | |
| r.fw ? (ebwtFw? buf->patFw : buf->patFwRev) : | |
| (ebwtFw? buf->patRc : buf->patRcRev), | |
| r.fw ? (ebwtFw? &buf->qual : &buf->qualRev) : | |
| (ebwtFw? &buf->qualRev : &buf->qual), | |
| &buf->name, | |
| buf->color, | |
| buf->primer, | |
| buf->trimc, | |
| colorExEnds, | |
| snpPhred, | |
| refs_, | |
| r.ebwt->rmap(), | |
| ebwtFw, | |
| r.mms, // mismatch positions | |
| r.refcs, // reference characters for mms | |
| r.numMms, // # mismatches | |
| h, // position | |
| make_pair(0, 0), // (bogus) mate coords | |
| true, // (bogus) mate orientation | |
| 0, // (bogus) mate length | |
| make_pair(r.top, r.bot), // arrows | |
| tlen, // textlen | |
| len, // qlen | |
| r.stratum, // alignment stratum | |
| r.cost, // cost, including quality penalty | |
| r.bot - r.top - 1, // # other hits | |
| buf->patid, | |
| buf->seed, | |
| 0)) | |
| { | |
| if(r.mate1) doneSe1_ = true; | |
| else doneSe2_ = true; | |
| if(donePe_) driver_->removeMate(r.mate1 ? 1 : 2); | |
| } | |
| } | |
| void resolveOutstanding(const UPair& off, | |
| const TIndexOffU tlen, | |
| const Range& range) | |
| { | |
| assert(!this->done); | |
| if(!donePe_) { | |
| bool ret = resolveOutstandingInRef(off, tlen, range); | |
| if(++mixedAttempts_ > mixedAttemptLim_ || ret) { | |
| // Give up on this pair | |
| donePe_ = true; | |
| if(sinkPtSe1_ != NULL) { | |
| if(doneSe1_) driver_->removeMate(1); | |
| if(doneSe2_) driver_->removeMate(2); | |
| } | |
| } | |
| this->done = (donePe_ && (!sinkPt_->empty() || sinkPtSe1_ == NULL || (doneSe1_ && doneSe2_))); | |
| } | |
| if(!this->done && sinkPtSe1_ != NULL) { | |
| bool doneSe = (range.mate1 ? doneSe1_ : doneSe2_); | |
| if(!doneSe) { | |
| // Hold onto this single-end alignment in case we don't | |
| // find any paired alignments | |
| reportSe(range, off, tlen); | |
| } | |
| this->done = doneSe1_ && doneSe2_ && donePe_; | |
| } | |
| } | |
| /** | |
| * Given a vector of reference positions where one of the two mates | |
| * (the "anchor" mate) has aligned, look directly at the reference | |
| * sequence for instances where the other mate (the "outstanding" | |
| * mate) aligns such that mating constraint is satisfied. | |
| * | |
| * This function picks up to 'pick' anchors at random from the | |
| * 'offs' array. It returns the number that it actually picked. | |
| */ | |
| bool resolveOutstandingInRef(const UPair& off, | |
| const TIndexOffU tlen, | |
| const Range& range) | |
| { | |
| assert(!donePe_); | |
| assert(refs_->loaded()); | |
| assert_lt(off.first, refs_->numRefs()); | |
| // pairFw = true if the anchor indicates that the pair will | |
| // align in its forward orientation (i.e. with mate1 to the | |
| // left of mate2) | |
| bool pairFw = (range.mate1)? (range.fw == fw1_) : (range.fw == fw2_); | |
| // matchRight = true, if the opposite mate will be to the right | |
| // of the anchor mate | |
| bool matchRight = (pairFw ? range.mate1 : !range.mate1); | |
| // fw = orientation of the opposite mate | |
| bool fw = range.mate1 ? fw2_ : fw1_; // whether outstanding mate is fw/rc | |
| if(!pairFw) fw = !fw; | |
| // 'seq' = sequence for opposite mate | |
| const String<Dna5>& seq = | |
| fw ? (range.mate1 ? patsrc_->bufb().patFw : | |
| patsrc_->bufa().patFw) : | |
| (range.mate1 ? patsrc_->bufb().patRc : | |
| patsrc_->bufa().patRc); | |
| // 'qual' = qualities for opposite mate | |
| const String<char>& qual = | |
| fw ? (range.mate1 ? patsrc_->bufb().qual : | |
| patsrc_->bufa().qual) : | |
| (range.mate1 ? patsrc_->bufb().qualRev : | |
| patsrc_->bufa().qualRev); | |
| uint32_t qlen = (uint32_t)seqan::length(seq); // length of outstanding mate | |
| uint32_t alen = (range.mate1 ? patsrc_->bufa().length() : | |
| patsrc_->bufb().length()); | |
| int minins = minInsert_; | |
| int maxins = maxInsert_; | |
| if(fw1_) { | |
| minins = max<int>(0, minins - patsrc_->bufa().trimmed5); | |
| maxins = max<int>(0, maxins - patsrc_->bufa().trimmed5); | |
| } else { | |
| minins = max<int>(0, minins - patsrc_->bufa().trimmed3); | |
| maxins = max<int>(0, maxins - patsrc_->bufa().trimmed3); | |
| } | |
| if(fw2_) { | |
| minins = max<int>(0, minins - patsrc_->bufb().trimmed3); | |
| maxins = max<int>(0, maxins - patsrc_->bufb().trimmed3); | |
| } else { | |
| minins = max<int>(0, minins - patsrc_->bufb().trimmed5); | |
| maxins = max<int>(0, maxins - patsrc_->bufb().trimmed5); | |
| } | |
| assert_geq(minins, 0); | |
| assert_geq(maxins, 0); | |
| // Don't even try if either of the mates is longer than the | |
| // maximum insert size. | |
| if((uint32_t)maxins <= max(qlen, alen)) { | |
| return false; | |
| } | |
| const TIndexOffU tidx = off.first; // text id where anchor mate hit | |
| const TIndexOffU toff = off.second; // offset where anchor mate hit | |
| // Set begin/end to the range of reference positions where | |
| // outstanding mate may align while fulfilling insert-length | |
| // constraints. | |
| TIndexOffU begin, end; | |
| assert_geq(maxins, minins); | |
| TIndexOffU insDiff = maxins - minins; | |
| if(matchRight) { | |
| end = toff + maxins; | |
| // Adding 1 disallows the opposite from starting at the | |
| // left-hand edge of the anchor mate. | |
| begin = toff + (gAllowMateContainment ? 0 : 1); | |
| // We can also add a bit more if qlen is less than alen, | |
| // since we're requiring that opposite not be contained | |
| // within anchor. | |
| if(!gAllowMateContainment && qlen < alen) { | |
| begin += alen-qlen; | |
| } | |
| if(end > insDiff + qlen) { | |
| begin = max<TIndexOffU>(begin, end - insDiff - qlen); | |
| } | |
| end = min<TIndexOffU>(refs_->approxLen(tidx), end); | |
| begin = min<TIndexOffU>(refs_->approxLen(tidx), begin); | |
| } else { | |
| if(toff + alen < (TIndexOffU)maxins) { | |
| begin = 0; | |
| } else { | |
| begin = toff + alen - maxins; | |
| } | |
| uint32_t mi = min<uint32_t>(alen, qlen); | |
| if(gAllowMateContainment) { | |
| end = toff + alen - 1; | |
| } else { | |
| end = toff + mi - 1; | |
| end = min<TIndexOffU>(end, toff + alen - minins + qlen - 1); | |
| if(toff + alen + qlen < (TIndexOffU)minins + 1) end = 0; | |
| } | |
| } | |
| // Check if there's not enough space in the range to fit an | |
| // alignment for the outstanding mate. | |
| if(end - begin < qlen) return false; | |
| std::vector<Range> ranges; | |
| std::vector<TIndexOffU> offs; | |
| refAligner_->find(1, tidx, refs_, seq, qual, begin, end, ranges, | |
| offs, pairFw ? &pairs_fw_ : &pairs_rc_, | |
| toff, fw); | |
| assert_eq(ranges.size(), offs.size()); | |
| for(size_t i = 0; i < ranges.size(); i++) { | |
| Range& r = ranges[i]; | |
| r.fw = fw; | |
| r.cost |= (r.stratum << 14); | |
| r.mate1 = !range.mate1; | |
| const TIndexOffU result = offs[i]; | |
| // Just copy the known range's top and bot for now | |
| r.top = range.top; | |
| r.bot = range.bot; | |
| bool ebwtLFw = matchRight ? range.ebwt->fw() : true; | |
| bool ebwtRFw = matchRight ? true : range.ebwt->fw(); | |
| if(report( | |
| matchRight ? range : r, // range for upstream mate | |
| matchRight ? r : range, // range for downstream mate | |
| tidx, // ref idx | |
| matchRight ? toff : result, // upstream offset | |
| matchRight ? result : toff, // downstream offset | |
| tlen, // length of ref | |
| pairFw, // whether the pair is being mapped to fw strand | |
| ebwtLFw, | |
| ebwtRFw, | |
| range.ebwt->rmap())) return true; | |
| } | |
| return false; | |
| } | |
| const BitPairReference* refs_; | |
| PatternSourcePerThread *patsrc_; | |
| uint32_t qlen1_, qlen2_; | |
| bool chase_; | |
| // true -> we're no longer shooting for paired-end alignments; | |
| // just collecting single-end ones | |
| bool donePe_, doneSe1_, doneSe2_; | |
| // For searching for outstanding mates | |
| RefAligner<String<Dna5> >* refAligner_; | |
| // Temporary HitSink; to be deleted | |
| const HitSinkPerThreadFactory& sinkPtFactory_; | |
| HitSinkPerThread* sinkPt_; | |
| HitSinkPerThread* sinkPtSe1_, * sinkPtSe2_; | |
| // State for alignment | |
| EbwtSearchParams<String<Dna> >* params_; | |
| // for single-end: | |
| EbwtSearchParams<String<Dna> >* paramsSe1_, * paramsSe2_; | |
| // Paired-end boundaries | |
| const uint32_t minInsert_; | |
| const uint32_t maxInsert_; | |
| const uint32_t mixedAttemptLim_; | |
| uint32_t mixedAttempts_; | |
| // Orientation of upstream/downstream mates when aligning to | |
| // forward strand | |
| const bool fw1_, fw2_; | |
| // State for getting alignments from ranges statefully | |
| RangeChaser<String<Dna> >* rchase_; | |
| // Range-finding state for first mate | |
| TDriver* driver_; | |
| // Pool for distributing chunks of best-first path descriptor memory | |
| ChunkPool *pool_; | |
| bool verbose_; | |
| bool quiet_; | |
| int maxBts_; // maximum allowed # backtracks | |
| int *btCnt_; // current backtrack count | |
| /// For keeping track of paired alignments that have already been | |
| /// found for the forward and reverse-comp pair orientations | |
| TSetPairs pairs_fw_, pairs_rc_; | |
| }; | |
| #endif /* ALIGNER_H_ */ |