diff --git a/Makefile b/Makefile index 8ec0e2a..82b3cfd 100644 --- a/Makefile +++ b/Makefile @@ -69,7 +69,8 @@ BUILD_LIBS = OTHER_CPPS = ccnt_lut.cpp hit.cpp ref_read.cpp alphabet.c shmem.cpp \ edit.cpp ebwt.cpp SEARCH_CPPS = qual.cpp pat.cpp ebwt_search_util.cpp ref_aligner.cpp \ - log.cpp hit_set.cpp refmap.cpp annot.cpp sam.cpp + log.cpp hit_set.cpp refmap.cpp annot.cpp sam.cpp \ + color.cpp color_dec.cpp SEARCH_CPPS_MAIN = $(SEARCH_CPPS) bowtie_main.cpp BUILD_CPPS = diff --git a/VERSION b/VERSION index 2bb6a82..d33c3a2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.11.3 \ No newline at end of file +0.12.0 \ No newline at end of file diff --git a/aligner.h b/aligner.h index e28bee6..fabc936 100644 --- a/aligner.h +++ b/aligner.h @@ -331,6 +331,8 @@ class UnpairedAlignerV2 : public Aligner { const HitSinkPerThreadFactory& sinkPtFactory, HitSinkPerThread* sinkPt, vector >& os, + const BitPairReference* refs, + int snpPhred, bool rangeMode, bool verbose, bool quiet, @@ -339,6 +341,8 @@ class UnpairedAlignerV2 : public Aligner { int *btCnt = NULL, AlignerMetrics *metrics = NULL) : Aligner(true, rangeMode), + refs_(refs), + snpPhred_(snpPhred), doneFirst_(true), firstIsFw_(true), chase_(false), @@ -412,11 +416,14 @@ class UnpairedAlignerV2 : public Aligner { bool ebwtFw = ra.ebwt->fw(); params_->setFw(ra.fw); return params_->reportHit( - ra.fw ? (ebwtFw? bufa_->patFw : bufa_->patFwRev) : - (ebwtFw? bufa_->patRc : bufa_->patRcRev), + ra.fw ? (ebwtFw? bufa_->patFw : bufa_->patFwRev) : + (ebwtFw? bufa_->patRc : bufa_->patRcRev), ra.fw ? (ebwtFw? &bufa_->qual : &bufa_->qualRev) : (ebwtFw? &bufa_->qualRev : &bufa_->qual), &bufa_->name, + bufa_->color, + snpPhred_, + refs_, ra.ebwt->rmap(), ebwtFw, ra.mms, // mismatch positions @@ -507,6 +514,12 @@ class UnpairedAlignerV2 : public Aligner { } protected: + + // Reference sequences (needed for colorspace decoding) + const BitPairReference* refs_; + + int snpPhred_; + // Progress state bool doneFirst_; bool firstIsFw_; @@ -566,6 +579,7 @@ class PairedBWAlignerV1 : public Aligner { uint32_t mixedThresh, uint32_t mixedAttemptLim, const BitPairReference* refs, + int snpPhred, bool rangeMode, bool verbose, bool quiet, @@ -573,7 +587,8 @@ class PairedBWAlignerV1 : public Aligner { ChunkPool *pool, int *btCnt) : Aligner(true, rangeMode), - refs_(refs), patsrc_(NULL), qlen1_(0), qlen2_(0), doneFw_(true), + refs_(refs), snpPhred_(snpPhred), + patsrc_(NULL), qlen1_(0), qlen2_(0), doneFw_(true), doneFwFirst_(true), chase1Fw_(false), chase1Rc_(false), chase2Fw_(false), chase2Rc_(false), @@ -819,6 +834,9 @@ class PairedBWAlignerV1 : public Aligner { rL.fw ? (ebwtFwL? &bufL->qual : &bufL->qualRev) : (ebwtFwL? &bufL->qualRev : &bufL->qual), &bufL->name, + bufL->color, + snpPhred_, + refs_, rmap, ebwtFwL, rL.mms, // mismatch positions @@ -846,6 +864,9 @@ class PairedBWAlignerV1 : public Aligner { rR.fw ? (ebwtFwR? &bufR->qual : &bufR->qualRev) : (ebwtFwR? &bufR->qualRev : &bufR->qual), &bufR->name, + bufR->color, + snpPhred_, + refs_, rmap, ebwtFwR, rR.mms, // mismatch positions @@ -1228,6 +1249,8 @@ class PairedBWAlignerV1 : public Aligner { const BitPairReference* refs_; + int snpPhred_; + PatternSourcePerThread *patsrc_; uint32_t qlen1_; uint32_t qlen2_; @@ -1408,6 +1431,7 @@ class PairedBWAlignerV2 : public Aligner { uint32_t maxInsert, uint32_t mixedAttemptLim, const BitPairReference* refs, + int snpPhred, bool rangeMode, bool verbose, bool quiet, @@ -1415,7 +1439,9 @@ class PairedBWAlignerV2 : public Aligner { ChunkPool *pool, int *btCnt) : Aligner(true, rangeMode), - refs_(refs), patsrc_(NULL), + refs_(refs), + snpPhred_(snpPhred), + patsrc_(NULL), qlen1_(0), qlen2_(0), chase_(false), donePe_(false), @@ -1643,6 +1669,9 @@ class PairedBWAlignerV2 : public Aligner { rL.fw ? (ebwtFwL? &bufL->qual : &bufL->qualRev) : (ebwtFwL? &bufL->qualRev : &bufL->qual), &bufL->name, + bufL->color, + snpPhred_, + refs_, rmap, ebwtFwL, rL.mms, // mismatch positions @@ -1670,6 +1699,9 @@ class PairedBWAlignerV2 : public Aligner { rR.fw ? (ebwtFwR? &bufR->qual : &bufR->qualRev) : (ebwtFwR? &bufR->qualRev : &bufR->qual), &bufR->name, + bufR->color, + snpPhred_, + refs_, rmap, ebwtFwR, rR.mms, // mismatch positions @@ -1709,6 +1741,9 @@ class PairedBWAlignerV2 : public Aligner { r.fw ? (ebwtFw? &buf->qual : &buf->qualRev) : (ebwtFw? &buf->qualRev : &buf->qual), &buf->name, + buf->color, + snpPhred_, + refs_, r.ebwt->rmap(), ebwtFw, r.mms, // mismatch positions @@ -1871,6 +1906,9 @@ class PairedBWAlignerV2 : public Aligner { } const BitPairReference* refs_; + + int snpPhred_; + PatternSourcePerThread *patsrc_; uint32_t qlen1_, qlen2_; bool chase_; diff --git a/aligner_0mm.h b/aligner_0mm.h index 7d068c2..312c2d3 100644 --- a/aligner_0mm.h +++ b/aligner_0mm.h @@ -25,6 +25,7 @@ class UnpairedExactAlignerV1Factory : public AlignerFactory { UnpairedExactAlignerV1Factory( Ebwt >& ebwtFw, Ebwt >* ebwtBw, + int snpPhred, bool doFw, bool doRc, HitSink& sink, @@ -33,6 +34,7 @@ class UnpairedExactAlignerV1Factory : public AlignerFactory { RangeCache* cacheBw, uint32_t cacheLimit, ChunkPool *pool, + BitPairReference* refs, vector >& os, bool maqPenalty, bool qualOrder, @@ -43,6 +45,7 @@ class UnpairedExactAlignerV1Factory : public AlignerFactory { uint32_t seed) : ebwtFw_(ebwtFw), ebwtBw_(ebwtBw), + snpPhred_(snpPhred), doFw_(doFw), doRc_(doRc), sink_(sink), sinkPtFactory_(sinkPtFactory), @@ -50,6 +53,7 @@ class UnpairedExactAlignerV1Factory : public AlignerFactory { cacheBw_(cacheBw), cacheLimit_(cacheLimit), pool_(pool), + refs_(refs), os_(os), maqPenalty_(maqPenalty), qualOrder_(qualOrder), @@ -68,7 +72,7 @@ class UnpairedExactAlignerV1Factory : public AlignerFactory { virtual Aligner* create() const { HitSinkPerThread* sinkPt = sinkPtFactory_.create(); EbwtSearchParams >* params = - new EbwtSearchParams >(*sinkPt, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPt, os_, true, true); const int halfAndHalf = 0; const bool seeded = false; @@ -108,13 +112,14 @@ class UnpairedExactAlignerV1Factory : public AlignerFactory { return new UnpairedAlignerV2( params, dr, rchase, - sink_, sinkPtFactory_, sinkPt, os_, rangeMode_, verbose_, quiet_, - INT_MAX, pool_, NULL, NULL); + sink_, sinkPtFactory_, sinkPt, os_, refs_, snpPhred_, + rangeMode_, verbose_, quiet_, INT_MAX, pool_, NULL, NULL); } private: Ebwt >& ebwtFw_; Ebwt >* ebwtBw_; + int snpPhred_; bool doFw_; bool doRc_; HitSink& sink_; @@ -123,6 +128,7 @@ class UnpairedExactAlignerV1Factory : public AlignerFactory { RangeCache *cacheBw_; const uint32_t cacheLimit_; ChunkPool *pool_; + BitPairReference* refs_; vector >& os_; bool maqPenalty_; bool qualOrder_; @@ -144,6 +150,8 @@ class PairedExactAlignerV1Factory : public AlignerFactory { PairedExactAlignerV1Factory( Ebwt >& ebwtFw, Ebwt >* ebwtBw, + bool color, + int snpPhred, bool doFw, bool doRc, bool v1, @@ -172,6 +180,8 @@ class PairedExactAlignerV1Factory : public AlignerFactory { bool quiet, uint32_t seed) : ebwtFw_(ebwtFw), + color_(color), + snpPhred_(snpPhred), doFw_(doFw), doRc_(doRc), v1_(v1), @@ -209,15 +219,15 @@ class PairedExactAlignerV1Factory : public AlignerFactory { HitSinkPerThread* sinkPt = sinkPtFactory_.createMult(2); HitSinkPerThread* sinkPtSe1 = NULL, * sinkPtSe2 = NULL; EbwtSearchParams >* params = - new EbwtSearchParams >(*sinkPt, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPt, os_, true, true); EbwtSearchParams >* paramsSe1 = NULL, * paramsSe2 = NULL; if(reportSe_) { sinkPtSe1 = sinkPtFactory_.create(); sinkPtSe2 = sinkPtFactory_.create(); paramsSe1 = - new EbwtSearchParams >(*sinkPtSe1, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPtSe1, os_, true, true); paramsSe2 = - new EbwtSearchParams >(*sinkPtSe2, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPtSe2, os_, true, true); } const int halfAndHalf = 0; @@ -301,7 +311,8 @@ class PairedExactAlignerV1Factory : public AlignerFactory { os_, verbose_, quiet_, false, pool_, NULL); } - RefAligner >* refAligner = new ExactRefAligner >(verbose_, quiet_); + RefAligner >* refAligner + = new ExactRefAligner >(color_, verbose_, quiet_); // Set up a RangeChaser RangeChaser > *rchase = @@ -317,8 +328,8 @@ class PairedExactAlignerV1Factory : public AlignerFactory { refAligner, rchase, sink_, sinkPtFactory_, sinkPt, mate1fw_, mate2fw_, peInner_, peOuter_, dontReconcile_, symCeil_, mixedThresh_, - mixedAttemptLim_, refs_, rangeMode_, verbose_, quiet_, - INT_MAX, pool_, NULL); + mixedAttemptLim_, refs_, snpPhred_, rangeMode_, verbose_, + quiet_, INT_MAX, pool_, NULL); return al; } else { TRangeSrcDrPtrVec *drVec = new TRangeSrcDrPtrVec(); @@ -333,8 +344,8 @@ class PairedExactAlignerV1Factory : public AlignerFactory { rchase, sink_, sinkPtFactory_, sinkPt, sinkPtSe1, sinkPtSe2, mate1fw_, mate2fw_, peInner_, peOuter_, - mixedAttemptLim_, refs_, rangeMode_, verbose_, quiet_, - INT_MAX, pool_, NULL); + mixedAttemptLim_, refs_, snpPhred_, rangeMode_, + verbose_, quiet_, INT_MAX, pool_, NULL); delete drVec; return al; } @@ -343,6 +354,8 @@ class PairedExactAlignerV1Factory : public AlignerFactory { private: Ebwt >& ebwtFw_; Ebwt >* ebwtBw_; + bool color_; + int snpPhred_; bool doFw_; bool doRc_; bool v1_; diff --git a/aligner_1mm.h b/aligner_1mm.h index 1a249ac..236806f 100644 --- a/aligner_1mm.h +++ b/aligner_1mm.h @@ -25,6 +25,7 @@ class Unpaired1mmAlignerV1Factory : public AlignerFactory { Unpaired1mmAlignerV1Factory( Ebwt >& ebwtFw, Ebwt >* ebwtBw, + int snpPhred, bool doFw, bool doRc, HitSink& sink, @@ -33,6 +34,7 @@ class Unpaired1mmAlignerV1Factory : public AlignerFactory { RangeCache *cacheBw, uint32_t cacheLimit, ChunkPool *pool, + BitPairReference* refs, vector >& os, bool maqPenalty, bool qualOrder, @@ -43,6 +45,7 @@ class Unpaired1mmAlignerV1Factory : public AlignerFactory { uint32_t seed) : ebwtFw_(ebwtFw), ebwtBw_(ebwtBw), + snpPhred_(snpPhred), doFw_(doFw), doRc_(doRc), sink_(sink), @@ -51,6 +54,7 @@ class Unpaired1mmAlignerV1Factory : public AlignerFactory { cacheBw_(cacheBw), cacheLimit_(cacheLimit), pool_(pool), + refs_(refs), os_(os), maqPenalty_(maqPenalty), qualOrder_(qualOrder), @@ -72,7 +76,7 @@ class Unpaired1mmAlignerV1Factory : public AlignerFactory { HitSinkPerThread* sinkPt = sinkPtFactory_.create(); EbwtSearchParams >* params = - new EbwtSearchParams >(*sinkPt, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPt, os_); const int halfAndHalf = 0; const bool seeded = false; @@ -145,13 +149,14 @@ class Unpaired1mmAlignerV1Factory : public AlignerFactory { // Set up the aligner return new UnpairedAlignerV2( params, dr, rchase, - sink_, sinkPtFactory_, sinkPt, os_, rangeMode_, verbose_, quiet_, - INT_MAX, pool_, NULL, NULL); + sink_, sinkPtFactory_, sinkPt, os_, refs_, snpPhred_, + rangeMode_, verbose_, quiet_, INT_MAX, pool_, NULL, NULL); } private: Ebwt >& ebwtFw_; Ebwt >* ebwtBw_; + const int snpPhred_; bool doFw_; bool doRc_; HitSink& sink_; @@ -160,6 +165,7 @@ class Unpaired1mmAlignerV1Factory : public AlignerFactory { RangeCache *cacheBw_; const uint32_t cacheLimit_; ChunkPool *pool_; + BitPairReference* refs_; vector >& os_; const bool maqPenalty_; const bool qualOrder_; @@ -181,6 +187,8 @@ class Paired1mmAlignerV1Factory : public AlignerFactory { Paired1mmAlignerV1Factory( Ebwt >& ebwtFw, Ebwt >* ebwtBw, + bool color, + int snpPhred, bool doFw, bool doRc, bool v1, @@ -210,6 +218,8 @@ class Paired1mmAlignerV1Factory : public AlignerFactory { uint32_t seed) : ebwtFw_(ebwtFw), ebwtBw_(ebwtBw), + color_(color), + snpPhred_(snpPhred), doFw_(doFw), doRc_(doRc), v1_(v1), @@ -249,15 +259,15 @@ class Paired1mmAlignerV1Factory : public AlignerFactory { HitSinkPerThread* sinkPt = sinkPtFactory_.createMult(2); HitSinkPerThread* sinkPtSe1 = NULL, * sinkPtSe2 = NULL; EbwtSearchParams >* params = - new EbwtSearchParams >(*sinkPt, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPt, os_); EbwtSearchParams >* paramsSe1 = NULL, * paramsSe2 = NULL; if(reportSe_) { sinkPtSe1 = sinkPtFactory_.create(); sinkPtSe2 = sinkPtFactory_.create(); paramsSe1 = - new EbwtSearchParams >(*sinkPtSe1, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPtSe1, os_); paramsSe2 = - new EbwtSearchParams >(*sinkPtSe2, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPtSe2, os_); } const int halfAndHalf = 0; @@ -413,7 +423,8 @@ class Paired1mmAlignerV1Factory : public AlignerFactory { dr2RcVec->push_back(dr2Rc_Bw); } - RefAligner >* refAligner = new OneMMRefAligner >(verbose_, quiet_); + RefAligner >* refAligner = + new OneMMRefAligner >(color_, verbose_, quiet_); // Set up a RangeChaser RangeChaser > *rchase = @@ -429,8 +440,8 @@ class Paired1mmAlignerV1Factory : public AlignerFactory { refAligner, rchase, sink_, sinkPtFactory_, sinkPt, mate1fw_, mate2fw_, peInner_, peOuter_, dontReconcile_, symCeil_, mixedThresh_, - mixedAttemptLim_, refs_, rangeMode_, verbose_, quiet_, - INT_MAX, pool_, NULL); + mixedAttemptLim_, refs_, snpPhred_, rangeMode_, verbose_, + quiet_, INT_MAX, pool_, NULL); delete dr1FwVec; delete dr1RcVec; delete dr2FwVec; @@ -445,8 +456,8 @@ class Paired1mmAlignerV1Factory : public AlignerFactory { sinkPt, sinkPtSe1, sinkPtSe2, mate1fw_, mate2fw_, peInner_, peOuter_, - mixedAttemptLim_, refs_, rangeMode_, verbose_, quiet_, - INT_MAX, pool_, NULL); + mixedAttemptLim_, refs_, snpPhred_, rangeMode_, + verbose_, quiet_, INT_MAX, pool_, NULL); delete dr1FwVec; return al; } @@ -455,6 +466,8 @@ class Paired1mmAlignerV1Factory : public AlignerFactory { private: Ebwt >& ebwtFw_; Ebwt >* ebwtBw_; + bool color_; + const int snpPhred_; bool doFw_; bool doRc_; bool v1_; diff --git a/aligner_23mm.h b/aligner_23mm.h index 2f0fba6..15fe14c 100644 --- a/aligner_23mm.h +++ b/aligner_23mm.h @@ -26,6 +26,7 @@ class Unpaired23mmAlignerV1Factory : public AlignerFactory { Ebwt >& ebwtFw, Ebwt >* ebwtBw, bool two, + int snpPhred, bool doFw, bool doRc, HitSink& sink, @@ -34,6 +35,7 @@ class Unpaired23mmAlignerV1Factory : public AlignerFactory { RangeCache *cacheBw, uint32_t cacheLimit, ChunkPool *pool, + BitPairReference* refs, vector >& os, bool maqPenalty, bool qualOrder, @@ -45,6 +47,7 @@ class Unpaired23mmAlignerV1Factory : public AlignerFactory { ebwtFw_(ebwtFw), ebwtBw_(ebwtBw), two_(two), + snpPhred_(snpPhred), doFw_(doFw), doRc_(doRc), sink_(sink), sinkPtFactory_(sinkPtFactory), @@ -52,6 +55,7 @@ class Unpaired23mmAlignerV1Factory : public AlignerFactory { cacheBw_(cacheBw), cacheLimit_(cacheLimit), pool_(pool), + refs_(refs), os_(os), maqPenalty_(maqPenalty), qualOrder_(qualOrder), @@ -73,7 +77,7 @@ class Unpaired23mmAlignerV1Factory : public AlignerFactory { HitSinkPerThread* sinkPt = sinkPtFactory_.create(); EbwtSearchParams >* params = - new EbwtSearchParams >(*sinkPt, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPt, os_); const bool seeded = false; @@ -214,14 +218,15 @@ class Unpaired23mmAlignerV1Factory : public AlignerFactory { return new UnpairedAlignerV2( params, dr, rchase, - sink_, sinkPtFactory_, sinkPt, os_, rangeMode_, verbose_, - quiet_, INT_MAX, pool_, NULL, NULL); + sink_, sinkPtFactory_, sinkPt, os_, refs_, snpPhred_, + rangeMode_, verbose_, quiet_, INT_MAX, pool_, NULL, NULL); } private: Ebwt >& ebwtFw_; Ebwt >* ebwtBw_; bool two_; + const int snpPhred_; bool doFw_; bool doRc_; HitSink& sink_; @@ -230,6 +235,7 @@ class Unpaired23mmAlignerV1Factory : public AlignerFactory { RangeCache *cacheBw_; const uint32_t cacheLimit_; ChunkPool *pool_; + BitPairReference* refs_; vector >& os_; const bool maqPenalty_; const bool qualOrder_; @@ -252,6 +258,8 @@ class Paired23mmAlignerV1Factory : public AlignerFactory { Paired23mmAlignerV1Factory( Ebwt >& ebwtFw, Ebwt >* ebwtBw, + bool color, + int snpPhred, bool doFw, bool doRc, bool v1, @@ -282,6 +290,8 @@ class Paired23mmAlignerV1Factory : public AlignerFactory { uint32_t seed) : ebwtFw_(ebwtFw), ebwtBw_(ebwtBw), + color_(color), + snpPhred_(snpPhred), doFw_(doFw), doRc_(doRc), v1_(v1), @@ -322,15 +332,15 @@ class Paired23mmAlignerV1Factory : public AlignerFactory { HitSinkPerThread* sinkPt = sinkPtFactory_.createMult(2); HitSinkPerThread* sinkPtSe1 = NULL, * sinkPtSe2 = NULL; EbwtSearchParams >* params = - new EbwtSearchParams >(*sinkPt, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPt, os_); EbwtSearchParams >* paramsSe1 = NULL, * paramsSe2 = NULL; if(reportSe_) { sinkPtSe1 = sinkPtFactory_.create(); sinkPtSe2 = sinkPtFactory_.create(); paramsSe1 = - new EbwtSearchParams >(*sinkPtSe1, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPtSe1, os_); paramsSe2 = - new EbwtSearchParams >(*sinkPtSe2, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPtSe2, os_); } const bool seeded = false; @@ -607,9 +617,9 @@ class Paired23mmAlignerV1Factory : public AlignerFactory { RefAligner >* refAligner; if(two_) { - refAligner = new TwoMMRefAligner >(verbose_, quiet_); + refAligner = new TwoMMRefAligner >(color_, verbose_, quiet_); } else { - refAligner = new ThreeMMRefAligner >(verbose_, quiet_); + refAligner = new ThreeMMRefAligner >(color_, verbose_, quiet_); } // Set up a RangeChaser @@ -626,8 +636,8 @@ class Paired23mmAlignerV1Factory : public AlignerFactory { refAligner, rchase, sink_, sinkPtFactory_, sinkPt, mate1fw_, mate2fw_, peInner_, peOuter_, dontReconcile_, symCeil_, mixedThresh_, - mixedAttemptLim_, refs_, rangeMode_, verbose_, quiet_, - INT_MAX, pool_, NULL); + mixedAttemptLim_, refs_, snpPhred_, rangeMode_, verbose_, + quiet_, INT_MAX, pool_, NULL); delete dr1FwVec; delete dr1RcVec; delete dr2FwVec; @@ -642,8 +652,8 @@ class Paired23mmAlignerV1Factory : public AlignerFactory { sinkPt, sinkPtSe1, sinkPtSe2, mate1fw_, mate2fw_, peInner_, peOuter_, - mixedAttemptLim_, refs_, rangeMode_, verbose_, quiet_, - INT_MAX, pool_, NULL); + mixedAttemptLim_, refs_, snpPhred_, rangeMode_, + verbose_, quiet_, INT_MAX, pool_, NULL); delete dr1FwVec; return al; } @@ -652,6 +662,8 @@ class Paired23mmAlignerV1Factory : public AlignerFactory { private: Ebwt >& ebwtFw_; Ebwt >* ebwtBw_; + bool color_; + int snpPhred_; bool doFw_; bool doRc_; bool v1_; diff --git a/aligner_seed_mm.h b/aligner_seed_mm.h index 46fd937..737a2a3 100644 --- a/aligner_seed_mm.h +++ b/aligner_seed_mm.h @@ -31,6 +31,7 @@ class UnpairedSeedAlignerFactory : public AlignerFactory { uint32_t seedMms, uint32_t seedLen, int qualCutoff, + int snpPhred, int maxBts, HitSink& sink, const HitSinkPerThreadFactory& sinkPtFactory, @@ -38,6 +39,7 @@ class UnpairedSeedAlignerFactory : public AlignerFactory { RangeCache* cacheBw, uint32_t cacheLimit, ChunkPool *pool, + BitPairReference* refs, vector >& os, bool maqPenalty, bool qualOrder, @@ -53,6 +55,7 @@ class UnpairedSeedAlignerFactory : public AlignerFactory { seedMms_(seedMms), seedLen_(seedLen), qualCutoff_(qualCutoff), + snpPhred_(snpPhred), maxBts_(maxBts), sink_(sink), sinkPtFactory_(sinkPtFactory), @@ -60,6 +63,7 @@ class UnpairedSeedAlignerFactory : public AlignerFactory { cacheBw_(cacheBw), cacheLimit_(cacheLimit), pool_(pool), + refs_(refs), os_(os), strandFix_(strandFix), maqPenalty_(maqPenalty), @@ -78,7 +82,7 @@ class UnpairedSeedAlignerFactory : public AlignerFactory { virtual Aligner* create() const { HitSinkPerThread* sinkPt = sinkPtFactory_.create(); EbwtSearchParams >* params = - new EbwtSearchParams >(*sinkPt, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPt, os_); int *btCnt = new int[1]; *btCnt = maxBts_; @@ -533,8 +537,9 @@ class UnpairedSeedAlignerFactory : public AlignerFactory { return new UnpairedAlignerV2( params, dr, rchase, - sink_, sinkPtFactory_, sinkPt, os_, rangeMode_, verbose_, - quiet_, maxBts_, pool_, btCnt, metrics_); + sink_, sinkPtFactory_, sinkPt, os_, refs_, snpPhred_, + rangeMode_, verbose_, quiet_, maxBts_, pool_, btCnt, + metrics_); } private: @@ -545,6 +550,7 @@ class UnpairedSeedAlignerFactory : public AlignerFactory { const uint32_t seedMms_; const uint32_t seedLen_; const int qualCutoff_; + const int snpPhred_; const int maxBts_; HitSink& sink_; const HitSinkPerThreadFactory& sinkPtFactory_; @@ -552,6 +558,7 @@ class UnpairedSeedAlignerFactory : public AlignerFactory { RangeCache *cacheBw_; const uint32_t cacheLimit_; ChunkPool *pool_; + BitPairReference* refs_; vector >& os_; bool strandFix_; bool maqPenalty_; @@ -573,12 +580,14 @@ class PairedSeedAlignerFactory : public AlignerFactory { PairedSeedAlignerFactory( Ebwt >& ebwtFw, Ebwt >* ebwtBw, + bool color, bool v1, bool doFw, bool doRc, uint32_t seedMms, uint32_t seedLen, int qualCutoff, + int snpPhred, int maxBts, HitSink& sink, const HitSinkPerThreadFactory& sinkPtFactory, @@ -606,12 +615,14 @@ class PairedSeedAlignerFactory : public AlignerFactory { uint32_t seed) : ebwtFw_(ebwtFw), ebwtBw_(ebwtBw), + color_(color), v1_(v1), doFw_(doFw), doRc_(doRc), seedMms_(seedMms), seedLen_(seedLen), qualCutoff_(qualCutoff), + snpPhred_(snpPhred), maxBts_(maxBts), sink_(sink), sinkPtFactory_(sinkPtFactory), @@ -647,27 +658,27 @@ class PairedSeedAlignerFactory : public AlignerFactory { HitSinkPerThread* sinkPt = sinkPtFactory_.createMult(2); HitSinkPerThread* sinkPtSe1 = NULL, * sinkPtSe2 = NULL; EbwtSearchParams >* params = - new EbwtSearchParams >(*sinkPt, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPt, os_); EbwtSearchParams >* paramsSe1 = NULL, * paramsSe2 = NULL; if(reportSe_) { sinkPtSe1 = sinkPtFactory_.create(); sinkPtSe2 = sinkPtFactory_.create(); paramsSe1 = - new EbwtSearchParams >(*sinkPtSe1, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPtSe1, os_); paramsSe2 = - new EbwtSearchParams >(*sinkPtSe2, os_, true, true, true, rangeMode_); + new EbwtSearchParams >(*sinkPtSe2, os_); } RefAligner >* refAligner = NULL; int *btCnt = new int[1]; *btCnt = maxBts_; if(seedMms_ == 0) { - refAligner = new Seed0RefAligner >(verbose_, quiet_, seedLen_, qualCutoff_, maqPenalty_); + refAligner = new Seed0RefAligner >(color_, verbose_, quiet_, seedLen_, qualCutoff_, maqPenalty_); } else if(seedMms_ == 1) { - refAligner = new Seed1RefAligner >(verbose_, quiet_, seedLen_, qualCutoff_, maqPenalty_); + refAligner = new Seed1RefAligner >(color_, verbose_, quiet_, seedLen_, qualCutoff_, maqPenalty_); } else if(seedMms_ == 2) { - refAligner = new Seed2RefAligner >(verbose_, quiet_, seedLen_, qualCutoff_, maqPenalty_); + refAligner = new Seed2RefAligner >(color_, verbose_, quiet_, seedLen_, qualCutoff_, maqPenalty_); } else { - refAligner = new Seed3RefAligner >(verbose_, quiet_, seedLen_, qualCutoff_, maqPenalty_); + refAligner = new Seed3RefAligner >(color_, verbose_, quiet_, seedLen_, qualCutoff_, maqPenalty_); } bool do1Fw = true; bool do1Rc = true; @@ -1324,7 +1335,8 @@ class PairedSeedAlignerFactory : public AlignerFactory { refAligner, rchase, sink_, sinkPtFactory_, sinkPt, mate1fw_, mate2fw_, peInner_, peOuter_, dontReconcile_, symCeil_, mixedThresh_, mixedAttemptLim_, refs_, - rangeMode_, verbose_, quiet_, maxBts_, pool_, btCnt); + snpPhred_, rangeMode_, verbose_, quiet_, maxBts_, pool_, + btCnt); delete dr1FwVec; delete dr1RcVec; delete dr2FwVec; @@ -1337,8 +1349,8 @@ class PairedSeedAlignerFactory : public AlignerFactory { new TCostAwareRangeSrcDr(strandFix_, dr1FwVec, verbose_, quiet_, true), refAligner, rchase, sink_, sinkPtFactory_, sinkPt, sinkPtSe1, sinkPtSe2, mate1fw_, mate2fw_, peInner_, peOuter_, - mixedAttemptLim_, refs_, rangeMode_, verbose_, quiet_, maxBts_, - pool_, btCnt); + mixedAttemptLim_, refs_, snpPhred_, rangeMode_, verbose_, + quiet_, maxBts_, pool_, btCnt); delete dr1FwVec; return al; } @@ -1347,12 +1359,14 @@ class PairedSeedAlignerFactory : public AlignerFactory { private: Ebwt >& ebwtFw_; Ebwt >* ebwtBw_; + bool color_; const bool v1_; // whether to use V1 PairedAligner const bool doFw_; const bool doRc_; const uint32_t seedMms_; const uint32_t seedLen_; const int qualCutoff_; + const int snpPhred_; const int maxBts_; HitSink& sink_; const HitSinkPerThreadFactory& sinkPtFactory_; diff --git a/alphabet.c b/alphabet.c index 1b7ae6d..4801da8 100644 --- a/alphabet.c +++ b/alphabet.c @@ -73,3 +73,168 @@ uint8_t rcCharToDna5[] = { /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; + +/// For converting from ASCII to the Dna5 code where A=0, C=1, G=2, +/// T=3, N=4 +uint8_t asc2col[] = { + /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, + /* - . */ + /* 48 */ 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0 1 2 3 */ + /* 64 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 96 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 112 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/** + * Mapping from ASCII characters to DNA categories: + * + * 0 = invalid - error + * 1 = DNA + * 2 = IUPAC (ambiguous DNA) + * 3 = not an error, but unmatchable; alignments containing this + * character are invalid + */ +uint8_t asc2dnacat[] = { + /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, + /* - */ + /* 48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 64 */ 0, 1, 2, 1, 2, 0, 0, 1, 2, 0, 0, 2, 0, 2, 2, 0, + /* A B C D G H K M N */ + /* 80 */ 0, 0, 2, 2, 1, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, + /* R S T V W X Y */ + /* 96 */ 0, 1, 2, 1, 2, 0, 0, 1, 2, 0, 0, 2, 0, 2, 2, 0, + /* a b c d g h k m n */ + /* 112 */ 0, 0, 2, 2, 1, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, + /* r s t v w x y */ + /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/** + * Mapping from ASCII characters for ambiguous nucleotides into masks: + */ +uint8_t asc2dnamask[] = { + /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 64 */ 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0, + /* A B C D G H K M N */ + /* 80 */ 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0, + /* R S T V W Y */ + /* 96 */ 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0, + /* a b c d g h k m n */ + /* 112 */ 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0, + /* r s t v w y */ + /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/** + * Mapping from ASCII characters for ambiguous nucleotides into masks: + */ +char asc2dnacomp[] = { + /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,'-', 0, 0, + /* 48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 64 */ 0,'T','V','G','H', 0, 0,'C','D', 0, 0,'M', 0,'K','N', 0, + /* A B C D G H K M N */ + /* 80 */ 0, 0,'Y','S','A', 0,'B','W', 0,'R', 0, 0, 0, 0, 0, 0, + /* R S T V W Y */ + /* 96 */ 0,'T','V','G','H', 0, 0,'C','D', 0, 0,'M', 0,'K','N', 0, + /* a b c d g h k m n */ + /* 112 */ 0, 0,'Y','S','A', 0,'B','W', 0,'R', 0, 0, 0, 0, 0, 0, + /* r s t v w y */ + /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/** + * Mapping from ASCII characters to color categories: + * + * 0 = invalid - error + * 1 = valid color + * 2 = IUPAC (ambiguous DNA) - there is no such thing for colors to my + * knowledge + * 3 = not an error, but unmatchable; alignments containing this + * character are invalid + */ +uint8_t asc2colcat[] = { + /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, + /* - . */ + /* 48 */ 1, 1, 1, 1, 3, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, + /* 0 1 2 3 4 5 6 7 8 9 */ + /* 64 */ 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* A B C D E F */ + /* 80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 96 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 112 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/** + * Convert a nucleotide and a color to the paired nucleotide. Indexed + * first by nucleotide then by color. Note that this is exactly the + * same as the dinuc2color array. + */ +uint8_t nuccol2nuc[5][5] = { + /* B G O R . */ + /* A */ {0, 1, 2, 3, 4}, + /* C */ {1, 0, 3, 2, 4}, + /* G */ {2, 3, 0, 1, 4}, + /* T */ {3, 2, 1, 0, 4}, + /* N */ {4, 4, 4, 4, 4} +}; + +/** + * Convert a pair of nucleotides to a color. + */ +uint8_t dinuc2color[5][5] = { + /* A */ {0, 1, 2, 3, 4}, + /* C */ {1, 0, 3, 2, 4}, + /* G */ {2, 3, 0, 1, 4}, + /* T */ {3, 2, 1, 0, 4}, + /* N */ {4, 4, 4, 4, 4} +}; diff --git a/alphabet.h b/alphabet.h index 2b143d2..b464ddf 100644 --- a/alphabet.h +++ b/alphabet.h @@ -12,37 +12,27 @@ using namespace std; using namespace seqan; /** - * Helper function to print a uint32_t as a DNA string where each 2-bit - * stretch is a character and more significiant bits appear to the left - * of less singificant bits. - */ -static inline std::string u32ToDna(uint32_t a, int len) { - char buf[17]; // TODO: return a new string; by value I guess - assert_leq(len, 16); - for(int i = 0; i < len; i++) { - buf[len-i-1] = "ACGT"[a & 3]; - a >>= 2; - } - buf[len] = '\0'; - return std::string(buf); -} - -/** * Return a new TStr containing the reverse-complement of s. Ns go to * Ns. */ template -static inline TStr reverseComplement(const TStr& s) { +static inline TStr reverseComplement(const TStr& s, bool color) { typedef typename Value::Type TVal; TStr s_rc; size_t slen = length(s); resize(s_rc, slen); - for(size_t i = 0; i < slen; i++) { - int sv = (int)s[slen-i-1]; - if(sv == 4) { - s_rc[i] = (TVal)4; - } else { - s_rc[i] = (TVal)(sv ^ 3); + if(color) { + for(size_t i = 0; i < slen; i++) { + s_rc[i] = s[slen-i-1]; + } + } else { + for(size_t i = 0; i < slen; i++) { + int sv = (int)s[slen-i-1]; + if(sv == 4) { + s_rc[i] = (TVal)4; + } else { + s_rc[i] = (TVal)(sv ^ 3); + } } } return s_rc; @@ -52,7 +42,7 @@ static inline TStr reverseComplement(const TStr& s) { * Reverse-complement s in-place. Ns go to Ns. */ template -static inline void reverseComplementInPlace(TStr& s, bool color = false) { +static inline void reverseComplementInPlace(TStr& s, bool color) { typedef typename Value::Type TVal; if(color) { reverseInPlace(s); @@ -107,25 +97,6 @@ static inline TStr reverseCopy(const TStr& s) { } /** - * Return the reverse-complement of s. - */ -static inline bool isReverseComplement(const String& s1, - const String& s2) -{ - if(length(s1) != length(s2)) return false; - size_t slen = length(s1); - for(size_t i = 0; i < slen; i++) { - int i1 = (int)s1[i]; - int i2 = (int)s2[slen - i - 1]; - if(i1 == 4) { - if(i2 != 4) return false; - } - else if(i1 != (i2 ^ 3)) return false; - } - return true; -} - -/** * Return true iff the first string is dollar-less-than the second. * This means that we pretend that a 'dollar sign' character, * lexicographically larger than all other characters, exists at the @@ -226,6 +197,56 @@ static inline char comp(char c) { extern uint8_t dna4Cat[]; extern uint8_t charToDna5[]; +extern uint8_t asc2col[]; extern uint8_t rcCharToDna5[]; +/// Convert an ascii char to a DNA category. Categories are: +/// 0 -> invalid +/// 1 -> unambiguous a, c, g or t +/// 2 -> ambiguous +/// 3 -> unmatchable +extern uint8_t asc2dnacat[]; + +/// Convert an ascii char to a color category. Categories are: +/// 0 -> invalid +/// 1 -> unambiguous 0, 1, 2 or 3 +/// 2 -> ambiguous (not applicable for colors) +/// 3 -> unmatchable +extern uint8_t asc2colcat[]; + +/// Convert a 2-bit nucleotide (and 4=N) and a color to the +/// corresponding 2-bit nucleotide +extern uint8_t nuccol2nuc[5][5]; + +/** + * Return true iff c is an unambiguous Dna character. + */ +static inline bool isUnambigDna(char c) { + return asc2dnacat[(int)c] == 1; +} + +/** + * Return true iff c is a Dna character. + */ +static inline bool isDna(char c) { + return asc2dnacat[(int)c] > 0; +} + +/** + * Return true iff c is an unambiguous color character (0,1,2,3). + */ +static inline bool isUnambigColor(char c) { + return asc2colcat[(int)c] == 1; +} + +/** + * Return true iff c is a color character. + */ +static inline bool isColor(char c) { + return asc2colcat[(int)c] > 0; +} + +/// Convert a pair of 2-bit (and 4=N) encoded DNA bases to a color +extern uint8_t dinuc2color[5][5]; + #endif /*ALPHABETS_H_*/ diff --git a/bowtie_inspect.cpp b/bowtie_inspect.cpp index 746e9d6..d491296 100644 --- a/bowtie_inspect.cpp +++ b/bowtie_inspect.cpp @@ -262,12 +262,13 @@ static void driver(const string& ebwtFileBase, const string& query) { print_index_sequence_names(adjustedEbwtFileBase, cout); } else { // Initialize Ebwt object - TPackedEbwt ebwt(adjustedEbwtFileBase, true, -1, -1, + bool color = readEbwtColor(adjustedEbwtFileBase); + TPackedEbwt ebwt(adjustedEbwtFileBase, color, true, -1, -1, false, false, false, true, // no memory-mapped io NULL, // no reference map verbose); - // Load whole index into memory - ebwt.loadIntoMemory(true, false); + // Load whole index into memory + ebwt.loadIntoMemory(-1, true, false); print_index_sequences(cout, ebwt); // Evict any loaded indexes from memory if(ebwt.isInMemory()) { diff --git a/color.cpp b/color.cpp new file mode 100644 index 0000000..ef37f26 --- /dev/null +++ b/color.cpp @@ -0,0 +1,114 @@ +/* + * color.cpp + * + * Created on: Oct 18, 2009 + * Author: Ben Langmead + */ + +#include +#include +#include +#include "color.h" + +using namespace std; + +/** + * Set the console color. + */ +void setConsoleColor(int color) { + cout << (char)0x1B << "[" << 0 << ";" << color + 30 << ";" << 0 + 40 << "m"; +} + +/** + * Set the console color. + */ +void appendConsoleColor(string& s, int color) { + s.push_back((char)0x1B); + s.append("[0;"); + ostringstream ss; + ss << (color + 30); + s.append(ss.str()); + s.append(";40m"); +} + +/** + * Print color character in the appropriate color to the console. + * Console must support color. + */ +void printColor(char color) { + char ch = ' '; + switch(color) { + case 'A': + case '0': + case 0: + setConsoleColor(COLOR_BLUE); + ch = '0'; break; + case 'C': + case '1': + case 1: + setConsoleColor(COLOR_GREEN); + ch = '1'; break; + case 'G': + case '2': + case 2: + setConsoleColor(COLOR_YELLOW); + ch = '2'; break; + case 'T': + case '3': + case 3: + setConsoleColor(COLOR_RED); + ch = '3'; break; + case 'N': + case '4': + case '.': + case 4: + setConsoleColor(COLOR_WHITE); + ch = '.'; break; + default: + setConsoleColor(COLOR_WHITE); + break; + } + cout << ch; + setConsoleColor(COLOR_WHITE); +} + +/** + * Print color character in the appropriate color to the console. + * Console must support color. + */ +void appendColor(string& s, char color) { + char ch = ' '; + switch(color) { + case 'A': + case '0': + case 0: + appendConsoleColor(s, COLOR_BLUE); + ch = '0'; break; + case 'C': + case '1': + case 1: + appendConsoleColor(s, COLOR_GREEN); + ch = '1'; break; + case 'G': + case '2': + case 2: + appendConsoleColor(s, COLOR_YELLOW); + ch = '2'; break; + case 'T': + case '3': + case 3: + appendConsoleColor(s, COLOR_RED); + ch = '3'; break; + case 'N': + case '4': + case '.': + case 4: + appendConsoleColor(s, COLOR_WHITE); + ch = '.'; break; + default: + appendConsoleColor(s, COLOR_WHITE); + break; + } + s.push_back(ch); + appendConsoleColor(s, COLOR_WHITE); +} diff --git a/color.h b/color.h new file mode 100644 index 0000000..27be4d9 --- /dev/null +++ b/color.h @@ -0,0 +1,27 @@ +/* + * color.h + * + * Created on: Oct 18, 2009 + * Author: Ben Langmead + */ + +#ifndef COLOR_H_ +#define COLOR_H_ + +#include + +enum { + COLOR_RED = 1, + COLOR_GREEN, + COLOR_YELLOW, + COLOR_BLUE, + COLOR_WHITE = 7 +}; + + +void appendConsoleColor(std::string& s, int color); +void setConsoleColor(int color); +void appendColor(std::string& s, char color); +void printColor(char color); + +#endif /* COLOR_H_ */ diff --git a/color_dec.cpp b/color_dec.cpp new file mode 100644 index 0000000..2e35e87 --- /dev/null +++ b/color_dec.cpp @@ -0,0 +1,351 @@ +/* + * color_dec.cpp + * + * Created on: October 15, 2009 + * Author: Ben Langmead + */ + +#include +#include +#include +#include "alphabet.h" +#include "color_dec.h" +#include "color.h" +#include "qual.h" + +using namespace std; + +// 4-bit pop count +static int alts[] = { + -1, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4 +}; +static int firsts[] = { + -1, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0 +}; + +/** + * Given a nucleotide mask, pick one matching nucleotide at random. + */ +static int randFromMask(int mask) { + assert_gt(mask, 0); + if(alts[mask] == 1) return firsts[mask]; + assert_gt(mask, 0); + assert_lt(mask, 16); + int r = rand() % alts[mask]; + assert_geq(r, 0); + assert_lt(r, alts[mask]); + for(int i = 0; i < 4; i++) { + if((mask & (1 << i)) != 0) { + if(r == 0) return i; + r--; + } + } + cerr << "Shouldn't get here" << endl; + throw 1; + return -1; +} + +/** + * Does a 2-bit-encoded base match any bit in a mask? + */ +static inline bool matches(int i, int j) { + return ((1 << i) & j) != 0; +} + +/** + * Given the dynamic programming table, trace backwards from the last + * column and populate the 's' and 'cmm' strings accordingly. Whenever + * there are multiple equally good ways of backtracking, choose one at + * random. + */ +static void backtrack(int table[4][6][1025], // filled-in table + const char *read, size_t readi, size_t readf, + const char *ref, size_t refi, size_t reff, + char *s, // final nucleotide string + char *cmm, // color mismatches + char *nmm, // nucleotide mismatches + int& cmms, // # color mismatches + int& nmms) // # nucleotide mismatches +{ + const size_t len = reff-refi; + char buf[1025]; + cmms = nmms = 0; + int min = INT_MAX; + int bests = 0; + // Determine best base in final column of table + for(int i = 0; i < 4; i++) { + // Install minimum and backtrack info + int m = table[i][4][len-1]; + if(m < min) { + min = m; + bests = (1 << i); + } else if(m == min) { + bests |= (1 << i); + } + } + // i <- position of rightmost nucleotide + int i = (int)len-1; + // to <- rightmost nucleotide + int to = randFromMask(bests); + while(true) { + bests = table[to][5][i]; // get next best mask + s[i--] = to; // install best nucleotide + if(i < 0) break; // done + assert_gt(bests, 0); + assert_lt(bests, 16); + to = randFromMask(bests); // select + } + // Determine what reference nucleotides were matched against + for(size_t i = 0; i < len; i++) { + if(matches(s[i], ref[refi+i])) { + buf[i] = s[i]; + assert_eq(1, alts[(int)ref[refi+i]]); + // Just plain matched + nmm[i] = 'M'; + } else { + // If ref is ambiguous here, does it matter which one we + // choose? I don't think so. + assert_eq(1, alts[(int)ref[refi+i]]); + //buf[i] = randFromMask(ref[refi+i]); + buf[i] = firsts[(int)ref[refi+i]]; + // SNP here + nmm[i] = 'S'; + nmms++; + } + } + for(size_t i = 0; i < len-1; i++) { + int c1 = (int)read[readi+i]; // actual + int c2 = dinuc2color[(int)buf[i]][(int)buf[i+1]]; // decoded + assert_leq(c1, 4); assert_geq(c1, 0); + if(c1 != c2 || c1 == 4) { + // Actual != decoded + assert_lt(c2, 4); assert_geq(c2, 0); + cmm[i] = "0123."[c2]; + cmms++; + } else { + cmm[i] = 'M'; + } + } + // done +} + +/** + * Decode the colorspace read 'read' as aligned against the reference + * string 'ref', assuming that it's a hit. + */ +void decodeHit( + const char *read, // ASCII colors, '0', '1', '2', '3', '.' + const char *qual, // ASCII quals, Phred+33 encoded + size_t readi, // offset of first character within 'read' to consider + size_t readf, // offset of last char (exclusive) in 'read' to consider + const char *ref, // reference sequence, as masks + size_t refi, // offset of first character within 'ref' to consider + size_t reff, // offset of last char (exclusive) in 'ref' to consider + int snpPhred, // penalty incurred by a SNP + char *ns, // decoded nucleotides are appended here + char *cmm, // where the color mismatches are in the string + char *nmm, // where nucleotide mismatches are in the string + int& cmms, // number of color mismatches + int& nmms) // number of nucleotide mismatches +{ + assert_lt(refi, reff); + assert_lt(readi, readf); + assert_eq(reff-refi-1, readf-readi); + + // + // Dynamic programming table; good for colorspace reads up to 1024 + // colors in length. + // + int table[4][6][1025]; + // 0 -> A, 1 -> C, 2 -> G, 3 -> T, 4 -> min(A, C, G, T), + // 5 -> backtrack mask, 6 -> min mismatches + + // The first column of the table just considers the first + // nucleotide and whether it matches the ref nucleotide. + for(int to = 0; to < 4; to++) { + if(matches(to, ref[refi])) { + // The assigned subject nucleotide matches the reference; + // no penalty + table[to][0][0] = 0; + table[to][1][0] = 0; + table[to][2][0] = 0; + table[to][3][0] = 0; + table[to][4][0] = 0; + table[to][5][0] = 15; + } else { + // The assigned subject nucleotide does not match the + // reference nucleotide, so we add a SNP penalty + table[to][0][0] = snpPhred; + table[to][1][0] = snpPhred; + table[to][2][0] = snpPhred; + table[to][3][0] = snpPhred; + table[to][4][0] = snpPhred; + table[to][5][0] = 15; + } + } + + // Successive columns examine successive alignment positions + int omin = INT_MAX, t = 0; + int lastOmin = INT_MAX; + for(size_t c = readi; c < readf; c++) { + const int readc = (int)read[c]; + assert_leq(readc, 4); + assert_geq(readc, 0); + lastOmin = omin; + omin = INT_MAX; + // t <- index of column in dynamic programming table + t = c - readi + 1; + const int refc = ref[refi + t]; + int from[] = { table[0][4][t-1], table[1][4][t-1], + table[2][4][t-1], table[3][4][t-1] }; + // For each downstream nucleotide + for(int to = 0; to < 4; to++) { + // For each upstream nucleotide + int min = INT_MAX; + const int goodfrom = nuccol2nuc[to][readc]; + int q = qual[c]; + // Reward the preceding position + if(goodfrom < 4) from[goodfrom] -= q; + min = from[0]; + table[to][5][t] = 1; + if(from[1] < min) { + min = from[1]; + table[to][5][t] = 2; + } else if(from[1] == min) { + table[to][5][t] |= 2; + } + if(from[2] < min) { + min = from[2]; + table[to][5][t] = 4; + } else if(from[2] == min) { + table[to][5][t] |= 4; + } + if(from[3] < min) { + min = from[3]; + table[to][5][t] = 8; + } else if(from[3] == min) { + table[to][5][t] |= 8; + } + min += q; + if(!matches(to, refc)) { + min += snpPhred; + } + table[to][4][t] = min; + if(min < omin) omin = min; + if(goodfrom < 4) from[goodfrom] += q; + } + } + + t++; + assert_eq(t, (int)(reff - refi)); + // Install the best backward path into ns, cmm, nmm + backtrack(table, + read, readi, readi + t - 1, + ref, refi, refi + t, + ns, cmm, nmm, cmms, nmms); +} + +#ifdef MAIN_COLOR_DEC + +#include +#include + +static const char *short_opts = "s:m:r:e:"; +static struct option long_opts[] = { + {(char*)"snppen", required_argument, 0, 's'}, + {(char*)"misspen", required_argument, 0, 'm'}, + {(char*)"seed", required_argument, 0, 'r'}, + {(char*)"maxpen", required_argument, 0, 'e'} +}; + +template +T parse(const char *s) { + T tmp; + stringstream ss(s); + ss >> tmp; + return tmp; +} + +int main(int argc, char **argv) { + int option_index = 0; + int next_option; + int snppen = 30; + int misspen = 20; + int maxPenalty = 70; + unsigned seed = 0; + do { + next_option = getopt_long(argc, argv, short_opts, long_opts, &option_index); + switch (next_option) { + case 's': snppen = parse(optarg); break; + case 'm': misspen = parse(optarg); break; + case 'r': seed = parse(optarg); break; + case 'e': maxPenalty = parse(optarg); break; + case -1: break; + default: { + cerr << "Unknown option: " << (char)next_option << endl; + exit(1); + } + } + } while(next_option != -1); + srand(seed); + if(argc - optind < 2) { + cerr << "Not enough options" << endl; + exit(1); + } + string read, ref; + read = argv[optind]; + for(size_t i = 0; i < read.length(); i++) { + read[i] = asc2col[(int)read[i]]; + assert_leq(read[i], 4); + assert_geq(read[i], 0); + } + ref = argv[optind+1]; + for(size_t i = 0; i < ref.length(); i++) { + int num = 0; + int alts[] = {4, 4, 4, 4}; + decodeNuc(toupper(ref[i]), num, alts); + assert_leq(num, 4); + assert_gt(num, 0); + ref[i] = 0; + for(int j = 0; j < num; j++) { + ref[i] |= (1 << alts[j]); + } + } + string ns; + string quals; + quals.resize(read.length(), misspen); + string cmm, nmm; + int score = decode(read, quals, 0, read.length(), + ref, 0, ref.length(), maxPenalty, + snppen, ns, cmm, nmm); + cout << " Score: " << score << " (max: " << maxPenalty << ")" << endl; + cout << " MMs: "; + for(size_t i = 0; i < cmm.length(); i++) { + cout << cmm[i] << " "; + } + cout << endl; + cout << "Colors: "; + for(size_t i = 0; i < read.length(); i++) { + printColor((int)read[i]); + cout << " "; + } + cout << endl; + cout << " Bases: "; + for(size_t i = 0; i < ns.length(); i++) { + cout << "ACGTN"[(int)ns[i]] << " "; + } + cout << endl; + cout << " Ref: "; + for(size_t i = 0; i < ref.length(); i++) { + cout << mask2iupac[(int)ref[i]] << " "; + } + cout << endl; + cout << " MMs: "; + for(size_t i = 0; i < ref.length(); i++) { + cout << nmm[i] << " "; + } + cout << endl; +} +#endif diff --git a/color_dec.h b/color_dec.h new file mode 100644 index 0000000..f9f1222 --- /dev/null +++ b/color_dec.h @@ -0,0 +1,31 @@ +/* + * color_dec.h + * + * Created on: Oct 14, 2009 + * Author: Ben Langmead + */ + +#ifndef COLOR_DEC_H_ +#define COLOR_DEC_H_ + +#include +#include +#include +#include "alphabet.h" + +void decodeHit( + const char *read, // ASCII colors, '0', '1', '2', '3', '.' + const char *qual, // ASCII quals, Phred+33 encoded + size_t readi, // offset of first character within 'read' to consider + size_t readf, // offset of last char (exclusive) in 'read' to consider + const char *ref, // reference sequence, as masks + size_t refi, // offset of first character within 'ref' to consider + size_t reff, // offset of last char (exclusive) in 'ref' to consider + int snpPhred, // penalty incurred by a SNP + char *ns, // decoded nucleotides are appended here + char *cmm, // where the color mismatches are in the string + char *nmm, // where nucleotide mismatches are in the string + int& cmms, // number of color mismatches + int& nmms);// number of nucleotide mismatches + +#endif /* COLOR_DEC_H_ */ diff --git a/ebwt.h b/ebwt.h index 55509a6..7c2f1ec 100644 --- a/ebwt.h +++ b/ebwt.h @@ -36,6 +36,8 @@ #include "mm.h" #include "timer.h" #include "refmap.h" +#include "color_dec.h" +#include "reference.h" using namespace std; using namespace seqan; @@ -67,6 +69,13 @@ if(this->verbose()) { \ #endif /** + * Flags describing type of Ebwt. + */ +enum EBWT_FLAGS { + EBWT_COLOR = 2 // true -> Ebwt is colorspace +}; + +/** * Extended Burrows-Wheeler transform header. This together with the * actual data arrays and other text-specific parameters defined in * class Ebwt constitute the entire Ebwt. @@ -81,19 +90,22 @@ class EbwtParams { int32_t linesPerSide, int32_t offRate, int32_t isaRate, - int32_t ftabChars) + int32_t ftabChars, + bool color) { - init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars); + init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color); } EbwtParams(const EbwtParams& eh) { init(eh._len, eh._lineRate, eh._linesPerSide, eh._offRate, - eh._isaRate, eh._ftabChars); + eh._isaRate, eh._ftabChars, eh._color); } void init(uint32_t len, int32_t lineRate, int32_t linesPerSide, - int32_t offRate, int32_t isaRate, int32_t ftabChars) + int32_t offRate, int32_t isaRate, int32_t ftabChars, + bool color) { + _color = color; _len = len; _bwtLen = _len + 1; _sz = (len+3)/4; @@ -155,6 +167,7 @@ class EbwtParams { uint32_t numLines() const { return _numLines; } uint32_t ebwtTotLen() const { return _ebwtTotLen; } uint32_t ebwtTotSz() const { return _ebwtTotSz; } + bool color() const { return _color; } /** * Set a new suffix-array sampling rate, which involves updating @@ -162,9 +175,9 @@ class EbwtParams { */ void setOffRate(int __offRate) { _offRate = __offRate; - _offMask = 0xffffffff << _offRate; - _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate; - _offsSz = _offsLen*4; + _offMask = 0xffffffff << _offRate; + _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate; + _offsSz = _offsLen*4; } /** @@ -173,9 +186,9 @@ class EbwtParams { */ void setIsaRate(int __isaRate) { _isaRate = __isaRate; - _isaMask = 0xffffffff << _isaRate; - _isaLen = (_bwtLen + (1 << _isaRate) - 1) >> _isaRate; - _isaSz = _isaLen*4; + _isaMask = 0xffffffff << _isaRate; + _isaLen = (_bwtLen + (1 << _isaRate) - 1) >> _isaRate; + _isaSz = _isaLen*4; } /// Check that this EbwtParams is internally consistent @@ -227,22 +240,22 @@ class EbwtParams { << " ebwtTotSz: " << _ebwtTotSz << endl; } - uint32_t _len; - uint32_t _bwtLen; - uint32_t _sz; - uint32_t _bwtSz; - int32_t _lineRate; - int32_t _linesPerSide; - int32_t _origOffRate; - int32_t _offRate; - uint32_t _offMask; - int32_t _isaRate; - uint32_t _isaMask; - int32_t _ftabChars; - uint32_t _eftabLen; - uint32_t _eftabSz; - uint32_t _ftabLen; - uint32_t _ftabSz; + uint32_t _len; + uint32_t _bwtLen; + uint32_t _sz; + uint32_t _bwtSz; + int32_t _lineRate; + int32_t _linesPerSide; + int32_t _origOffRate; + int32_t _offRate; + uint32_t _offMask; + int32_t _isaRate; + uint32_t _isaMask; + int32_t _ftabChars; + uint32_t _eftabLen; + uint32_t _eftabSz; + uint32_t _ftabLen; + uint32_t _ftabSz; uint32_t _offsLen; uint32_t _offsSz; uint32_t _isaLen; @@ -256,6 +269,7 @@ class EbwtParams { uint32_t _numLines; uint32_t _ebwtTotLen; uint32_t _ebwtTotSz; + bool _color; }; /** @@ -341,6 +355,7 @@ class Ebwt { /// Construct an Ebwt from the given input file Ebwt(const string& in, + int color, bool __fw, int32_t __overrideOffRate = -1, int32_t __overrideIsaRate = -1, @@ -362,7 +377,7 @@ class Ebwt { useShmem_ = useShmem; _in1Str = in + ".1.ebwt"; _in2Str = in + ".2.ebwt"; - readIntoMemory(true, &_eh, mmSweep, loadNames, startVerbose); + readIntoMemory(color, true, &_eh, mmSweep, loadNames, startVerbose); // If the offRate has been overridden, reflect that in the // _eh._offRate field if(_overrideOffRate > _eh._offRate) { @@ -381,7 +396,8 @@ class Ebwt { /// vector, optionally using a blockwise suffix sorter with the /// given 'bmax' and 'dcv' parameters. The string vector is /// ultimately joined and the joined string is passed to buildToDisk(). - Ebwt(int32_t lineRate, + Ebwt(int color, + int32_t lineRate, int32_t linesPerSide, int32_t offRate, int32_t isaRate, @@ -405,7 +421,7 @@ class Ebwt { bool sanityCheck = false) : Ebwt_INITS Ebwt_STAT_INITS, - _eh(joinedLen(szs), lineRate, linesPerSide, offRate, isaRate, ftabChars) + _eh(joinedLen(szs), lineRate, linesPerSide, offRate, isaRate, ftabChars, color) { _in1Str = file + ".1.ebwt"; _in2Str = file + ".2.ebwt"; @@ -466,7 +482,7 @@ class Ebwt { if(_sanity) { VMSG_NL("Sanity-checking Ebwt"); assert(!isInMemory()); - readIntoMemory(false, NULL, false, true, false); + readIntoMemory(color, false, NULL, false, true, false); sanityCheckAll(); evictFromMemory(); assert(!isInMemory()); @@ -763,8 +779,8 @@ class Ebwt { * Load this Ebwt into memory by reading it in from the _in1 and * _in2 streams. */ - void loadIntoMemory(bool loadNames, bool verbose) { - readIntoMemory(false, NULL, false, loadNames, verbose); + void loadIntoMemory(int color, bool loadNames, bool verbose) { + readIntoMemory(color, false, NULL, false, loadNames, verbose); } /** @@ -960,7 +976,7 @@ class Ebwt { void buildToDisk(InorderBlockwiseSA& sa, const TStr& s, ostream& out1, ostream& out2); // I/O - void readIntoMemory(bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, bool startVerbose); + void readIntoMemory(int color, bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, bool startVerbose); void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const; void writeFromMemory(bool justHeader, const string& out1, const string& out2) const; @@ -970,12 +986,12 @@ class Ebwt { void sanityCheckUpToSide(int upToSide) const; void sanityCheckAll() const; void restore(TStr& s) const; - void checkOrigs(const vector >& os, bool mirror) const; + void checkOrigs(const vector >& os, bool color, bool mirror) const; // Searching and reporting void joinedToTextOff(uint32_t qlen, uint32_t off, uint32_t& tidx, uint32_t& textoff, uint32_t& tlen) const; - inline bool report(const String& query, String* quals, String* name, const std::vector& mmui32, const std::vector& refcs, size_t numMms, uint32_t off, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, uint16_t cost, const EbwtSearchParams& params) const; - inline bool reportChaseOne(const String& query, String* quals, String* name, const std::vector& mmui32, const std::vector& refcs, size_t numMms, uint32_t i, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, uint16_t cost, const EbwtSearchParams& params, SideLocus *l = NULL) const; + inline bool report(const String& query, String* quals, String* name, bool color, int snpPhred, const BitPairReference* ref, const std::vector& mmui32, const std::vector& refcs, size_t numMms, uint32_t off, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, uint16_t cost, const EbwtSearchParams& params) const; + inline bool reportChaseOne(const String& query, String* quals, String* name, bool color, int snpPhred, const BitPairReference* ref, const std::vector& mmui32, const std::vector& refcs, size_t numMms, uint32_t i, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, uint16_t cost, const EbwtSearchParams& params, SideLocus *l = NULL) const; inline bool reportReconstruct(const String& query, String* quals, String* name, String& lbuf, String& rbuf, const uint32_t *mmui32, const char* refcs, size_t numMms, uint32_t i, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, const EbwtSearchParams& params, SideLocus *l = NULL) const; inline int rowL(const SideLocus& l) const; inline uint32_t countUpTo(const SideLocus& l, int c) const; @@ -1106,29 +1122,29 @@ bool Ebwt::isPacked() { template class EbwtSearchParams { public: - EbwtSearchParams(HitSinkPerThread& __sink, - const vector >& __texts, - bool __revcomp = true, - bool __fw = true, - bool __ebwtFw = true, - bool __arrowMode = false) : - _sink(__sink), - _texts(__texts), + EbwtSearchParams(HitSinkPerThread& sink, + const vector >& texts, + bool fw = true, + bool ebwtFw = true) : + _sink(sink), + _texts(texts), _patid(0xffffffff), - _revcomp(__revcomp), - _fw(__fw), - _arrowMode(__arrowMode) { } - HitSinkPerThread& sink() const { return _sink; } - void setPatId(uint32_t __patid) { _patid = __patid; } - uint32_t patId() const { return _patid; } - void setFw(bool __fw) { _fw = __fw; } - bool fw() const { return _fw; } + _fw(fw) { } + + HitSinkPerThread& sink() const { return _sink; } + void setPatId(uint32_t patid) { _patid = patid; } + uint32_t patId() const { return _patid; } + void setFw(bool fw) { _fw = fw; } + bool fw() const { return _fw; } /** * Report a hit. Returns true iff caller can call off the search. */ bool reportHit(const String& query, // read sequence String* quals, // read quality values String* name, // read name + bool color, // true -> read is colorspace + int snpPhred, // penalty for a SNP + const BitPairReference* ref, // reference (= NULL if not necessary) const ReferenceMap* rmap, // map to another reference coordinate system bool ebwtFw, // whether index is forward (true) or mirror (false) const std::vector& mmui32, // mismatch list @@ -1140,115 +1156,176 @@ class EbwtSearchParams { uint16_t mlen, // mate length U32Pair a, // arrow pair uint32_t tlen, // length of text - uint32_t len, // length of query + uint32_t qlen, // length of query int stratum, // alignment stratum uint16_t cost, // cost of alignment uint32_t oms, // approx. # other valid alignments uint32_t patid, uint8_t mate) const { - // The search functions should not have allowed us to get here +#ifndef NDEBUG + // Check that no two elements of the mms array are the same + for(size_t i = 0; i < numMms; i++) { + for(size_t j = i+1; j < numMms; j++) { + assert_neq(mmui32[i], mmui32[j]); + } + } +#endif + // If ebwtFw is true, then 'query' and 'quals' are reversed + // If _fw is false, then 'query' and 'quals' are reverse complemented + assert(!color || ref != NULL); + assert(quals != NULL); + assert(name != NULL); assert_eq(mmui32.size(), refcs.size()); assert_leq(numMms, mmui32.size()); - String pat; - uint32_t qlen = length(query); assert_gt(qlen, 0); - reserve(pat, qlen); - // Report it using the HitSinkPerThread Hit hit; hit.stratum = stratum; hit.cost = cost; - if(patid == 0xffffffff) patid = _patid; - // Make a copy of the read name - if(name != NULL) assign(hit.patName, *name); hit.patSeq = query; - if(quals != NULL) { - hit.quals = *quals; - } + hit.quals = *quals; if(!ebwtFw) { - // Note: this is re-reversing the pattern and the quals - // string back to their normal orientation; the pattern - // source reversed it initially + // Re-reverse the pattern and the quals back to how they + // appeared in the read file ::reverseInPlace(hit.patSeq); ::reverseInPlace(hit.quals); } -#ifndef NDEBUG - // Check that no two elements of the mms array are the same - for(size_t i = 0; i < numMms; i++) { - for(size_t j = i+1; j < numMms; j++) { - assert_neq(mmui32[i], mmui32[j]); + if(color) { + hit.colSeq = hit.patSeq; + hit.crefcs.resize(qlen, 0); + // Turn the mmui32 and refcs arrays into the mm FixedBitset and + // the refc vector + for(size_t i = 0; i < numMms; i++) { + if (ebwtFw != _fw) { + // The 3' end is on the left but the mm vector encodes + // mismatches w/r/t the 5' end, so we flip + uint32_t off = qlen - mmui32[i] - 1; + hit.cmms.set(off); + hit.crefcs[off] = refcs[i]; + } else { + hit.cmms.set(mmui32[i]); + hit.crefcs[i] = refcs[i]; + } } - } -#endif - // Turn the mmui32 and refcs arrays into the mm FixedBitset and - // the refc vector - hit.refcs.resize(qlen, 0); - for(size_t i = 0; i < numMms; i++) { - if (ebwtFw != _fw) { - // The 3' end is on the left but the mm vector encodes - // mismatches w/r/t the 5' end, so we flip - uint32_t off = len - mmui32[i] - 1; - hit.mms.set(off); - hit.refcs[off] = refcs[i]; + assert(ref != NULL); + char read[1024]; + char rf[1024]; + char qual[1024]; + char ns[1024]; + char cmm[1024]; + char nmm[1024]; + int cmms = 0; + int nmms = 0; + // TODO: account for indels when calculating these bounds + size_t readi = 0; + size_t readf = seqan::length(hit.patSeq); + size_t refi = 0; + size_t reff = readf + 1; + // The "Phred-scaled probability of a mutation" a la Maq/BWA + bool maqRound = false; + for(size_t i = 0; i < qlen + 1; i++) { + if(i < qlen) { + read[i] = (int)hit.patSeq[i]; + qual[i] = mmPenalty(maqRound, phredCharToPhredQual(hit.quals[i])); + } + int rc = ref->getBase(h.first, h.second + i); + assert_geq(rc, 0); + assert_lt(rc, 4); + rf[i] = (1 << rc); } - else { - hit.mms.set(mmui32[i]); - hit.refcs[mmui32[i]] = refcs[i]; + decodeHit( + read, // ASCII colors, '0', '1', '2', '3', '.' + qual, // ASCII quals, Phred+33 encoded + readi, // offset of first character within 'read' to consider + readf, // offset of last char (exclusive) in 'read' to consider + rf, // reference sequence, as masks + refi, // offset of first character within 'ref' to consider + reff, // offset of last char (exclusive) in 'ref' to consider + snpPhred, // penalty incurred by a SNP + ns, // decoded nucleotides are appended here + cmm, // where the color mismatches are in the string + nmm, // where nucleotide mismatches are in the string + cmms, // number of color mismatches + nmms);// number of nucleotide mismatches + seqan::resize(hit.patSeq, qlen+1); + seqan::resize(hit.quals, qlen+1); + hit.refcs.resize(qlen+1, 0); + for(size_t i = 0; i < qlen + 1; i++) { + // Set sequence character + assert_leq(ns[i], 4); + assert_geq(ns[i], 0); + hit.patSeq[i] = (Dna5)(int)ns[i]; + // Set initial quality + hit.quals[i] = '!'; + // Color mismatches penalize quality + if(i > 0) { + if(cmm[i-1] == 'M') hit.quals[i] += qual[i-1]; + else hit.quals[i] -= qual[i-1]; + } + if(i < qlen) { + if(cmm[i] == 'M') hit.quals[i] += qual[i]; + else hit.quals[i] -= qual[i]; + } + if(hit.quals[i] < '!') hit.quals[i] = '!'; + if(nmm[i] != 'M') { + uint32_t off = i; + if(!_fw) off = (qlen+1) - i - 1; + assert_lt(off, qlen+1); + hit.mms.set(off); + hit.refcs[off] = "ACGT"[ref->getBase(h.first, h.second+i)]; + } + } + qlen++; + mlen++; + } else { + // Turn the mmui32 and refcs arrays into the mm FixedBitset and + // the refc vector + hit.refcs.resize(qlen, 0); + for(size_t i = 0; i < numMms; i++) { + if (ebwtFw != _fw) { + // The 3' end is on the left but the mm vector encodes + // mismatches w/r/t the 5' end, so we flip + uint32_t off = qlen - mmui32[i] - 1; + hit.mms.set(off); + hit.refcs[off] = refcs[i]; + } else { + hit.mms.set(mmui32[i]); + hit.refcs[mmui32[i]] = refcs[i]; + } } } // Check the hit against the original text, if it's available - if(_texts.size() > 0 && !_arrowMode) { + if(_texts.size() > 0) { assert_lt(h.first, _texts.size()); - FixedBitset diffs; + FixedBitset<1024> diffs; // This type of check assumes that only mismatches are // possible. If indels are possible, then we either need // the caller to provide information about indel locations, // or we need to extend this to a more complicated check. - assert_leq(h.second + len, length(_texts[h.first])); - for(size_t i = 0; i < len; i++) { + assert_leq(h.second + qlen, length(_texts[h.first])); + for(size_t i = 0; i < qlen; i++) { assert_neq(4, (int)_texts[h.first][h.second + i]); - if(ebwtFw) { - // Forward pattern appears at h - if((int)query[i] != (int)_texts[h.first][h.second + i]) { - uint32_t qoff = i; - // if ebwtFw != _fw the 3' end is on on the - // left end of the pattern, but the diff vector - // should encode mismatches w/r/t the 5' end, - // so we flip - if (ebwtFw != _fw) diffs.set(len - qoff - 1); - else diffs.set(qoff); - } - } else { - // Reverse of pattern appears at h - if((int)query[len-i-1] != (int)_texts[h.first][h.second + i]) { - uint32_t qoff = len-i-1; - // if ebwtFw != _fw the 3' end is on on the - // left end of the pattern, but the diff vector - // should encode mismatches w/r/t the 5' end, - // so we flip - if (ebwtFw != _fw) diffs.set(len - qoff - 1); - else diffs.set(qoff); - } + // Forward pattern appears at h + if((int)hit.patSeq[i] != (int)_texts[h.first][h.second + i]) { + uint32_t qoff = i; + // if ebwtFw != _fw the 3' end is on on the + // left end of the pattern, but the diff vector + // should encode mismatches w/r/t the 5' end, + // so we flip + if (_fw) diffs.set(qoff); + else diffs.set(qlen - qoff - 1); } } if(diffs != hit.mms) { // Oops, mismatches were not where we expected them; // print a diagnostic message before asserting cerr << "Expected " << hit.mms.str() << " mismatches, got " << diffs.str() << endl; - cerr << " Pat: "; - for(size_t i = 0; i < len; i++) { - if(ebwtFw) cerr << query[i]; - else cerr << query[len-i-1]; - } - cerr << endl; + cerr << " Pat: " << hit.patSeq << endl; cerr << " Tseg: "; - for(size_t i = 0; i < len; i++) { + for(size_t i = 0; i < qlen; i++) { cerr << _texts[h.first][h.second + i]; } cerr << endl; - if(length(_texts[h.first]) < 80) { - cerr << " Text: " << _texts[h.first] << endl; - } cerr << " mmui32: "; for(size_t i = 0; i < numMms; i++) { cerr << mmui32[i] << " "; @@ -1259,35 +1336,25 @@ class EbwtSearchParams { } if(diffs != hit.mms) assert(false); } - hit.h = _arrowMode? a : h; - if(!_arrowMode && rmap != NULL) { - rmap->map(hit.h); - } - hit.patId = patid; - if(name != NULL) hit.patName = *name; + hit.h = h; + if(rmap != NULL) rmap->map(hit.h); + hit.patId = ((patid == 0xffffffff) ? _patid : patid); + hit.patName = *name; hit.mh = mh; hit.fw = _fw; hit.mfw = mfw; hit.mlen = mlen; hit.oms = oms; hit.mate = mate; + hit.color = color; return sink().reportHit(hit, stratum); } - bool arrowMode() const { - return _arrowMode; - } - void setArrowMode(bool a) { - _arrowMode = a; - } - const vector >& texts() const { return _texts; } private: HitSinkPerThread& _sink; - const vector >& _texts; // original texts, if available (if not - // available, _texts.size() == 0) + const vector >& _texts; // original texts, if available (if not + // available, _texts.size() == 0) uint32_t _patid; // id of current read - bool _revcomp; // whether reverse complements are enabled bool _fw; // current read is forward-oriented - bool _arrowMode; // report arrows }; /** @@ -2285,6 +2352,9 @@ template inline bool Ebwt::report(const String& query, String* quals, String* name, + bool color, + int snpPhred, + const BitPairReference* ref, const std::vector& mmui32, const std::vector& refcs, size_t numMms, @@ -2298,32 +2368,6 @@ inline bool Ebwt::report(const String& query, { VMSG_NL("In report"); assert_lt(off, this->_eh._len); - if(params.arrowMode()) { - // Call reportHit with a bogus genome position; in this mode, - // all we care about are the top and bottom arrows - return params.reportHit( - query, // read sequence - quals, // read quality values - name, // read name - rmap_, // map to another reference coordinate system - _fw, // true = index is forward; false = mirror - mmui32, // mismatch positions - refcs, // reference characters for mms - numMms, // # mismatches - make_pair(0, 0), // (bogus) position - make_pair(0, 0), // (bogus) mate position - true, // (bogus) mate orientation - 0, // (bogus) mate length - make_pair(top, bot), // arrows - 0, // (bogus) textlen - qlen, // qlen - stratum, // alignment stratum - cost, // cost, including stratum & quality penalty - bot-top-1, // # other hits - 0xffffffff, // pattern id - 0); // mate (0 = unpaired) - } - uint32_t tidx; uint32_t textoff; uint32_t tlen; @@ -2335,6 +2379,9 @@ inline bool Ebwt::report(const String& query, query, // read sequence quals, // read quality values name, // read name + color, // true -> read is colorspace + snpPhred, // phred probability of SNP + ref, // reference sequence rmap_, // map to another reference coordinate system _fw, // true = index is forward; false = mirror mmui32, // mismatch positions @@ -2367,6 +2414,9 @@ template inline bool Ebwt::reportChaseOne(const String& query, String* quals, String* name, + bool color, + int snpPhred, + const BitPairReference* ref, const std::vector& mmui32, const std::vector& refcs, size_t numMms, @@ -2380,7 +2430,6 @@ inline bool Ebwt::reportChaseOne(const String& query, SideLocus *l) const { VMSG_NL("In reportChaseOne"); - assert(!params.arrowMode()); uint32_t off; uint32_t jumps = 0; ASSERT_ONLY(uint32_t origi = i); @@ -2423,7 +2472,9 @@ inline bool Ebwt::reportChaseOne(const String& query, assert_eq(rcoff, off); } #endif - return report(query, quals, name, mmui32, refcs, numMms, off, top, bot, qlen, stratum, cost, params); + return report(query, quals, name, color, snpPhred, ref, mmui32, + refcs, numMms, off, top, bot, qlen, stratum, cost, + params); } /** @@ -2452,7 +2503,6 @@ inline bool Ebwt::reportReconstruct(const String& query, { VMSG_NL("In reportReconstruct"); assert_gt(_eh._isaLen, 0); // Must have inverse suffix array to reconstruct - assert(params.arrowMode()); uint32_t off; uint32_t jumps = 0; SideLocus myl; @@ -2628,7 +2678,8 @@ void Ebwt::restore(TStr& s) const { * the given array of reference sequences. For sanity checking. */ template -void Ebwt::checkOrigs(const vector >& os, bool mirror) const +void Ebwt::checkOrigs(const vector >& os, + bool color, bool mirror) const { TStr rest; restore(rest); @@ -2640,19 +2691,32 @@ void Ebwt::checkOrigs(const vector >& os, bool mirror) const } while(i < os.size()) { size_t olen = length(os[i]); + int lastorig = -1; for(; j < olen; j++) { size_t joff = j; if(mirror) joff = olen - j - 1; if((int)os[i][joff] == 4) { // Skip over Ns + lastorig = -1; if(!mirror) { - while(j < length(os[i]) && (int)os[i][j] == 4) j++; + while(j < olen && (int)os[i][j] == 4) j++; } else { - while(j < length(os[i]) && (int)os[i][olen-j-1] == 4) j++; + while(j < olen && (int)os[i][olen-j-1] == 4) j++; } - break; + j--; + continue; } - assert_eq(os[i][joff], rest[restOff]); + if(lastorig == -1 && color) { + lastorig = os[i][joff]; + continue; + } + if(color) { + assert_neq(-1, lastorig); + assert_eq(dinuc2color[(int)os[i][joff]][lastorig], rest[restOff]); + } else { + assert_eq(os[i][joff], rest[restOff]); + } + lastorig = (int)os[i][joff]; restOff++; } if(j == length(os[i])) { @@ -2675,7 +2739,8 @@ void Ebwt::checkOrigs(const vector >& os, bool mirror) const * Read an Ebwt from file with given filename. */ template -void Ebwt::readIntoMemory(bool justHeader, +void Ebwt::readIntoMemory(int color, + bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, @@ -2818,19 +2883,39 @@ void Ebwt::readIntoMemory(bool justHeader, int32_t isaRate = _overrideIsaRate; int32_t ftabChars = readI32(_in1, switchEndian); bytesRead += 4; - // BTL: chunkRate is now deprecated - /*int32_t chunkRate =*/ readI32(_in1, switchEndian); + // chunkRate was deprecated in an earlier version of Bowtie; now + // we use it to hold flags. + int32_t flags = readI32(_in1, switchEndian); + if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) { + if(color != -1 && !color) { + cerr << "Error: -C was not specified when running bowtie, but index is in colorspace. If" << endl + << "your reads are in colorspace, please use the -C option. If your reads are not" << endl + << "in colorspace, please use a normal index (one built without specifying -C to" << endl + << "bowtie-build)." << endl; + throw 1; + } + color = 1; + } else if(flags < 0) { + if(color != -1 && color) { + cerr << "Error: -C was specified when running bowtie, but index is not in colorspace. If" << endl + << "your reads are in colorspace, please use a colorspace index (one built using" << endl + << "bowtie-build -C). If your reads are not in colorspace, don't specify -C when" << endl + << "running bowtie." << endl; + throw 1; + } + color = 0; + } bytesRead += 4; // Create a new EbwtParams from the entries read from primary stream EbwtParams *eh; bool deleteEh = false; if(params != NULL) { - params->init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars); + params->init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color); if(_verbose || startVerbose) params->print(cerr); eh = params; } else { - eh = new EbwtParams(len, lineRate, linesPerSide, offRate, isaRate, ftabChars); + eh = new EbwtParams(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color); deleteEh = true; } @@ -3318,10 +3403,14 @@ readEbwtRefnames(istream& in, vector& refnames) { int32_t offRate = readI32(in, switchEndian); int32_t ftabChars = readI32(in, switchEndian); // BTL: chunkRate is now deprecated - /*int32_t chunkRate =*/ readI32(in, switchEndian); + int32_t flags = readI32(in, switchEndian); + bool color = false; + if(flags < 0) { + color = (((-flags) & EBWT_COLOR) != 0); + } // Create a new EbwtParams from the entries read from primary stream - EbwtParams eh(len, lineRate, linesPerSide, offRate, -1, ftabChars); + EbwtParams eh(len, lineRate, linesPerSide, offRate, -1, ftabChars, color); uint32_t nPat = readI32(in, switchEndian); // nPat in.seekg(nPat*4, ios_base::cur); // skip plen @@ -3388,6 +3477,41 @@ readEbwtRefnames(const string& instr, vector& refnames) { } /** + * Read just enough of the Ebwt's header to determine whether it's + * colorspace. + */ +static inline bool +readEbwtColor(const string& instr) { + ifstream in; + // Initialize our primary and secondary input-stream fields + in.open((instr + ".1.ebwt").c_str(), ios_base::in | ios::binary); + if(!in.is_open()) { + throw EbwtFileOpenException("Cannot open file " + instr); + } + assert(in.is_open()); + assert(in.good()); + bool switchEndian = false; + uint32_t one = readU32(in, switchEndian); // 1st word of primary stream + if(one != 1) { + assert_eq((1u<<24), one); + assert_eq(1, endianSwapU32(one)); + switchEndian = true; + } + readU32(in, switchEndian); + readI32(in, switchEndian); + readI32(in, switchEndian); + readI32(in, switchEndian); + readI32(in, switchEndian); + int32_t flags = readI32(in, switchEndian); + if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) { + return true; + } else { + return false; + } + +} + +/** * Write an extended Burrows-Wheeler transform to a pair of output * streams. * @@ -3416,7 +3540,9 @@ void Ebwt::writeFromMemory(bool justHeader, writeI32(out1, eh._linesPerSide, be); // not used writeI32(out1, eh._offRate, be); // every 2^offRate chars is "marked" writeI32(out1, eh._ftabChars, be); // number of 2-bit chars used to address ftab - writeI32(out1, 0xffffffff, be); // BTL: chunkRate is now deprecated + int32_t flags = 1; + if(eh._color) flags |= EBWT_COLOR; + writeI32(out1, -flags, be); // BTL: chunkRate is now deprecated if(!justHeader) { assert(isInMemory()); @@ -3489,7 +3615,7 @@ void Ebwt::writeFromMemory(bool justHeader, cout << "Re-reading \"" << out1 << "\"/\"" << out2 << "\" for sanity check" << endl; Ebwt copy(out1, out2, _verbose, _sanity); assert(!isInMemory()); - copy.loadIntoMemory(false, false); + copy.loadIntoMemory(eh._color ? 1 : 0, false, false); assert(isInMemory()); assert_eq(eh._lineRate, copy.eh()._lineRate); assert_eq(eh._linesPerSide, copy.eh()._linesPerSide); diff --git a/ebwt_build.cpp b/ebwt_build.cpp index f8eab0a..d3e5152 100644 --- a/ebwt_build.cpp +++ b/ebwt_build.cpp @@ -47,6 +47,7 @@ static bool autoMem; static bool packed; static bool writeRef; static bool justRef; +static bool color; static void resetOptions() { verbose = true; // be talkative (default) @@ -74,6 +75,7 @@ static void resetOptions() { packed = false; // writeRef = true; // write compact reference to .3.ebwt/.4.ebwt justRef = false; // *just* write compact reference, don't index + color = false; } // Argument constants for getopts @@ -102,6 +104,7 @@ static void printUsage(ostream& out) { << " -c reference sequences given on cmd line (as )" << endl << " -a/--noauto disable automatic -p/--bmax/--dcv memory-fitting" << endl << " -p/--packed use packed strings internally; slower, uses less mem" << endl + << " -C build colorspace index" << endl << " --bmax max bucket sz for blockwise suffix-array builder" << endl //<< " --bmaxmultsqrt max bucket sz as multiple of sqrt(ref len)" << endl << " --bmaxdivn max bucket sz as divisor of ref len (default: 4)" << endl @@ -343,7 +346,7 @@ static void printLongUsage(ostream& out) { ; } -static const char *short_options = "qraph?nscfl:i:o:t:h:3"; +static const char *short_options = "qraph?nscfl:i:o:t:h:3C"; static struct option long_options[] = { {(char*)"quiet", no_argument, 0, 'q'}, @@ -371,6 +374,7 @@ static struct option long_options[] = { {(char*)"ntoa", no_argument, 0, ARG_NTOA}, {(char*)"justref", no_argument, 0, '3'}, {(char*)"noref", no_argument, 0, 'r'}, + {(char*)"color", no_argument, 0, 'C'}, {(char*)"usage", no_argument, 0, ARG_USAGE}, {(char*)0, 0, 0, 0} // terminator }; @@ -412,6 +416,7 @@ static void parseOptions(int argc, const char **argv) { case 'f': format = FASTA; break; case 'c': format = CMDLINE; break; case 'p': packed = true; break; + case 'C': color = true; break; case 'l': lineRate = parseNumber(3, "-l/--lineRate arg must be at least 3"); break; @@ -500,7 +505,8 @@ static void driver(const string& infile, bool reverse = false) { vector is; - RefReadInParams refparams(cutoff, -1, reverse, nsToAs); + bool bisulfite = false; + RefReadInParams refparams(cutoff, -1, color, reverse, nsToAs, bisulfite); assert_gt(infiles.size(), 0); if(format == CMDLINE) { // Adapt sequence strings to stringstreams open for input @@ -536,18 +542,17 @@ static void driver(const string& infile, // sequences. A record represents a stretch of unambiguous // characters in one of the input sequences. vector szs; - uint32_t sztot; + std::pair sztot; { if(verbose) cout << "Reading reference sizes" << endl; Timer _t(cout, " Time reading reference sizes: ", verbose); if(!reverse && (writeRef || justRef)) { + // For forward reference, dump it to .3.ebwt and .4.ebwt + // files string file3 = outfile + ".3.ebwt"; string file4 = outfile + ".4.ebwt"; - BitpairOutFileBuf bpout(file4.c_str()); - // Read in the sizes of all the unambiguous stretches of the - // genome into a vector of RefRecords - sztot = fastaRefReadSizes(is, szs, refparams, &bpout); - // Write the records back out to a '.3.ebwt' file. + // Open output stream for the '.3.ebwt' file which will + // hold the size records. ofstream fout3(file3.c_str(), ios::binary); if(!fout3.good()) { cerr << "Could not open index file for writing: \"" << file3 << "\"" << endl @@ -555,31 +560,64 @@ static void driver(const string& infile, << "Bowtie." << endl; throw 1; } + BitpairOutFileBuf bpout(file4.c_str()); + // Read in the sizes of all the unambiguous stretches of + // the genome into a vector of RefRecords. The input + // streams are reset once it's done. bool be = currentlyBigEndian(); writeU32(fout3, 1, be); // endianness sentinel - writeU32(fout3, szs.size(), be); // write # records - // Write the records themselves - for(size_t i = 0; i < szs.size(); i++) { - szs[i].write(fout3, be); + if(color) { + refparams.color = false; + // Make sure the .3.ebwt and .4.ebwt files contain + // nucleotides; not colors + std::pair sztot2 = + fastaRefReadSizes(is, szs, refparams, &bpout); + refparams.color = true; + writeU32(fout3, szs.size(), be); // write # records + for(size_t i = 0; i < szs.size(); i++) szs[i].write(fout3, be); + szs.clear(); + // Now read in the colorspace size records; these are + // the ones that were indexed + sztot = fastaRefReadSizes(is, szs, refparams, NULL); + assert_eq(sztot2.second, sztot.second + 1); + } else { + sztot = fastaRefReadSizes(is, szs, refparams, &bpout); + writeU32(fout3, szs.size(), be); // write # records + for(size_t i = 0; i < szs.size(); i++) szs[i].write(fout3, be); } + assert_gt(sztot.first, 0); + assert_gt(sztot.second, 0); bpout.close(); fout3.close(); #ifndef NDEBUG if(sanityCheck) { - BitPairReference bpr(outfile, true, &infiles, NULL, format == CMDLINE); + BitPairReference bpr(outfile, color, true, &infiles, NULL, format == CMDLINE); } #endif } else { // Read in the sizes of all the unambiguous stretches of the // genome into a vector of RefRecords sztot = fastaRefReadSizes(is, szs, refparams); +#ifndef NDEBUG + if(refparams.color) { + refparams.color = false; + vector szs2; + std::pair sztot2 = + fastaRefReadSizes(is, szs2, refparams, NULL); + // One less color than base + assert_eq(sztot2.second, sztot.second + 1); + refparams.color = true; + } +#endif } } if(justRef) return; - assert_gt(sztot, 0); + assert_gt(sztot.first, 0); + assert_gt(sztot.second, 0); assert_gt(szs.size(), 0); // Construct Ebwt from input strings and parameters - Ebwt ebwt(lineRate, + Ebwt ebwt(refparams.color ? 1 : 0, + lineRate, linesPerSide, offRate, // suffix-array sampling rate isaRate, // ISA sampling rate @@ -593,7 +631,7 @@ static void driver(const string& infile, noDc? 0 : dcv,// difference-cover period is, // list of input streams szs, // list of reference sizes - sztot, // total size of all references + sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed -1, // override offRate @@ -611,11 +649,11 @@ static void driver(const string& infile, // Try restoring the original string (if there were // multiple texts, what we'll get back is the joined, // padded string, not a list) - ebwt.loadIntoMemory(false, false); + ebwt.loadIntoMemory(refparams.color ? 1 : 0, false, false); TStr s2; ebwt.restore(s2); ebwt.evictFromMemory(); { - TStr joinedss = Ebwt::join(is, szs, sztot, refparams, seed); + TStr joinedss = Ebwt::join(is, szs, sztot.first, refparams, seed); assert_eq(length(joinedss), length(s2)); assert_eq(joinedss, s2); } diff --git a/ebwt_search.cpp b/ebwt_search.cpp index 25214bf..41782a6 100644 --- a/ebwt_search.cpp +++ b/ebwt_search.cpp @@ -47,7 +47,6 @@ static bool quiet; // print nothing but the alignments static int sanityCheck; // enable expensive sanity checks static int format; // default read format is FASTQ static string origString; // reference text, or filename(s) -static int revcomp; // search for reverse complements? static int seed; // srandom() seed static int timing; // whether to report basic timing data static bool allHits; // for multihits, report just one @@ -95,7 +94,6 @@ static bool strata; // true -> don't stop at stratum boundaries static bool refOut; // if true, alignments go to per-ref files static int partitionSz; // output a partitioning key in first field static bool noMaqRound; // true -> don't round quals to nearest 10 like maq -static bool forgiveInput; // let read input be a little wrong w/o complaining or dying static bool useSpinlock; // false -> don't use of spinlocks even if they're #defines static bool fileParallel; // separate threads read separate input files in parallel static bool useShmem; // use shared memory to hold the index @@ -138,6 +136,9 @@ static bool fuzzy; static bool fullRef; static bool samNoHead; // don't print any header lines in SAM output static bool samNoSQ; // don't print @SQ header lines +static bool color; +static string rgs; // SAM outputs for @RG header line +static int snpPhred; // probability of SNP, for scoring colorspace alignments static void resetOptions() { mates1.clear(); @@ -150,7 +151,6 @@ static void resetOptions() { sanityCheck = 0; // enable expensive sanity checks format = FASTQ; // default read format is FASTQ origString = ""; // reference text, or filename(s) - revcomp = 1; // search for reverse complements? seed = 0; // srandom() seed timing = 0; // whether to report basic timing data allHits = false; // for multihits, report just one @@ -198,7 +198,6 @@ static void resetOptions() { refOut = false; // if true, alignments go to per-ref files partitionSz = 0; // output a partitioning key in first field noMaqRound = false; // true -> don't round quals to nearest 10 like maq - forgiveInput = false; // let read input be a little wrong w/o complaining or dying useSpinlock = true; // false -> don't use of spinlocks even if they're #defines fileParallel = false; // separate threads read separate input files in parallel useShmem = false; // use shared memory to hold the index @@ -241,11 +240,14 @@ static void resetOptions() { fullRef = false; // print entire reference name instead of just up to 1st space samNoHead = false; // don't print any header lines in SAM output samNoSQ = false; // don't print @SQ header lines + color = false; // don't align in colorspace by default + rgs = ""; // SAM outputs for @RG header line + snpPhred = 30; // probability of SNP, for scoring colorspace alignments } // mating constraints -static const char *short_options = "fF:qbzhcu:rv:s:at3:5:o:e:n:l:w:p:k:m:1:2:I:X:x:B:yS"; +static const char *short_options = "fF:qbzhcu:rv:s:at3:5:o:e:n:l:w:p:k:m:1:2:I:X:x:B:ySC"; enum { ARG_ORIG = 256, @@ -277,7 +279,6 @@ enum { ARG_SEED_EXTEND, ARG_PARTITION, ARG_integerQuals, - ARG_FORGIVE_INPUT, ARG_NOMAQROUND, ARG_USE_SPINLOCK, ARG_FILEPAR, @@ -317,8 +318,10 @@ enum { ARG_FUZZY, ARG_FULLREF, ARG_USAGE, + ARG_SNPPHRED, ARG_SAM_NOHEAD, - ARG_SAM_NOSQ + ARG_SAM_NOSQ, + ARG_SAM_RG }; static struct option long_options[] = { @@ -350,7 +353,6 @@ static struct option long_options[] = { {(char*)"reportopps", no_argument, &reportOpps, 1}, {(char*)"version", no_argument, &showVersion, 1}, {(char*)"dumppats", required_argument, 0, ARG_DUMP_PATS}, - {(char*)"revcomp", no_argument, 0, 'r'}, {(char*)"maqerr", required_argument, 0, 'e'}, {(char*)"seedlen", required_argument, 0, 'l'}, {(char*)"seedmms", required_argument, 0, 'n'}, @@ -377,7 +379,6 @@ static struct option long_options[] = { {(char*)"refout", no_argument, 0, ARG_REFOUT}, {(char*)"seedextend", no_argument, 0, ARG_SEED_EXTEND}, {(char*)"partition", required_argument, 0, ARG_PARTITION}, - {(char*)"forgive", no_argument, 0, ARG_FORGIVE_INPUT}, {(char*)"nospin", no_argument, 0, ARG_USE_SPINLOCK}, {(char*)"stateful", no_argument, 0, ARG_STATEFUL}, {(char*)"prewidth", required_argument, 0, ARG_PREFETCH_WIDTH}, @@ -422,6 +423,9 @@ static struct option long_options[] = { {(char*)"sam-nohead", no_argument, 0, ARG_SAM_NOHEAD}, {(char*)"sam-nosq", no_argument, 0, ARG_SAM_NOSQ}, {(char*)"sam-noSQ", no_argument, 0, ARG_SAM_NOSQ}, + {(char*)"color", no_argument, 0, 'C'}, + {(char*)"sam-RG", required_argument, 0, ARG_SAM_RG}, + {(char*)"snpphred", required_argument, 0, ARG_SNPPHRED}, {(char*)0, 0, 0, 0} // terminator }; @@ -446,6 +450,7 @@ static void printUsage(ostream& out) { << " -f query input files are (multi-)FASTA .fa/.mfa" << endl << " -r query input files are raw one-sequence-per-line" << endl << " -c query sequences given on cmd line (as , )" << endl + << " -C reads and index are in colorspace (incompatible with -q)" << endl << " -s/--skip skip the first reads/pairs in the input" << endl << " -u/--qupto stop after first reads/pairs (excl. skipped reads)" << endl << " -5/--trim5 trim bases from 5' (left) end of reads" << endl @@ -476,12 +481,13 @@ static void printUsage(ostream& out) { //<< " --better alignments guaranteed best possible stratum (old --best)" << endl << " --best hits guaranteed best stratum; ties broken by quality" << endl << " --strata hits in sub-optimal strata aren't reported (requires --best)" << endl - << " --strandfix attempt to fix strand biases" << endl + << " --strandfix attempt to fix strand biases (def: on w/ --best, off w/o)" << endl << "Output:" << endl << " -S/--sam write hits in SAM format" << endl - << " --concise write hits in concise format" << endl + //<< " --concise write hits in concise format" << endl << " -t/--time print wall-clock time taken by search phases" << endl << " -B/--offbase leftmost ref offset = in bowtie output (default: 0)" << endl + << " --snphred penalty for a SNP when decoding colorspace (default: 30)" << endl << " --quiet print nothing but the alignments" << endl << " --refout write alignments to files refXXXXX.map, 1 map per reference" << endl << " --refidx refer to ref. seqs by 0-based index rather than name" << endl @@ -1604,6 +1610,7 @@ static void parseOptions(int argc, const char **argv) { case 'q': format = FASTQ; break; case 'r': format = RAW; break; case 'c': format = CMDLINE; break; + case 'C': color = true; break; case ARG_CHAININ: format = INPUT_CHAIN; break; case 'I': minInsert = (uint32_t)parseInt(0, "-I arg must be positive"); @@ -1654,8 +1661,8 @@ static void parseOptions(int argc, const char **argv) { case ARG_integerQuals: integerQuals = true; break; case ARG_PHRED64: phred64Quals = true; break; case ARG_PHRED33: solexaQuals = false; phred64Quals = false; break; - case ARG_FORGIVE_INPUT: forgiveInput = true; break; case ARG_NOMAQROUND: noMaqRound = true; break; + case ARG_SNPPHRED: snpPhred = parseInt(0, "--snpphred must be at least 0"); break; case 'z': fullIndex = false; break; case ARG_REFIDX: noRefNames = true; break; case ARG_STATEFUL: stateful = true; break; @@ -1748,6 +1755,11 @@ static void parseOptions(int argc, const char **argv) { case ARG_PEV2: useV1 = false; break; case ARG_SAM_NOHEAD: samNoHead = true; break; case ARG_SAM_NOSQ: samNoSQ = true; break; + case ARG_SAM_RG: { + if(!rgs.empty()) rgs += '\t'; + rgs += optarg; + break; + } case ARG_MAXBTS: { maxBts = parseInt(0, "--maxbts must be positive"); maxBtsBetter = maxBts; @@ -1782,9 +1794,7 @@ static void parseOptions(int argc, const char **argv) { // ranges). offRate = 32; } - if(maqLike) { - revcomp = true; - } else if(mismatches == 3 && fullIndex) { + if(!maqLike && mismatches == 3 && fullIndex) { // Much faster than normal 3-mismatch mode stateful = true; } @@ -2143,6 +2153,7 @@ static void *exactSearchWorker(void *vp) { HitSink& _sink = *exactSearch_sink; Ebwt >& ebwt = *exactSearch_ebwt; vector >& os = *exactSearch_os; + const BitPairReference* refs = exactSearch_refs; // Global initialization bool sanity = sanityCheck && !os.empty(); @@ -2154,14 +2165,14 @@ static void *exactSearchWorker(void *vp) { EbwtSearchParams > params( *sink, // HitSink os, // reference sequences - revcomp, // forward AND reverse complement? true, // read is forward - true, // index is forward - rangeMode); // range mode + true); // index is forward GreedyDFSRangeSource bt( &ebwt, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) 0xffffffff, // qualThresh - 0xffffffff, // max backtracks (no max) + 0xffffffff, // max backtracks (no max) 0, // reportPartials (don't) true, // reportExacts rangeMode, // reportRanges @@ -2200,6 +2211,7 @@ static void *exactSearchWorkerStateful(void *vp) { UnpairedExactAlignerV1Factory alSEfact( ebwt, NULL, + snpPhred, !nofw, !norc, _sink, @@ -2208,6 +2220,7 @@ static void *exactSearchWorkerStateful(void *vp) { NULL, //&cacheBw, cacheLimit, pool, + refs, os, !noMaqRound, !better, @@ -2219,6 +2232,8 @@ static void *exactSearchWorkerStateful(void *vp) { PairedExactAlignerV1Factory alPEfact( ebwt, NULL, + snpPhred, + color, !nofw, !norc, useV1, @@ -2290,8 +2305,7 @@ static void *exactSearchWorkerStateful(void *vp) { static void exactSearch(PairedPatternSource& _patsrc, HitSink& _sink, Ebwt >& ebwt, - vector >& os, - bool paired = false) + vector >& os) { exactSearch_patsrc = &_patsrc; exactSearch_sink = &_sink; @@ -2303,13 +2317,14 @@ static void exactSearch(PairedPatternSource& _patsrc, // Load the rest of (vast majority of) the backward Ebwt into // memory Timer _t(cerr, "Time loading forward index: ", timing); - ebwt.loadIntoMemory(!noRefNames, startVerbose); + ebwt.loadIntoMemory(color ? 1 : 0, !noRefNames, startVerbose); } BitPairReference *refs = NULL; - if((mates1.size() > 0 || mates12.size() > 0) && mixedThresh < 0xffffffff) { + bool pair = mates1.size() > 0 || mates12.size() > 0; + if(color || (pair && mixedThresh < 0xffffffff)) { Timer _t(cerr, "Time loading reference: ", timing); - refs = new BitPairReference(adjustedEbwtFileBase, sanityCheck, NULL, &os, false, useMm, useShmem, mmSweep, verbose, startVerbose); + refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, useMm, useShmem, mmSweep, verbose, startVerbose); if(!refs->loaded()) throw 1; } exactSearch_refs = refs; @@ -2371,20 +2386,22 @@ static void* mismatchSearchWorkerPhase1(void *vp){ vector >& os = *mismatchSearch_os; SyncBitset& doneMask = *mismatchSearch_doneMask; SyncBitset& hitMask = *mismatchSearch_hitMask; - bool sanity = sanityCheck && !os.empty() && !rangeMode; - PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid); + const BitPairReference* refs = mismatchSearch_refs; + + bool sanity = sanityCheck && !os.empty() && !rangeMode; + PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid); PatternSourcePerThread* patsrc = patsrcFact->create(); HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink, sanity); HitSinkPerThread* sink = sinkFact->create(); EbwtSearchParams > params( - *sink, // HitSinkPerThread + *sink, // HitSinkPerThread os, // reference sequences - revcomp, // forward AND reverse complement? false, // read is forward - true, // index is forward - rangeMode); // range mode + true); // index is forward GreedyDFSRangeSource bt( - &ebwtFw, params, + &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) 0xffffffff, // qualThresh 0xffffffff, // max backtracks (no max) 0, // reportPartials (don't) @@ -2418,23 +2435,25 @@ static void* mismatchSearchWorkerPhase2(void *vp){ vector >& os = *mismatchSearch_os; SyncBitset& doneMask = *mismatchSearch_doneMask; SyncBitset& hitMask = *mismatchSearch_hitMask; - // Per-thread initialization - bool sanity = sanityCheck && !os.empty() && !rangeMode; - PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid); + const BitPairReference* refs = mismatchSearch_refs; + + // Per-thread initialization + bool sanity = sanityCheck && !os.empty() && !rangeMode; + PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid); PatternSourcePerThread* patsrc = patsrcFact->create(); HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink, sanity); HitSinkPerThread* sink = sinkFact->create(); EbwtSearchParams > params( - *sink, // HitSinkPerThread + *sink, // HitSinkPerThread os, // reference sequences - revcomp, // forward AND reverse complement? true, // read is forward - false, // index is mirror index - rangeMode); // range mode + false); // index is mirror index GreedyDFSRangeSource bt( - &ebwtBw, params, + &ebwtBw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) 0xffffffff, // qualThresh - 0xffffffff, // max backtracks (no max) + 0xffffffff, // max backtracks (no max) 0, // reportPartials (don't) true, // reportExacts rangeMode, // reportRanges @@ -2496,7 +2515,7 @@ static void mismatchSearch(PairedPatternSource& _patsrc, // Load the rest of (vast majority of) the backward Ebwt into // memory Timer _t(cerr, "Time loading forward index: ", timing); - ebwtFw.loadIntoMemory(!noRefNames, startVerbose); + ebwtFw.loadIntoMemory(color ? 1 : 0, !noRefNames, startVerbose); } #ifdef BOWTIE_PTHREADS @@ -2528,12 +2547,12 @@ static void mismatchSearch(PairedPatternSource& _patsrc, // Load the rest of (vast majority of) the backward Ebwt into // memory Timer _t(cerr, "Time loading mirror index: ", timing); - ebwtBw.loadIntoMemory(!noRefNames, startVerbose); + ebwtBw.loadIntoMemory(color ? 1 : 0, !noRefNames, startVerbose); } - _patsrc.reset(); // reset pattern source to 1st pattern + _patsrc.reset(); // reset pattern source to 1st pattern // Sanity-check the restored version of the Ebwt if(sanityCheck && !os.empty()) { - ebwtBw.checkOrigs(os, true); + ebwtBw.checkOrigs(os, color, true); } // Phase 2 @@ -2568,13 +2587,14 @@ static void *mismatchSearchWorkerFullStateful(void *vp) { // Global initialization bool sanity = sanityCheck && !os.empty(); - PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid); + PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid); HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink, sanity); ChunkPool *pool = new ChunkPool(chunkSz * 1024, chunkPoolMegabytes * 1024 * 1024, chunkVerbose); Unpaired1mmAlignerV1Factory alSEfact( ebwtFw, &ebwtBw, + snpPhred, !nofw, !norc, _sink, @@ -2583,6 +2603,7 @@ static void *mismatchSearchWorkerFullStateful(void *vp) { NULL, //&cacheBw, cacheLimit, pool, + refs, os, !noMaqRound, !better, @@ -2594,6 +2615,8 @@ static void *mismatchSearchWorkerFullStateful(void *vp) { Paired1mmAlignerV1Factory alPEfact( ebwtFw, &ebwtBw, + color, + snpPhred, !nofw, !norc, useV1, @@ -2643,26 +2666,28 @@ static void *mismatchSearchWorkerFullStateful(void *vp) { static void* mismatchSearchWorkerFull(void *vp){ int tid = *((int*)vp); - PairedPatternSource& _patsrc = *mismatchSearch_patsrc; - HitSink& _sink = *mismatchSearch_sink; - Ebwt >& ebwtFw = *mismatchSearch_ebwtFw; - Ebwt >& ebwtBw = *mismatchSearch_ebwtBw; - vector >& os = *mismatchSearch_os; - // Per-thread initialization - bool sanity = sanityCheck && !os.empty() && !rangeMode; - PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid); + PairedPatternSource& _patsrc = *mismatchSearch_patsrc; + HitSink& _sink = *mismatchSearch_sink; + Ebwt >& ebwtFw = *mismatchSearch_ebwtFw; + Ebwt >& ebwtBw = *mismatchSearch_ebwtBw; + vector >& os = *mismatchSearch_os; + const BitPairReference* refs = mismatchSearch_refs; + + // Per-thread initialization + bool sanity = sanityCheck && !os.empty() && !rangeMode; + PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid); PatternSourcePerThread* patsrc = patsrcFact->create(); HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink, sanity); HitSinkPerThread* sink = sinkFact->create(); EbwtSearchParams > params( - *sink, // HitSinkPerThread + *sink, // HitSinkPerThread os, // reference sequences - revcomp, // forward AND reverse complement? true, // read is forward - false, // index is mirror index - rangeMode); // range mode + false); // index is mirror index GreedyDFSRangeSource bt( - &ebwtFw, params, + &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) 0xffffffff, // qualThresh 0xffffffff, // max backtracks (no max) 0, // reportPartials (don't) @@ -2713,18 +2738,19 @@ static void mismatchSearchFull(PairedPatternSource& _patsrc, { // Load the other half of the index into memory Timer _t(cerr, "Time loading forward index: ", timing); - ebwtFw.loadIntoMemory(!noRefNames, startVerbose); + ebwtFw.loadIntoMemory(color ? 1 : 0, !noRefNames, startVerbose); } { // Load the other half of the index into memory Timer _t(cerr, "Time loading mirror index: ", timing); - ebwtBw.loadIntoMemory(!noRefNames, startVerbose); + ebwtBw.loadIntoMemory(color ? 1 : 0, !noRefNames, startVerbose); } // Create range caches, which are shared among all aligners BitPairReference *refs = NULL; - if((mates1.size() > 0 || mates12.size() > 0) && mixedThresh < 0xffffffff) { + bool pair = mates1.size() > 0 || mates12.size() > 0; + if(color || (pair && mixedThresh < 0xffffffff)) { Timer _t(cerr, "Time loading reference: ", timing); - refs = new BitPairReference(adjustedEbwtFileBase, sanityCheck, NULL, &os, false, useMm, useShmem, mmSweep, verbose, startVerbose); + refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, useMm, useShmem, mmSweep, verbose, startVerbose); if(!refs->loaded()) throw 1; } mismatchSearch_refs = refs; @@ -2764,7 +2790,7 @@ static void mismatchSearchFull(PairedPatternSource& _patsrc, /* Load the forward index into memory if necessary */ \ if(!ebwtFw.isInMemory()) { \ Timer _t(cerr, "Time loading forward index: ", timing); \ - ebwtFw.loadIntoMemory(!noRefNames, startVerbose); \ + ebwtFw.loadIntoMemory(color ? 1 : 0, !noRefNames, startVerbose); \ } \ assert(ebwtFw.isInMemory()); \ _patsrc.reset(); /* rewind pattern source to first pattern */ \ @@ -2777,7 +2803,7 @@ static void mismatchSearchFull(PairedPatternSource& _patsrc, /* Load the forward index into memory if necessary */ \ if(!ebwtBw.isInMemory()) { \ Timer _t(cerr, "Time loading mirror index: ", timing); \ - ebwtBw.loadIntoMemory(!noRefNames, startVerbose); \ + ebwtBw.loadIntoMemory(color ? 1 : 0, !noRefNames, startVerbose); \ } \ assert(ebwtBw.isInMemory()); \ _patsrc.reset(); /* rewind pattern source to first pattern */ \ @@ -2790,27 +2816,6 @@ static void mismatchSearchFull(PairedPatternSource& _patsrc, uint32_t twoRevOff = (seedMms <= 2) ? s : 0; \ uint32_t oneRevOff = (seedMms <= 1) ? s : 0; \ uint32_t unrevOff = (seedMms == 0) ? s : 0; \ - ::naiveOracle( \ - os, \ - patFw, \ - plen, \ - qual, \ - name, \ - patid, \ - hits, \ - qualCutoff, \ - unrevOff, \ - oneRevOff, \ - twoRevOff, \ - threeRevOff, \ - true, /* fw */ \ - ebwtfw, /* ebwtFw */ \ - 0, /* iham */ \ - NULL, /* muts */ \ - !noMaqRound, /* maqRound */ \ - false, /* halfAndHalf */ \ - true, /* reportExacts */ \ - ebwtfw); /* invert */ \ if(hits.size() > 0) { \ /* Print offending hit obtained by oracle */ \ ::printHit( \ @@ -2818,11 +2823,11 @@ static void mismatchSearchFull(PairedPatternSource& _patsrc, hits[0], \ patFw, \ plen, \ - unrevOff, \ - oneRevOff, \ - twoRevOff, \ - threeRevOff, \ - ebwtfw); /* ebwtFw */ \ + unrevOff, \ + oneRevOff, \ + twoRevOff, \ + threeRevOff, \ + ebwtfw); /* ebwtFw */ \ } \ assert_eq(0, hits.size()); \ } @@ -2834,27 +2839,6 @@ static void mismatchSearchFull(PairedPatternSource& _patsrc, uint32_t twoRevOff = (seedMms <= 2) ? s : 0; \ uint32_t oneRevOff = (seedMms <= 1) ? s : 0; \ uint32_t unrevOff = (seedMms == 0) ? s : 0; \ - ::naiveOracle( \ - os, \ - patRc, \ - plen, \ - qualRev, \ - name, \ - patid, \ - hits, \ - qualCutoff, \ - unrevOff, \ - oneRevOff, \ - twoRevOff, \ - threeRevOff, \ - false, /* fw */ \ - ebwtfw, /* ebwtFw */ \ - 0, /* iham */ \ - NULL, /* muts */ \ - !noMaqRound, /* maqRound */ \ - false, /* halfAndHalf */ \ - true, /* reportExacts */ \ - !ebwtfw); /* invert */ \ if(hits.size() > 0) { \ /* Print offending hit obtained by oracle */ \ ::printHit( \ @@ -2862,11 +2846,11 @@ static void mismatchSearchFull(PairedPatternSource& _patsrc, hits[0], \ patRc, \ plen, \ - unrevOff, \ - oneRevOff, \ - twoRevOff, \ - threeRevOff, \ - ebwtfw); /* ebwtFw */ \ + unrevOff, \ + oneRevOff, \ + twoRevOff, \ + threeRevOff, \ + ebwtfw); /* ebwtFw */ \ } \ assert_eq(0, hits.size()); \ } @@ -2895,18 +2879,20 @@ static BitPairReference* twoOrThreeMismatchSearch_refs; EbwtSearchParams > params( \ *sink, /* HitSink */ \ os, /* reference sequences */ \ - revcomp, /* forward AND reverse complement? */ \ true, /* read is forward */ \ - true, /* index is forward */ \ - rangeMode); /* range mode (irrelevant here) */ + true); /* index is forward */ static void* twoOrThreeMismatchSearchWorkerPhase1(void *vp) { TWOTHREE_WORKER_SETUP(); SyncBitset& doneMask = *twoOrThreeMismatchSearch_doneMask; SyncBitset& hitMask = *twoOrThreeMismatchSearch_hitMask; Ebwt >& ebwtFw = *twoOrThreeMismatchSearch_ebwtFw; + const BitPairReference* refs = twoOrThreeMismatchSearch_refs; + GreedyDFSRangeSource btr1( - &ebwtFw, params, + &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) 0xffffffff, // qualThresh // Do not impose maximums in 2/3-mismatch mode 0xffffffff, // max backtracks (no limit) @@ -2941,8 +2927,12 @@ static void* twoOrThreeMismatchSearchWorkerPhase2(void *vp) { SyncBitset& doneMask = *twoOrThreeMismatchSearch_doneMask; SyncBitset& hitMask = *twoOrThreeMismatchSearch_hitMask; Ebwt >& ebwtBw = *twoOrThreeMismatchSearch_ebwtBw; + const BitPairReference* refs = twoOrThreeMismatchSearch_refs; + GreedyDFSRangeSource bt2( &ebwtBw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) 0xffffffff, // qualThresh // Do not impose maximums in 2/3-mismatch mode 0xffffffff, // max backtracks (no limit) @@ -2976,9 +2966,13 @@ static void* twoOrThreeMismatchSearchWorkerPhase3(void *vp) { SyncBitset& doneMask = *twoOrThreeMismatchSearch_doneMask; SyncBitset& hitMask = *twoOrThreeMismatchSearch_hitMask; Ebwt >& ebwtFw = *twoOrThreeMismatchSearch_ebwtFw; + const BitPairReference* refs = twoOrThreeMismatchSearch_refs; + // GreedyDFSRangeSource to search for seedlings for case 4F GreedyDFSRangeSource bt3( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) 0xffffffff, // qualThresh (none) // Do not impose maximums in 2/3-mismatch mode 0xffffffff, // max backtracks (no limit) @@ -2992,6 +2986,8 @@ static void* twoOrThreeMismatchSearchWorkerPhase3(void *vp) { false); // considerQuals GreedyDFSRangeSource bthh3( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) 0xffffffff, // qualThresh // Do not impose maximums in 2/3-mismatch mode 0xffffffff, // max backtracks (no limit) @@ -3031,7 +3027,6 @@ static void twoOrThreeMismatchSearch( bool two = true) /// true -> 2, false -> 3 { // Global initialization - assert(revcomp); assert(!fullIndex); assert(!ebwtFw.isInMemory()); assert(!ebwtBw.isInMemory()); @@ -3134,6 +3129,7 @@ static void *twoOrThreeMismatchSearchWorkerStateful(void *vp) { ebwtFw, &ebwtBw, two, + snpPhred, !nofw, !norc, _sink, @@ -3142,6 +3138,7 @@ static void *twoOrThreeMismatchSearchWorkerStateful(void *vp) { NULL, //&cacheBw, cacheLimit, pool, + refs, os, !noMaqRound, !better, @@ -3153,6 +3150,8 @@ static void *twoOrThreeMismatchSearchWorkerStateful(void *vp) { Paired23mmAlignerV1Factory alPEfact( ebwtFw, &ebwtBw, + color, + snpPhred, !nofw, !norc, useV1, @@ -3205,8 +3204,11 @@ static void* twoOrThreeMismatchSearchWorkerFull(void *vp) { TWOTHREE_WORKER_SETUP(); Ebwt >& ebwtFw = *twoOrThreeMismatchSearch_ebwtFw; Ebwt >& ebwtBw = *twoOrThreeMismatchSearch_ebwtBw; + const BitPairReference* refs = twoOrThreeMismatchSearch_refs; GreedyDFSRangeSource btr1( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) 0xffffffff, // qualThresh // Do not impose maximums in 2/3-mismatch mode 0xffffffff, // max backtracks (no limit) @@ -3220,6 +3222,8 @@ static void* twoOrThreeMismatchSearchWorkerFull(void *vp) { false); // considerQuals GreedyDFSRangeSource bt2( &ebwtBw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) 0xffffffff, // qualThresh // Do not impose maximums in 2/3-mismatch mode 0xffffffff, // max backtracks (no limit) @@ -3233,6 +3237,8 @@ static void* twoOrThreeMismatchSearchWorkerFull(void *vp) { false); // considerQuals GreedyDFSRangeSource bt3( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) 0xffffffff, // qualThresh (none) // Do not impose maximums in 2/3-mismatch mode 0xffffffff, // max backtracks (no limit) @@ -3246,6 +3252,8 @@ static void* twoOrThreeMismatchSearchWorkerFull(void *vp) { false); // considerQuals GreedyDFSRangeSource bthh3( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) 0xffffffff, // qualThresh // Do not impose maximums in 2/3-mismatch mode 0xffffffff, // max backtracks (no limit) @@ -3281,32 +3289,32 @@ static void* twoOrThreeMismatchSearchWorkerFull(void *vp) { template static void twoOrThreeMismatchSearchFull( PairedPatternSource& _patsrc, /// pattern source - HitSink& _sink, /// hit sink - Ebwt& ebwtFw, /// index of original text - Ebwt& ebwtBw, /// index of mirror text - vector >& os, /// text strings, if available (empty otherwise) - bool two = true) /// true -> 2, false -> 3 + HitSink& _sink, /// hit sink + Ebwt& ebwtFw, /// index of original text + Ebwt& ebwtBw, /// index of mirror text + vector >& os, /// text strings, if available (empty otherwise) + bool two = true) /// true -> 2, false -> 3 { // Global initialization - assert(revcomp); assert(fullIndex); assert(!ebwtFw.isInMemory()); assert(!ebwtBw.isInMemory()); { // Load the other half of the index into memory Timer _t(cerr, "Time loading forward index: ", timing); - ebwtFw.loadIntoMemory(!noRefNames, startVerbose); + ebwtFw.loadIntoMemory(color ? 1 : 0, !noRefNames, startVerbose); } { // Load the other half of the index into memory Timer _t(cerr, "Time loading mirror index: ", timing); - ebwtBw.loadIntoMemory(!noRefNames, startVerbose); + ebwtBw.loadIntoMemory(color ? 1 : 0, !noRefNames, startVerbose); } // Create range caches, which are shared among all aligners BitPairReference *refs = NULL; - if((mates1.size() > 0 || mates12.size() > 0) && mixedThresh < 0xffffffff) { + bool pair = mates1.size() > 0 || mates12.size() > 0; + if(color || (pair && mixedThresh < 0xffffffff)) { Timer _t(cerr, "Time loading reference: ", timing); - refs = new BitPairReference(adjustedEbwtFileBase, sanityCheck, NULL, &os, false, useMm, useShmem, mmSweep, verbose, startVerbose); + refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, useMm, useShmem, mmSweep, verbose, startVerbose); if(!refs->loaded()) throw 1; } twoOrThreeMismatchSearch_refs = refs; @@ -3372,22 +3380,23 @@ static BitPairReference* seededQualSearch_refs; EbwtSearchParams > params( \ *sink, /* HitSink */ \ os, /* reference sequences */ \ - revcomp, /* forward AND reverse complement? */ \ true, /* read is forward */ \ - true, /* index is forward */ \ - rangeMode); /* range mode (irrelevant here) */ + true); /* index is forward */ static void* seededQualSearchWorkerPhase1(void *vp) { SEEDEDQUAL_WORKER_SETUP(); SyncBitset& doneMask = *seededQualSearch_doneMask; SyncBitset& hitMask = *seededQualSearch_hitMask; Ebwt >& ebwtFw = *seededQualSearch_ebwtFw; + const BitPairReference* refs = seededQualSearch_refs; uint32_t s = seedLen; uint32_t s5 = (s >> 1) + (s & 1); /* length of 5' half of seed */ // GreedyDFSRangeSource for finding exact hits for the forward- // oriented read GreedyDFSRangeSource btf1( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh maxBtsBetter, // max backtracks 0, // reportPartials (don't) @@ -3400,6 +3409,8 @@ static void* seededQualSearchWorkerPhase1(void *vp) { false); // considerQuals GreedyDFSRangeSource bt1( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh maxBtsBetter, // max backtracks 0, // reportPartials (don't) @@ -3435,9 +3446,12 @@ static void* seededQualSearchWorkerPhase2(void *vp) { uint32_t s5 = (s >> 1) + (s & 1); /* length of 5' half of seed */ Ebwt >& ebwtBw = *seededQualSearch_ebwtBw; PartialAlignmentManager* pamRc = seededQualSearch_pamRc; + const BitPairReference* refs = seededQualSearch_refs; // GreedyDFSRangeSource to search for hits for cases 1F, 2F, 3F GreedyDFSRangeSource btf2( &ebwtBw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh maxBtsBetter, // max backtracks 0, // reportPartials (no) @@ -3452,6 +3466,8 @@ static void* seededQualSearchWorkerPhase2(void *vp) { // GreedyDFSRangeSource to search for partial alignments for case 4R GreedyDFSRangeSource btr2( &ebwtBw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh (none) maxBtsBetter, // max backtracks seedMms, // report partials (up to seedMms mms) @@ -3490,9 +3506,12 @@ static void* seededQualSearchWorkerPhase3(void *vp) { Ebwt >& ebwtFw = *seededQualSearch_ebwtFw; PartialAlignmentManager* pamFw = seededQualSearch_pamFw; PartialAlignmentManager* pamRc = seededQualSearch_pamRc; + const BitPairReference* refs = seededQualSearch_refs; // GreedyDFSRangeSource to search for seedlings for case 4F GreedyDFSRangeSource btf3( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh (none) maxBtsBetter, // max backtracks seedMms, // reportPartials (do) @@ -3508,6 +3527,8 @@ static void* seededQualSearchWorkerPhase3(void *vp) { // the partial alignments found in Phase 2 GreedyDFSRangeSource btr3( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh maxBtsBetter, // max backtracks 0, // reportPartials (don't) @@ -3522,6 +3543,8 @@ static void* seededQualSearchWorkerPhase3(void *vp) { // The half-and-half GreedyDFSRangeSource GreedyDFSRangeSource btr23( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh maxBtsBetter, // max backtracks 0, // reportPartials (don't) @@ -3561,10 +3584,13 @@ static void* seededQualSearchWorkerPhase4(void *vp) { uint32_t s5 = (s >> 1) + (s & 1); /* length of 5' half of seed */ Ebwt >& ebwtBw = *seededQualSearch_ebwtBw; PartialAlignmentManager* pamFw = seededQualSearch_pamFw; + const BitPairReference* refs = seededQualSearch_refs; // GreedyDFSRangeSource to search for hits for case 4F by extending // the partial alignments found in Phase 3 GreedyDFSRangeSource btf4( &ebwtBw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh maxBtsBetter, // max backtracks 0, // reportPartials (don't) @@ -3579,6 +3605,8 @@ static void* seededQualSearchWorkerPhase4(void *vp) { // Half-and-half GreedyDFSRangeSource for forward read GreedyDFSRangeSource btf24( &ebwtBw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh maxBtsBetter, // max backtracks 0, // reportPartials (don't) @@ -3620,10 +3648,13 @@ static void* seededQualSearchWorkerFull(void *vp) { pamFw = new PartialAlignmentManager(64); } vector pals; + const BitPairReference* refs = seededQualSearch_refs; // GreedyDFSRangeSource for finding exact hits for the forward- // oriented read GreedyDFSRangeSource btf1( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh maxBtsBetter, // max backtracks 0, // reportPartials (don't) @@ -3636,6 +3667,8 @@ static void* seededQualSearchWorkerFull(void *vp) { false); // considerQuals GreedyDFSRangeSource bt1( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh maxBtsBetter, // max backtracks 0, // reportPartials (don't) @@ -3650,6 +3683,8 @@ static void* seededQualSearchWorkerFull(void *vp) { // GreedyDFSRangeSource to search for hits for cases 1F, 2F, 3F GreedyDFSRangeSource btf2( &ebwtBw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh maxBtsBetter, // max backtracks 0, // reportPartials (no) @@ -3664,6 +3699,8 @@ static void* seededQualSearchWorkerFull(void *vp) { // GreedyDFSRangeSource to search for partial alignments for case 4R GreedyDFSRangeSource btr2( &ebwtBw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh (none) maxBtsBetter, // max backtracks seedMms, // report partials (up to seedMms mms) @@ -3678,6 +3715,8 @@ static void* seededQualSearchWorkerFull(void *vp) { // GreedyDFSRangeSource to search for seedlings for case 4F GreedyDFSRangeSource btf3( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh (none) maxBtsBetter, // max backtracks seedMms, // reportPartials (do) @@ -3693,6 +3732,8 @@ static void* seededQualSearchWorkerFull(void *vp) { // the partial alignments found in Phase 2 GreedyDFSRangeSource btr3( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh maxBtsBetter, // max backtracks 0, // reportPartials (don't) @@ -3707,6 +3748,8 @@ static void* seededQualSearchWorkerFull(void *vp) { // The half-and-half GreedyDFSRangeSource GreedyDFSRangeSource btr23( &ebwtFw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh maxBtsBetter, // max backtracks 0, // reportPartials (don't) @@ -3723,6 +3766,8 @@ static void* seededQualSearchWorkerFull(void *vp) { // the partial alignments found in Phase 3 GreedyDFSRangeSource btf4( &ebwtBw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh maxBtsBetter, // max backtracks 0, // reportPartials (don't) @@ -3737,6 +3782,8 @@ static void* seededQualSearchWorkerFull(void *vp) { // Half-and-half GreedyDFSRangeSource for forward read GreedyDFSRangeSource btf24( &ebwtBw, params, + refs, // reference sequence (for colorspace) + snpPhred, // phred probability of SNP (for colorspace) qualCutoff, // qualThresh maxBtsBetter, // max backtracks 0, // reportPartials (don't) @@ -3804,6 +3851,7 @@ static void* seededQualSearchWorkerFullStateful(void *vp) { seedMms, seedLen, qualCutoff, + snpPhred, maxBts, _sink, *sinkFact, @@ -3811,6 +3859,7 @@ static void* seededQualSearchWorkerFullStateful(void *vp) { NULL, //&cacheBw, cacheLimit, pool, + refs, os, !noMaqRound, !better, @@ -3823,12 +3872,14 @@ static void* seededQualSearchWorkerFullStateful(void *vp) { PairedSeedAlignerFactory alPEfact( ebwtFw, &ebwtBw, + color, useV1, !nofw, !norc, seedMms, seedLen, qualCutoff, + snpPhred, maxBts, _sink, *sinkFact, @@ -3908,7 +3959,6 @@ static void seededQualCutoffSearch( vector >& os) /// text strings, if available (empty otherwise) { // Global intialization - assert(revcomp); assert(!fullIndex); assert_leq(seedMms, 3); uint32_t numQs = ((qUpto == 0xffffffff) ? 16 * 1024 * 1024 : qUpto); @@ -4079,7 +4129,6 @@ static void seededQualCutoffSearchFull( { // Global intialization assert(fullIndex); - assert(revcomp); assert_leq(seedMms, 3); seededQualSearch_patsrc = &_patsrc; @@ -4095,9 +4144,10 @@ static void seededQualCutoffSearchFull( // Create range caches, which are shared among all aligners BitPairReference *refs = NULL; - if((mates1.size() > 0 || mates12.size() > 0) && mixedThresh < 0xffffffff) { + bool pair = mates1.size() > 0 || mates12.size() > 0; + if(color || (pair && mixedThresh < 0xffffffff)) { Timer _t(cerr, "Time loading reference: ", timing); - refs = new BitPairReference(adjustedEbwtFileBase, sanityCheck, NULL, &os, false, useMm, useShmem, mmSweep, verbose, startVerbose); + refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, useMm, useShmem, mmSweep, verbose, startVerbose); if(!refs->loaded()) throw 1; } seededQualSearch_refs = refs; @@ -4112,7 +4162,7 @@ static void seededQualCutoffSearchFull( { // Load the other half of the index into memory Timer _t(cerr, "Time loading mirror index: ", timing); - ebwtBw.loadIntoMemory(!noRefNames, startVerbose); + ebwtBw.loadIntoMemory(color ? 1 : 0, !noRefNames, startVerbose); } CHUD_START(); { @@ -4151,11 +4201,11 @@ static PatternSource* patsrcFromStrings(int format, const vector& qs) { switch(format) { case FASTA: - return new FastaPatternSource (qs, randomizeQuals, + return new FastaPatternSource (qs, color, + randomizeQuals, useSpinlock, patDumpfile, verbose, trim3, trim5, - forgiveInput, skipReads); case FASTA_CONT: return new FastaContinuousPatternSource ( @@ -4165,17 +4215,18 @@ patsrcFromStrings(int format, const vector& qs) { patDumpfile, verbose, skipReads); case RAW: - return new RawPatternSource (qs, randomizeQuals, + return new RawPatternSource (qs, color, + randomizeQuals, useSpinlock, patDumpfile, verbose, trim3, trim5, skipReads); case FASTQ: - return new FastqPatternSource (qs, randomizeQuals, + return new FastqPatternSource (qs, color, + randomizeQuals, useSpinlock, patDumpfile, verbose, trim3, trim5, - forgiveInput, solexaQuals, phred64Quals, integerQuals, fuzzy, skipReads); @@ -4184,7 +4235,6 @@ patsrcFromStrings(int format, const vector& qs) { useSpinlock, patDumpfile, verbose, trim3, trim5, - forgiveInput, solexaQuals, phred64Quals, integerQuals, skipReads); case CMDLINE: @@ -4407,6 +4457,7 @@ static void driver(const char * type, cerr << "About to initialize fw Ebwt: "; logTime(cerr, true); } Ebwt ebwt(adjustedEbwtFileBase, + color, // index is colorspace true, // index is for the forward direction /* overriding: */ offRate, /* overriding: */ isaRate, @@ -4426,6 +4477,7 @@ static void driver(const char * type, cerr << "About to initialize rev Ebwt: "; logTime(cerr, true); } ebwtBw = new Ebwt(adjustedEbwtFileBase + ".rev", + color, // index is colorspace false, // index is for the reverse direction /* overriding: */ offRate, /* overriding: */ isaRate, @@ -4444,13 +4496,13 @@ static void driver(const char * type, // against original strings assert_eq(os.size(), ebwt.nPat()); for(size_t i = 0; i < os.size(); i++) { - assert_eq(length(os[i]), ebwt.plen()[i]); + assert_eq(length(os[i]), ebwt.plen()[i] + (color ? 1 : 0)); } } // Sanity-check the restored version of the Ebwt if(sanityCheck && !os.empty()) { - ebwt.loadIntoMemory(!noRefNames, startVerbose); - ebwt.checkOrigs(os, false); + ebwt.loadIntoMemory(color ? 1 : 0, !noRefNames, startVerbose); + ebwt.checkOrigs(os, color, false); ebwt.evictFromMemory(); } { @@ -4500,9 +4552,10 @@ static void driver(const char * type, } sam->appendHeaders( sam->out(0), ebwt.nPat(), - refnames, samNoSQ, rmap, + refnames, color, samNoSQ, rmap, ebwt.plen(), fullRef, - argstr.c_str()); + argstr.c_str(), + rgs.empty() ? NULL : rgs.c_str()); } sink = sam; } @@ -4598,7 +4651,7 @@ static void driver(const char * type, // Search without mismatches // Note that --fast doesn't make a difference here because // we're only loading half of the index anyway - exactSearch(*patsrc, *sink, ebwt, os, paired); + exactSearch(*patsrc, *sink, ebwt, os); } // Evict any loaded indexes from memory if(ebwt.isInMemory()) { diff --git a/ebwt_search_backtrack.h b/ebwt_search_backtrack.h index 8dd79db..0c59f09 100644 --- a/ebwt_search_backtrack.h +++ b/ebwt_search_backtrack.h @@ -24,46 +24,50 @@ class GreedyDFSRangeSource { public: GreedyDFSRangeSource( - const Ebwt* __ebwt, - const EbwtSearchParams& __params, - uint32_t __qualThresh, /// max acceptable q-distance + const Ebwt* ebwt, + const EbwtSearchParams& params, + const BitPairReference* refs, + int snpPhred, + uint32_t qualThresh, /// max acceptable q-distance const int maxBts, /// maximum # backtracks allowed - uint32_t __reportPartials = 0, - bool __reportExacts = true, - bool __reportRanges = false, - PartialAlignmentManager* __partials = NULL, - String* __muts = NULL, - bool __verbose = true, - vector >* __os = NULL, - bool __considerQuals = true, // whether to consider quality values when making backtracking decisions - bool __halfAndHalf = false, // hacky way of supporting separate revisitable regions - bool __maqPenalty = true) : + uint32_t reportPartials = 0, + bool reportExacts = true, + bool reportRanges = false, + PartialAlignmentManager* partials = NULL, + String* muts = NULL, + bool verbose = true, + vector >* os = NULL, + bool considerQuals = true, // whether to consider quality values when making backtracking decisions + bool halfAndHalf = false, // hacky way of supporting separate revisitable regions + bool maqPenalty = true) : + _refs(refs), + _snpPhred(snpPhred), _qry(NULL), _qlen(0), _qual(NULL), _name(NULL), - _ebwt(__ebwt), - _params(__params), + _ebwt(ebwt), + _params(params), _unrevOff(0), _1revOff(0), _2revOff(0), _3revOff(0), - _maqPenalty(__maqPenalty), - _qualThresh(__qualThresh), + _maqPenalty(maqPenalty), + _qualThresh(qualThresh), _pairs(NULL), _elims(NULL), _mms(), _refcs(), _chars(NULL), - _reportPartials(__reportPartials), - _reportExacts(__reportExacts), - _reportRanges(__reportRanges), - _partials(__partials), - _muts(__muts), - _os(__os), + _reportPartials(reportPartials), + _reportExacts(reportExacts), + _reportRanges(reportRanges), + _partials(partials), + _muts(muts), + _os(os), _sanity(_os != NULL && _os->size() > 0), - _considerQuals(__considerQuals), - _halfAndHalf(__halfAndHalf), + _considerQuals(considerQuals), + _halfAndHalf(halfAndHalf), _5depth(0), _3depth(0), _numBts(0), @@ -72,14 +76,14 @@ class GreedyDFSRangeSource { _precalcedSideLocus(false), _preLtop(), _preLbot(), - _verbose(__verbose), + _verbose(verbose), _ihits(0llu) { } ~GreedyDFSRangeSource() { - if(_pairs != NULL) delete[] _pairs; - if(_elims != NULL) delete[] _elims; - if(_chars != NULL) delete[] _chars; + if(_pairs != NULL) delete[] _pairs; + if(_elims != NULL) delete[] _elims; + if(_chars != NULL) delete[] _chars; } /** @@ -131,6 +135,7 @@ class GreedyDFSRangeSource { } // Initialize the random source using new read as part of the // seed. + _color = r.color; _rand.init(r.seed); } @@ -347,12 +352,6 @@ class GreedyDFSRangeSource { // Remainder of this function is sanity checking sink.setRetainHits(oldRetain); // restore old value - if(_sanity) { - // Check that the alignments produced for the read and - // backtracking constraints are consistent with results - // obtained by a naive oracle - sanityCheckHits(sink.retainedHits(), _ihits, iham); - } _totNumBts += _numBts; _numBts = 0; _precalcedSideLocus = false; @@ -684,9 +683,6 @@ class GreedyDFSRangeSource { backtrackDespiteMatch = true; mustBacktrack = true; } else if(stackDepth == 0) { - if(sink.numValidHits() == prehits) { - confirmNoHit(iham); - } // We're returning from the bottommost frame // without having found any hits; let's // sanity-check that there really aren't any @@ -716,12 +712,6 @@ class GreedyDFSRangeSource { mustBacktrack = true; backtrackDespiteMatch = true; } else if(stackDepth < 2) { - if(stackDepth == 0 && sink.numValidHits() == prehits) { - // We're returning from the bottommost frame - // without having found any hits; let's - // sanity-check that there really aren't any - confirmNoHit(iham); - } return false; } } @@ -744,11 +734,7 @@ class GreedyDFSRangeSource { !invalidExact && // alignment isn't disqualified by no-exact-hits setting !reportedPartial) // for when it's a partial alignment we've already reported { - uint64_t rhits = sink.numReportedHits(); bool ret = reportAlignment(stackDepth, top, bot, ham); - if(_sanity && sink.numReportedHits() > rhits) { - confirmHit(iham); - } if(!ret) { // reportAlignment returned false, so enter the // backtrack loop and keep going @@ -992,11 +978,6 @@ class GreedyDFSRangeSource { } if(ret) { assert_gt(sink.numValidHits(), prehits); - // Not necessarily true that there is a hit to - // confirm; it could be that true was returned - // because we just ran up against the max, where - // max > k - //if(_sanity) confirmHit(iham); return true; // return, signaling that we're done } if(_bailedOnBacktracks || @@ -1025,12 +1006,6 @@ class GreedyDFSRangeSource { assert_eq(0, altNum); assert_eq(0, eligibleSz); assert_eq(0, eligibleNum); - if(stackDepth == 0 && sink.numValidHits() == prehits) { - // We're returning from the bottommost frame - // without having found any hits; let's - // sanity-check that there really aren't any - confirmNoHit(iham); - } return false; } else if(eligibleNum == 0 && _considerQuals) { @@ -1096,15 +1071,6 @@ class GreedyDFSRangeSource { // Try again } // while(top == bot && altNum > 0) if(mustBacktrack || invalidHalfAndHalf || invalidExact) { - // This full alignment was rejected for one of the above - // reasons - return false to indicate we should keep - // searching - if(stackDepth == 0 && sink.numValidHits() == prehits) { - // We're returning from the bottommost frame - // without having found any hits; let's - // sanity-check that there really aren't any - confirmNoHit(iham); - } return false; } // Mismatch with no alternatives @@ -1112,12 +1078,6 @@ class GreedyDFSRangeSource { assert_eq(0, altNum); assert_eq(0, eligibleSz); assert_eq(0, eligibleNum); - if(stackDepth == 0 && sink.numValidHits() == prehits) { - // We're returning from the bottommost frame - // without having found any hits; let's - // sanity-check that there really aren't any - confirmNoHit(iham); - } return false; } // Match! @@ -1134,9 +1094,6 @@ class GreedyDFSRangeSource { if(stackDepth >= _reportPartials) { ret = reportAlignment(stackDepth, top, bot, ham); } - if(!ret && stackDepth == 0 && sink.numValidHits() == prehits) { - confirmNoHit(iham); - } return ret; } @@ -1253,7 +1210,6 @@ class GreedyDFSRangeSource { const std::vector& mms, uint64_t prehits = 0xffffffffffffffffllu) { - HitSinkPerThread& sink = _params.sink(); assert_eq(0, _reportPartials); // Crossing from the hi-half into the lo-half if(d == _5depth) { @@ -1264,14 +1220,6 @@ class GreedyDFSRangeSource { assert_leq(stackDepth, 1); // Reject if we haven't encountered mismatch by this point if(stackDepth == 0) { - if(prehits != 0xffffffffffffffffllu && - sink.numValidHits() == prehits) - { - // We're returning from the bottommost frame - // without having reported any hits; let's - // sanity-check that there really aren't any - confirmNoHit(iham); - } return false; } } else { // if(_3revOff != _2revOff) @@ -1281,14 +1229,6 @@ class GreedyDFSRangeSource { assert_leq(stackDepth, 2); // Reject if we haven't encountered mismatch by this point if(stackDepth < 1) { - if(prehits != 0xffffffffffffffffllu && - sink.numValidHits() == prehits) - { - // We're returning from the bottommost frame - // without having found any hits; let's - // sanity-check that there really aren't any - confirmNoHit(iham); - } return false; } } @@ -1301,12 +1241,6 @@ class GreedyDFSRangeSource { assert_leq(stackDepth, 2); // Must have encountered two mismatches by this point if(stackDepth < 2) { - if(prehits != 0xffffffffffffffffllu && - sink.numValidHits() == prehits && - stackDepth == 0) - { - confirmNoHit(iham); - } // We're returning from the bottommost frame // without having found any hits; let's // sanity-check that there really aren't any @@ -1326,13 +1260,6 @@ class GreedyDFSRangeSource { assert_leq(loHalfMms + hiHalfMms, 3); assert_gt(hiHalfMms, 0); if(loHalfMms == 0) { - // Didn't encounter any mismatches in the lo-half - if(prehits != 0xffffffffffffffffllu && - sink.numValidHits() == prehits && - stackDepth == 0) - { - confirmNoHit(iham); - } // We're returning from the bottommost frame // without having found any hits; let's // sanity-check that there really aren't any @@ -1609,12 +1536,7 @@ class GreedyDFSRangeSource { // of the backtracker) return false; } - if(_reportRanges) { - assert(_params.arrowMode()); - return _ebwt->report((*_qry), _qual, _name, - _mms, _refcs, stackDepth, 0, - top, bot, _qlen, stratum, cost, _params); - } + assert(!_reportRanges); uint32_t spread = bot - top; // Pick a random spot in the range to begin report uint32_t r = top + (_rand.nextU32() % spread); @@ -1625,9 +1547,9 @@ class GreedyDFSRangeSource { // their indices into the query string; not in terms // of their offset from the 3' or 5' end. if(_ebwt->reportChaseOne((*_qry), _qual, _name, - _mms, _refcs, stackDepth, ri, - top, bot, _qlen, stratum, cost, - _params)) + _color, _snpPhred, _refs, _mms, + _refcs, stackDepth, ri, top, bot, + _qlen, stratum, cost, _params)) { // Return value of true means that we can stop return true; @@ -1775,258 +1697,14 @@ class GreedyDFSRangeSource { return true; } - /** - * Confirm that no - */ - void confirmNoHit(uint32_t iham) { - // Not smart enough to deal with seedling hits yet - if(!_sanity || _reportPartials > 0) return; - vector oracleHits; - // Invoke the naive oracle, which will place all qualifying - // hits in the 'oracleHits' vector - naiveOracle(oracleHits, iham); - if(oracleHits.size() > 0) { - // Oops, the oracle found at least one hit; print - // detailed info about the first oracle hit (for - // debugging) - cout << "Oracle hit " << oracleHits.size() - << " times, but backtracker did not hit" << endl; - for(size_t i = 0; i < oracleHits.size() && i < 3; i++) { - const Hit& h = oracleHits[i]; - cout << " Oracle hit " << i << ": " << endl; - if(_muts != NULL) { - undoPartialMutations(); - cout << " Unmutated Pat: " << prefix(*_qry, _qlen) << endl; - applyPartialMutations(); - } - cout << " Pat: " << prefix(*_qry, _qlen) << endl; - cout << " Tseg: "; - bool ebwtFw = _ebwt->fw(); - if(ebwtFw) { - for(size_t i = 0; i < _qlen; i++) { - cout << (*_os)[h.h.first][h.h.second + i]; - } - } else { - for(int i = (int)_qlen-1; i >= 0; i--) { - cout << (*_os)[h.h.first][h.h.second + i]; - } - } - cout << endl; - cout << " Quals: " << prefix(*_qual, _qlen) << endl; - cout << " Bt: "; - for(int i = (int)_qlen-1; i >= 0; i--) { - if (i < (int)_unrevOff) cout << "0"; - else if(i < (int)_1revOff) cout << "1"; - else if(i < (int)_2revOff) cout << "2"; - else if(i < (int)_3revOff) cout << "3"; - else cout << "X"; - } - cout << endl; - } - } - assert_eq(0, oracleHits.size()); - } - - /** - * - */ - void confirmHit(uint32_t iham) { - // Not smart enough to deal with seedling hits yet - if(!_sanity || _reportPartials > 0) return; - vector oracleHits; - // Invoke the naive oracle, which will place all qualifying - // hits in the 'oracleHits' vector - naiveOracle(oracleHits, iham); - vector& retainedHits = _params.sink().retainedHits(); - assert_gt(retainedHits.size(), 0); - if(oracleHits.size() == 0) { - ::printHit( - (*_os), retainedHits.back(), (*_qry), _qlen, _unrevOff, - _1revOff, _2revOff, _3revOff, _ebwt->fw()); - } - // If we found a hit, it had better be one of the ones - // that the oracle found - assert_gt(oracleHits.size(), 0); - // Get the hit reported by the backtracker - Hit& rhit = retainedHits.back(); - // Go through oracleHits and look for a match - size_t i; - for(i = 0; i < oracleHits.size(); i++) { - const Hit& h = oracleHits[i]; - if(h.h.first == rhit.h.first && - h.h.second == rhit.h.second && - h.fw == rhit.fw) - { - assert(h.mms == rhit.mms); - // It's a match - hit confirmed - break; - } - } - assert_lt(i, oracleHits.size()); // assert we found a matchup - } - - /** - * Sanity-check the final set of alignments generated for this read - * with respect to the current backtracking constraints. - */ - void sanityCheckHits(const vector& hits, - size_t first, uint32_t iham) - { - if(!_sanity || _reportPartials > 0 || _bailedOnBacktracks || first == hits.size()) { - return; - } - HitSinkPerThread& sink = _params.sink(); - uint32_t maxHitsAllowed = sink.maxHits(); - vector oracleHits; - // Invoke the naive oracle, which will place all qualifying - // hits and their strata in the 'oracleHits' vector - naiveOracle(oracleHits, iham); - assert_gt(oracleHits.size(), 0); - int lastStratum = -1; - int bestStratum = -1; - int worstStratum = -1; - // For each hit generated for this read - for(size_t i = first; i < hits.size(); i++) { - const Hit& rhit = hits[i]; - int stratum = rhit.stratum; - // Keep a running measure of the "worst" stratum - // observed in the reported hits - if(worstStratum == -1) { - worstStratum = stratum; - } else if(stratum > worstStratum) { - worstStratum = stratum; - } - // Keep a running measure of the "best" stratum - // observed in the reported hits - if(bestStratum == -1) { - bestStratum = stratum; - } else if(stratum < bestStratum) { - bestStratum = stratum; - } - if(lastStratum == -1) { - lastStratum = stratum; - } else if(!sink.spanStrata()) { - // Retained hits are not allowed to "span" strata, - // so this hit's stratum had better be the same as - // the last hit's - assert_eq(lastStratum, stratum); - } - // Go through oracleHits and look for a match - size_t i; - bool found = false; - for(i = 0; i < oracleHits.size(); i++) { - const Hit& h = oracleHits[i]; - if(h.h.first == rhit.h.first && - h.h.second == rhit.h.second && - h.fw == rhit.fw) - { - // Oracle hit i and reported hit j refer to the - // same alignment - assert(h.mms == rhit.mms); - assert_eq(stratum, h.stratum); - // Erase the elements in the oracle vectors - oracleHits.erase(oracleHits.begin() + i); - found = true; - break; - } - } - // If the backtracker found a hit, it had better be one - // of the ones that the oracle found - assert(found); - } - // All of the oracle hits that corresponded to a reported - // hit have now been eliminated - assert_neq(-1, lastStratum); - assert_neq(-1, bestStratum); - assert_neq(-1, worstStratum); - if(maxHitsAllowed == 0xffffffff && !sink.exceededOverThresh()) { - if(sink.spanStrata()) { - // All hits remaining must occur more times than the max - map readToCnt; - for(size_t i = 0; i < oracleHits.size(); i++) { - readToCnt[oracleHits[i].patId]++; - } - map::iterator it; - for(it = readToCnt.begin(); it != readToCnt.end(); it++) { - assert_gt(it->second, sink.overThresh()); - } - } else { - // Must have matched all oracle hits at the best - // stratum - size_t numLeftovers = 0; - for(size_t i = 0; i < oracleHits.size(); i++) { - if(oracleHits[i].stratum == bestStratum) { - numLeftovers++; - } - } - // Must have matched every oracle hit - if(numLeftovers > 0 && numLeftovers <= sink.overThresh()) { - for(size_t i = 0; i < oracleHits.size(); i++) { - if(oracleHits[i].stratum == bestStratum) { - printHit(oracleHits[i]); - } - } - assert(0); - } - } - } - if(sink.best() && sink.overThresh() == 0xffffffff && !sink.exceededOverThresh()) { - // Ensure that all oracle hits have a stratum at least - // as bad as the worst stratum observed in the reported - // hits - for(size_t i = 0; i < oracleHits.size(); i++) { - assert_leq(bestStratum, oracleHits[i].stratum); - } - } - } - - /** - * Naively search for hits for the current pattern under the - * current backtracking strategy and store hits in hits vector. - */ - void naiveOracle(vector& hits, - uint32_t iham = 0, /// initial weighted hamming distance - uint32_t unrevOff = 0xffffffff, - uint32_t oneRevOff = 0xffffffff, - uint32_t twoRevOff = 0xffffffff, - uint32_t threeRevOff = 0xffffffff) - { - if(unrevOff == 0xffffffff) unrevOff = _unrevOff; - if(oneRevOff == 0xffffffff) oneRevOff = _1revOff; - if(twoRevOff == 0xffffffff) twoRevOff = _2revOff; - if(threeRevOff == 0xffffffff) threeRevOff = _3revOff; - bool ebwtFw = _ebwt->fw(); - bool fw = _params.fw(); - uint32_t patid = _params.patId(); - - ::naiveOracle( - (*_os), - (*_qry), - _qlen, - (*_qual), - (*_name), - patid, - hits, - _qualThresh, - unrevOff, - oneRevOff, - twoRevOff, - threeRevOff, - fw, // transpose - ebwtFw, - iham, - _muts, - _maqPenalty, - _halfAndHalf, - _reportExacts, - false); - } - + const BitPairReference* _refs; // reference sequences (or NULL if not colorspace) + int _snpPhred; // phred probability of SNP String* _qry; // query (read) sequence size_t _qlen; // length of _qry String* _qual; // quality values for _qry - String* _name; // name of _qry - const Ebwt >* _ebwt; // Ebwt to search in + String* _name; // name of _qry + bool _color; // whether read is colorspace + const Ebwt >* _ebwt; // Ebwt to search in const EbwtSearchParams >& _params; // Ebwt to search in uint32_t _unrevOff; // unrevisitable chunk uint32_t _1revOff; // 1-revisitable chunk @@ -2126,7 +1804,7 @@ class EbwtRangeSource : public RangeSource { bool maqPenalty, bool qualOrder, AlignerMetrics *metrics = NULL) : - RangeSource(), + RangeSource(), qry_(NULL), qlen_(0), qual_(NULL), @@ -2205,6 +1883,7 @@ class EbwtRangeSource : public RangeSource { assert_geq(length(*qual_), qlen_); this->done = false; this->foundRange = false; + color_ = r.color; rand_.init(r.seed); } @@ -2895,6 +2574,7 @@ class EbwtRangeSource : public RangeSource { size_t qlen_; // length of _qry String* qual_; // quality values for _qry String* name_; // name of _qry + bool color_; // true -> read is colorspace String* altQry_; // alternate basecalls String* altQual_; // quality values for alternate basecalls int alts_; // max # alternatives diff --git a/ebwt_search_util.cpp b/ebwt_search_util.cpp index 6b7ff59..30fe5b9 100644 --- a/ebwt_search_util.cpp +++ b/ebwt_search_util.cpp @@ -9,14 +9,14 @@ using namespace seqan; * regions constraining the hit. */ void printHit(const vector >& os, - const Hit& h, - const String& qry, - size_t qlen, - uint32_t unrevOff, - uint32_t oneRevOff, - uint32_t twoRevOff, - uint32_t threeRevOff, - bool ebwtFw) + const Hit& h, + const String& qry, + size_t qlen, + uint32_t unrevOff, + uint32_t oneRevOff, + uint32_t twoRevOff, + uint32_t threeRevOff, + bool ebwtFw) { // Print pattern sequence cout << " Pat: " << qry << endl; @@ -42,214 +42,3 @@ void printHit(const vector >& os, } cout << endl; } - -/** - * Naively search for the same hits that should be found by the - * backtracking search. This is used only if --orig and -s have - * been specified for bowtie-debug. - */ -void naiveOracle(const vector >& os, - const String& qry, - uint32_t qlen, - const String& qual, - const String& name, - uint32_t patid, - vector& hits, - uint32_t qualThresh, - uint32_t unrevOff, - uint32_t oneRevOff, - uint32_t twoRevOff, - uint32_t threeRevOff, - bool fw, - bool ebwtFw, - uint32_t iham = 0, - String* muts = NULL, - bool maqPenalty = true, - bool halfAndHalf = false, - bool reportExacts = true, - bool invert = false) -{ - bool fivePrimeOnLeft = (ebwtFw == fw); - uint32_t plen = qlen; - uint8_t *pstr = (uint8_t *)begin(qry, Standard()); - // For each text... - for(size_t i = 0; i < os.size(); i++) { - // For each text position... - if(length(os[i]) < plen) continue; - uint32_t olen = length(os[i]); - uint8_t *ostr = (uint8_t *)begin(os[i], Standard()); - // For each possible alignment of pattern against text - for(size_t j = 0; j <= olen - plen; j++) { - size_t mms = 0; // mismatches observed over the whole thing - size_t rev1mm = 0; // mismatches observed in the 1-revisitable region - size_t rev2mm = 0; // mismatches observed in the 2-revisitable region - size_t rev3mm = 0; // mismatches observed in the 3-revisitable region - uint32_t ham = iham; // weighted hamming distance so far - FixedBitset diffs; // mismatch bitvector - vector refcs; // reference characters for mms - refcs.resize(qlen, 0); - // For each alignment column, from right to left - bool success = true; - int ok, okInc; - if(ebwtFw) { - ok = j+(int)plen-1; - okInc = -1; - } else { - ok = olen-(j+((int)plen-1))-1; - okInc = 1; - } - bool rejectN = false; - for(int k = (int)plen-1; k >= 0; k--) { - size_t kr = plen-1-k; - if((int)ostr[ok] == 4) { - rejectN = true; - break; - } - if(pstr[k] != ostr[ok]) { - mms++; - ham += mmPenalty(maqPenalty, phredCharToPhredQual(qual[k])); - if(ham > qualThresh) { - // Alignment is invalid because it exceeds - // our target weighted hamming distance - // threshold - success = false; - break; - } - size_t koff = kr; - if(invert) koff = (size_t)k; - // What region does the mm fall into? - if(koff < unrevOff) { - // Alignment is invalid because it contains - // a mismatch in the unrevisitable region - success = false; - break; - } else if(koff < oneRevOff) { - rev1mm++; - if(rev1mm > 1) { - // Alignment is invalid because it - // contains more than 1 mismatch in the - // 1-revisitable region - success = false; - break; - } - } else if(koff < twoRevOff) { - rev2mm++; - if(rev2mm > 2) { - // Alignment is invalid because it - // contains more than 2 mismatches in the - // 2-revisitable region - success = false; - break; - } - } else if(koff < threeRevOff) { - rev3mm++; - if(rev3mm > 3) { - // Alignment is invalid because it - // contains more than 3 mismatches in the - // 3-revisitable region - success = false; - break; - } - } - if(halfAndHalf) { - if(twoRevOff == threeRevOff) { - // 1 and 1 - assert_eq(0, rev3mm); - if(rev1mm > 1 || rev2mm > 1) { - // Half-and-half alignment is invalid - // because it contains more than 1 mismatch - // in either one or the other half - success = false; - break; - } - } else { - // 1 and 1,2 - assert_eq(unrevOff, oneRevOff); - assert_eq(0, rev1mm); - if(rev2mm > 2 || rev3mm > 2) { - success = false; - break; - } - } - } - // Update 'diffs' and 'refcs' to reflect this - // mismatch - if(fivePrimeOnLeft) { - diffs.set(k); - refcs[k] = (char)ostr[ok]; - } else { - // The 3' end is on on the left end of the - // pattern, but the diffs vector should - // encode mismatches w/r/t the 5' end, so - // we flip - diffs.set(plen-k-1); - refcs[plen-k-1] = (char)ostr[ok]; - } - } - ok += okInc; - } - if(rejectN) { - // Rejected because the reference half of the - // alignment contained one or more Ns - continue; - } - if(halfAndHalf) { - if(twoRevOff == threeRevOff) { - if(rev1mm != 1 || rev2mm != 1) { - success = false; - } - } else { - if(rev2mm == 0 || rev3mm == 0 || - rev2mm + rev3mm < 2 || - rev2mm + rev3mm > 3) - { - success = false; - } - } - } - if(!reportExacts && mms == 0) { - // Reject this exact alignment because we've been - // instructed to ignore them (usually because a - // previous invocation already reported them) - success = false; - } - if(success) { - // It's a hit - uint32_t off = j; - int stratum = (int)(rev1mm + rev2mm + rev3mm); - if(!ebwtFw) { - off = olen - off; - off -= plen; - } - // Add in mismatches from _muts - if(muts != NULL) { - for(size_t i = 0; i < length(*muts); i++) { - // Entries in _mms[] are in terms of offset into - // _qry - not in terms of offset from 3' or 5' end - if(fivePrimeOnLeft) { - diffs.set((*muts)[i].pos); - refcs[(*muts)[i].pos] = (*muts)[i].newBase; - } else { - diffs.set(plen - (*muts)[i].pos - 1); - refcs[plen - (*muts)[i].pos - 1] = (*muts)[i].newBase; - } - stratum++; - } - } - Hit h(make_pair(i, off), // ref coords - make_pair(0, 0), // (bogus) mate ref coords - patid, // read id - name, // read name - qry, // read sequence - qual, // read qualities - fw, // forward/reverse-comp - true, // (bogus) mate forward/reverse-comp - 0, // (bogus) mate length - diffs, // mismatch bitvector - refcs, - 0, stratum, ham | (stratum << 14), 0); - hits.push_back(h); - } // For each pattern character - } // For each alignment over current text - } // For each text -} diff --git a/ebwt_search_util.h b/ebwt_search_util.h index b70ba14..e54efd5 100644 --- a/ebwt_search_util.h +++ b/ebwt_search_util.h @@ -160,28 +160,6 @@ void printHit(const vector >& os, uint32_t threeRevOff, bool ebwtFw); -extern -void naiveOracle(const vector >& os, - const String& qry, - uint32_t qlen, - const String& qual, - const String& name, - uint32_t patid, - vector& hits, - uint32_t qualThresh, - uint32_t unrevOff, - uint32_t oneRevOff, - uint32_t twoRevOff, - uint32_t threeRevOff, - bool fw, - bool ebwtFw, - uint32_t iham, - String* muts, - bool maqPenalty, - bool halfAndHalf, - bool reportExacts, - bool invert); - /** * A synchronized data structure for storing partial alignments * associated with patids, with particular attention to compactness. diff --git a/filebuf.h b/filebuf.h index 1c5fc2a..2c0a0ab 100644 --- a/filebuf.h +++ b/filebuf.h @@ -226,6 +226,44 @@ class FileBuf { static const size_t LASTN_BUF_SZ = 8 * 1024; /** + * Keep get()ing characters until a non-whitespace character (or + * -1) is reached, and return it. + */ + int getPastWhitespace() { + int c; + while(isspace(c = get()) && c != -1); + return c; + } + + /** + * Keep get()ing characters until a we've passed over the next + * string of newline characters (\r's and \n's) or -1 is reached, + * and return it. + */ + int getPastNewline() { + int c = get(); + while(c != '\r' && c != '\n' && c != -1) c = get(); + while(c == '\r' || c == '\n') c = get(); + assert_neq(c, '\r'); + assert_neq(c, '\n'); + return c; + } + + /** + * Keep get()ing characters until a we've passed over the next + * string of newline characters (\r's and \n's) or -1 is reached, + * and return it. + */ + int peekPastNewline() { + int c = peek(); + while(c != '\r' && c != '\n' && c != -1) c = get(); + while(c == '\r' || c == '\n') c = get(); + assert_neq(c, '\r'); + assert_neq(c, '\n'); + return c; + } + + /** * Reset to the beginning of the last-N-chars buffer. */ void resetLastN() { diff --git a/hit.h b/hit.h index 4899cc6..dca7249 100644 --- a/hit.h +++ b/hit.h @@ -50,9 +50,6 @@ static const std::string output_type_names[] = { typedef pair U32Pair; -// Support reads of up to 1024 characters for now -static const int max_read_bp = 1024; - /** * Encapsulates a hit, including a text-id/text-offset pair, a pattern * id, and a boolean indicating whether it matched as its forward or @@ -62,58 +59,6 @@ class Hit { public: Hit() : stratum(-1) { } - Hit(const Hit& other) { this->operator=(other); } - - Hit(U32Pair _h, - U32Pair _mh, - uint32_t id, - const String& name, - const String& seq, - const String& q, - bool f, bool mf, - uint16_t ml, - const FixedBitset& mm, - const vector& rc, - uint32_t os, - int8_t st, - uint16_t co, - uint8_t m) : - h(_h), mh(_mh), patId(id), oms(os), fw(f), mfw(mf), mlen(ml), - stratum(st), cost(co), mate(m) - { - patName = name; - patSeq = seq; - quals = q; - mms = mm; - refcs = rc; - // Enforce the constraints imposed by the binary output format - if(seqan::length(patName) > 0xffff) { - cerr << "Error: One or more read names are 2^16 characters or longer. Please truncate" << endl - << "read names and re-run bowtie." << endl; - throw 1; - } - if(mms.count() > 0xff) { - cerr << "Error: The alignment contains 256 or more mismatches. bowtie cannot handle" << endl - << "alignments with this many alignments. Please provide smaller reads or consider" << endl - << "using a different tool." << endl; - throw 1; - } - if(seqan::length(quals) > 0xffff) { - cerr << "Error: One or more quality strings are 2^16 characters or longer. Please" << endl - << "truncate reads and re-run bowtie." << endl; - throw 1; - } - if(seqan::length(patSeq) > 0xffff) { - cerr << "Error: One or more read sequences are 2^16 characters or longer. Please" << endl - << "truncate reads and re-run bowtie." << endl; - throw 1; - } - if(stratum < 0 || stratum >= 4) { - cerr << "Error: Stratum is out of bounds: " << stratum << endl; - throw 1; - } - } - /** * Initialize this Ent to represent the given hit. */ @@ -146,9 +91,10 @@ class Hit { hs.name = hits.front().patName; hs.seq = hits.front().patSeq; hs.qual = hits.front().quals; + hs.color = hits.front().color; if(!hits.front().fw) { // Re-reverse - reverseComplementInPlace(hs.seq, false); + reverseComplementInPlace(hs.seq, hs.color); reverseInPlace(hs.qual); } // Convert hits to entries @@ -159,7 +105,7 @@ class Hit { } /** - * Populate a HitSet + * Populate a vector of Hits with hits from the given HitSet. */ static void fromHitSet(std::vector& hits, const HitSet& hs) { assert(hs.sorted()); @@ -174,9 +120,10 @@ class Hit { hits[i].patName = hs.name; hits[i].patSeq = hs.seq; hits[i].quals = hs.qual; + hits[i].color = hs.color; size_t len = seqan::length(hs.seq); if(!hs[i].fw) { - reverseComplementInPlace(hits[i].patSeq, false); + reverseComplementInPlace(hits[i].patSeq, hs.color); reverseInPlace(hits[i].quals); } hits[i].refcs.resize(len); @@ -193,9 +140,12 @@ class Hit { uint32_t patId; /// read index String patName; /// read name String patSeq; /// read sequence + String colSeq; /// read sequence String quals; /// read qualities - FixedBitset mms;/// mismatch mask + FixedBitset<1024> mms; /// nucleotide mismatch mask + FixedBitset<1024> cmms; /// color mismatch mask (if relevant) vector refcs; /// reference characters for mms + vector crefcs; /// reference characters for cmms uint32_t oms; /// # of other possible mappings; 0 -> this is unique bool fw; /// orientation of read in alignment bool mfw; /// orientation of mate in alignment @@ -205,6 +155,7 @@ class Hit { uint8_t mate; /// matedness; 0 = not a mate /// 1 = upstream mate /// 2 = downstream mate + bool color; /// read is in colorspace? size_t length() const { return seqan::length(patSeq); } @@ -214,9 +165,12 @@ class Hit { this->patId = other.patId; this->patName = other.patName; this->patSeq = other.patSeq; + this->colSeq = other.colSeq; this->quals = other.quals; this->mms = other.mms; + this->cmms = other.cmms; this->refcs = other.refcs; + this->crefcs = other.crefcs; this->oms = other.oms; this->fw = other.fw; this->mfw = other.mfw; @@ -224,6 +178,8 @@ class Hit { this->stratum = other.stratum; this->cost = other.cost; this->mate = other.mate; + this->color = other.color; + this->cmms = other.cmms; return *this; } }; @@ -456,7 +412,7 @@ class HitSink { DECL_HIT_DUMPS, bool onePairFile, RecalTable *table, - vector* refnames = NULL) : + vector* refnames = NULL) : _outs(), _deleteOuts(true), recalTable_(table), @@ -774,7 +730,7 @@ class HitSink { MUTEX_LOCK(dumpAlignLockPE_); if(dumpAl_1_ == NULL) { dumpAl_1_ = openOf(dumpAlBase_, 1, ""); - dumpAl_2_ = openOf(dumpAlBase_, 2, ""); + dumpAl_2_ = openOf(dumpAlBase_, 2, ""); assert(dumpAl_1_ != NULL); assert(dumpAl_2_ != NULL); } @@ -852,7 +808,7 @@ class HitSink { MUTEX_LOCK(dumpUnalLockPE_); if(dumpUnal_1_ == NULL) { dumpUnal_1_ = openOf(dumpUnalBase_, 1, ""); - dumpUnal_2_ = openOf(dumpUnalBase_, 2, ""); + dumpUnal_2_ = openOf(dumpUnalBase_, 2, ""); assert(dumpUnal_1_ != NULL); assert(dumpUnal_2_ != NULL); } @@ -878,7 +834,7 @@ class HitSink { MUTEX_LOCK(dumpMaxFaLock_); if(dumpMaxFa_ == NULL) { dumpMaxFa_ = openOf(dumpMaxFaBase_, 0, ".fa"); - assert(dumpMaxFa_ != NULL); + assert(dumpMaxFa_ != NULL); } printFastaRecord(*dumpMaxFa_, p.bufa().name, p.bufa().patFw); MUTEX_UNLOCK(dumpMaxFaLock_); @@ -907,9 +863,9 @@ class HitSink { MUTEX_LOCK(dumpMaxFaLockPE_); if(dumpMaxFa_1_ == NULL) { assert(dumpMaxFa_2_ == NULL); - dumpMaxFa_1_ = openOf(dumpMaxFaBase_, 1, ".fa"); - dumpMaxFa_2_ = openOf(dumpMaxFaBase_, 2, ".fa"); - assert(dumpMaxFa_1_ != NULL && dumpMaxFa_2_ != NULL); + dumpMaxFa_1_ = openOf(dumpMaxFaBase_, 1, ".fa"); + dumpMaxFa_2_ = openOf(dumpMaxFaBase_, 2, ".fa"); + assert(dumpMaxFa_1_ != NULL && dumpMaxFa_2_ != NULL); } printFastaRecord(*dumpMaxFa_1_, p.bufa().name, p.bufa().patFw); printFastaRecord(*dumpMaxFa_2_, p.bufb().name, p.bufb().patFw); @@ -919,9 +875,9 @@ class HitSink { MUTEX_LOCK(dumpMaxFqLockPE_); if(dumpMaxFq_1_ == NULL) { assert(dumpMaxFq_2_ == NULL); - dumpMaxFq_1_ = openOf(dumpMaxFqBase_, 1, ".fq"); - dumpMaxFq_2_ = openOf(dumpMaxFqBase_, 2, ".fq"); - assert(dumpMaxFq_1_ != NULL && dumpMaxFq_2_ != NULL); + dumpMaxFq_1_ = openOf(dumpMaxFqBase_, 1, ".fq"); + dumpMaxFq_2_ = openOf(dumpMaxFqBase_, 2, ".fq"); + assert(dumpMaxFq_1_ != NULL && dumpMaxFq_2_ != NULL); } printFastqRecord(*dumpMaxFq_1_, p.bufa().name, p.bufa().patFw, p.bufa().qual); printFastqRecord(*dumpMaxFq_2_, p.bufb().name, p.bufb().patFw, p.bufb().qual); @@ -1033,169 +989,169 @@ class HitSink { bool onePairFile_; // Output streams for dumping - std::ofstream *dumpAlFa_; // for single-ended reads - std::ofstream *dumpAlFa_1_; // for first mates - std::ofstream *dumpAlFa_2_; // for second mates - std::ofstream *dumpAlFq_; // for single-ended reads - std::ofstream *dumpAlFq_1_; // for first mates - std::ofstream *dumpAlFq_2_; // for second mates - std::ofstream *dumpAl_; // for single-ended reads - std::ofstream *dumpAl_1_; // for first mates - std::ofstream *dumpAl_2_; // for second mates - std::ofstream *dumpUnalFa_; // for single-ended reads - std::ofstream *dumpUnalFa_1_; // for first mates - std::ofstream *dumpUnalFa_2_; // for second mates - std::ofstream *dumpUnalFq_; // for single-ended reads - std::ofstream *dumpUnalFq_1_; // for first mates - std::ofstream *dumpUnalFq_2_; // for second mates - std::ofstream *dumpUnal_; // for single-ended reads - std::ofstream *dumpUnal_1_; // for first mates - std::ofstream *dumpUnal_2_; // for second mates - std::ofstream *dumpMaxFa_; // for single-ended reads - std::ofstream *dumpMaxFa_1_; // for first mates - std::ofstream *dumpMaxFa_2_; // for second mates - std::ofstream *dumpMaxFq_; // for single-ended reads - std::ofstream *dumpMaxFq_1_; // for first mates - std::ofstream *dumpMaxFq_2_; // for second mates - std::ofstream *dumpMax_; // for single-ended reads - std::ofstream *dumpMax_1_; // for first mates - std::ofstream *dumpMax_2_; // for second mates - - /** - * Open an ofstream with given name; output error message and quit - * if it fails. - */ - std::ofstream* openOf(const std::string& name, - int mateType, - const std::string& suffix) + std::ofstream *dumpAlFa_; // for single-ended reads + std::ofstream *dumpAlFa_1_; // for first mates + std::ofstream *dumpAlFa_2_; // for second mates + std::ofstream *dumpAlFq_; // for single-ended reads + std::ofstream *dumpAlFq_1_; // for first mates + std::ofstream *dumpAlFq_2_; // for second mates + std::ofstream *dumpAl_; // for single-ended reads + std::ofstream *dumpAl_1_; // for first mates + std::ofstream *dumpAl_2_; // for second mates + std::ofstream *dumpUnalFa_; // for single-ended reads + std::ofstream *dumpUnalFa_1_; // for first mates + std::ofstream *dumpUnalFa_2_; // for second mates + std::ofstream *dumpUnalFq_; // for single-ended reads + std::ofstream *dumpUnalFq_1_; // for first mates + std::ofstream *dumpUnalFq_2_; // for second mates + std::ofstream *dumpUnal_; // for single-ended reads + std::ofstream *dumpUnal_1_; // for first mates + std::ofstream *dumpUnal_2_; // for second mates + std::ofstream *dumpMaxFa_; // for single-ended reads + std::ofstream *dumpMaxFa_1_; // for first mates + std::ofstream *dumpMaxFa_2_; // for second mates + std::ofstream *dumpMaxFq_; // for single-ended reads + std::ofstream *dumpMaxFq_1_; // for first mates + std::ofstream *dumpMaxFq_2_; // for second mates + std::ofstream *dumpMax_; // for single-ended reads + std::ofstream *dumpMax_1_; // for first mates + std::ofstream *dumpMax_2_; // for second mates + + /** + * Open an ofstream with given name; output error message and quit + * if it fails. + */ + std::ofstream* openOf(const std::string& name, + int mateType, + const std::string& suffix) { - std::string s = name; + std::string s = name; size_t dotoff = name.find_last_of("."); - if(mateType == 1) { - if(dotoff == string::npos) { - s += "_1"; s += suffix; - } else { - s = name.substr(0, dotoff) + "_1" + s.substr(dotoff); - } - } else if(mateType == 2) { - if(dotoff == string::npos) { - s += "_2"; s += suffix; - } else { - s = name.substr(0, dotoff) + "_2" + s.substr(dotoff); - } - } else if(mateType != 0) { - cerr << "Bad mate type " << mateType << endl; throw 1; - } - std::ofstream* tmp = new ofstream(s.c_str(), ios::out); - if(tmp->fail()) { - if(mateType == 0) { - cerr << "Could not open single-ended aligned/unaligned-read file for writing: " << name << endl; - } else { - cerr << "Could not open paired-end aligned/unaligned-read file for writing: " << name << endl; - } - throw 1; - } - return tmp; - } - - /** - * Initialize all the locks for dumping. - */ - void initDumps() { - dumpAlFa_ = dumpAlFa_1_ = dumpAlFa_2_ = NULL; - dumpAlFq_ = dumpAlFq_1_ = dumpAlFq_2_ = NULL; - dumpAl_ = dumpAl_1_ = dumpAl_2_ = NULL; - dumpUnalFa_ = dumpUnalFa_1_ = dumpUnalFa_2_ = NULL; - dumpUnalFq_ = dumpUnalFq_1_ = dumpUnalFq_2_ = NULL; - dumpUnal_ = dumpUnal_1_ = dumpUnal_2_ = NULL; - dumpMaxFa_ = dumpMaxFa_1_ = dumpMaxFa_2_ = NULL; - dumpMaxFq_ = dumpMaxFq_1_ = dumpMaxFq_2_ = NULL; - dumpMax_ = dumpMax_1_ = dumpMax_2_ = NULL; - dumpAlignFlag_ = !dumpAlFaBase_.empty() || - !dumpAlFqBase_.empty() || - !dumpAlBase_.empty(); - dumpUnalignFlag_ = !dumpUnalFaBase_.empty() || - !dumpUnalFqBase_.empty() || - !dumpUnalBase_.empty(); - dumpMaxedFlag_ = !dumpMaxFaBase_.empty() || - !dumpMaxFqBase_.empty() || - !dumpMaxBase_.empty(); - MUTEX_INIT(dumpAlignFaLock_); - MUTEX_INIT(dumpAlignFaLockPE_); - MUTEX_INIT(dumpAlignFqLock_); - MUTEX_INIT(dumpAlignFqLockPE_); - MUTEX_INIT(dumpAlignLock_); - MUTEX_INIT(dumpAlignLockPE_); - MUTEX_INIT(dumpUnalFaLock_); - MUTEX_INIT(dumpUnalFaLockPE_); - MUTEX_INIT(dumpUnalFqLock_); - MUTEX_INIT(dumpUnalFqLockPE_); - MUTEX_INIT(dumpUnalLock_); - MUTEX_INIT(dumpUnalLockPE_); - MUTEX_INIT(dumpMaxFaLock_); - MUTEX_INIT(dumpMaxFaLockPE_); - MUTEX_INIT(dumpMaxFqLock_); - MUTEX_INIT(dumpMaxFqLockPE_); - MUTEX_INIT(dumpMaxLock_); - MUTEX_INIT(dumpMaxLockPE_); - } - - void destroyDumps() { - if(dumpAlFa_ != NULL) { dumpAlFa_->close(); delete dumpAlFa_; } - if(dumpAlFa_1_ != NULL) { dumpAlFa_1_->close(); delete dumpAlFa_1_; } - if(dumpAlFa_2_ != NULL) { dumpAlFa_2_->close(); delete dumpAlFa_2_; } - if(dumpAlFq_ != NULL) { dumpAlFq_->close(); delete dumpAlFq_; } - if(dumpAlFq_1_ != NULL) { dumpAlFq_1_->close(); delete dumpAlFq_1_; } - if(dumpAlFq_2_ != NULL) { dumpAlFq_2_->close(); delete dumpAlFq_2_; } - if(dumpAl_ != NULL) { dumpAl_->close(); delete dumpAl_; } - if(dumpAl_1_ != NULL) { dumpAl_1_->close(); delete dumpAl_1_; } - if(dumpAl_2_ != NULL) { dumpAl_2_->close(); delete dumpAl_2_; } - - if(dumpUnalFa_ != NULL) { dumpUnalFa_->close(); delete dumpUnalFa_; } - if(dumpUnalFa_1_ != NULL) { dumpUnalFa_1_->close(); delete dumpUnalFa_1_; } - if(dumpUnalFa_2_ != NULL) { dumpUnalFa_2_->close(); delete dumpUnalFa_2_; } - if(dumpUnalFq_ != NULL) { dumpUnalFq_->close(); delete dumpUnalFq_; } - if(dumpUnalFq_1_ != NULL) { dumpUnalFq_1_->close(); delete dumpUnalFq_1_; } - if(dumpUnalFq_2_ != NULL) { dumpUnalFq_2_->close(); delete dumpUnalFq_2_; } - if(dumpUnal_ != NULL) { dumpUnal_->close(); delete dumpUnal_; } - if(dumpUnal_1_ != NULL) { dumpUnal_1_->close(); delete dumpUnal_1_; } - if(dumpUnal_2_ != NULL) { dumpUnal_2_->close(); delete dumpUnal_2_; } - - if(dumpMaxFa_ != NULL) { dumpMaxFa_->close(); delete dumpMaxFa_; } - if(dumpMaxFa_1_ != NULL) { dumpMaxFa_1_->close(); delete dumpMaxFa_1_; } - if(dumpMaxFa_2_ != NULL) { dumpMaxFa_2_->close(); delete dumpMaxFa_2_; } - if(dumpMaxFq_ != NULL) { dumpMaxFq_->close(); delete dumpMaxFq_; } - if(dumpMaxFq_1_ != NULL) { dumpMaxFq_1_->close(); delete dumpMaxFq_1_; } - if(dumpMaxFq_2_ != NULL) { dumpMaxFq_2_->close(); delete dumpMaxFq_2_; } - if(dumpMax_ != NULL) { dumpMax_->close(); delete dumpMax_; } - if(dumpMax_1_ != NULL) { dumpMax_1_->close(); delete dumpMax_1_; } - if(dumpMax_2_ != NULL) { dumpMax_2_->close(); delete dumpMax_2_; } - } - - // Locks for dumping - MUTEX_T dumpAlignFaLock_; - MUTEX_T dumpAlignFaLockPE_; // _1 and _2 - MUTEX_T dumpAlignFqLock_; - MUTEX_T dumpAlignFqLockPE_; // _1 and _2 - MUTEX_T dumpAlignLock_; - MUTEX_T dumpAlignLockPE_; // _1 and _2 - MUTEX_T dumpUnalFaLock_; - MUTEX_T dumpUnalFaLockPE_; // _1 and _2 - MUTEX_T dumpUnalFqLock_; - MUTEX_T dumpUnalFqLockPE_; // _1 and _2 - MUTEX_T dumpUnalLock_; - MUTEX_T dumpUnalLockPE_; // _1 and _2 - MUTEX_T dumpMaxFaLock_; - MUTEX_T dumpMaxFaLockPE_; // _1 and _2 - MUTEX_T dumpMaxFqLock_; - MUTEX_T dumpMaxFqLockPE_; // _1 and _2 - MUTEX_T dumpMaxLock_; - MUTEX_T dumpMaxLockPE_; // _1 and _2 - - // false -> no dumping - bool dumpAlignFlag_; - bool dumpUnalignFlag_; - bool dumpMaxedFlag_; + if(mateType == 1) { + if(dotoff == string::npos) { + s += "_1"; s += suffix; + } else { + s = name.substr(0, dotoff) + "_1" + s.substr(dotoff); + } + } else if(mateType == 2) { + if(dotoff == string::npos) { + s += "_2"; s += suffix; + } else { + s = name.substr(0, dotoff) + "_2" + s.substr(dotoff); + } + } else if(mateType != 0) { + cerr << "Bad mate type " << mateType << endl; throw 1; + } + std::ofstream* tmp = new ofstream(s.c_str(), ios::out); + if(tmp->fail()) { + if(mateType == 0) { + cerr << "Could not open single-ended aligned/unaligned-read file for writing: " << name << endl; + } else { + cerr << "Could not open paired-end aligned/unaligned-read file for writing: " << name << endl; + } + throw 1; + } + return tmp; + } + + /** + * Initialize all the locks for dumping. + */ + void initDumps() { + dumpAlFa_ = dumpAlFa_1_ = dumpAlFa_2_ = NULL; + dumpAlFq_ = dumpAlFq_1_ = dumpAlFq_2_ = NULL; + dumpAl_ = dumpAl_1_ = dumpAl_2_ = NULL; + dumpUnalFa_ = dumpUnalFa_1_ = dumpUnalFa_2_ = NULL; + dumpUnalFq_ = dumpUnalFq_1_ = dumpUnalFq_2_ = NULL; + dumpUnal_ = dumpUnal_1_ = dumpUnal_2_ = NULL; + dumpMaxFa_ = dumpMaxFa_1_ = dumpMaxFa_2_ = NULL; + dumpMaxFq_ = dumpMaxFq_1_ = dumpMaxFq_2_ = NULL; + dumpMax_ = dumpMax_1_ = dumpMax_2_ = NULL; + dumpAlignFlag_ = !dumpAlFaBase_.empty() || + !dumpAlFqBase_.empty() || + !dumpAlBase_.empty(); + dumpUnalignFlag_ = !dumpUnalFaBase_.empty() || + !dumpUnalFqBase_.empty() || + !dumpUnalBase_.empty(); + dumpMaxedFlag_ = !dumpMaxFaBase_.empty() || + !dumpMaxFqBase_.empty() || + !dumpMaxBase_.empty(); + MUTEX_INIT(dumpAlignFaLock_); + MUTEX_INIT(dumpAlignFaLockPE_); + MUTEX_INIT(dumpAlignFqLock_); + MUTEX_INIT(dumpAlignFqLockPE_); + MUTEX_INIT(dumpAlignLock_); + MUTEX_INIT(dumpAlignLockPE_); + MUTEX_INIT(dumpUnalFaLock_); + MUTEX_INIT(dumpUnalFaLockPE_); + MUTEX_INIT(dumpUnalFqLock_); + MUTEX_INIT(dumpUnalFqLockPE_); + MUTEX_INIT(dumpUnalLock_); + MUTEX_INIT(dumpUnalLockPE_); + MUTEX_INIT(dumpMaxFaLock_); + MUTEX_INIT(dumpMaxFaLockPE_); + MUTEX_INIT(dumpMaxFqLock_); + MUTEX_INIT(dumpMaxFqLockPE_); + MUTEX_INIT(dumpMaxLock_); + MUTEX_INIT(dumpMaxLockPE_); + } + + void destroyDumps() { + if(dumpAlFa_ != NULL) { dumpAlFa_->close(); delete dumpAlFa_; } + if(dumpAlFa_1_ != NULL) { dumpAlFa_1_->close(); delete dumpAlFa_1_; } + if(dumpAlFa_2_ != NULL) { dumpAlFa_2_->close(); delete dumpAlFa_2_; } + if(dumpAlFq_ != NULL) { dumpAlFq_->close(); delete dumpAlFq_; } + if(dumpAlFq_1_ != NULL) { dumpAlFq_1_->close(); delete dumpAlFq_1_; } + if(dumpAlFq_2_ != NULL) { dumpAlFq_2_->close(); delete dumpAlFq_2_; } + if(dumpAl_ != NULL) { dumpAl_->close(); delete dumpAl_; } + if(dumpAl_1_ != NULL) { dumpAl_1_->close(); delete dumpAl_1_; } + if(dumpAl_2_ != NULL) { dumpAl_2_->close(); delete dumpAl_2_; } + + if(dumpUnalFa_ != NULL) { dumpUnalFa_->close(); delete dumpUnalFa_; } + if(dumpUnalFa_1_ != NULL) { dumpUnalFa_1_->close(); delete dumpUnalFa_1_; } + if(dumpUnalFa_2_ != NULL) { dumpUnalFa_2_->close(); delete dumpUnalFa_2_; } + if(dumpUnalFq_ != NULL) { dumpUnalFq_->close(); delete dumpUnalFq_; } + if(dumpUnalFq_1_ != NULL) { dumpUnalFq_1_->close(); delete dumpUnalFq_1_; } + if(dumpUnalFq_2_ != NULL) { dumpUnalFq_2_->close(); delete dumpUnalFq_2_; } + if(dumpUnal_ != NULL) { dumpUnal_->close(); delete dumpUnal_; } + if(dumpUnal_1_ != NULL) { dumpUnal_1_->close(); delete dumpUnal_1_; } + if(dumpUnal_2_ != NULL) { dumpUnal_2_->close(); delete dumpUnal_2_; } + + if(dumpMaxFa_ != NULL) { dumpMaxFa_->close(); delete dumpMaxFa_; } + if(dumpMaxFa_1_ != NULL) { dumpMaxFa_1_->close(); delete dumpMaxFa_1_; } + if(dumpMaxFa_2_ != NULL) { dumpMaxFa_2_->close(); delete dumpMaxFa_2_; } + if(dumpMaxFq_ != NULL) { dumpMaxFq_->close(); delete dumpMaxFq_; } + if(dumpMaxFq_1_ != NULL) { dumpMaxFq_1_->close(); delete dumpMaxFq_1_; } + if(dumpMaxFq_2_ != NULL) { dumpMaxFq_2_->close(); delete dumpMaxFq_2_; } + if(dumpMax_ != NULL) { dumpMax_->close(); delete dumpMax_; } + if(dumpMax_1_ != NULL) { dumpMax_1_->close(); delete dumpMax_1_; } + if(dumpMax_2_ != NULL) { dumpMax_2_->close(); delete dumpMax_2_; } + } + + // Locks for dumping + MUTEX_T dumpAlignFaLock_; + MUTEX_T dumpAlignFaLockPE_; // _1 and _2 + MUTEX_T dumpAlignFqLock_; + MUTEX_T dumpAlignFqLockPE_; // _1 and _2 + MUTEX_T dumpAlignLock_; + MUTEX_T dumpAlignLockPE_; // _1 and _2 + MUTEX_T dumpUnalFaLock_; + MUTEX_T dumpUnalFaLockPE_; // _1 and _2 + MUTEX_T dumpUnalFqLock_; + MUTEX_T dumpUnalFqLockPE_; // _1 and _2 + MUTEX_T dumpUnalLock_; + MUTEX_T dumpUnalLockPE_; // _1 and _2 + MUTEX_T dumpMaxFaLock_; + MUTEX_T dumpMaxFaLockPE_; // _1 and _2 + MUTEX_T dumpMaxFqLock_; + MUTEX_T dumpMaxFqLockPE_; // _1 and _2 + MUTEX_T dumpMaxLock_; + MUTEX_T dumpMaxLockPE_; // _1 and _2 + + // false -> no dumping + bool dumpAlignFlag_; + bool dumpUnalignFlag_; + bool dumpMaxedFlag_; volatile bool first_; /// true -> first hit hasn't yet been reported volatile uint64_t numAligned_; /// # reads with >= 1 alignment @@ -1541,9 +1497,9 @@ class NGoodHitSinkPerThreadFactory : public HitSinkPerThreadFactory { private: HitSink& sink_; - uint32_t n_; - uint32_t max_; - bool keep_; + uint32_t n_; + uint32_t max_; + bool keep_; }; @@ -1726,9 +1682,9 @@ class NBestHitSinkPerThreadFactory : public HitSinkPerThreadFactory { private: HitSink& sink_; - uint32_t n_; - uint32_t max_; - bool keep_; + uint32_t n_; + uint32_t max_; + bool keep_; }; /** @@ -1743,12 +1699,12 @@ class NBestStratHitSinkPerThread : public HitSinkPerThread { public: NBestStratHitSinkPerThread( HitSink& sink, - uint32_t __n, - uint32_t __max = 0xffffffff, - bool __keep = false) : - HitSinkPerThread(sink, __max, __keep), - _n(__n), - bestStratumReported_(999) + uint32_t __n, + uint32_t __max = 0xffffffff, + bool __keep = false) : + HitSinkPerThread(sink, __max, __keep), + _n(__n), + bestStratumReported_(999) { assert_gt(_n, 0); } @@ -1928,9 +1884,9 @@ class NBestStratHitSinkPerThreadFactory : public HitSinkPerThreadFactory { private: HitSink& sink_; - uint32_t n_; - uint32_t max_; - bool keep_; + uint32_t n_; + uint32_t max_; + bool keep_; }; /** @@ -2069,9 +2025,9 @@ class NBestFirstStratHitSinkPerThreadFactory : public HitSinkPerThreadFactory { private: HitSink& sink_; - uint32_t n_; - uint32_t max_; - bool keep_; + uint32_t n_; + uint32_t max_; + bool keep_; }; /** @@ -2081,18 +2037,18 @@ class ChainingHitSinkPerThread : public HitSinkPerThread { public: ChainingHitSinkPerThread(HitSink& sink, - uint32_t n, - uint32_t max, - bool keep, - bool strata, - uint32_t mult) : - HitSinkPerThread(sink, max * mult, keep), - n_(n * mult), mult_(mult), max_(max), strata_(strata), cutoff_(0xffff) - { + uint32_t n, + uint32_t max, + bool keep, + bool strata, + uint32_t mult) : + HitSinkPerThread(sink, max * mult, keep), + n_(n * mult), mult_(mult), max_(max), strata_(strata), cutoff_(0xffff) + { hs_ = NULL; hsISz_ = 0; assert_gt(n_, 0); - } + } virtual uint32_t maxHits() { return n_; } @@ -2293,11 +2249,11 @@ class ChainingHitSinkPerThread : public HitSinkPerThread { HitSet *hs_; size_t hsISz_; - uint32_t n_; + uint32_t n_; uint32_t mult_; - uint32_t max_; - bool strata_; /// true -> reporting is stratified - uint16_t cutoff_; /// the smallest irrelevant cost + uint32_t max_; + bool strata_; /// true -> reporting is stratified + uint16_t cutoff_; /// the smallest irrelevant cost }; /** @@ -2331,10 +2287,10 @@ class ChainingHitSinkPerThreadFactory : public HitSinkPerThreadFactory { private: HitSink& sink_; - uint32_t n_; - uint32_t max_; - bool keep_; - bool strata_; + uint32_t n_; + uint32_t max_; + bool keep_; + bool strata_; }; /** @@ -2415,8 +2371,8 @@ class AllHitSinkPerThreadFactory : public HitSinkPerThreadFactory { private: HitSink& sink_; - uint32_t max_; - bool keep_; + uint32_t max_; + bool keep_; }; /** @@ -2583,8 +2539,8 @@ class AllStratHitSinkPerThreadFactory : public HitSinkPerThreadFactory { private: HitSink& sink_; - uint32_t max_; - bool keep_; + uint32_t max_; + bool keep_; }; /** @@ -2628,7 +2584,7 @@ class ConciseHitSink : public HitSink { ss << '/' << (int)h.mate; } ss << (h.fw? "+" : "-") << ":"; - // .first is text id, .second is offset + // .first is text id, .second is offset ss << "<" << h.h.first << "," << (h.h.second + offBase) << "," << h.mms.count(); if(reportOpps) ss << "," << h.oms; ss << ">" << endl; @@ -3048,11 +3004,6 @@ class VerboseHitSink : public HitSink { } ss << s3; ss << "\t" << (h.fw? "+":"-"); - ss << "\t" << h.patSeq; - ss << "\t" << h.quals; - ss << "\t" << h.oms; - ss << "\t" << (int)h.mate; - ss << "\t" << h.patName; // end if(partition != 0) } else { assert(!dospill); @@ -3066,11 +3017,11 @@ class VerboseHitSink : public HitSink { ss << h.h.first; } ss << "\t" << (h.h.second + offBase); - ss << "\t" << h.patSeq; - ss << "\t" << h.quals; - ss << "\t" << h.oms; // end else clause of if(partition != 0) } + ss << '\t' << h.patSeq; + ss << '\t' << h.quals; + ss << '\t' << h.oms; ss << '\t'; // Look for SNP annotations falling within the alignment map snpAnnots; @@ -3117,6 +3068,11 @@ class VerboseHitSink : public HitSink { first = false; } } + if(partition != 0) { + if(first) ss << '-'; + ss << "\t" << (int)h.mate; + ss << "\t" << h.patName; + } ss << endl; } while(spill); } @@ -3437,8 +3393,8 @@ class BinaryHitSink : public HitSink { private: int offBase_; /// Add this to reference offsets before outputting. - /// (An easy way to make things 1-based instead of - /// 0-based) + /// (An easy way to make things 1-based instead of + /// 0-based) }; /** diff --git a/hit_set.cpp b/hit_set.cpp index 054f73a..a1da35b 100644 --- a/hit_set.cpp +++ b/hit_set.cpp @@ -26,7 +26,7 @@ void HitSet::reportUpTo(ostream& os, int khits) { if(!h.fw && seqan::empty(seqrc)) { // Lazily initialize seqrc and qualr seqrc = seq; - reverseComplementInPlace(seqrc, false); + reverseComplementInPlace(seqrc, color); assert_eq(seqan::length(seqrc), seqan::length(seq)); qualr = qual; reverseInPlace(qualr); diff --git a/map_tool.cpp b/map_tool.cpp index 4b299c8..c98df90 100644 --- a/map_tool.cpp +++ b/map_tool.cpp @@ -201,7 +201,7 @@ static void parseOptions(int argc, char **argv) { */ static void fastqAppend(ostream& out, Hit& h) { if(!h.fw) { - ::reverseComplementInPlace(h.patSeq); + ::reverseComplementInPlace(h.patSeq, h.color); ::reverseInPlace(h.quals); } out << "@" << h.patName << endl @@ -214,7 +214,7 @@ static void fastqAppend(ostream& out, Hit& h) { * Print the read involved in an alignment as a FASTA record. */ static void fastaAppend(ostream& out, Hit& h) { - if(!h.fw) ::reverseComplementInPlace(h.patSeq); + if(!h.fw) ::reverseComplementInPlace(h.patSeq, h.color); out << ">" << h.patName << endl << h.patSeq << endl; } diff --git a/pat.cpp b/pat.cpp index e57138a..8054ad0 100644 --- a/pat.cpp +++ b/pat.cpp @@ -30,7 +30,8 @@ void tooManyQualities(const String& read_name) { void tooManySeqChars(const String& read_name) { cerr << "Reads file contained a pattern with more than 1024 sequence characters." << endl - << "Please truncate reads and quality values and and re-run Bowtie" << endl; + << "Please truncate reads and quality values and and re-run Bowtie." << endl + << "Offending read: " << read_name << endl; throw 1; } diff --git a/pat.h b/pat.h index 0c994fe..f5ebc7a 100644 --- a/pat.h +++ b/pat.h @@ -108,6 +108,8 @@ struct ReadBuf { readOrigBufLen = 0; alts = 0; fuzzy = false; + color = false; + primer = '?'; RESET_BUF(patFw, patBufFw, Dna5); RESET_BUF(patRc, patBufRc, Dna5); RESET_BUF(qual, qualBuf, char); @@ -142,6 +144,8 @@ struct ReadBuf { seqan::clear(altQualRev[j]); } readOrigBufLen = 0; + color = fuzzy = false; + primer = '?'; } /// Return true iff the read (pair) is empty @@ -155,7 +159,9 @@ struct ReadBuf { } /** - * Construct patRc in place. + * Construct reverse complement of the pattern and the fuzzy + * alternative patters. If read is in colorspace, just reverse + * them. */ void constructRevComps() { uint32_t len = length(); @@ -164,11 +170,23 @@ struct ReadBuf { for(int j = 0; j < alts; j++) { RESET_BUF_LEN(altPatRc[j], altPatBufRc[j], len, Dna5); } - for(uint32_t i = 0; i < len; i++) { - // Reverse-complement the sequence - patBufRc[i] = (patBufFw[len-i-1] == 4) ? 4 : (patBufFw[len-i-1] ^ 3); - for(int j = 0; j < alts; j++) { - altPatBufRc[j][i] = (altPatBufFw[j][len-i-1] == 4) ? 4 : (altPatBufFw[j][len-i-1] ^ 3); + if(color) { + for(uint32_t i = 0; i < len; i++) { + // Reverse the sequence + patBufRc[i] = patBufFw[len-i-1]; + for(int j = 0; j < alts; j++) { + // Reverse the fuzzy alternatives + altPatBufRc[j][i] = altPatBufFw[j][len-i-1]; + } + } + } else { + for(uint32_t i = 0; i < len; i++) { + // Reverse-complement the sequence + patBufRc[i] = (patBufFw[len-i-1] == 4) ? 4 : (patBufFw[len-i-1] ^ 3); + for(int j = 0; j < alts; j++) { + // Reverse-complement the fuzzy alternatives + altPatBufRc[j][i] = (altPatBufFw[j][len-i-1] == 4) ? 4 : (altPatBufFw[j][len-i-1] ^ 3); + } } } } @@ -232,9 +250,20 @@ struct ReadBuf { } } + /** + * Dump basic information about this read to the given ostream. + */ void dump(std::ostream& os) const { - os << name << " " << patFw << " "; - // Print out the sequences + os << name << ' '; + if(color) { + for(size_t i = 0; i < seqan::length(patFw); i++) { + os << "0123."[(int)patFw[i]]; + } + } else { + os << patFw; + } + os << ' '; + // Print out the fuzzy alternative sequences for(int j = 0; j < 3; j++) { bool started = false; if(seqan::length(altQual[j]) > 0) { @@ -246,7 +275,11 @@ struct ReadBuf { if(altQual[j][i] == '!') { os << '-'; } else { - os << altPatFw[j][i]; + if(color) { + os << "0123."[(int)altPatFw[j][i]]; + } else { + os << altPatFw[j][i]; + } } } } @@ -254,7 +287,7 @@ struct ReadBuf { cout << " "; } os << qual << " "; - // Print out the quality strings + // Print out the fuzzy alternative quality strings for(int j = 0; j < 3; j++) { bool started = false; if(seqan::length(altQual[j]) > 0) { @@ -283,6 +316,7 @@ struct ReadBuf { hs.name = name; hs.seq = patFw; hs.qual = qual; + hs.color = color; } static const int BUF_SIZE = 1024; @@ -325,6 +359,8 @@ struct ReadBuf { uint32_t seed; // random seed int alts; // number of alternatives bool fuzzy; // whether to employ fuzziness + bool color; // whether read is in color space + char primer; // primer base, for csfasta files HitSet hitset; // holds previously-found hits; for chaining }; @@ -1285,6 +1321,11 @@ static inline int peekToEndOfLine(FileBuf& in) { } } +extern void wrongQualityFormat(const String& read_name); +extern void tooFewQualities(const String& read_name); +extern void tooManyQualities(const String& read_name); +extern void tooManySeqChars(const String& read_name); + /** * Encapsualtes a source of patterns which is an in-memory vector. */ @@ -1439,7 +1480,6 @@ class BufferedFilePatternSource : public TrimmingPatternSource { BufferedFilePatternSource(const vector& infiles, bool randomizeQuals = false, bool useSpinlock = true, - bool __forgiveInput = false, const char *dumpfile = NULL, bool verbose = false, int trim3 = 0, @@ -1450,7 +1490,6 @@ class BufferedFilePatternSource : public TrimmingPatternSource { infiles_(infiles), filecur_(0), filebuf_(), - forgiveInput_(__forgiveInput), skip_(skip), first_(true) { @@ -1487,7 +1526,7 @@ class BufferedFilePatternSource : public TrimmingPatternSource { assert(seqan::empty(r.patFw)); return; } - if(first_ && seqan::empty(r.patFw) && !forgiveInput_) { + if(first_ && seqan::empty(r.patFw) /* && !quiet_ */) { // No reads could be extracted from the first _infile cerr << "Warning: Could not find any reads in \"" << infiles_[0] << "\"" << endl; } @@ -1500,7 +1539,7 @@ class BufferedFilePatternSource : public TrimmingPatternSource { read(r, patid); } while((seqan::empty(r.patFw) && !filebuf_.eof())); assert_geq(patid, skip_); - if(seqan::empty(r.patFw) && !forgiveInput_) { + if(seqan::empty(r.patFw) /*&& !quiet_ */) { // No reads could be extracted from this _infile cerr << "Warning: Could not find any reads in \"" << infiles_[filecur_] << "\"" << endl; } @@ -1533,7 +1572,7 @@ class BufferedFilePatternSource : public TrimmingPatternSource { assert(seqan::empty(ra.patFw)); return; } - if(first_ && seqan::empty(ra.patFw) && !forgiveInput_) { + if(first_ && seqan::empty(ra.patFw) /*&& !quiet_*/) { // No reads could be extracted from the first _infile cerr << "Warning: Could not find any read pairs in \"" << infiles_[0] << "\"" << endl; } @@ -1546,7 +1585,7 @@ class BufferedFilePatternSource : public TrimmingPatternSource { readPair(ra, rb, patid); } while((seqan::empty(ra.patFw) && !filebuf_.eof())); assert_geq(patid, skip_); - if(seqan::empty(ra.patFw) && !forgiveInput_) { + if(seqan::empty(ra.patFw) /*&& !quiet_*/) { // No reads could be extracted from this _infile cerr << "Warning: Could not find any reads in \"" << infiles_[filecur_] << "\"" << endl; } @@ -1601,29 +1640,28 @@ class BufferedFilePatternSource : public TrimmingPatternSource { vector errs_; /// whether we've already printed an error for each file size_t filecur_; /// index into infiles_ of next file to read FileBuf filebuf_; /// read file currently being read from - bool forgiveInput_; /// try hard to parse input even if it's malformed uint32_t skip_; /// number of reads to skip bool first_; }; /** - * Synchronized concrete pattern source for a list of FASTA files. + * Synchronized concrete pattern source for a list of FASTA or CSFASTA + * (if color = true) files. */ class FastaPatternSource : public BufferedFilePatternSource { public: FastaPatternSource(const vector& infiles, - bool randomizeQuals = false, + bool color, + bool randomizeQuals, bool useSpinlock = true, const char *dumpfile = NULL, bool verbose = false, int trim3 = 0, int trim5 = 0, - bool __forgiveInput = false, uint32_t skip = 0) : BufferedFilePatternSource(infiles, randomizeQuals, useSpinlock, - __forgiveInput, dumpfile, verbose, - trim3, trim5, skip), - first_(true) + dumpfile, verbose, trim3, trim5, skip), + first_(true), color_(color) { } virtual void reset() { first_ = true; @@ -1641,6 +1679,13 @@ class FastaPatternSource : public BufferedFilePatternSource { } return c; } + + /// Called when we have to bail without having parsed a read. + void bail(ReadBuf& r) { + r.clearAll(); + filebuf_.resetLastN(); + } + /// Read another pattern from a FASTA input file virtual void read(ReadBuf& r, uint32_t& patid) { int c; @@ -1648,28 +1693,12 @@ class FastaPatternSource : public BufferedFilePatternSource { int nameLen = 0; // Pick off the first carat c = filebuf_.get(); - if(c == -1) { - r.clearAll(); - filebuf_.resetLastN(); - return; - } + if(c < 0) { bail(r); return; } assert_eq('>', c); if(first_) { if(c != '>') { - if(forgiveInput_) { - c = FastaPatternSource::skipToNextFastaRecord(filebuf_); - if(c < 0) { - r.clearAll(); - filebuf_.resetLastN(); - return; - } - } else { - c = getOverNewline(filebuf_); if(c < 0) { - r.clearAll(); - filebuf_.resetLastN(); - return; - } - } + c = getOverNewline(filebuf_); + if(c < 0) { bail(r); return; } } if(c != '>') { cerr << "Error: reads file does not look like a FASTA file" << endl; @@ -1682,19 +1711,13 @@ class FastaPatternSource : public BufferedFilePatternSource { // Read to the end of the id line, sticking everything after the '>' // into *name while(true) { - c = filebuf_.get(); if(c < 0) { - r.clearAll(); - filebuf_.resetLastN(); - return; - } + c = filebuf_.get(); + if(c < 0) { bail(r); return; } if(c == '\n' || c == '\r') { // Break at end of line, after consuming all \r's, \n's while(c == '\n' || c == '\r') { - c = filebuf_.get(); if(c < 0) { - r.clearAll(); - filebuf_.resetLastN(); - return; - } + c = filebuf_.get(); + if(c < 0) { bail(r); return; } } break; } @@ -1706,19 +1729,28 @@ class FastaPatternSource : public BufferedFilePatternSource { // _in now points just past the first character of a sequence // line, and c holds the first character int begin = 0; + if(color_) { + // This is the primer character, keep it in the + // 'primer' field of the read buf and keep parsing + c = toupper(c); + if(c != 'A' && c != 'C' && c != 'G' && c != 'T') { + cerr << "Warning: Primer character for read " << r.name << " is not DNA: " << c << endl; + } + r.primer = c; + r.color = true; + c = filebuf_.get(); + if(c < 0) { bail(r); return; } + } while(c != '>') { - if(isalpha(c) && begin++ >= this->trim5_) { - if(dstLen + 1 > 1024) { - cerr << "Input file contained a pattern more than 1024 characters long. Please truncate" << endl - << "reads and re-run Bowtie" << endl; - throw 1; - } - r.patBufFw [dstLen] = charToDna5[c]; - r.qualBuf[dstLen] = 'I'; + bool ischar = (color_ ? isColor(c) : isDna(c)); + if(ischar && begin++ >= this->trim5_) { + if(dstLen + 1 > 1024) tooManySeqChars(r.name); + r.patBufFw[dstLen] = (color_ ? asc2col[c] : charToDna5[c]); + r.qualBuf[dstLen] = 'I'; dstLen++; } c = filebuf_.get(); - if(c == '\r' || c == '\n' || c == -1) { + if(c == '\r' || c == '\n') { // Either we hit EOL or EOF; time to copy the buffered // read data into the 'orig' buffer. c = peekOverNewline(filebuf_); @@ -1732,7 +1764,6 @@ class FastaPatternSource : public BufferedFilePatternSource { _setLength(r.patFw, dstLen); _setBegin (r.qual, r.qualBuf); _setLength(r.qual, dstLen); - // Set up a default name if one hasn't been set if(nameLen == 0) { itoa10(readCnt_, r.nameBuf); @@ -1765,6 +1796,7 @@ class FastaPatternSource : public BufferedFilePatternSource { } private: bool first_; + bool color_; }; @@ -1779,11 +1811,6 @@ static inline bool tokenizeQualLine(FileBuf& filebuf, char *buf, size_t buflen, return true; } -extern void wrongQualityFormat(const String& read_name); -extern void tooFewQualities(const String& read_name); -extern void tooManyQualities(const String& read_name); -extern void tooManySeqChars(const String& read_name); - /** * Synchronized concrete pattern source for a list of files with tab- * delimited name, seq, qual fields (or, for paired-end reads, @@ -1798,13 +1825,12 @@ class TabbedPatternSource : public BufferedFilePatternSource { bool verbose = false, int trim3 = 0, int trim5 = 0, - bool forgiveInput = false, bool solQuals = false, bool phred64Quals = false, bool intQuals = false, uint32_t skip = 0) : BufferedFilePatternSource(infiles, randomizeQuals, useSpinlock, - forgiveInput, dumpfile, verbose, + dumpfile, verbose, trim3, trim5, skip), solQuals_(solQuals), phred64Quals_(phred64Quals), @@ -1926,9 +1952,7 @@ class TabbedPatternSource : public BufferedFilePatternSource { } assert_eq('\n', ct); if(filebuf_.peek() == '\n') { - if(!forgiveInput_) { - assert(false); - } + assert(false); } peekOverNewline(filebuf_); ra.readOrigBufLen = filebuf_.copyLastN(ra.readOrigBuf); @@ -2017,11 +2041,7 @@ class TabbedPatternSource : public BufferedFilePatternSource { if(isalpha(c)) { if(begin++ >= this->trim5_) { if(dna4Cat[c] == 0) { - if(forgiveInput_) { - return -1; - } else { - assert(false); - } + assert(false); } if(dstLen + 1 > 1024) { cerr << "Input file contained a pattern more than 1024 characters long. Please truncate" << endl @@ -2095,11 +2115,7 @@ class TabbedPatternSource : public BufferedFilePatternSource { } } if(qualsRead != dstLen + this->trim5_) { - if(forgiveInput_) { - return -1; - } else { - assert(false); - } + assert(false); } } _setBegin (r.qual, (char*)r.qualBuf); @@ -2132,7 +2148,7 @@ class FastaContinuousPatternSource : public BufferedFilePatternSource { uint32_t skip = 0, uint32_t seed = 0) : BufferedFilePatternSource(infiles, false, useSpinlock, - false, dumpfile, verbose, 0, 0, skip), + dumpfile, verbose, 0, 0, skip), length_(length), freq_(freq), eat_(length_-1), beginning_(true), nameChars_(0), bufCur_(0), subReadCnt_(0llu) @@ -2266,26 +2282,27 @@ class FastaContinuousPatternSource : public BufferedFilePatternSource { class FastqPatternSource : public BufferedFilePatternSource { public: FastqPatternSource(const vector& infiles, + bool color, bool randomizeQuals = false, bool useSpinlock = true, const char *dumpfile = NULL, bool verbose = false, int trim3 = 0, int trim5 = 0, - bool __forgiveInput = false, bool solexa_quals = false, bool phred64Quals = false, bool integer_quals = false, bool fuzzy = false, uint32_t skip = 0) : BufferedFilePatternSource(infiles, randomizeQuals, useSpinlock, - __forgiveInput, dumpfile, verbose, + dumpfile, verbose, trim3, trim5, skip), first_(true), solQuals_(solexa_quals), phred64Quals_(phred64Quals), intQuals_(integer_quals), - fuzzy_(fuzzy) + fuzzy_(fuzzy), + color_(color) { } virtual void reset() { first_ = true; @@ -2344,26 +2361,14 @@ class FastqPatternSource : public BufferedFilePatternSource { int dstLen = 0; int nameLen = 0; r.fuzzy = fuzzy_; + r.color = color_; r.alts = 0; // Pick off the first at if(first_) { c = filebuf_.get(); if(c != '@') { - if(forgiveInput_) { - c = FastqPatternSource::skipToNextFastqRecord(filebuf_, c == '+'); - if(c < 0) { - filebuf_.resetLastN(); - seqan::clear(r.patFw); - return; - } - } else { - c = getOverNewline(filebuf_); - if(c < 0) { - filebuf_.resetLastN(); - seqan::clear(r.patFw); - return; - } - } + c = getOverNewline(filebuf_); + if(c < 0) { bail(r); return; } } if(c != '@') { cerr << "Error: reads file does not look like a FASTQ file" << endl; @@ -2377,20 +2382,12 @@ class FastqPatternSource : public BufferedFilePatternSource { // into *name while(true) { c = filebuf_.get(); - if(c < 0) { - seqan::clear(r.patFw); - filebuf_.resetLastN(); - return; - } + if(c < 0) { bail(r); return; } if(c == '\n' || c == '\r') { // Break at end of line, after consuming all \r's, \n's while(c == '\n' || c == '\r') { c = filebuf_.get(); - if(c < 0) { - seqan::clear(r.patFw); - filebuf_.resetLastN(); - return; - } + if(c < 0) { bail(r); return; } } break; } @@ -2419,6 +2416,11 @@ class FastqPatternSource : public BufferedFilePatternSource { int trim5 = this->trim5_; int altBufIdx = 0; while(c != '+') { + // Convert color numbers to letters if necessary + if(color_) { + if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0']; + if(c == '.') c = 'N'; + } if(fuzzy_ && c == '-') c = 'A'; if(isalpha(c)) { // If it's past the 5'-end trim point @@ -2432,7 +2434,7 @@ class FastqPatternSource : public BufferedFilePatternSource { if(charsRead == 0) { c = filebuf_.get(); continue; - } + } charsRead = 0; if(altBufIdx >= 3) { cerr << "At most 3 alternate sequence strings permitted; offending read: " << r.name << endl; @@ -2443,12 +2445,7 @@ class FastqPatternSource : public BufferedFilePatternSource { dstLenCur = &dstLens[altBufIdx]; } c = filebuf_.get(); - if(c < 0) { - // EOF occurred in the middle of a read - abort - seqan::clear(r.patFw); - filebuf_.resetLastN(); - return; - } + if(c < 0) { bail(r); return; } } // Trim from 3' end dstLen = dstLens[0]; @@ -2507,12 +2504,7 @@ class FastqPatternSource : public BufferedFilePatternSource { qualsReadCur = &qualsRead[altBufIdx]; continue; } - if(c < 0) { - // EOF occurred in the middle of a read - abort - seqan::clear(r.patFw); - filebuf_.resetLastN(); - return; - } + if(c < 0) { bail(r); return; } if (c != '\r' && c != '\n') { if (*qualsReadCur >= trim5) { size_t off = (*qualsReadCur) - trim5; @@ -2614,11 +2606,23 @@ class FastqPatternSource : public BufferedFilePatternSource { out << "@" << name << endl << seq << endl << "+" << endl << qual << endl; } private: + + /** + * Do things we need to do if we have to bail in the middle of a + * read, usually because we reached the end of the input without + * finishing. + */ + void bail(ReadBuf& r) { + seqan::clear(r.patFw); + filebuf_.resetLastN(); + } + bool first_; bool solQuals_; bool phred64Quals_; bool intQuals_; bool fuzzy_; + bool color_; }; /** @@ -2629,6 +2633,7 @@ class FastqPatternSource : public BufferedFilePatternSource { class RawPatternSource : public BufferedFilePatternSource { public: RawPatternSource(const vector& infiles, + bool color, bool randomizeQuals = false, bool useSpinlock = true, const char *dumpfile = NULL, @@ -2636,9 +2641,9 @@ class RawPatternSource : public BufferedFilePatternSource { int trim3 = 0, int trim5 = 0, uint32_t skip = 0) : - BufferedFilePatternSource(infiles, randomizeQuals, false, useSpinlock, + BufferedFilePatternSource(infiles, randomizeQuals, useSpinlock, dumpfile, verbose, trim3, trim5, skip), - first_(true) + first_(true), color_(color) { } virtual void reset() { first_ = true; @@ -2657,6 +2662,10 @@ class RawPatternSource : public BufferedFilePatternSource { } assert(!isspace(c)); if(first_) { + if(color_) { + if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0']; + if(c == '.') c = 'N'; + } if(dna4Cat[c] == 0) { cerr << "Error: reads file does not look like a Raw file" << endl; if(c == '>') { @@ -2674,6 +2683,10 @@ class RawPatternSource : public BufferedFilePatternSource { // line, and c holds the first character while(!isspace(c) && c >= 0) { c = filebuf_.get(); + if(color_) { + if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0']; + if(c == '.') c = 'N'; + } if(isalpha(c) && dstLen >= this->trim5_) { size_t len = dstLen - this->trim5_; if(len >= 1024) tooManyQualities(String("(no name)")); @@ -2727,6 +2740,7 @@ class RawPatternSource : public BufferedFilePatternSource { } private: bool first_; + bool color_; }; /** @@ -2742,7 +2756,7 @@ class ChainPatternSource : public BufferedFilePatternSource { bool verbose, uint32_t skip) : BufferedFilePatternSource( - infiles, false, false, useSpinlock, dumpfile, verbose, 0, 0, skip) { } + infiles, false, useSpinlock, dumpfile, verbose, 0, 0, skip) { } protected: diff --git a/range_source.h b/range_source.h index 7516820..1317e60 100644 --- a/range_source.h +++ b/range_source.h @@ -20,9 +20,7 @@ enum AdvanceUntil { }; /** - * Expandable list of Edits. One can: - * - Add an Edit, which might trigger an expansion - * - + * List of Edits that automatically expands as edits are added. */ struct EditList { @@ -32,7 +30,7 @@ struct EditList { * Add an edit to the edit list. */ bool add(const Edit& e, AllocOnlyPool& pool, size_t qlen) { - assert_lt(sz_, qlen+3); + assert_lt(sz_, qlen + 10); if(sz_ < numEdits) { assert(moreEdits_ == NULL); assert(yetMoreEdits_ == NULL); @@ -55,7 +53,7 @@ struct EditList { } else if(sz_ == (numEdits + numMoreEdits)) { assert(moreEdits_ != NULL); assert(yetMoreEdits_ == NULL); - yetMoreEdits_ = pool.alloc(qlen+3 - numMoreEdits - numEdits); + yetMoreEdits_ = pool.alloc(qlen + 10 - numMoreEdits - numEdits); if(yetMoreEdits_ == NULL) { return false; } @@ -99,9 +97,7 @@ struct EditList { /** * Return true iff no Edits have been added. */ - bool empty() const { - return size() == 0; - } + bool empty() const { return size() == 0; } /** * Set a particular element of the EditList. @@ -132,22 +128,20 @@ struct EditList { /** * Return number of Edits in the List. */ - size_t size() const { - return sz_; - } + size_t size() const { return sz_; } /** - * + * Free all the heap-allocated edit lists */ void free(AllocOnlyPool& epool, size_t qlen) { if(yetMoreEdits_ != NULL) - epool.free(yetMoreEdits_, qlen+3 - numMoreEdits - numEdits); + epool.free(yetMoreEdits_, qlen + 10 - numMoreEdits - numEdits); if(moreEdits_ != NULL) epool.free(moreEdits_, numMoreEdits); } - const static size_t numEdits = 6; - const static size_t numMoreEdits = 16; + const static size_t numEdits = 6; // part of object allocation + const static size_t numMoreEdits = 16; // first extra allocation size_t sz_; // number of Edits stored in the EditList Edit edits_[numEdits]; // first 4 edits; typically, no more are needed Edit *moreEdits_; // if used, size is dictated by numMoreEdits @@ -156,28 +150,32 @@ struct EditList { /** * Holds per-position information about what outgoing paths have been - * eliminated and what the quality value is at the position. + * eliminated and what the quality value(s) is (are) at the position. */ union ElimsAndQual { /** - * Assuming qualA/C/G/T are already set, set quallo and quallo2 - * accordingly. + * Assuming qual A/C/G/T are already set, set quallo and quallo2 + * to the additional cost incurred by the least and second-least + * costly paths. */ void updateLo() { flags.quallo = 127; flags.quallo2 = 127; if(!flags.mmA) { + // A mismatch to an A in the genome has not been ruled out if(flags.qualA < flags.quallo) { - flags.quallo2 = flags.quallo; + //flags.quallo2 = flags.quallo; flags.quallo = flags.qualA; - } else if(flags.qualA == flags.quallo) { - flags.quallo2 = flags.quallo; - } else if(flags.qualA < flags.quallo2) { - flags.quallo2 = flags.qualA; } + //else if(flags.qualA == flags.quallo) { + // flags.quallo2 = flags.quallo; + //} else if(flags.qualA < flags.quallo2) { + // flags.quallo2 = flags.qualA; + //} } if(!flags.mmC) { + // A mismatch to a C in the genome has not been ruled out if(flags.qualC < flags.quallo) { flags.quallo2 = flags.quallo; flags.quallo = flags.qualC; @@ -188,6 +186,7 @@ union ElimsAndQual { } } if(!flags.mmG) { + // A mismatch to a G in the genome has not been ruled out if(flags.qualG < flags.quallo) { flags.quallo2 = flags.quallo; flags.quallo = flags.qualG; @@ -198,6 +197,7 @@ union ElimsAndQual { } } if(!flags.mmT) { + // A mismatch to a T in the genome has not been ruled out if(flags.qualT < flags.quallo) { flags.quallo2 = flags.quallo; flags.quallo = flags.qualT; @@ -211,23 +211,31 @@ union ElimsAndQual { } /** - * Set all the non-qual bits of the flags field to 1, indicating + * Set all 13 elimination bits of the flags field to 1, indicating * that all outgoing paths are eliminated. */ inline void eliminate() { join.elims = ((1 << 13) - 1); } + /** + * Internal consistency check. Basically just checks that lo and + * lo2 are set correctly. + */ bool repOk() const { uint8_t lo = 127; uint8_t lo2 = 127; + assert_lt(flags.qualA, 127); + assert_lt(flags.qualC, 127); + assert_lt(flags.qualG, 127); + assert_lt(flags.qualT, 127); if(!flags.mmA) { if(flags.qualA < lo) { - lo2 = lo; lo = flags.qualA; - } else if(flags.qualA == lo || flags.qualA < lo2) { - lo2 = flags.qualA; } + //else if(flags.qualA == lo || flags.qualA < lo2) { + // lo2 = flags.qualA; + //} } if(!flags.mmC) { if(flags.qualC < lo) { @@ -309,7 +317,8 @@ struct RangeState { * now. */ Edit pickEdit(int pos, RandomSource& rand, bool fuzzy, - uint32_t& top, uint32_t& bot, bool& last) + uint32_t& top, uint32_t& bot, bool indels, + bool& last) { bool color = false; Edit ret; @@ -331,11 +340,24 @@ struct RangeState { last = false; // not the last at this pos // Sum up range sizes and do a random weighted pick uint32_t tot = 0; - bool candA = !eq.flags.mmA; - bool candC = !eq.flags.mmC; - bool candG = !eq.flags.mmG; - bool candT = !eq.flags.mmT; + bool candA = !eq.flags.mmA; bool candC = !eq.flags.mmC; + bool candG = !eq.flags.mmG; bool candT = !eq.flags.mmT; + bool candInsA = false, candInsC = false; + bool candInsG = false, candInsT = false; + bool candDel = false; + if(indels) { + // Insertions and deletions can only be candidates + // if the user asked for indels + candInsA = !eq.flags.insA; + candInsC = !eq.flags.insC; + candInsG = !eq.flags.insG; + candInsT = !eq.flags.insT; + candDel = !eq.flags.del; + } if(fuzzy) { + // To be a candidate in fuzzy mode, you have to both + // (a) not have been eliminated, and (b) be tied for + // lowest penalty. candA = (candA && eq.flags.qualA == eq.flags.quallo); candC = (candC && eq.flags.qualC == eq.flags.quallo); candG = (candG && eq.flags.qualG == eq.flags.quallo); @@ -366,6 +388,38 @@ struct RangeState { assert_gt(num, 0); ASSERT_ONLY(num--); } + if(indels) { + if(candInsA) { + assert_gt(bots[0], tops[0]); + tot += (bots[0] - tops[0]); + assert_gt(num, 0); + ASSERT_ONLY(num--); + } + if(candInsC) { + assert_gt(bots[1], tops[1]); + tot += (bots[1] - tops[1]); + assert_gt(num, 0); + ASSERT_ONLY(num--); + } + if(candInsG) { + assert_gt(bots[2], tops[2]); + tot += (bots[2] - tops[2]); + assert_gt(num, 0); + ASSERT_ONLY(num--); + } + if(candInsT) { + assert_gt(bots[3], tops[3]); + tot += (bots[3] - tops[3]); + assert_gt(num, 0); + ASSERT_ONLY(num--); + } + if(candDel) { + // Always a candidate? + // Always a candidate just within the window? + // We can eliminate some indels based on the content downstream, but not many + } + } + assert_geq(num, 0); assert_lt(num, origNum); // Throw a dart randomly that hits one of the possible @@ -707,7 +761,8 @@ class Branch { // the last remaining one at this position, 'last' is set to // true. uint32_t top = 0, bot = 0; - Edit e = ranges_[pos].pickEdit(pos + rdepth_, rand, fuzzy, top, bot, last); + Edit e = ranges_[pos].pickEdit(pos + rdepth_, rand, fuzzy, top, + bot, false, last); assert_gt(bot, top); // Create and initialize a new Branch uint16_t newRdepth = rdepth_ + pos + 1; diff --git a/ref_aligner.h b/ref_aligner.h index 2161e3e..48396bc 100644 --- a/ref_aligner.h +++ b/ref_aligner.h @@ -10,6 +10,7 @@ #include #include #include "seqan/sequence.h" +#include "alphabet.h" #include "range.h" #include "reference.h" @@ -35,13 +36,14 @@ class RefAligner { typedef std::set TSetPairs; public: - RefAligner(bool verbose = false, + RefAligner(bool color, + bool verbose = false, bool quiet = false, uint32_t seedLen = 0, uint32_t qualMax = 0xffffffff, bool maqPenalty = false) : - verbose_(verbose), seedLen_(seedLen), qualMax_(qualMax), - maqPenalty_(maqPenalty), refbuf_(buf_), + color_(color), verbose_(verbose), seedLen_(seedLen), + qualMax_(qualMax), maqPenalty_(maqPenalty), refbuf_(buf_), refbufSz_(REF_ALIGNER_BUFSZ), freeRefbuf_(false) { } @@ -74,7 +76,7 @@ class RefAligner { { assert_gt(numToFind, 0); assert_gt(end, begin); - uint32_t spread = end - begin; + uint32_t spread = end - begin + (color_ ? 1 : 0); uint32_t spreadPlus = spread + 12; // Make sure the buffer is large enough to accommodate the spread if(spreadPlus > this->refbufSz_) { @@ -83,6 +85,13 @@ class RefAligner { // Read in the relevant stretch of the reference string int offset = refs->getStretch(this->refbuf_, tidx, begin, spread); uint8_t *buf = ((uint8_t*)this->refbuf_) + offset; + if(color_) { + // Colorize buffer + for(size_t i = 0; i < (end-begin); i++) { + assert_lt(buf[i], 4); + buf[i] = dinuc2color[(int)buf[i]][(int)buf[i+1]]; + } + } // Look for alignments ASSERT_ONLY(uint32_t irsz = ranges.size()); anchor64Find(numToFind, tidx, buf, qry, quals, begin, @@ -145,6 +154,7 @@ class RefAligner { } protected: + bool color_; /// whether to colorize reference buffers before handing off bool verbose_; /// be talkative uint32_t seedLen_; /// length of seed region for read uint32_t qualMax_; /// maximum sum of quality penalties @@ -171,8 +181,8 @@ class ExactRefAligner : public RefAligner { public: - ExactRefAligner(bool verbose, bool quiet) : - RefAligner(verbose, quiet) { } + ExactRefAligner(bool color, bool verbose, bool quiet) : + RefAligner(color, verbose, quiet) { } virtual ~ExactRefAligner() { } @@ -505,8 +515,8 @@ class OneMMRefAligner : public RefAligner { public: - OneMMRefAligner(bool verbose, bool quiet) : - RefAligner(verbose, quiet) { } + OneMMRefAligner(bool color, bool verbose, bool quiet) : + RefAligner(color, verbose, quiet) { } virtual ~OneMMRefAligner() { } @@ -909,8 +919,8 @@ class TwoMMRefAligner : public RefAligner { public: - TwoMMRefAligner(bool verbose, bool quiet) : - RefAligner(verbose, quiet) { } + TwoMMRefAligner(bool color, bool verbose, bool quiet) : + RefAligner(color, verbose, quiet) { } virtual ~TwoMMRefAligner() { } @@ -1387,8 +1397,8 @@ class ThreeMMRefAligner : public RefAligner { public: - ThreeMMRefAligner(bool verbose, bool quiet) : - RefAligner(verbose, quiet) { } + ThreeMMRefAligner(bool color, bool verbose, bool quiet) : + RefAligner(color, verbose, quiet) { } virtual ~ThreeMMRefAligner() { } @@ -1940,8 +1950,8 @@ class Seed0RefAligner : public RefAligner { public: - Seed0RefAligner(bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) : - RefAligner(verbose, quiet, seedLen, qualMax, maqPenalty) { } + Seed0RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) : + RefAligner(color, verbose, quiet, seedLen, qualMax, maqPenalty) { } virtual ~Seed0RefAligner() { } @@ -2543,8 +2553,8 @@ class Seed1RefAligner : public RefAligner { public: - Seed1RefAligner(bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) : - RefAligner(verbose, quiet, seedLen, qualMax, maqPenalty) { } + Seed1RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) : + RefAligner(color, verbose, quiet, seedLen, qualMax, maqPenalty) { } virtual ~Seed1RefAligner() { } @@ -3282,8 +3292,8 @@ class Seed2RefAligner : public RefAligner { public: - Seed2RefAligner(bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) : - RefAligner(verbose, quiet, seedLen, qualMax, maqPenalty) { } + Seed2RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) : + RefAligner(color, verbose, quiet, seedLen, qualMax, maqPenalty) { } virtual ~Seed2RefAligner() { } @@ -4142,8 +4152,8 @@ class Seed3RefAligner : public RefAligner { public: - Seed3RefAligner(bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) : - RefAligner(verbose, quiet, seedLen, qualMax, maqPenalty) { } + Seed3RefAligner(bool color, bool verbose, bool quiet, uint32_t seedLen, uint32_t qualMax, bool maqPenalty) : + RefAligner(color, verbose, quiet, seedLen, qualMax, maqPenalty) { } virtual ~Seed3RefAligner() { } diff --git a/ref_read.cpp b/ref_read.cpp index 57c9eae..086db63 100644 --- a/ref_read.cpp +++ b/ref_read.cpp @@ -1,132 +1,165 @@ #include "ref_read.h" /** - * Reads past the next sequence from the given FASTA file and returns - * the size of the passed sequence. Does not do anything with the - * sequence characters themselves; this is purely for counting lengths. + * Reads past the next ambiguous or unambiguous stretch of sequence + * from the given FASTA file and returns its length. Does not do + * anything with the sequence characters themselves; this is purely for + * measuring lengths. */ RefRecord fastaRefReadSize(FileBuf& in, - const RefReadInParams& refparams, + const RefReadInParams& rparms, bool first, BitpairOutFileBuf* bpout) { int c; - static int lastc = '>'; - assert_neq(refparams.baseCutoff, 0); - assert_neq(refparams.numSeqCutoff, 0); + static int lastc = '>'; // last character seen + assert_neq(rparms.baseCutoff, 0); // should have stopped + assert_neq(rparms.numSeqCutoff, 0); // should have stopped // RefRecord params - size_t seqCharsRead = 0; - size_t seqOff = 0; - bool seqFirst = true; + size_t len = 0; // 'len' counts toward total length + // 'off' counts number of ambiguous characters before first + // unambiguous character + size_t off = 0; // Pick off the first carat and any preceding whitespace if(first) { assert(!in.eof()); lastc = '>'; - c = skipWhitespace(in); + c = in.getPastWhitespace(); if(in.eof()) { // Got eof right away; emit warning cerr << "Warning: Empty input file" << endl; lastc = -1; - return RefRecord(seqOff, seqCharsRead, seqFirst); + return RefRecord(0, 0, true); } + // TODO: having # could yield spurious warnings (below) assert(c == '>' || c == '#'); } + + first = true; // Skip to the end of the id line; if the next line is either // another id line or a comment line, keep skipping if(lastc == '>') { // Skip to the end of the name line do { - if((c = skipLine(in)) == -1) { + if((c = in.getPastNewline()) == -1) { + // No more input cerr << "Warning: Encountered empty reference sequence" << endl; lastc = -1; - return RefRecord(seqOff, seqCharsRead, seqFirst); + return RefRecord(0, 0, true); } if(c == '>') { cerr << "Warning: Encountered empty reference sequence" << endl; } + // continue until a non-name, non-comment line } while (c == '>' || c == '#'); } else { - // Skip until we arrive at a '>', in which case let lastc = '>' - // and return, or until we arrive at a DNA char - seqFirst = false; - seqOff = 1; // The gap has already been consumed, so count it - c = in.get(); // Get next char - if(c == -1) { + first = false; // not the first in a sequence + off = 1; // The gap has already been consumed, so count it + if((c = in.get()) == -1) { // Don't emit a warning, since this might legitimately be // a gap on the end of the final sequence in the file lastc = -1; - return RefRecord(seqOff, seqCharsRead, seqFirst); + return RefRecord(off, len, first); } } // Now skip to the first DNA character, counting gap characters // as we go + int lc = -1; // last-DNA char variable for color conversion while(true) { - if(refparams.nsToAs && dna4Cat[c] == 2) { + int cat = dna4Cat[c]; + if(rparms.nsToAs && cat == 2) { // Turn this 'N' (or other ambiguous char) into an 'A' c = 'A'; } - int cat = dna4Cat[c]; - ASSERT_ONLY(int cc = toupper(c)); if(cat == 1) { // This is a DNA character - assert(cc == 'A' || cc == 'C' || cc == 'G' || cc == 'T'); - break; // to read-in loop + if(rparms.color) { + if(lc != -1) { + // Got two consecutive unambiguous DNAs + break; // to read-in loop + } + // Keep going; we need two consecutive unambiguous DNAs + lc = charToDna5[(int)c]; + // The 'if(off > 0)' takes care of the case where + // the reference is entirely unambiguous and we don't + // want to incorrectly increment off. + if(off > 0) off++; + } else { + break; // to read-in loop + } } else if(cat == 2) { - assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T'); - seqOff++; // skip over gap character and increment + lc = -1; + off++; // skip over gap character and increment } else if(c == '>') { - if(seqOff > 0 && lastc == '>') { + if(off > 0 && lastc == '>') { cerr << "Warning: Encountered reference sequence with only gaps" << endl; } else if(lastc == '>') { cerr << "Warning: Encountered empty reference sequence" << endl; } lastc = '>'; - return RefRecord(seqOff, seqCharsRead, seqFirst); + return RefRecord(off, len, first); } c = in.get(); if(c == -1) { // End-of-file - if(seqOff > 0 && lastc == '>') { + if(off > 0 && lastc == '>') { cerr << "Warning: Encountered reference sequence with only gaps" << endl; } else if(lastc == '>') { cerr << "Warning: Encountered empty reference sequence" << endl; } lastc = -1; - return RefRecord(seqOff, seqCharsRead, seqFirst); + return RefRecord(off, len, first); } } - assert_eq(1, dna4Cat[c]); + assert(!rparms.color || (lc != -1)); + assert_eq(1, dna4Cat[c]); // C must be unambiguous base + if(off > 0 && rparms.color && first) { + // Handle the case where the first record has ambiguous + // characters but we're in color space; one of those counts is + // spurious + off--; + } // in now points just past the first character of a sequence // line, and c holds the first character while(c != -1 && c != '>') { - if(refparams.nsToAs && dna4Cat[c] == 2) { + if(rparms.nsToAs && dna4Cat[c] == 2) { // Turn this 'N' (or other ambiguous char) into an 'A' c = 'A'; } uint8_t cat = dna4Cat[c]; - ASSERT_ONLY(int cc = toupper(c)); + int cc = toupper(c); + if(rparms.bisulfite && cc == 'C') c = cc = 'T'; if(cat == 1) { // It's a DNA character assert(cc == 'A' || cc == 'C' || cc == 'G' || cc == 'T'); // Consume it - seqCharsRead++; + len++; // Output it - if(bpout) bpout->write(charToDna5[c]); - if(refparams.baseCutoff != -1 && - (int64_t)seqCharsRead >= refparams.baseCutoff) + if(bpout != NULL) { + if(rparms.color) { + // output color + bpout->write(dinuc2color[charToDna5[(int)c]][lc]); + } else if(!rparms.color) { + // output nucleotide + bpout->write(charToDna5[c]); + } + } + if(rparms.baseCutoff != -1 && + (int64_t)len >= rparms.baseCutoff) { lastc = -1; - return RefRecord(seqOff, seqCharsRead, seqFirst); + return RefRecord(off, len, first); } + lc = charToDna5[(int)c]; } else if(cat == 2) { // It's an N or a gap lastc = c; assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T'); - return RefRecord(seqOff, seqCharsRead, seqFirst); + return RefRecord(off, len, first); } else { // Not DNA and not a gap, ignore it #ifndef NDEBUG @@ -143,7 +176,7 @@ RefRecord fastaRefReadSize(FileBuf& in, c = in.get(); } lastc = c; - return RefRecord(seqOff, seqCharsRead, seqFirst); + return RefRecord(off, len, first); } /** @@ -151,13 +184,15 @@ RefRecord fastaRefReadSize(FileBuf& in, * all of the given input files, in order. Returns the total size of * all references combined. Rewinds each istream before returning. */ -size_t fastaRefReadSizes(vector& in, - vector& recs, - const RefReadInParams& refparams, - BitpairOutFileBuf* bpout) +std::pair +fastaRefReadSizes(vector& in, + vector& recs, + const RefReadInParams& rparms, + BitpairOutFileBuf* bpout) { - uint32_t tot = 0; - RefReadInParams rpcp = refparams; + uint32_t unambigTot = 0; + uint32_t bothTot = 0; + RefReadInParams rpcp = rparms; assert_gt(in.size(), 0); // For each input istream for(size_t i = 0; i < in.size(); i++) { @@ -169,32 +204,40 @@ size_t fastaRefReadSizes(vector& in, assert_neq(rpcp.numSeqCutoff, 0); // For each pattern in this istream while(!in[i]->eof() && rpcp.baseCutoff != 0 && rpcp.numSeqCutoff != 0) { - RefRecord rec = fastaRefReadSize(*in[i], refparams, first, bpout); + RefRecord rec = fastaRefReadSize(*in[i], rparms, first, bpout); if(rpcp.baseCutoff > 0) assert_leq((int64_t)rec.len, rpcp.baseCutoff); if(rpcp.baseCutoff != -1) rpcp.baseCutoff -= rec.len; if(rpcp.numSeqCutoff != -1) rpcp.numSeqCutoff--; assert_geq(rpcp.baseCutoff, -1); assert_geq(rpcp.numSeqCutoff, -1); - if((tot + rec.len) < tot) { + if((unambigTot + rec.len) < unambigTot) { cerr << "Error: Reference sequence has more than 2^32-1 characters! Please divide the" << endl << "reference into batches or chunks of about 3.6 billion characters or less each" << endl << "and index each independently." << endl; throw 1; } - tot += rec.len; + // Add the length of this record. + unambigTot += rec.len; + bothTot += rec.len; + bothTot += rec.off; first = false; if(rec.len == 0 && rec.off == 0 && !rec.first) continue; recs.push_back(rec); } + // Reset the input stream in[i]->reset(); assert(!in[i]->eof()); - #ifndef NDEBUG +#ifndef NDEBUG + // Check that it's really reset int c = in[i]->get(); assert_eq('>', c); in[i]->reset(); assert(!in[i]->eof()); - #endif +#endif } - assert_gt(tot, 0); - return tot; + assert_gt(bothTot, 0); + assert_gt(unambigTot, 0); + return make_pair( + unambigTot, // total number of unambiguous DNA characters read + bothTot); // total number of DNA characters read, incl. ambiguous ones } diff --git a/ref_read.h b/ref_read.h index e1d0862..5c0653a 100644 --- a/ref_read.h +++ b/ref_read.h @@ -17,16 +17,6 @@ using namespace std; using namespace seqan; -/// Skip to the end of the current line; return the first character -/// of the next line -static inline int skipWhitespace(FileBuf& in) { - int c; - while(isspace(c = in.get())) { - if(in.eof()) return -1; - } - return c; -} - /** * Encapsulates a stretch of the reference containing only unambiguous * characters. From an ordered list of RefRecords, one can (almost) @@ -80,46 +70,37 @@ struct RefRecord { * Parameters governing treatment of references as they're read in. */ struct RefReadInParams { - RefReadInParams(int64_t bc, int64_t sc, bool r, bool nsToA) : - baseCutoff(bc), numSeqCutoff(sc), reverse(r), nsToAs(nsToA) { } + RefReadInParams(int64_t bc, int64_t sc, bool col, bool r, + bool nsToA, bool bisulf) : + baseCutoff(bc), numSeqCutoff(sc), color(col), reverse(r), + nsToAs(nsToA), bisulfite(bisulf) { } // stop reading references once we've finished reading this many // total reference bases int64_t baseCutoff; // stop reading references once we've finished reading this many // distinct sequences int64_t numSeqCutoff; + // extract colors from reference + bool color; // reverse each reference sequence before passing it along bool reverse; // convert ambiguous characters to As bool nsToAs; + // bisulfite-convert the reference + bool bisulfite; }; -extern RefRecord fastaRefReadSize(FileBuf& in, - const RefReadInParams& refparams, - bool first, - BitpairOutFileBuf* bpout = NULL); -extern size_t fastaRefReadSizes(vector& in, - vector& recs, - const RefReadInParams& refparams, - BitpairOutFileBuf* bpout = NULL); +extern RefRecord +fastaRefReadSize(FileBuf& in, + const RefReadInParams& rparms, + bool first, + BitpairOutFileBuf* bpout = NULL); -/** - * For given filehandle, read to the end of the current line and return - * the first character on the next line, or -1 if eof is encountered at - * any time. - */ -static inline int skipLine(FileBuf& in) { - while(true) { - int c = in.get(); if(in.eof()) return -1; - if(c == '\n' || c == '\r') { - while(c == '\n' || c == '\r') { - c = in.get(); if(in.eof()) return -1; - } - // c now holds first character of next line - return c; - } - } -} +extern std::pair +fastaRefReadSizes(vector& in, + vector& recs, + const RefReadInParams& rparms, + BitpairOutFileBuf* bpout = NULL); /** * Reads the next sequence from the given FASTA file and appends it to @@ -129,14 +110,14 @@ template static RefRecord fastaRefReadAppend(FileBuf& in, bool first, TStr& dst, - RefReadInParams& refparams, + RefReadInParams& rparms, string* name = NULL) { typedef typename Value::Type TVal; int c; static int lastc = '>'; if(first) { - c = skipWhitespace(in); + c = in.getPastWhitespace(); if(c != '>') { cerr << "Reference file does not seem to be a FASTA file" << endl; throw 1; @@ -145,23 +126,24 @@ static RefRecord fastaRefReadAppend(FileBuf& in, } assert_neq(-1, lastc); - assert_neq(refparams.baseCutoff, 0); - assert_neq(refparams.numSeqCutoff, 0); + assert_neq(rparms.baseCutoff, 0); + assert_neq(rparms.numSeqCutoff, 0); // RefRecord params - size_t seqCharsRead = 0; - size_t seqOff = 0; - bool seqFirst = true; + size_t len = 0; + size_t off = 0; + first = true; size_t ilen = length(dst); // Chew up the id line; if the next line is either // another id line or a comment line, keep chewing + int lc = -1; // last-DNA char variable for color conversion c = lastc; if(c == '>' || c == '#') { do { while (c == '#') { - if((c = skipLine(in)) == -1) { + if((c = in.getPastNewline()) == -1) { lastc = -1; goto bail; } @@ -173,40 +155,52 @@ static RefRecord fastaRefReadAppend(FileBuf& in, lastc = -1; goto bail; } - if(c == '\n' || c == '\r') { - while(c == '\n' || c == '\r') { - c = in.get(); - if(c == -1) { - lastc = -1; - goto bail; - } + while(c == '\r' || c == '\n') c = in.get(); + if(c == -1) { + lastc = -1; + goto bail; } - // c now holds first character of next line break; } if (name) name->push_back(c); } + // c holds the first character on the line after the name + // line } while (c == '>' || c == '#'); } else { ASSERT_ONLY(int cc = toupper(c)); assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T'); - seqFirst = false; + first = false; } // Skip over gaps while(true) { - if(refparams.nsToAs && dna4Cat[c] == 2) { + int cat = dna4Cat[c]; + if(rparms.nsToAs && cat == 2) { c = 'A'; } - int cat = dna4Cat[c]; + int cc = toupper(c); + if(rparms.bisulfite && cc == 'C') c = cc = 'T'; if(cat == 1) { // This is a DNA character - break; // to read-in loop + if(rparms.color) { + if(lc != -1) { + // Got two consecutive unambiguous DNAs + break; // to read-in loop + } + // Keep going; we need two consecutive unambiguous DNAs + lc = charToDna5[(int)c]; + // The 'if(off > 0)' takes care of the case where + // the reference is entirely unambiguous and we don't + // want to incorrectly increment off. + if(off > 0) off++; + } else { + break; // to read-in loop + } } else if(cat == 2) { - ASSERT_ONLY(int cc = toupper(c)); - assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T'); - seqOff++; // skip it + lc = -1; + off++; // skip it } else if(c == '>') { lastc = '>'; goto bail; @@ -217,6 +211,13 @@ static RefRecord fastaRefReadAppend(FileBuf& in, goto bail; } } + if(first && rparms.color && off > 0) { + // Handle the case where the first record has ambiguous + // characters but we're in color space; one of those counts is + // spurious + off--; + } + assert(!rparms.color || lc != -1); assert_eq(1, dna4Cat[c]); // in now points just past the first character of a sequence @@ -227,34 +228,42 @@ static RefRecord fastaRefReadAppend(FileBuf& in, int cat = dna4Cat[c]; assert_neq(2, cat); if(cat == 1) { - appendValue(dst, (Dna)(char)c); + // Consume it + if(!rparms.color || lc != -1) len++; + // Add it to referenece buffer + if(rparms.color) { + appendValue(dst, (Dna)dinuc2color[charToDna5[(int)c]][lc]); + } else if(!rparms.color) { + appendValue(dst, (Dna)(char)c); + } assert_lt((uint8_t)(Dna)dst[length(dst)-1], 4); - seqCharsRead++; - if(refparams.baseCutoff > 0 && - (int64_t)seqCharsRead >= refparams.baseCutoff) + if(rparms.baseCutoff > 0 && + (int64_t)len >= rparms.baseCutoff) { lastc = -1; goto bail; } + lc = charToDna5[(int)c]; } c = in.get(); - if(refparams.nsToAs && dna4Cat[c] == 2) { + if(rparms.nsToAs && dna4Cat[c] == 2) { c = 'A'; } if (c == -1 || c == '>' || c == '#' || dna4Cat[c] == 2) { lastc = c; break; } + if(rparms.bisulfite && toupper(c) == 'C') c = 'T'; } bail: // Optionally reverse the portion that we just appended - if(refparams.reverse) { + if(rparms.reverse) { // Find limits of the portion we just appended size_t nlen = length(dst); - assert_eq(nlen - ilen, seqCharsRead); - if(seqCharsRead > 0) { - size_t halfway = ilen + (seqCharsRead>>1); + assert_eq(nlen - ilen, len); + if(len > 0) { + size_t halfway = ilen + (len>>1); // Reverse it in-place for(size_t i = ilen; i < halfway; i++) { size_t diff = i-ilen; @@ -265,7 +274,7 @@ static RefRecord fastaRefReadAppend(FileBuf& in, } } } - return RefRecord(seqOff, seqCharsRead, seqFirst); + return RefRecord(off, len, first); } #endif /*ndef REF_READ_H_*/ diff --git a/reference.h b/reference.h index a2069e7..88828fc 100644 --- a/reference.h +++ b/reference.h @@ -32,6 +32,7 @@ class BitPairReference { * Load from .3.ebwt/.4.ebwt Bowtie index files. */ BitPairReference(const string& in, + bool color, bool sanity = false, std::vector* infiles = NULL, std::vector >* origs = NULL, @@ -293,6 +294,29 @@ class BitPairReference { assert(origs != NULL); os = origs; } + + // Never mind; reference is always letters, even if index + // and alignment run are colorspace + + // If we're building a colorspace index, we need to convert + // osv to colors first +// if(color) { +// for(size_t i = 0; i < os->size(); i++) { +// size_t olen = seqan::length((*os)[i]); +// for(size_t j = 0; j < olen-1; j++) { +// int b1 = (int)(*os)[i][j]; +// assert_geq(b1, 0); assert_leq(b1, 4); +// int b2 = (int)(*os)[i][j+1]; +// assert_geq(b2, 0); assert_leq(b2, 4); +// (*os)[i][j] = (Dna5)dinuc2color[b1][b2]; +// assert((b1 != 4 && b2 != 4) || (int)(*os)[i][j] == 4); +// } +// seqan::resize((*os)[i], olen-1); +// } +// } + // Go through the loaded reference files base-by-base and + // sanity check against what we get by calling getBase and + // getStretch for(size_t i = 0; i < os->size(); i++) { size_t olen = seqan::length((*os)[i]); size_t olenU32 = (olen + 12) / 4; @@ -300,8 +324,8 @@ class BitPairReference { uint8_t *bufadj = (uint8_t*)buf; bufadj += getStretch(buf, i, 0, olen); for(size_t j = 0; j < olen; j++) { - assert_eq((*os)[i][j], bufadj[j]); - assert_eq((int)(*os)[i][j], getBase(i, j)); + assert_eq((int)(*os)[i][j], (int)bufadj[j]); + assert_eq((int)(*os)[i][j], (int)getBase(i, j)); } delete[] buf; } diff --git a/sam.cpp b/sam.cpp index 39fd0a0..2caa3b4 100644 --- a/sam.cpp +++ b/sam.cpp @@ -20,11 +20,13 @@ using namespace std; void SAMHitSink::appendHeaders(OutFileBuf& os, size_t numRefs, const vector& refnames, + bool color, bool nosq, ReferenceMap *rmap, const uint32_t* plen, bool fullRef, - const char *cmdline) + const char *cmdline, + const char *rgline) { ostringstream ss; ss << "@HD\tVN:1.0\tSO:unsorted" << endl; @@ -39,9 +41,12 @@ void SAMHitSink::appendHeaders(OutFileBuf& os, } else { ss << i; } - ss << "\tLN:" << plen[i] << endl; + ss << "\tLN:" << (plen[i] + (color ? 1 : 0)) << endl; } } + if(rgline != NULL) { + ss << "@RG\t" << rgline << endl; + } ss << "@PG\tID:Bowtie\tVN:" << BOWTIE_VERSION << "\tCL:\"" << cmdline << "\"" << endl; os.writeString(ss.str()); } @@ -58,7 +63,15 @@ void SAMHitSink::appendAligned(ostream& ss, int offBase) { // QNAME - ss << h.patName << "\t"; + if(h.mate > 0) { + // truncate final 2 chars + for(int i = 0; i < (int)seqan::length(h.patName)-2; i++) { + ss << h.patName[i]; + } + } else { + ss << h.patName; + } + ss << '\t'; // FLAG int flags = 0; if(h.mate == 1) { @@ -119,20 +132,33 @@ void SAMHitSink::appendAligned(ostream& ss, // // Always output stratum ss << "\tXA:i:" << (int)h.stratum; + // Always output cost + //ss << "\tXC:i:" << (int)h.cost; // Look for SNP annotations falling within the alignment // Output MD field - const size_t len = length(h.patSeq); + size_t len = length(h.patSeq); int nm = 0; int run = 0; ss << "\tMD:Z:"; + const FixedBitset<1024> *mms = &h.mms; + const String* pat = &h.patSeq; + const vector* refcs = &h.refcs; + if(h.color && false) { + // Disabled: print MD:Z string w/r/t to colors, not letters + mms = &h.cmms; + pat = &h.colSeq; + assert_eq(length(h.colSeq), len+1); + len = length(h.colSeq); + refcs = &h.crefcs; + } if(h.fw) { for (int i = 0; i < (int)len; ++ i) { - if(h.mms.test(i)) { + if(mms->test(i)) { nm++; // There's a mismatch at this position - assert_gt((int)h.refcs.size(), i); - char refChar = toupper(h.refcs[i]); - ASSERT_ONLY(char qryChar = (h.fw ? h.patSeq[i] : h.patSeq[length(h.patSeq)-i-1])); + assert_gt((int)refcs->size(), i); + char refChar = toupper((*refcs)[i]); + ASSERT_ONLY(char qryChar = (h.fw ? (*pat)[i] : (*pat)[len-i-1])); assert_neq(refChar, qryChar); ss << run << refChar; run = 0; @@ -142,12 +168,12 @@ void SAMHitSink::appendAligned(ostream& ss, } } else { for (int i = len-1; i >= 0; -- i) { - if(h.mms.test(i)) { + if(mms->test(i)) { nm++; // There's a mismatch at this position - assert_gt((int)h.refcs.size(), i); - char refChar = toupper(h.refcs[i]); - ASSERT_ONLY(char qryChar = (h.fw ? h.patSeq[i] : h.patSeq[length(h.patSeq)-i-1])); + assert_gt((int)refcs->size(), i); + char refChar = toupper((*refcs)[i]); + ASSERT_ONLY(char qryChar = (h.fw ? (*pat)[i] : (*pat)[len-i-1])); assert_neq(refChar, qryChar); ss << run << refChar; run = 0; @@ -159,6 +185,7 @@ void SAMHitSink::appendAligned(ostream& ss, ss << run; // Add optional edit distance field ss << "\tNM:i:" << nm; + if(h.color) ss << "\tCM:i:" << h.cmms.count(); ss << endl; } diff --git a/sam.h b/sam.h index 4c8fb48..5e9d3a8 100644 --- a/sam.h +++ b/sam.h @@ -95,11 +95,13 @@ class SAMHitSink : public HitSink { void appendHeaders(OutFileBuf& os, size_t numRefs, const vector& refnames, + bool color, bool nosq, ReferenceMap *rmap, const uint32_t* plen, bool fullRef, - const char *cmdline); + const char *cmdline, + const char *rgline); protected: