diff --git a/aligner.h b/aligner.h index ea75e4c..9f96205 100644 --- a/aligner.h +++ b/aligner.h @@ -72,9 +72,9 @@ class Aligner { // Current read pair PatternSourcePerThread* patsrc_; - ReadBuf* bufa_; + Read* bufa_; uint32_t alen_; - ReadBuf* bufb_; + Read* bufb_; uint32_t blen_; bool rangeMode_; RandomSource rand_; @@ -155,24 +155,21 @@ class MultiAligner { * try to hide all the latency. */ void run() { - bool done = false; - while(!done) { - done = true; + bool saw_last_read = false; + while(!saw_last_read) { for(uint32_t i = 0; i < n_; i++) { if(!(*aligners_)[i]->done) { // Advance an aligner already in progress - done = false; (*aligners_)[i]->advance(); } else { + if(saw_last_read) { + break; + } // Get a new read and initialize an aligner with it - (*patsrcs_)[i]->nextReadPair(); - if(!(*patsrcs_)[i]->empty() && (*patsrcs_)[i]->patid() < qUpto_) { + pair ret = (*patsrcs_)[i]->nextReadPair(); + saw_last_read = ret.second; + if(ret.first && (*patsrcs_)[i]->rdid() < qUpto_) { (*aligners_)[i]->setQuery((*patsrcs_)[i]); - assert(!(*aligners_)[i]->done); - done = false; - } else { - // No more reads; if done == true, it remains - // true } } } @@ -253,12 +250,12 @@ class MixedMultiAligner { msg = ss.str(); Timer timer(std::cout, msg.c_str()); #endif - bool done = false; bool first = true; + bool saw_last_read = false; if(n_ == 1) { Aligner *al = seOrPe_[0] ? (*alignersSE_)[0] : (*alignersPE_)[0]; PatternSourcePerThread *ps = (*patsrcs_)[0]; - while(!done) { + while(true) { #ifdef PER_THREAD_TIMING int cpu = 0, node = 0; get_cpu_and_node(cpu, node); @@ -271,16 +268,18 @@ class MixedMultiAligner { current_node = node; } #endif - done = true; if(!first && !al->done) { // Advance an aligner already in progress; this is // the common case - done = false; al->advance(); } else { + if(saw_last_read) { + break; + } // Get a new read - ps->nextReadPair(); - if(ps->patid() < qUpto_ && !ps->empty()) { + pair ret = ps->nextReadPair(); + saw_last_read = ret.second; + if(ps->rdid() < qUpto_ && ret.first) { if(ps->paired()) { // Read currently in buffer is paired-end (*alignersPE_)[0]->setQuery(ps); @@ -292,16 +291,12 @@ class MixedMultiAligner { al = (*alignersSE_)[0]; seOrPe_[0] = true; // true = unpaired } - done = false; - } else { - // No more reads; if done == true, it remains - // true } } first = false; } } else { - while(!done) { + while(true) { #ifdef PER_THREAD_TIMING int cpu = 0, node = 0; get_cpu_and_node(cpu, node); @@ -314,21 +309,24 @@ class MixedMultiAligner { current_node = node; } #endif - done = true; + bool saw_last_read = false; for(uint32_t i = 0; i < n_; i++) { Aligner *al = seOrPe_[i] ? (*alignersSE_)[i] : (*alignersPE_)[i]; if(!first && !al->done) { // Advance an aligner already in progress; this is // the common case - done = false; al->advance(); } else { + if(saw_last_read) { + break; + } // Feed a new read to a vacant aligner PatternSourcePerThread *ps = (*patsrcs_)[i]; // Get a new read - ps->nextReadPair(); - if(ps->patid() < qUpto_ && !ps->empty()) { + pair ret = ps->nextReadPair(); + saw_last_read = ret.second; + if(ps->rdid() < qUpto_ && ret.first) { if(ps->paired()) { // Read currently in buffer is paired-end (*alignersPE_)[i]->setQuery(ps); @@ -338,10 +336,6 @@ class MixedMultiAligner { (*alignersSE_)[i]->setQuery(ps); seOrPe_[i] = true; // true = unpaired } - done = false; - } else { - // No more reads; if done == true, it remains - // true } } } @@ -433,7 +427,7 @@ class UnpairedAlignerV2 : public Aligner { if(metrics_ != NULL) { metrics_->nextRead(patsrc->bufa().patFw); } - pool_->reset(&patsrc->bufa().name, patsrc->patid()); + pool_->reset(&patsrc->bufa().name, (uint32_t)patsrc->rdid()); if(patsrc->bufa().length() < 4) { if(!quiet_) { cerr << "Warning: Skipping read " << patsrc->bufa().name @@ -496,7 +490,7 @@ class UnpairedAlignerV2 : public Aligner { ra.stratum, // alignment stratum ra.cost, // cost, including qual penalty ra.bot - ra.top - 1, // # other hits - patsrc_->patid(), // pattern id + (uint32_t)patsrc_->rdid(),// pattern id bufa_->seed, // pseudo-random seed 0); // mate (0 = unpaired) } @@ -745,7 +739,7 @@ class PairedBWAlignerV1 : public Aligner { assert(!patsrc->bufb().empty()); // Give all of the drivers pointers to the relevant read info patsrc_ = patsrc; - pool_->reset(&patsrc->bufa().name, patsrc->patid()); + pool_->reset(&patsrc->bufa().name, (uint32_t)patsrc->rdid()); if(patsrc->bufa().length() < 4 || patsrc->bufb().length() < 4) { if(!quiet_) { cerr << "Warning: Skipping pair " << patsrc->bufa().name @@ -875,8 +869,8 @@ class PairedBWAlignerV1 : public Aligner { TIndexOffU spreadL = rL.bot - rL.top; TIndexOffU spreadR = rR.bot - rR.top; TIndexOffU oms = min(spreadL, spreadR) - 1; - ReadBuf* bufL = pairFw ? bufa_ : bufb_; - ReadBuf* bufR = pairFw ? bufb_ : bufa_; + Read* bufL = pairFw ? bufa_ : bufb_; + Read* bufR = pairFw ? bufb_ : bufa_; TIndexOffU lenL = pairFw ? alen_ : blen_; TIndexOffU lenR = pairFw ? blen_ : alen_; bool ret; @@ -1598,7 +1592,7 @@ class PairedBWAlignerV2 : public Aligner { assert(!patsrc->bufb().empty()); // Give all of the drivers pointers to the relevant read info patsrc_ = patsrc; - pool_->reset(&patsrc->bufa().name, patsrc->patid()); + pool_->reset(&patsrc->bufa().name, (uint32_t)patsrc->rdid()); if(patsrc->bufa().length() < 4 || patsrc->bufb().length() < 4) { if(!quiet_) { cerr << "Warning: Skipping pair " << patsrc->bufa().name @@ -1754,8 +1748,8 @@ class PairedBWAlignerV2 : public Aligner { TIndexOffU spreadL = rL.bot - rL.top; TIndexOffU spreadR = rR.bot - rR.top; TIndexOffU oms = min(spreadL, spreadR) - 1; - ReadBuf* bufL = pairFw ? bufa_ : bufb_; - ReadBuf* bufR = pairFw ? bufb_ : bufa_; + Read* bufL = pairFw ? bufa_ : bufb_; + Read* bufR = pairFw ? bufb_ : bufa_; TIndexOffU lenL = pairFw ? alen_ : blen_; TIndexOffU lenR = pairFw ? blen_ : alen_; bool ret; @@ -1840,7 +1834,7 @@ class PairedBWAlignerV2 : public Aligner { EbwtSearchParams >*params = (r.mate1 ? paramsSe1_ : paramsSe2_); assert(!(r.mate1 ? doneSe1_ : doneSe2_)); params->setFw(r.fw); - ReadBuf* buf = r.mate1 ? bufa_ : bufb_; + Read* buf = r.mate1 ? bufa_ : bufb_; bool ebwtFw = r.ebwt->fw(); uint32_t len = r.mate1 ? alen_ : blen_; assert_eq(buf->color, color); diff --git a/ebwt_search.cpp b/ebwt_search.cpp index 0e183ec..4cd2d42 100644 --- a/ebwt_search.cpp +++ b/ebwt_search.cpp @@ -109,7 +109,6 @@ static uint32_t skipReads; // # reads/read pairs to skip static bool nofw; // don't align fw orientation of read static bool norc; // don't align rc orientation of read static bool strandFix; // attempt to fix strand bias -static bool randomizeQuals; // randomize quality values static bool stats; // print performance stats static int chunkPoolMegabytes; // max MB to dedicate to best-first search frames per thread static int chunkSz; // size of single chunk disbursed by ChunkPool @@ -221,7 +220,6 @@ static void resetOptions() { nofw = false; // don't align fw orientation of read norc = false; // don't align rc orientation of read strandFix = true; // attempt to fix strand bias - randomizeQuals = false; // randomize quality values stats = false; // print performance stats chunkPoolMegabytes = 64; // max MB to dedicate to best-first search frames per thread chunkSz = 256; // size of single chunk disbursed by ChunkPool (in KB) @@ -308,7 +306,6 @@ enum { ARG_NO_RC, ARG_SKIP, ARG_STRAND_FIX, - ARG_RANDOMIZE_QUALS, ARG_STATS, ARG_ONETWO, ARG_PHRED64, @@ -412,7 +409,6 @@ static struct option long_options[] = { {(char*)"tryhard", no_argument, 0, 'y'}, {(char*)"skip", required_argument, 0, 's'}, {(char*)"strandfix", no_argument, 0, ARG_STRAND_FIX}, - {(char*)"randquals", no_argument, 0, ARG_RANDOMIZE_QUALS}, {(char*)"stats", no_argument, 0, ARG_STATS}, {(char*)"12", required_argument, 0, ARG_ONETWO}, {(char*)"phred33-quals", no_argument, 0, ARG_PHRED33}, @@ -847,7 +843,6 @@ static void parseOptions(int argc, const char **argv) { } case ARG_DUMP_PATS: patDumpfile = optarg; break; case ARG_STRAND_FIX: strandFix = true; break; - case ARG_RANDOMIZE_QUALS: randomizeQuals = true; break; case ARG_PARTITION: partitionSz = parse(optarg); break; case ARG_READS_PER_BATCH: { if(optarg == NULL || parse(optarg) < 1) { @@ -1004,7 +999,7 @@ static const char *argv0 = NULL; #define FINISH_READ(p) \ /* Don't do finishRead if the read isn't legit or if the read was skipped by the doneMask */ \ - if(!p->empty()) { \ + if(get_read_ret.first) { \ sink->finishRead(*p, true, !skipped); \ } \ skipped = false; @@ -1013,8 +1008,9 @@ static const char *argv0 = NULL; /// whether the result is empty or the patid exceeds the limit, and /// marshaling the read into convenient variables. #define GET_READ(p) \ - p->nextReadPair(); \ - if(p->empty() || p->patid() >= qUpto) { \ + if(get_read_ret.second) break; \ + get_read_ret = p->nextReadPair(); \ + if(!get_read_ret.first || p->rdid() >= qUpto) { \ p->bufa().clearAll(); \ break; \ } \ @@ -1033,33 +1029,9 @@ static const char *argv0 = NULL; patRcRev.data_begin += 0; /* suppress "unused" compiler warning */ \ String& name = p->bufa().name; \ name.data_begin += 0; /* suppress "unused" compiler warning */ \ - uint32_t patid = p->patid(); \ + uint32_t patid = (uint32_t)p->rdid(); \ params.setPatId(patid); -/// Macro for getting the forward oriented version of next read, -/// possibly aborting depending on whether the result is empty or the -/// patid exceeds the limit, and marshaling the read into convenient -/// variables. -#define GET_READ_FW(p) \ - p->nextReadPair(); \ - if(p->empty() || p->patid() >= qUpto) { \ - p->bufa().clearAll(); \ - break; \ - } \ - params.setPatId(p->patid()); \ - assert(!empty(p->bufa().patFw)); \ - String& patFw = p->bufa().patFw; \ - patFw.data_begin += 0; /* suppress "unused" compiler warning */ \ - String& qual = p->bufa().qual; \ - qual.data_begin += 0; /* suppress "unused" compiler warning */ \ - String& patFwRev = p->bufa().patFwRev; \ - patFwRev.data_begin += 0; /* suppress "unused" compiler warning */ \ - String& qualRev = p->bufa().qualRev; \ - qualRev.data_begin += 0; /* suppress "unused" compiler warning */ \ - String& name = p->bufa().name; \ - name.data_begin += 0; /* suppress "unused" compiler warning */ \ - uint32_t patid = p->patid(); - #define WORKER_EXIT() \ patsrcFact->destroy(patsrc); \ delete patsrcFact; \ @@ -1164,6 +1136,7 @@ static void exactSearchWorker(void *vp) { verbose, // verbose &os, false); // considerQuals + pair get_read_ret = make_pair(false, false); bool skipped = false; #ifdef PER_THREAD_TIMING uint64_t ncpu_changeovers = 0; @@ -1515,6 +1488,7 @@ static void mismatchSearchWorkerFull(void *vp){ &os, false); // considerQuals bool skipped = false; + pair get_read_ret = make_pair(false, false); #ifdef PER_THREAD_TIMING uint64_t ncpu_changeovers = 0; uint64_t nnuma_changeovers = 0; @@ -1543,7 +1517,7 @@ static void mismatchSearchWorkerFull(void *vp){ #endif FINISH_READ(patsrc); GET_READ(patsrc); - uint32_t plen = length(patFw); + uint32_t plen = (uint32_t)length(patFw); uint32_t s = plen; uint32_t s3 = s >> 1; // length of 3' half of seed uint32_t s5 = (s >> 1) + (s & 1); // length of 5' half of seed @@ -1894,6 +1868,7 @@ static void twoOrThreeMismatchSearchWorkerFull(void *vp) { false, // considerQuals true); // halfAndHalf bool skipped = false; + pair get_read_ret = make_pair(false, false); #ifdef PER_THREAD_TIMING uint64_t ncpu_changeovers = 0; uint64_t nnuma_changeovers = 0; @@ -1923,7 +1898,7 @@ static void twoOrThreeMismatchSearchWorkerFull(void *vp) { FINISH_READ(patsrc); GET_READ(patsrc); patid += 0; // kill unused variable warning - uint32_t plen = length(patFw); + uint32_t plen = (uint32_t)length(patFw); uint32_t s = plen; uint32_t s3 = s >> 1; // length of 3' half of seed uint32_t s5 = (s >> 1) + (s & 1); // length of 5' half of seed @@ -2203,6 +2178,7 @@ static void seededQualSearchWorkerFull(void *vp) { !noMaqRound); String muts; bool skipped = false; + pair get_read_ret = make_pair(false, false); #ifdef PER_THREAD_TIMING uint64_t ncpu_changeovers = 0; uint64_t nnuma_changeovers = 0; @@ -2473,43 +2449,38 @@ patsrcFromStrings(int format, { switch(format) { case FASTA: - return new FastaPatternSource (seed, reads, quals, color, - randomizeQuals, - patDumpfile, verbose, + return new FastaPatternSource (reads, quals, color, + patDumpfile, trim3, trim5, solexaQuals, phred64Quals, integerQuals, skipReads); case FASTA_CONT: return new FastaContinuousPatternSource ( - seed, reads, fastaContLen, + reads, fastaContLen, fastaContFreq, - patDumpfile, verbose, + patDumpfile, skipReads); case RAW: - return new RawPatternSource (seed, reads, color, - randomizeQuals, - patDumpfile, verbose, + return new RawPatternSource (reads, color, + patDumpfile, trim3, trim5, skipReads); case FASTQ: - return new FastqPatternSource (seed, reads, color, - randomizeQuals, - patDumpfile, verbose, + return new FastqPatternSource (reads, color, + patDumpfile, trim3, trim5, solexaQuals, phred64Quals, integerQuals, skipReads); case TAB_MATE: - return new TabbedPatternSource(seed, reads, color, - randomizeQuals, - patDumpfile, verbose, + return new TabbedPatternSource(reads, false, color, + patDumpfile, trim3, trim5, skipReads); case CMDLINE: - return new VectorPatternSource(seed, reads, color, - randomizeQuals, - patDumpfile, verbose, + return new VectorPatternSource(reads, color, + patDumpfile, trim3, trim5, skipReads); default: { @@ -2665,9 +2636,9 @@ static void driver(const char * type, } PatternComposer *patsrc = NULL; if(mates12.size() > 0) { - patsrc = new SoloPatternComposer(patsrcs_ab, seed); + patsrc = new SoloPatternComposer(patsrcs_ab); } else { - patsrc = new DualPatternComposer(patsrcs_a, patsrcs_b, seed); + patsrc = new DualPatternComposer(patsrcs_a, patsrcs_b); } // Open hit output file diff --git a/ebwt_search_backtrack.h b/ebwt_search_backtrack.h index 54d0959..0956ff7 100644 --- a/ebwt_search_backtrack.h +++ b/ebwt_search_backtrack.h @@ -88,7 +88,7 @@ class GreedyDFSRangeSource { /** * Set a new query read. */ - void setQuery(ReadBuf& r) { + void setQuery(Read& r) { const bool fw = _params.fw(); const bool ebwtFw = _ebwt->fw(); if(ebwtFw) { @@ -1838,7 +1838,7 @@ class EbwtRangeSource : public RangeSource { /** * Set a new query read. */ - virtual void setQuery(ReadBuf& r, Range *seedRange) { + virtual void setQuery(Read& r, Range *seedRange) { const bool ebwtFw = ebwt_->fw(); if(ebwtFw) { qry_ = fw_ ? &r.patFw : &r.patRc; diff --git a/hit.h b/hit.h index 766e68c..59603d9 100644 --- a/hit.h +++ b/hit.h @@ -592,7 +592,8 @@ class HitSink { */ void dumpAlign(PatternSourcePerThread& p) { if(!dumpAlignFlag_) return; - if(!p.paired() || onePairFile_) { + const bool paired = p.bufa().mate > 0; + if(!paired || onePairFile_) { // Dump unpaired read to an aligned-read file of the same format if(!dumpAlBase_.empty()) { ThreadSafe _ts(&dumpAlignLock_); @@ -646,7 +647,8 @@ class HitSink { */ void dumpUnal(PatternSourcePerThread& p) { if(!dumpUnalignFlag_) return; - if(!p.paired() || onePairFile_) { + const bool paired = p.bufa().mate > 0; + if(!paired || onePairFile_) { // Dump unpaired read to an unaligned-read file of the same format if(!dumpUnalBase_.empty()) { ThreadSafe _ts(&dumpUnalLock_); @@ -701,7 +703,8 @@ class HitSink { if(dumpUnalignFlag_) dumpUnal(p); return; } - if(!p.paired() || onePairFile_) { + const bool paired = p.bufa().mate > 0; + if(!paired || onePairFile_) { // Dump unpaired read to an maxed-out-read file of the same format if(!dumpMaxBase_.empty()) { ThreadSafe _ts(&dumpMaxLock_); diff --git a/pat.cpp b/pat.cpp index a597e7e..acba1e7 100644 --- a/pat.cpp +++ b/pat.cpp @@ -12,90 +12,794 @@ using namespace std; using namespace seqan; /** - * Parse a single quality string from fb and store qualities in r. - * Assume the next character obtained via fb.get() is the first - * character of the quality string. When returning, the next - * character returned by fb.peek() or fb.get() should be the first - * character of the following line. + * Calculate a per-read random seed based on a combination of + * the read data (incl. sequence, name, quals) and the global + * seed in '_randSeed'. */ -int parseQuals(ReadBuf& r, - FileBuf& fb, - int readLen, - int trim3, - int trim5, - bool intQuals, - bool phred64, - bool solexa64) +static uint32_t genRandSeed( + const String& qry, + const String& qual, + const String& name, + uint32_t seed) { - int qualsRead = 0; - int c = 0; - assert(fb.peek() != '\n' && fb.peek() != '\r'); - _setBegin (r.qual, (char*)r.qualBuf); - _setLength(r.qual, 0); - if (intQuals) { - while (c != '\r' && c != '\n' && c != -1) { - bool neg = false; - int num = 0; - while(!isspace(c = fb.peek()) && !fb.eof()) { - if(c == '-') { - neg = true; - assert_eq(num, 0); - } else { - if(!isdigit(c)) { - char buf[2048]; - cerr << "Warning: could not parse quality line:" << endl; - fb.getPastNewline(); - cerr << fb.copyLastN(buf); - buf[2047] = '\0'; - cerr << buf; - throw 1; + // Calculate a per-read random seed based on a combination of + // the read data (incl. sequence, name, quals) and the global + // seed + uint32_t rseed = (seed + 101) * 59 * 61 * 67 * 71 * 73 * 79 * 83; + size_t qlen = seqan::length(qry); + // Throw all the characters of the read into the random seed + for(size_t i = 0; i < qlen; i++) { + int p = (int)qry[i]; + assert_leq(p, 4); + size_t off = ((i & 15) << 1); + rseed ^= (p << off); + } + // Throw all the quality values for the read into the random + // seed + for(size_t i = 0; i < qlen; i++) { + int p = (int)qual[i]; + assert_leq(p, 255); + size_t off = ((i & 3) << 3); + rseed ^= (p << off); + } + // Throw all the characters in the read name into the random + // seed + size_t namelen = seqan::length(name); + for(size_t i = 0; i < namelen; i++) { + int p = (int)name[i]; + assert_leq(p, 255); + size_t off = ((i & 3) << 3); + rseed ^= (p << off); + } + return rseed; +} + +/** + * Once name/sequence/qualities have been parsed for an + * unpaired read, set all the other key fields of the Read + * struct. + */ +void PatternSourcePerThread::finalize(Read& ra) { + ra.mate = 0; + ra.constructRevComps(); + ra.constructReverses(); + ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_); +} + +/** + * Once name/sequence/qualities have been parsed for a + * paired-end read, set all the other key fields of the Read + * structs. + */ +void PatternSourcePerThread::finalizePair(Read& ra, Read& rb) { + ra.mate = 1; + ra.constructRevComps(); + ra.constructReverses(); + ra.fixMateName(1); + ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_); + + rb.mate = 2; + rb.constructRevComps(); + rb.constructReverses(); + rb.fixMateName(2); + rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, seed_); +} + +/** + * Get the next paired or unpaired read from the wrapped + * PatternComposer. Returns a pair of bools; first indicates + * whether we were successful, second indicates whether we're + * done. + */ +pair PatternSourcePerThread::nextReadPair() { + // Prepare batch + if(buf_.exhausted()) { + pair res = nextBatch(); + if(res.first && res.second == 0) { + return make_pair(false, true); + } + last_batch_ = res.first; + last_batch_size_ = res.second; + assert_eq(0, buf_.cur_buf_); + } else { + buf_.next(); // advance cursor + assert_gt(buf_.cur_buf_, 0); + } + // Parse read/pair + assert(strlen(buf_.read_a().readOrigBuf) != 0); + assert(buf_.read_a().empty()); + if(!parse(buf_.read_a(), buf_.read_b())) { + return make_pair(false, false); + } + // Finalize read/pair + if(paired()) { + finalizePair(buf_.read_a(), buf_.read_b()); + } else { + finalize(buf_.read_a()); + } + bool this_is_last = buf_.cur_buf_ == last_batch_size_-1; + return make_pair(true, this_is_last ? last_batch_ : false); +} + +/** + * The main member function for dispensing pairs of reads or + * singleton reads. Returns true iff ra and rb contain a new + * pair; returns false if ra contains a new unpaired read. + */ +pair SoloPatternComposer::nextBatch(PerThreadReadBuf& pt) { + uint32_t cur = cur_; + while(cur < src_.size()) { + // Patterns from srca_[cur_] are unpaired + pair res; + do { + res = src_[cur]->nextBatch( + pt, + true, // batch A (or pairs) + true); // grab lock below + } while(!res.first && res.second == 0); + if(res.second == 0) { + ThreadSafe ts(&mutex_m); + if(cur + 1 > cur_) { + cur_++; + } + cur = cur_; + continue; // on to next pair of PatternSources + } + return res; + } + assert_leq(cur, src_.size()); + return make_pair(true, 0); +} + +/** + * The main member function for dispensing pairs of reads or + * singleton reads. Returns true iff ra and rb contain a new + * pair; returns false if ra contains a new unpaired read. + */ +pair DualPatternComposer::nextBatch(PerThreadReadBuf& pt) { + // 'cur' indexes the current pair of PatternSources + uint32_t cur = cur_; + while(cur < srca_.size()) { + if(srcb_[cur] == NULL) { + pair res = srca_[cur]->nextBatch( + pt, + true, // batch A (or pairs) + true); // grab lock below + bool done = res.first; + if(!done && res.second == 0) { + ThreadSafe ts(&mutex_m); + if(cur + 1 > cur_) cur_++; + cur = cur_; // Move on to next PatternSource + continue; // on to next pair of PatternSources + } + return make_pair(done, res.second); + } else { + pair resa, resb; + // Lock to ensure that this thread gets parallel reads + // in the two mate files + { + ThreadSafe ts(&mutex_m); + resa = srca_[cur]->nextBatch( + pt, + true, // batch A + false); // don't grab lock below + resb = srcb_[cur]->nextBatch( + pt, + false, // batch B + false); // don't grab lock below + assert_eq(srca_[cur]->readCount(), + srcb_[cur]->readCount()); + } + if(resa.second < resb.second) { + cerr << "Error, fewer reads in file specified with -1 " + << "than in file specified with -2" << endl; + throw 1; + } else if(resa.second == 0 && resb.second == 0) { + ThreadSafe ts(&mutex_m); + if(cur + 1 > cur_) { + cur_++; + } + cur = cur_; // Move on to next PatternSource + continue; // on to next pair of PatternSources + } else if(resb.second < resa.second) { + cerr << "Error, fewer reads in file specified with -2 " + << "than in file specified with -1" << endl; + throw 1; + } + assert_eq(resa.first, resb.first); + assert_eq(resa.second, resb.second); + return make_pair(resa.first, resa.second); + } + } + assert_leq(cur, srca_.size()); + return make_pair(true, 0); +} + +/** + * Fill Read with the sequence, quality and name for the next + * read in the list of read files. This function gets called by + * all the search threads, so we must handle synchronization. + * + * Returns pair where bool indicates whether we're + * completely done, and int indicates how many reads were read. + */ +pair CFilePatternSource::nextBatch( + PerThreadReadBuf& pt, + bool batch_a, + bool lock) +{ + bool done = false; + int nread = 0; + + // synchronization at this level because both reading and manipulation of + // current file pointer have to be protected + ThreadSafe ts(&mutex, lock); + pt.setReadId(readCnt_); + while(true) { // loop that moves on to next file when needed + do { + pair ret = nextBatchFromFile(pt, batch_a); + done = ret.first; + nread = ret.second; + } while(!done && nread == 0); // not sure why this would happen + if(done && filecur_ < infiles_.size()) { // finished with this file + open(); + resetForNextFile(); // reset state to handle a fresh file + filecur_++; + if(nread == 0) { + continue; + } + } + break; + } + assert_geq(nread, 0); + readCnt_ += nread; + return make_pair(done, nread); +} + +/** + * Open the next file in the list of input files. + */ +void CFilePatternSource::open() { + if(is_open_) { + is_open_ = false; + fclose(fp_); + fp_ = NULL; + if(qfp_ != NULL) { + fclose(qfp_); + qfp_ = NULL; + } + } + while(filecur_ < infiles_.size()) { + // Open read + if(infiles_[filecur_] == "-") { + fp_ = stdin; + } else if((fp_ = fopen(infiles_[filecur_].c_str(), "rb")) == NULL) { + if(!errs_[filecur_]) { + cerr << "Warning: Could not open read file \"" + << infiles_[filecur_] << "\" for reading; skipping..." + << endl; + errs_[filecur_] = true; + } + filecur_++; + continue; + } + is_open_ = true; + setvbuf(fp_, buf_, _IOFBF, 64*1024); + if(!qinfiles_.empty()) { + if(qinfiles_[filecur_] == "-") { + qfp_ = stdin; + } else if((qfp_ = fopen(qinfiles_[filecur_].c_str(), "rb")) == NULL) { + if(!errs_[filecur_]) { + cerr << "Warning: Could not open quality file \"" + << qinfiles_[filecur_] << "\" for reading; skipping..." + << endl; + errs_[filecur_] = true; + } + filecur_++; + continue; + } + assert(qfp_ != NULL); + setvbuf(qfp_, qbuf_, _IOFBF, 64*1024); + } + return; + } + throw 1; +} + +/** + * Constructor for vector pattern source, used when the user has + * specified the input strings on the command line using the -c + * option. + */ +VectorPatternSource::VectorPatternSource( + const vector& seqs, + bool color, + const char *dumpfile, + int trim3, + int trim5, + uint32_t skip) : + TrimmingPatternSource(dumpfile, trim3, trim5), + color_(color), + cur_(skip), + skip_(skip), + paired_(false), + tokbuf_(), + bufs_() +{ + // Install sequences in buffers, ready for immediate copying in + // nextBatch(). Formatting of the buffer is just like + // TabbedPatternSource. + const size_t seqslen = seqs.size(); + for(size_t i = 0; i < seqslen; i++) { + tokbuf_.clear(); + tokenize(seqs[i], ":", tokbuf_, 2); + assert_gt(tokbuf_.size(), 0); + assert_leq(tokbuf_.size(), 2); + // Get another buffer ready + bufs_.resize(bufs_.size()+1); + bufs_.back().clear(); + // Install name + itoa10(static_cast(i), nametmp_); + bufs_.back() = nametmp_; + bufs_.back().push_back('\t'); + // Install sequence + bufs_.back().append(tokbuf_[0].c_str()); + bufs_.back().push_back('\t'); + // Install qualities + if(tokbuf_.size() > 1) { + bufs_.back().append(tokbuf_[1].c_str()); + } else { + const size_t len = tokbuf_[0].length(); + for(size_t i = 0; i < len; i++) { + bufs_.back().push_back('I'); + } + } + } +} + +/** + * Read next batch. However, batch concept is not very applicable for this + * PatternSource where all the info has already been parsed into the fields + * in the contsructor. This essentially modifies the pt as though we read + * in some number of patterns. + */ +pair VectorPatternSource::nextBatch( + PerThreadReadBuf& pt, + bool batch_a, + bool lock) +{ + ThreadSafe ts(&mutex, lock); + pt.setReadId(cur_); + vector& readbuf = batch_a ? pt.bufa_ : pt.bufb_; + size_t readi = 0; + for(; readi < pt.max_buf_ && cur_ < bufs_.size(); readi++, cur_++) { + const size_t len = bufs_[cur_].length(); + memcpy(readbuf[readi].readOrigBuf, bufs_[cur_].c_str(), len); + readbuf[readi].readOrigBufLen = len; + } + readCnt_ += readi; + return make_pair(cur_ == bufs_.size(), readi); +} + +/** + * Finishes parsing outside the critical section. + */ +bool VectorPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const { + // Very similar to TabbedPatternSource + + // Light parser (nextBatchFromFile) puts unparsed data + // into Read& r, even when the read is paired. + assert(ra.empty()); + assert_gt(ra.readOrigBufLen, 0); // raw data for read/pair is here + int c = '\t'; + size_t cur = 0; + const size_t buflen = ra.readOrigBufLen; + + // Loop over the two ends + for(int endi = 0; endi < 2 && c == '\t'; endi++) { + Read& r = ((endi == 0) ? ra : rb); + assert_eq(0, seqan::length(r.nameBuf)); + // Parse name if (a) this is the first end, or + // (b) this is tab6 + size_t nameoff = 0; + if(endi < 1 || paired_) { + // Parse read name + c = ra.readOrigBuf[cur++]; + while(c != '\t' && cur < buflen) { + r.nameBuf[nameoff++] = c; + c = ra.readOrigBuf[cur++]; + } + assert_eq('\t', c); + if(cur >= buflen) { + return false; // record ended prematurely + } + } else if(endi > 0) { + // if this is the second end and we're parsing + // tab5, copy name from first end + rb.name = ra.name; + } + r.nameBuf[nameoff] = '\0'; + _setBegin(r.name, r.nameBuf); + _setLength(r.name, nameoff); + + // Parse sequence + assert_eq(0, seqan::length(r.patFw)); + c = ra.readOrigBuf[cur++]; + int nchar = 0, seqoff = 0; + if(color_) { + if(asc2dnacat[c] > 0) { + // First char is a DNA char (primer) + ra.primer = c; + int c2 = toupper(ra.readOrigBuf[cur++]); + // Is second char a color? + if(asc2colcat[c2] <= 0) { + // No -- so we don't assue this is primer + cur -= 2; + c = ra.readOrigBuf[cur++]; + } + } + } + if(color_) { + while(c != '\t' && cur < buflen) { + if(c >= '0' && c < '4') { + c = "ACGTN"[(int)c - '0']; + } + if(c == '.') { + c = 'N'; + } + if(isalpha(c)) { + assert_in(toupper(c), "ACGTN"); + if(nchar++ >= this->trim5_) { + assert_neq(0, asc2dnacat[c]); + r.patBufFw[seqoff++] = charToDna5[c]; // ascii to int + } + } + c = ra.readOrigBuf[cur++]; + } + ra.color = true; + } else { + while(c != '\t' && cur < buflen) { + if(isalpha(c)) { + assert_in(toupper(c), "ACGTN"); + if(nchar++ >= this->trim5_) { + assert_neq(0, asc2dnacat[c]); + r.patBufFw[seqoff++] = charToDna5[c]; // ascii to int } - assert(isdigit(c)); - num *= 10; - num += (c - '0'); } - fb.get(); + c = ra.readOrigBuf[cur++]; + } + } + assert_eq('\t', c); + if(cur >= buflen) { + return false; // record ended prematurely + } + r.patBufFw[seqoff] = '\0'; + _setBegin(r.patFw, (Dna5*)r.patBufFw); + // record amt trimmed from 5' end due to --trim5 + r.trimmed5 = (int)(nchar - seqoff); + // record amt trimmed from 3' end due to --trim3 + int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_; + _setLength(r.patFw, seqoff - trim3); + r.trimmed3 = trim3; + + // Parse qualities + assert_eq(0, seqan::length(r.qual)); + c = ra.readOrigBuf[cur++]; + int nqual = 0; + size_t qualoff = 0; + while(c != '\t' && c != '\n' && c != '\r') { + if(c == ' ') { + wrongQualityFormat(r.name); + return false; } - if(neg) num = 0; - // Phred-33 ASCII encode it and add it to the back of the - // quality string - r.qualBuf[qualsRead++] = ('!' + num); - // Skip over next stretch of whitespace - c = fb.peek(); - while(c != '\r' && c != '\n' && isspace(c) && !fb.eof()) { - fb.get(); - c = fb.peek(); + char cadd = charToPhred33(c, false, false); + if(++nqual > this->trim5_) { + r.qualBuf[qualoff++] = cadd; } + if(cur >= buflen) break; + c = ra.readOrigBuf[cur++]; } - } else { - while (c != '\r' && c != '\n' && c != -1) { - c = fb.get(); - r.qualBuf[qualsRead++] = charToPhred33(c, solexa64, phred64); - c = fb.peek(); - while(c != '\r' && c != '\n' && isspace(c) && !fb.eof()) { - fb.get(); - c = fb.peek(); - } - } - } - if (qualsRead < readLen-1 || - (qualsRead < readLen && !r.color)) - { - tooFewQualities(r.name); - } - qualsRead -= trim3; - if(qualsRead <= 0) return 0; - int trimmedReadLen = readLen-trim3-trim5; - if(trimmedReadLen < 0) trimmedReadLen = 0; - if(qualsRead > trimmedReadLen) { - // Shift everybody left - for(int i = 0; i < readLen; i++) { - r.qualBuf[i] = r.qualBuf[i+qualsRead-trimmedReadLen]; - } - } - _setLength(r.qual, trimmedReadLen); - while(fb.peek() == '\n' || fb.peek() == '\r') fb.get(); - return qualsRead; + if(nchar > nqual) { + tooFewQualities(r.name); + return false; + } else if(nqual > nchar) { + tooManyQualities(r.name); + return false; + } + r.qualBuf[seqoff] = '\0'; + _setBegin(r.qual, r.qualBuf); + _setLength(r.qual, seqan::length(r.patFw)); + assert(c == '\t' || c == '\n' || c == '\r' || cur >= buflen); + } + ra.parsed = true; + if(!rb.parsed && rb.readOrigBufLen > 0) { + return parse(rb, ra, rdid); + } + return true; +} + +/** + * Light-parse a FASTA batch into the given buffer. + */ +pair FastaPatternSource::nextBatchFromFile( + PerThreadReadBuf& pt, + bool batch_a) +{ + int c; + vector& readbuf = batch_a ? pt.bufa_ : pt.bufb_; + if(first_) { + c = getc_unlocked(fp_); + while(c == '\r' || c == '\n') { + c = getc_unlocked(fp_); + } + if(c != '>') { + cerr << "Error: reads file does not look like a FASTA file" << endl; + throw 1; + } + first_ = false; + } + bool done = false; + size_t readi = 0; + // Read until we run out of input or until we've filled the buffer + for(; readi < pt.max_buf_ && !done; readi++) { + readbuf[readi].readOrigBuf[0] = '>'; + size_t bufoff = 1; + while(true) { + c = getc_unlocked(fp_); + if(c < 0 || c == '>') { + done = c < 0; + break; + } + readbuf[readi].readOrigBuf[bufoff++] = c; + } + readbuf[readi].readOrigBufLen = bufoff; + } + return make_pair(done, readi); +} + +/** + * Finalize FASTA parsing outside critical section. + */ +bool FastaPatternSource::parse(Read& r, Read& rb, TReadId rdid) const { + // We assume the light parser has put the raw data for the separate ends + // into separate Read objects. That doesn't have to be the case, but + // that's how we've chosen to do it for FastqPatternSource + assert_gt(r.readOrigBufLen, 0); + assert(r.empty()); + int c = -1; + size_t cur = 1; + const size_t buflen = r.readOrigBufLen; + + // Parse read name + assert_eq(0, seqan::length(r.name)); + int nameoff = 0; + while(cur < buflen) { + c = r.readOrigBuf[cur++]; + if(c == '\n' || c == '\r') { + do { + c = r.readOrigBuf[cur++]; + } while((c == '\n' || c == '\r') && cur < buflen); + break; + } + r.nameBuf[nameoff++] = c; + } + if(cur >= buflen) { + return false; // FASTA ended prematurely + } + if(nameoff > 0) { + r.nameBuf[nameoff] = '\0'; + _setBegin(r.name, r.nameBuf); + _setLength(r.name, nameoff); + } + + // Parse sequence + int nchar = 0, seqoff = 0; + assert_eq(0, seqan::length(r.patFw)); + assert(c != '\n' && c != '\r'); + assert_lt(cur, buflen); + while(c != '\n' && cur < buflen) { + if(c == '.') { + c = 'N'; + } + if(isalpha(c)) { + // If it's past the 5'-end trim point + if(nchar++ >= this->trim5_) { + r.patBufFw[seqoff++] = charToDna5[c]; + } + } + assert_lt(cur, buflen); + c = r.readOrigBuf[cur++]; + } + r.patBufFw[seqoff] = '\0'; + _setBegin(r.patFw, (Dna5*)r.patBufFw); + // record amt trimmed from 5' end due to --trim5 + r.trimmed5 = (int)(nchar - seqoff); + // record amt trimmed from 3' end due to --trim3 + int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_; + _setLength(r.patFw, seqoff - trim3); + r.trimmed3 = trim3; + + for(size_t i = 0; i < seqoff - trim3; i++) { + r.qualBuf[i] = 'I'; + } + r.qualBuf[seqoff - trim3] = '\0'; + _setBegin(r.qual, r.qualBuf); + _setLength(r.qual, nameoff); + + // Set up a default name if one hasn't been set + if(nameoff == 0) { + itoa10(static_cast(rdid), r.nameBuf); + _setBegin(r.name, r.nameBuf); + _setLength(r.name, strlen(r.nameBuf)); + } + r.parsed = true; + if(!rb.parsed && rb.readOrigBufLen > 0) { + return parse(rb, r, rdid); + } + return true; +} + +/** + * Light-parse a FASTA-continuous batch into the given buffer. + * This is trickier for FASTA-continuous than for other formats, + * for several reasons: + * + * 1. Reads are substrings of a longer FASTA input string + * 2. Reads may overlap w/r/t the longer FASTA string + * 3. Read names depend on the most recently observed FASTA + * record name + */ +pair FastaContinuousPatternSource::nextBatchFromFile( + PerThreadReadBuf& pt, + bool batch_a) +{ + int c = -1; + vector& readbuf = batch_a ? pt.bufa_ : pt.bufb_; + size_t readi = 0; + int nameoff = 0; + while(readi < pt.max_buf_) { + c = getc_unlocked(fp_); + if(c < 0) { + break; + } + if(c == '>') { + resetForNextFile(); + c = getc_unlocked(fp_); + bool sawSpace = false; + while(c != '\n' && c != '\r') { + if(!sawSpace) { + sawSpace = isspace(c); + } + if(!sawSpace) { + name_prefix_buf_[nameoff++] = c; + } + c = getc_unlocked(fp_); + } + while(c == '\n' || c == '\r') { + c = getc_unlocked(fp_); + } + if(c < 0) { + break; + } + name_prefix_buf_[nameoff++] = '_'; + } + int cat = asc2dnacat[c]; + if(cat >= 2) c = 'N'; + if(cat == 0) { + // Non-DNA, non-IUPAC char; skip + continue; + } else { + // DNA char + buf_[bufCur_++] = c; + if(bufCur_ == 1024) { + bufCur_ = 0; // wrap around circular buf + } + if(eat_ > 0) { + eat_--; + // Try to keep readCnt_ aligned with the offset + // into the reference; that lets us see where + // the sampling gaps are by looking at the read + // name + if(!beginning_) { + readCnt_++; + } + continue; + } + // install name + memcpy(readbuf[readi].readOrigBuf, name_prefix_buf_, nameoff); + itoa10(readCnt_ - subReadCnt_, name_int_buf_); + size_t bufoff = nameoff; + memcpy(readbuf[readi].readOrigBuf + bufoff, name_int_buf_, strlen(name_int_buf_)); + bufoff += strlen(name_int_buf_); + readbuf[readi].readOrigBuf[bufoff++] = '\t'; + // install sequence + for(size_t i = 0; i < length_; i++) { + if(length_ - i <= bufCur_) { + c = buf_[bufCur_ - (length_ - i)]; + } else { + // Rotate + c = buf_[bufCur_ - (length_ - i) + 1024]; + } + readbuf[readi].readOrigBuf[bufoff++] = 'c'; + } + readbuf[readi].readOrigBufLen = bufoff; + eat_ = freq_-1; + readCnt_++; + beginning_ = false; + readi++; + } + } + return make_pair(c < 0, readi); +} + +/** + * Finalize FASTA-continuous parsing outside critical section. + */ +bool FastaContinuousPatternSource::parse( + Read& ra, + Read& rb, + TReadId rdid) const +{ + // Light parser (nextBatchFromFile) puts unparsed data + // into Read& r, even when the read is paired. + assert(ra.empty()); + assert(rb.empty()); + assert_gt(ra.readOrigBufLen, 0); // raw data for read/pair is here + assert_eq(0, rb.readOrigBufLen); + int c = '\t'; + size_t cur = 0; + const size_t buflen = ra.readOrigBufLen; + + // Parse read name + assert_eq(0, seqan::length(ra.name)); + int nameoff = 0; + c = ra.readOrigBuf[cur++]; + while(c != '\t' && cur < buflen) { + ra.nameBuf[nameoff++] = c; + c = ra.readOrigBuf[cur++]; + } + assert_eq('\t', c); + if(cur >= buflen) { + return false; // record ended prematurely + } + ra.nameBuf[nameoff] = '\0'; + _setBegin(ra.name, ra.nameBuf); + _setLength(ra.name, nameoff); + + // Parse sequence + assert_eq(0, seqan::length(ra.patFw)); + c = ra.readOrigBuf[cur++]; + int nchar = 0, seqoff = 0; + while(cur < buflen) { + if(isalpha(c)) { + assert_in(toupper(c), "ACGTN"); + if(nchar++ >= this->trim5_) { + assert_neq(0, asc2dnacat[c]); + ra.patBufFw[seqoff++] = charToDna5[c]; // ascii to int + } + } + c = ra.readOrigBuf[cur++]; + } + ra.patBufFw[seqoff] = '\0'; + _setBegin(ra.patFw, (Dna5*)ra.patBufFw); + // record amt trimmed from 5' end due to --trim5 + ra.trimmed5 = (int)(nchar - seqoff); + // record amt trimmed from 3' end due to --trim3 + int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_; + _setLength(ra.patFw, seqoff - trim3); + ra.trimmed3 = trim3; + + // Make fake qualities + assert_eq(0, seqan::length(ra.qual)); + int qualoff = 0; + for(size_t i = 0; i < seqoff; i++) { + ra.qualBuf[qualoff++] = 'I'; + } + ra.qualBuf[qualoff] = '\0'; + _setBegin(ra.qual, ra.qualBuf); + _setLength(ra.qual, qualoff); + ra.parsed = true; + return true; } /** @@ -105,9 +809,13 @@ int parseQuals(ReadBuf& r, * for lines worth of the input file into a buffer (r.readOrigBuf). finalize() * then parses the contents of r.readOrigBuf later. */ -pair FastqPatternSource::readLight(ReadBuf& r) { +pair FastqPatternSource::nextBatchFromFile( + PerThreadReadBuf& pt, + bool batch_a) +{ int c = 0; - size_t& len = r.readOrigBufLen; + vector& readBuf = batch_a ? pt.bufa_ : pt.bufb_; + size_t& len = readBuf[0].readOrigBufLen; len = 0; if(first_) { c = getc_unlocked(fp_); @@ -118,40 +826,66 @@ pair FastqPatternSource::readLight(ReadBuf& r) { cerr << "Error: reads file does not look like a FASTQ file" << endl; throw 1; } - r.readOrigBuf[len++] = c; + readBuf[0].readOrigBuf[len++] = c; assert_eq('@', c); first_ = false; } // Note: to reduce the number of times we have to enter the critical // section (each entrance has some assocaited overhead), we could populate // the buffer with several reads worth of data here, instead of just one. - int newlines = 4; - while(newlines) { - c = getc_unlocked(fp_); - if(c == '\n' || (c < 0 && newlines == 1)) { - newlines--; - c = '\n'; - } else if(c < 0) { - return make_pair(false, true); + bool done = false, aborted = false; + size_t readi = 0; + // Read until we run out of input or until we've filled the buffer + for(; readi < pt.max_buf_ && !done; readi++) { + char* buf = readBuf[readi].readOrigBuf; + assert(readi == 0 || strlen(buf) == 0); + if(readi > 0) + { + len = readBuf[readi].readOrigBufLen; + len = 0; } - r.readOrigBuf[len++] = c; + int newlines = 4; + while(newlines) { + c = getc_unlocked(fp_); + done = c < 0; + if(c == '\n' || (done && newlines == 1)) { + newlines--; + c = '\n'; + } else if(done) { + //return make_pair(false, true); + aborted = true; //Unexpected EOF + break; + } + buf[len++] = c; + } + readCnt_++; + readBuf[readi].patid = (uint32_t)(readCnt_-1); + } + if(aborted) { + readi--; + readCnt_--; } - readCnt_++; - r.patid = (uint32_t)(readCnt_-1); - return make_pair(true, c < 0); + return make_pair(done, readi); } -/// Read another pattern from a FASTQ input file -void FastqPatternSource::finalize(ReadBuf &r) const { +/** + * Finalize FASTQ parsing outside critical section. + */ +bool FastqPatternSource::parse(Read &r, Read& rb, TReadId rdid) const { + // We assume the light parser has put the raw data for the separate ends + // into separate Read objects. That doesn't have to be the case, but + // that's how we've chosen to do it for FastqPatternSource + assert_gt(r.readOrigBufLen, 0); + assert(r.empty()); int c; - size_t name_len = 0; size_t cur = 1; - r.color = color_; - r.primer = -1; + const size_t buflen = r.readOrigBufLen; // Parse read name + assert_eq(0, seqan::length(r.name)); + int nameoff = 0; while(true) { - assert(cur < r.readOrigBufLen); + assert_lt(cur, buflen); c = r.readOrigBuf[cur++]; if(c == '\n' || c == '\r') { do { @@ -159,103 +893,339 @@ void FastqPatternSource::finalize(ReadBuf &r) const { } while(c == '\n' || c == '\r'); break; } - r.nameBuf[name_len++] = c; + r.nameBuf[nameoff++] = c; } + r.nameBuf[nameoff] = '\0'; _setBegin(r.name, r.nameBuf); - _setLength(r.name, name_len); - - if(color_) { - // May be a primer character. If so, keep it 'primer' field - // of read buf and parse the rest of the read without it. -// c = toupper(c); -// if(asc2dnacat[c] > 0) { -// // First char is a DNA char -// int c2 = toupper(fb_.peek()); -// // Second char is a color char -// if(asc2colcat[c2] > 0) { -// r.primer = c; -// r.trimc = c2; -// mytrim5 += 2; // trim primer and first color -// } -// } -// if(c < 0) { bail(r); return; } - throw 1; - } + _setLength(r.name, nameoff); // Parse sequence - int nchar = 0; - uint8_t *seqbuf = r.patBufFw; + int nchar = 0, seqoff = 0; + assert_eq(0, seqan::length(r.patFw)); while(c != '+') { if(c == '.') { c = 'N'; } - if(color_ && c >= '0' && c <= '4') { - c = "ACGTN"[(int)c - '0']; - } if(isalpha(c)) { // If it's past the 5'-end trim point - if(nchar++ >= trim5_) { - *seqbuf++ = charToDna5[c]; + if(nchar++ >= this->trim5_) { + r.patBufFw[seqoff++] = charToDna5[c]; } } - assert(cur < r.readOrigBufLen); + assert_lt(cur, buflen); c = r.readOrigBuf[cur++]; } - int seq_len = (int)(seqbuf - r.patBufFw); - r.trimmed5 = (int)(nchar - seq_len); - r.trimmed3 = min(seq_len, trim3_); - seq_len = max(seq_len - trim3_, 0); _setBegin(r.patFw, (Dna5*)r.patBufFw); - _setLength(r.patFw, seq_len); - + // record amt trimmed from 5' end due to --trim5 + r.trimmed5 = (int)(nchar - seqoff); + // record amt trimmed from 3' end due to --trim3 + int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_; + _setLength(r.patFw, seqoff - trim3); + r.patBufFw[seqan::length(r.patFw)] = '\0'; + r.trimmed3 = trim3; + assert_eq('+', c); do { - assert(cur < r.readOrigBufLen); + assert_lt(cur, buflen); c = r.readOrigBuf[cur++]; } while(c != '\n' && c != '\r'); - do { - assert(cur < r.readOrigBufLen); + while(cur < buflen && (c == '\n' || c == '\r')) { c = r.readOrigBuf[cur++]; - } while(c == '\n' || c == '\r'); + } // Now we're on the next non-blank line after the + line - if(seq_len == 0) { - return; // done parsing empty read + if(seqan::length(r.patFw) == 0) { + return true; // done parsing empty read } - int nqual = 0; - char *qualbuf = r.qualBuf; + assert_eq(0, seqan::length(r.qual)); + int nqual = 0, qualoff = 0; if (intQuals_) { - // TODO: must implement this for compatibility with other Bowtie - throw 1; // not yet implemented + int cur_int = 0; + while(c != '\t' && c != '\n' && c != '\r') { + cur_int *= 10; + cur_int += (int)(c - '0'); + c = r.readOrigBuf[cur++]; + if(c == ' ' || c == '\t' || c == '\n' || c == '\r') { + char cadd = intToPhred33(cur_int, solQuals_); + cur_int = 0; + assert_geq(cadd, 33); + if(++nqual > this->trim5_) { + r.qualBuf[qualoff++] = cadd; + } + } + } } else { - while(c != '\r' && c != '\n') { - c = charToPhred33(c, solQuals_, phred64Quals_); - if(c == ' ') { + c = charToPhred33(c, solQuals_, phred64Quals_); + if(nqual++ >= r.trimmed5) { + r.qualBuf[qualoff++] = c; + } + while(cur < buflen) { + c = r.readOrigBuf[cur++]; + if (c == ' ') { wrongQualityFormat(r.name); + return false; + } + if(c == '\r' || c == '\n') { + break; } + c = charToPhred33(c, solQuals_, phred64Quals_); if(nqual++ >= r.trimmed5) { - *qualbuf++ = c; + r.qualBuf[qualoff++] = c; } - c = r.readOrigBuf[cur++]; } - int qual_len = (int)(qualbuf - r.qualBuf); - qual_len = max(0, qual_len - r.trimmed3); - if(qual_len < seq_len) { + if(qualoff < seqoff) { tooFewQualities(r.name); - } else if(qual_len > seq_len+1) { + return false; + } else if(qualoff > seqoff) { tooManyQualities(r.name); + return false; } - _setBegin(r.qual, (char*)r.qualBuf); - _setLength(r.qual, seq_len); } + r.qualBuf[seqan::length(r.patFw)] = '\0'; + _setBegin(r.qual, r.qualBuf); + _setLength(r.qual, seqan::length(r.patFw)); + // Set up a default name if one hasn't been set - if(name_len == 0) { - itoa10((int)readCnt_, r.nameBuf); + if(seqan::length(r.name) == 0) { + itoa10(static_cast(readCnt_), r.nameBuf); + _setBegin(r.name, r.nameBuf); + _setLength(r.name, nameoff); + } + r.parsed = true; + if(!rb.parsed && rb.readOrigBufLen > 0) { + return parse(rb, r, rdid); + } + return true; +} + +/** + * Light-parse a batch of tabbed-format reads into given buffer. + */ +pair TabbedPatternSource::nextBatchFromFile( + PerThreadReadBuf& pt, + bool batch_a) +{ + int c = getc_unlocked(fp_); + while(c >= 0 && (c == '\n' || c == '\r')) { + c = getc_unlocked(fp_); + } + vector& readbuf = batch_a ? pt.bufa_ : pt.bufb_; + size_t readi = 0; + // Read until we run out of input or until we've filled the buffer + for(; readi < pt.max_buf_ && c >= 0; readi++) { + readbuf[readi].readOrigBufLen = 0; + while(c >= 0 && c != '\n' && c != '\r') { + readbuf[readi].readOrigBuf[readbuf[readi].readOrigBufLen++] = c; + c = getc_unlocked(fp_); + } + while(c >= 0 && (c == '\n' || c == '\r')) { + c = getc_unlocked(fp_); + } + } + return make_pair(c < 0, readi); +} + +/** + * Finalize tabbed parsing outside critical section. + */ +bool TabbedPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const { + // Light parser (nextBatchFromFile) puts unparsed data + // into Read& r, even when the read is paired. + assert(ra.empty()); + assert(rb.empty()); + assert_gt(ra.readOrigBufLen, 0); // raw data for read/pair is here + assert_eq(0, rb.readOrigBufLen); + int c = '\t'; + size_t cur = 0; + const size_t buflen = ra.readOrigBufLen; + bool paired = false; + + // Loop over the two ends + for(int endi = 0; endi < 2 && c == '\t'; endi++) { + Read& r = ((endi == 0) ? ra : rb); + assert_eq(0, seqan::length(r.name)); + // Parse name if (a) this is the first end, or + // (b) this is tab6 + int nameoff = 0; + if(endi < 1 || secondName_) { + // Parse read name + c = ra.readOrigBuf[cur++]; + while(c != '\t' && cur < buflen) { + r.name[nameoff++] = c; + c = ra.readOrigBuf[cur++]; + } + assert_eq('\t', c); + if(cur >= buflen) { + return false; // record ended prematurely + } + } else if(endi > 0) { + // if this is the second end and we're parsing + // tab5, copy name from first end + rb.name = ra.name; + } + r.nameBuf[nameoff] = '\0'; _setBegin(r.name, r.nameBuf); - name_len = (int)strlen(r.nameBuf); - _setLength(r.name, name_len); + _setLength(r.name, nameoff); + + paired = endi > 0; + + // Parse sequence + assert_eq(0, seqan::length(r.patFw)); + c = ra.readOrigBuf[cur++]; + int nchar = 0, seqoff = 0; + while(c != '\t' && cur < buflen) { + if(isalpha(c)) { + assert_in(toupper(c), "ACGTN"); + if(nchar++ >= this->trim5_) { + assert_neq(0, asc2dnacat[c]); + r.patBufFw[seqoff++] = charToDna5[c]; + } + } + c = ra.readOrigBuf[cur++]; + } + assert_eq('\t', c); + if(cur >= buflen) { + return false; // record ended prematurely + } + _setBegin(r.patFw, (Dna5*)r.patBufFw); + // record amt trimmed from 5' end due to --trim5 + r.trimmed5 = (int)(nchar - seqoff); + // record amt trimmed from 3' end due to --trim3 + int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_; + _setLength(r.patFw, seqoff - trim3); + r.patBufFw[seqan::length(r.patFw)] = '\0'; + r.trimmed3 = trim3; + + // Parse qualities + assert_eq(0, seqan::length(r.qual)); + c = ra.readOrigBuf[cur++]; + int nqual = 0, qualoff = 0; + if (intQuals_) { + int cur_int = 0; + while(c != '\t' && c != '\n' && c != '\r' && cur < buflen) { + cur_int *= 10; + cur_int += (int)(c - '0'); + c = ra.readOrigBuf[cur++]; + if(c == ' ' || c == '\t' || c == '\n' || c == '\r') { + char cadd = intToPhred33(cur_int, solQuals_); + cur_int = 0; + assert_geq(cadd, 33); + if(++nqual > this->trim5_) { + r.qualBuf[qualoff++] = cadd; + } + } + } + } else { + while(c != '\t' && c != '\n' && c != '\r') { + if(c == ' ') { + wrongQualityFormat(r.name); + return false; + } + char cadd = charToPhred33(c, solQuals_, phred64Quals_); + if(++nqual > this->trim5_) { + r.qualBuf[qualoff++] = cadd; + } + if(cur >= buflen) break; + c = ra.readOrigBuf[cur++]; + } + } + if(nchar > nqual) { + tooFewQualities(r.name); + return false; + } else if(nqual > nchar) { + tooManyQualities(r.name); + return false; + } + r.qualBuf[seqan::length(r.patFw)] = '\0'; + _setBegin(r.qual, r.qualBuf); + _setLength(r.qual, seqan::length(r.patFw)); + assert(c == '\t' || c == '\n' || c == '\r' || cur >= buflen); + } + ra.parsed = true; + rb.parsed = paired; + return true; +} + +/** + * Light-parse a batch of raw-format reads into given buffer. + */ +pair RawPatternSource::nextBatchFromFile( + PerThreadReadBuf& pt, + bool batch_a) +{ + int c = getc_unlocked(fp_); + while(c >= 0 && (c == '\n' || c == '\r')) { + c = getc_unlocked(fp_); + } + vector& readbuf = batch_a ? pt.bufa_ : pt.bufb_; + size_t readi = 0; + // Read until we run out of input or until we've filled the buffer + for(; readi < pt.max_buf_ && c >= 0; readi++) { + readbuf[readi].readOrigBufLen = 0; + while(c >= 0 && c != '\n' && c != '\r') { + readbuf[readi].readOrigBuf[readbuf[readi].readOrigBufLen++] = c; + c = getc_unlocked(fp_); + } + while(c >= 0 && (c == '\n' || c == '\r')) { + c = getc_unlocked(fp_); + } + } + return make_pair(c < 0, readi); +} + +/** + * Finalize raw parsing outside critical section. + */ +bool RawPatternSource::parse(Read& r, Read& rb, TReadId rdid) const { + assert(r.empty()); + assert_gt(r.readOrigBufLen, 0); + int c = '\n'; + size_t cur = 0; + const size_t buflen = r.readOrigBufLen; + + // Parse sequence + assert_eq(0, seqan::length(r.patFw)); + int nchar = 0, seqoff = 0; + while(cur < buflen) { + c = r.readOrigBuf[cur++]; + assert(c != '\r' && c != '\n'); + if(isalpha(c)) { + assert_in(toupper(c), "ACGTN"); + if(nchar++ >= this->trim5_) { + assert_neq(0, asc2dnacat[c]); + r.patBufFw[seqoff++] = charToDna5[c]; + } + } + } + _setBegin(r.patFw, (Dna5*)r.patBufFw); + // record amt trimmed from 5' end due to --trim5 + r.trimmed5 = (int)(nchar - seqoff); + // record amt trimmed from 3' end due to --trim3 + int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_; + _setLength(r.patFw, seqoff - trim3); + r.patBufFw[seqan::length(r.patFw)] = '\0'; + r.trimmed3 = trim3; + + // Give the name field a dummy value + itoa10(rdid, r.nameBuf); + _setBegin(r.name, r.nameBuf); + _setLength(r.name, strlen(r.nameBuf)); + + // Give the base qualities dummy values + assert_eq(0, seqan::length(r.qual)); + const size_t len = seqan::length(r.patFw); + for(size_t i = 0; i < len; i++) { + r.qualBuf[i] = 'I'; } + _setBegin(r.qual, r.qualBuf); + _setLength(r.qual, seqan::length(r.patFw)); + + r.parsed = true; + if(!rb.parsed && rb.readOrigBufLen > 0) { + return parse(rb, r, rdid); + } + return true; } void wrongQualityFormat(const String& read_name) { @@ -283,24 +1253,3 @@ void tooManySeqChars(const String& read_name) { << "Offending read: " << read_name << endl; throw 1; } - -/** - * C++ version char* style "itoa": - */ -char* itoa10(int value, char* result) { - // Check that base is valid - char* out = result; - int quotient = value; - do { - *out = "0123456789"[ std::abs( quotient % 10 ) ]; - ++out; - quotient /= 10; - } while ( quotient ); - - // Only apply negative sign for base 10 - if (value < 0) *out++ = '-'; - std::reverse( result, out ); - - *out = 0; // terminator - return out; -} diff --git a/pat.h b/pat.h index 8548077..c3baa70 100644 --- a/pat.h +++ b/pat.h @@ -27,61 +27,91 @@ using namespace std; using namespace seqan; -/// Constructs string base-10 representation of integer 'value' -extern char* itoa10(int value, char* result); +/** + * C++ version char* style "itoa": + */ +template +char* itoa10(const T& value, char* result) { + // Check that base is valid + char* out = result; + T quotient = value; + if(std::numeric_limits::is_signed) { + if(quotient <= 0) quotient = -quotient; + } + // Now write each digit from most to least significant + do { + *out = "0123456789"[quotient % 10]; + ++out; + quotient /= 10; + } while (quotient > 0); + // Only apply negative sign for base 10 + if(std::numeric_limits::is_signed) { + // Avoid compiler warning in cases where T is unsigned + if (value <= 0 && value != 0) *out++ = '-'; + } + reverse( result, out ); + *out = 0; // terminator + return out; +} typedef uint64_t TReadId; /** - * Calculate a per-read random seed based on a combination of - * the read data (incl. sequence, name, quals) and the global - * seed in '_randSeed'. + * Parameters affecting how reads and read in. + * Note: Bowtie 2 uses this but Bowtie doesn't yet. */ -static inline uint32_t genRandSeed(const String& qry, - const String& qual, - const String& name, - uint32_t seed) -{ - // Calculate a per-read random seed based on a combination of - // the read data (incl. sequence, name, quals) and the global - // seed - uint32_t rseed = (seed + 101) * 59 * 61 * 67 * 71 * 73 * 79 * 83; - size_t qlen = seqan::length(qry); - // Throw all the characters of the read into the random seed - for(size_t i = 0; i < qlen; i++) { - int p = (int)qry[i]; - assert_leq(p, 4); - size_t off = ((i & 15) << 1); - rseed ^= (p << off); - } - // Throw all the quality values for the read into the random - // seed - for(size_t i = 0; i < qlen; i++) { - int p = (int)qual[i]; - assert_leq(p, 255); - size_t off = ((i & 3) << 3); - rseed ^= (p << off); - } - // Throw all the characters in the read name into the random - // seed - size_t namelen = seqan::length(name); - for(size_t i = 0; i < namelen; i++) { - int p = (int)name[i]; - assert_leq(p, 255); - size_t off = ((i & 3) << 3); - rseed ^= (p << off); - } - return rseed; -} +struct PatternParams { + + PatternParams() { } + + PatternParams( + int format_, + bool fileParallel_, + uint32_t seed_, + size_t max_buf_, + bool solexa64_, + bool phred64_, + bool intQuals_, + int sampleLen_, + int sampleFreq_, + size_t skip_, + int nthreads_, + bool fixName_) : + format(format_), + fileParallel(fileParallel_), + seed(seed_), + max_buf(max_buf_), + solexa64(solexa64_), + phred64(phred64_), + intQuals(intQuals_), + sampleLen(sampleLen_), + sampleFreq(sampleFreq_), + skip(skip_), + nthreads(nthreads_), + fixName(fixName_) { } + + int format; // file format + bool fileParallel; // true -> wrap files with separate PatternComposers + uint32_t seed; // pseudo-random seed + size_t max_buf; // number of reads to buffer in one read + bool solexa64; // true -> qualities are on solexa64 scale + bool phred64; // true -> qualities are on phred64 scale + bool intQuals; // true -> qualities are space-separated numbers + int sampleLen; // length of sampled reads for FastaContinuous... + int sampleFreq; // frequency of sampled reads for FastaContinuous... + size_t skip; // skip the first 'skip' patterns + int nthreads; // number of threads for locking + bool fixName; // +}; /** * A buffer for keeping all relevant information about a single read. * Each search thread has one. */ -struct ReadBuf { - ReadBuf() { reset(); } +struct Read { + Read() { reset(); } - ~ReadBuf() { + ~Read() { clearAll(); reset(); // Prevent seqan from trying to free buffers _setBegin(patFw, NULL); @@ -107,6 +137,7 @@ struct ReadBuf { primer = '?'; trimc = '?'; seed = 0; + parsed = false; RESET_BUF(patFw, patBufFw, Dna5); RESET_BUF(patRc, patBufRc, Dna5); RESET_BUF(qual, qualBuf, char); @@ -124,6 +155,7 @@ struct ReadBuf { seqan::clear(patRcRev); seqan::clear(qualRev); seqan::clear(name); + parsed = false; trimmed5 = trimmed3 = 0; readOrigBufLen = 0; qualOrigBufLen = 0; @@ -254,6 +286,7 @@ struct ReadBuf { String name; // read name char nameBuf[BUF_SIZE]; // read name buffer + bool parsed; // whether read has been fully parsed uint32_t patid; // unique 0-based id based on order in read file(s) int mate; // 0 = single-end, 1 = mate1, 2 = mate2 uint32_t seed; // random seed @@ -265,17 +298,6 @@ struct ReadBuf { HitSet hitset; // holds previously-found hits; for chaining }; -struct BoolTriple { - - BoolTriple() : a(false), b(false), c(false) { } - - BoolTriple(bool a_, bool b_, bool c_) : a(a_), b(b_), c(c_) { } - - bool a; - bool b; - bool c; -}; - /** * All per-thread storage for input read data. */ @@ -292,11 +314,11 @@ struct PerThreadReadBuf { reset(); } - ReadBuf& read_a() { return bufa_[cur_buf_]; } - ReadBuf& read_b() { return bufb_[cur_buf_]; } + Read& read_a() { return bufa_[cur_buf_]; } + Read& read_b() { return bufb_[cur_buf_]; } - const ReadBuf& read_a() const { return bufa_[cur_buf_]; } - const ReadBuf& read_b() const { return bufb_[cur_buf_]; } + const Read& read_a() const { return bufa_[cur_buf_]; } + const Read& read_b() const { return bufb_[cur_buf_]; } /** * Return read id for read/pair currently in the buffer. @@ -350,8 +372,8 @@ struct PerThreadReadBuf { } const size_t max_buf_; // max # reads to read into buffer at once - vector bufa_; // Read buffer for mate as - vector bufb_; // Read buffer for mate bs + vector bufa_; // Read buffer for mate as + vector bufb_; // Read buffer for mate bs size_t cur_buf_; // Read buffer currently active TReadId rdid_; // index of read at offset 0 of bufa_/bufb_ }; @@ -367,18 +389,11 @@ struct PerThreadReadBuf { */ class PatternSource { public: - PatternSource(uint32_t seed, - bool randomizeQuals = false, - const char *dumpfile = NULL, - bool verbose = false) : - seed_(seed), + PatternSource( + const char *dumpfile = NULL) : readCnt_(0), dumpfile_(dumpfile), - numWrappers_(0), - doLocking_(true), - randomizeQuals_(randomizeQuals), - mutex_m(), - verbose_(verbose) + mutex() { // Open dumpfile, if specified if(dumpfile_ != NULL) { @@ -393,40 +408,6 @@ class PatternSource { virtual ~PatternSource() { } /** - * Call this whenever this PatternSource is wrapped by a new - * WrappedPatternSourcePerThread. This helps us keep track of - * whether locks will be contended. - */ - //BP: remove? - void addWrapper() { - ThreadSafe ts(&mutex_m); - numWrappers_++; - } - - /** - * Implementation to be provided by concrete subclasses. An - * implementation for this member is only relevant for formats that - * can read in a pair of reads in a single transaction with a - * single input source. If paired-end input is given as a pair of - * parallel files, this member should throw an error and exit. - */ - virtual pair nextReadPair(ReadBuf& ra, ReadBuf& rb) = 0; - - /** - * Implementation to be provided by concrete subclasses. An - * implementation for this member is only relevant for formats - * where individual input sources look like single-end-read - * sources, e.g., formats where paired-end reads are specified in - * parallel read files. - */ - virtual bool nextRead(ReadBuf& r) = 0; - - /** - * Finishes parsing outside the critical section - */ - virtual void finalize(ReadBuf& r) const = 0; - - /** * Implementation to be provided by concrete subclasses. An * implementation for this member is only relevant for formats * where individual input sources look like single-end-read @@ -441,7 +422,7 @@ class PatternSource { /** * Finishes parsing a given read. Happens outside the critical section. */ - virtual bool parse(ReadBuf& ra, ReadBuf& rb) const = 0; + virtual bool parse(Read& ra, Read& rb, TReadId rdid) const = 0; /// Reset state to start over again with the first read virtual void reset() { readCnt_ = 0; } @@ -449,40 +430,14 @@ class PatternSource { /** * Return the number of reads attempted. */ - //BP: changed, but should I have? - uint64_t readCount() const { return readCnt_; } + TReadId readCount() const { return readCnt_; } protected: /** - * Mix up the quality values for ReadBuf r. There's probably a - * more (pseudo-)randomly rigorous way to do this; the output looks - * pretty cyclic. - */ - void randomizeQuals(ReadBuf& r) { - const size_t rlen = r.length(); - for(size_t i = 0; i < rlen; i++) { - if(i < rlen-1) { - r.qual[i] *= (r.qual[i+1] + 7); - } - if(i > 0) { - r.qual[i] *= (r.qual[i-1] + 11); - } - // A user says that g++ complained here about "comparison - // is always false due to limited range of data type", but - // I can't see why. I added the (int) cast to try to avoid - // the warning. - if((int)r.qual[i] < 0) r.qual[i] = -(r.qual[i]+1); - r.qual[i] %= 41; - assert_leq(r.qual[i], 40); - r.qual[i] += 33; - } - } - - /** * Dump the contents of the ReadBuf to the dump file. */ - void dumpBuf(const ReadBuf& r) { + void dumpBuf(const Read& r) { assert(dumpfile_ != NULL); dump(out_, r.patFw, empty(r.qual) ? String("(empty)") : r.qual, @@ -504,1318 +459,253 @@ class PatternSource { out << name << ": " << seq << " " << qual << endl; } - uint32_t seed_; - /// The number of reads read by this PatternSource volatile uint64_t readCnt_; const char *dumpfile_; /// dump patterns to this file before returning them ofstream out_; /// output stream for dumpfile - int numWrappers_; /// # threads that own a wrapper for this PatternSource - bool doLocking_; /// override whether to lock (true = don't override) - bool randomizeQuals_; /// true -> mess up qualities in a random way - MUTEX_T mutex_m; /// mutex for locking critical regions - bool verbose_; + + /// Lock enforcing mutual exclusion for (a) file I/O, (b) writing fields + /// of this or another other shared object. + MUTEX_T mutex; }; /** - * Abstract parent class for synhconized sources of paired-end reads - * (and possibly also single-end reads). + * Encapsualtes a source of patterns where each raw pattern is trimmed + * by some user-defined amount on the 3' and 5' ends. Doesn't + * implement the actual trimming - that's up to the concrete + * descendants. */ -class PatternComposer { +class TrimmingPatternSource : public PatternSource { public: - PatternComposer(uint32_t seed) { - seed_ = seed; - } - virtual ~PatternComposer() { } - - virtual void reset() = 0; - - //virtual pair nextReadPair(ReadBuf& ra, ReadBuf& rb) = 0; - - /** - * Member function override by concrete, format-specific classes. - */ - virtual std::pair nextBatch(PerThreadReadBuf& pt) = 0; - - /** - * Make appropriate call into the format layer to parse individual read. - */ - virtual bool parse(ReadBuf& ra, ReadBuf& rb) = 0; - - virtual void free_pmembers( const vector &elist) { - for (size_t i = 0; i < elist.size(); i++) { - if (elist[i] != NULL) - delete elist[i]; - } - } - + TrimmingPatternSource(const char *dumpfile = NULL, + int trim3 = 0, + int trim5 = 0) : + PatternSource(dumpfile), + trim3_(trim3), trim5_(trim5) { } protected: - - MUTEX_T mutex_m; /// mutex for locking critical regions - uint32_t seed_; + int trim3_; + int trim5_; }; +extern void wrongQualityFormat(const String& read_name); +extern void tooFewQualities(const String& read_name); +extern void tooManyQualities(const String& read_name); +extern void tooManySeqChars(const String& read_name); + /** - * Encapsulates a synchronized source of both paired-end reads and - * unpaired reads, where the paired-end must come from parallel files. + * Encapsulates a source of patterns which is an in-memory vector. */ -class SoloPatternComposer : public PatternComposer { - +class VectorPatternSource : public TrimmingPatternSource { public: - - SoloPatternComposer(const vector& src, - uint32_t seed) : - PatternComposer(seed), cur_(0), src_(src) - { - for(size_t i = 0; i < src_.size(); i++) { - assert(src_[i] != NULL); - } - } - + VectorPatternSource( + const vector& v, + bool color, + const char *dumpfile = NULL, + int trim3 = 0, + int trim5 = 0, + uint32_t skip = 0); + + virtual ~VectorPatternSource() { } + /** - * Reset this object and all the PatternSources under it so that - * the next call to nextReadPair gets the very first read pair. + * Read next batch. However, batch concept is not very applicable for this + * PatternSource where all the info has already been parsed into the fields + * in the contsructor. This essentially modifies the pt as though we read + * in some number of patterns. */ - virtual void reset() { - for(size_t i = 0; i < src_.size(); i++) { - src_[i]->reset(); - } - cur_ = 0; - } + virtual pair nextBatch( + PerThreadReadBuf& pt, + bool batch_a, + bool lock = true); /** - * The main member function for dispensing pairs of reads or - * singleton reads. Returns true iff ra and rb contain a new - * pair; returns false if ra contains a new unpaired read. + * Reset so that next call to nextBatch* gets the first batch. */ - pair nextBatch(PerThreadReadBuf& pt) { - uint32_t cur = cur_; - while(cur < src_.size()) { - // Patterns from srca_[cur_] are unpaired - pair res; - do { - res = src_[cur]->nextBatch( - pt, - true, // batch A (or pairs) - true); // grab lock below - } while(!res.first && res.second == 0); - if(res.second == 0) { - ThreadSafe ts(&mutex_m); - if(cur + 1 > cur_) { - cur_++; - } - cur = cur_; - continue; // on to next pair of PatternSources - } - return res; - } - assert_leq(cur, src_.size()); - return make_pair(true, 0); + virtual void reset() { + TrimmingPatternSource::reset(); + cur_ = skip_; + paired_ = false; } /** - * Make appropriate call into the format layer to parse individual read. + * Finishes parsing outside the critical section */ - virtual bool parse(ReadBuf& ra, ReadBuf& rb) { - return src_[0]->parse(ra, rb); - } + virtual bool parse(Read& ra, Read& rb, TReadId rdid) const; -protected: +private: - volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors - vector src_; /// PatternSources for paired-end reads + bool color_; // colorspace? + size_t cur_; // index for first read of next batch + uint32_t skip_; // # reads to skip + bool paired_; // whether reads are paired + std::vector tokbuf_; // buffer for storing parsed tokens + std::vector bufs_; // per-read buffers + char nametmp_[20]; // temp buffer for constructing name }; /** - * Encapsulates a synchronized source of both paired-end reads and - * unpaired reads, where the paired-end must come from parallel files. + * Parent class for PatternSources that read from a file. + * Uses unlocked C I/O, on the assumption that all reading + * from the file will take place in an otherwise-protected + * critical section. */ -class DualPatternComposer : public PatternComposer { - +class CFilePatternSource : public TrimmingPatternSource { public: - - DualPatternComposer(const vector& srca, - const vector& srcb, - uint32_t seed) : - PatternComposer(seed), cur_(0), srca_(srca), srcb_(srcb) + CFilePatternSource( + const vector& infiles, + const vector* qinfiles, + const char *dumpfile = NULL, + int trim3 = 0, + int trim5 = 0, + uint32_t skip = 0) : + TrimmingPatternSource(dumpfile, trim3, trim5), + infiles_(infiles), + filecur_(0), + fp_(NULL), + qfp_(NULL), + is_open_(false), + skip_(skip), + first_(true) { - // srca_ and srcb_ must be parallel - assert_eq(srca_.size(), srcb_.size()); - for(size_t i = 0; i < srca_.size(); i++) { - // Can't have NULL first-mate sources. Second-mate sources - // can be NULL, in the case when the corresponding first- - // mate source is unpaired. - assert(srca_[i] != NULL); - for(size_t j = 0; j < srcb_.size(); j++) { - assert_neq(srca_[i], srcb_[j]); - } + qinfiles_.clear(); + if(qinfiles != NULL) qinfiles_ = *qinfiles; + assert_gt(infiles.size(), 0); + errs_.resize(infiles_.size(), false); + if(qinfiles_.size() > 0 && + qinfiles_.size() != infiles_.size()) + { + cerr << "Error: Different numbers of input FASTA/quality files (" + << infiles_.size() << "/" << qinfiles_.size() << ")" << endl; + throw 1; } + open(); // open first file in the list + filecur_++; } - /** - * Reset this object and all the PatternSources under it so that - * the next call to nextReadPair gets the very first read pair. - */ - virtual void reset() { - for(size_t i = 0; i < srca_.size(); i++) { - srca_[i]->reset(); - if(srcb_[i] != NULL) { - srcb_[i]->reset(); + virtual ~CFilePatternSource() { + if(is_open_) { + assert(fp_ != NULL); + fclose(fp_); + fp_ = NULL; + if(qfp_ != NULL) { + fclose(qfp_); + qfp_ = NULL; } } - cur_ = 0; } /** - * The main member function for dispensing pairs of reads or - * singleton reads. Returns true iff ra and rb contain a new - * pair; returns false if ra contains a new unpaired read. + * Fill Read with the sequence, quality and name for the next + * read in the list of read files. This function gets called by + * all the search threads, so we must handle synchronization. + * + * Returns pair where bool indicates whether we're + * completely done, and int indicates how many reads were read. */ - pair nextBatch(PerThreadReadBuf& pt) { - // 'cur' indexes the current pair of PatternSources - uint32_t cur = cur_; - while(cur < srca_.size()) { - if(srcb_[cur] == NULL) { - pair res = srca_[cur]->nextBatch( - pt, - true, // batch A (or pairs) - true); // grab lock below - bool done = res.first; - if(!done && res.second == 0) { - ThreadSafe ts(&mutex_m); - if(cur + 1 > cur_) cur_++; - cur = cur_; // Move on to next PatternSource - continue; // on to next pair of PatternSources - } - return make_pair(done, res.second); - } else { - pair resa, resb; - // Lock to ensure that this thread gets parallel reads - // in the two mate files - { - ThreadSafe ts(&mutex_m); - resa = srca_[cur]->nextBatch( - pt, - true, // batch A - false); // don't grab lock below - resb = srcb_[cur]->nextBatch( - pt, - false, // batch B - false); // don't grab lock below - assert_eq(srca_[cur]->readCount(), - srcb_[cur]->readCount()); - } - if(resa.second < resb.second) { - cerr << "Error, fewer reads in file specified with -1 " - << "than in file specified with -2" << endl; - throw 1; - } else if(resa.second == 0 && resb.second == 0) { - ThreadSafe ts(&mutex_m); - if(cur + 1 > cur_) { - cur_++; - } - cur = cur_; // Move on to next PatternSource - continue; // on to next pair of PatternSources - } else if(resb.second < resa.second) { - cerr << "Error, fewer reads in file specified with -2 " - << "than in file specified with -1" << endl; - throw 1; - } - assert_eq(resa.first, resb.first); - assert_eq(resa.second, resb.second); - return make_pair(resa.first, resa.second); - } - } - assert_leq(cur, srca_.size()); - return make_pair(true, 0); - } + virtual pair nextBatch( + PerThreadReadBuf& pt, + bool batch_a, + bool lock); /** - * Make appropriate call into the format layer to parse individual read. + * Reset so that next call to nextBatch* gets the first batch. + * Should only be called by the master thread. */ - virtual bool parse(ReadBuf& ra, ReadBuf& rb) { - return srca_[0]->parse(ra, rb); - } - - -protected: - - volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors - vector srca_; /// PatternSources for 1st mates and/or unpaired reads - vector srcb_; /// PatternSources for 2nd mates -}; - -/** - * Encapsulates a single thread's interaction with the PatternSource. - * Most notably, this class holds the buffers into which the - * PatterSource will write sequences. This class is *not* threadsafe - * - it doesn't need to be since there's one per thread. PatternSource - * is thread-safe. - */ -class PatternSourcePerThread { - -public: - - PatternSourcePerThread(PatternComposer& patsrc, uint32_t max_buf, uint32_t seed) : - patsrc_(patsrc), buf_(max_buf), - last_batch_(false), last_batch_size_(0), seed_(seed) { } - - uint32_t patid() const { - return buf_.read_a().patid; - } - virtual void reset() { - buf_.reset(); - } - - bool empty() const { - return buf_.read_a().empty(); + TrimmingPatternSource::reset(); + filecur_ = 0, + open(); + filecur_++; } - uint32_t length(int mate) const { - return (mate == 1)? buf_.read_a().length() : buf_.read_b().length(); - } +protected: /** - * Return true iff the buffers jointly contain a paired-end read. + * Light-parse a batch of unpaired reads from current file into the given + * buffer. Called from CFilePatternSource.nextBatch(). */ - bool paired() { - bool ret = !(buf_.read_b().empty()); - assert(!ret || !empty()); - return ret; - } + virtual std::pair nextBatchFromFile( + PerThreadReadBuf& pt, + bool batch_a) = 0; /** - * Get the next paired or unpaired read from the wrapped - * PatternComposer. Returns a pair of bools; first indicates - * whether we were successful, second indicates whether we're - * done. - */ - pair nextReadPair() { - // Prepare batch - if(buf_.exhausted()) { - pair res = nextBatch(); - if(res.first && res.second == 0) { - return make_pair(false, true); - } - last_batch_ = res.first; - last_batch_size_ = res.second; - assert_eq(0, buf_.cur_buf_); - } else { - buf_.next(); // advance cursor - assert_gt(buf_.cur_buf_, 0); - } - // Parse read/pair - assert(strlen(buf_.read_a().readOrigBuf) != 0); - assert(buf_.read_a().empty()); - if(!parse(buf_.read_a(), buf_.read_b())) { - return make_pair(false, false); - } - // Finalize read/pair - if(strlen(buf_.read_b().readOrigBuf) != 0) { - finalizePair(buf_.read_a(), buf_.read_b()); - } else { - finalize(buf_.read_a()); - } - bool this_is_last = buf_.cur_buf_ == last_batch_size_-1; - return make_pair(true, this_is_last ? last_batch_ : false); - } - virtual void finalize(ReadBuf& ra) { - ra.mate = 1; - //ra.patid = buf_.rdid(); - ra.constructRevComps(); - ra.constructReverses(); - ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_); - //ra.fixMateName(1); - } - - - virtual void finalizePair(ReadBuf& ra, ReadBuf& rb) { - //ra.patid = rb.patid = buf_.rdid(); - - ra.mate = 1; - ra.constructRevComps(); - ra.constructReverses(); - ra.fixMateName(1); - ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_); - - rb.mate = 2; - rb.constructRevComps(); - rb.constructReverses(); - rb.fixMateName(2); - rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, seed_); - } - - ReadBuf& bufa() { return buf_.read_a(); } - ReadBuf& bufb() { return buf_.read_b(); } - - const ReadBuf& bufa() const { return buf_.read_a(); } - const ReadBuf& bufb() const { return buf_.read_b(); } - - -private: - - /** - * When we've finished fully parsing and dishing out reads in - * the current batch, we go get the next one by calling into - * the composition layer. - */ - std::pair nextBatch() { - buf_.reset(); - std::pair res = patsrc_.nextBatch(buf_); - buf_.init(); - return res; - } - - /** - * Call into composition layer (which in turn calls into - * format layer) to parse the read. - */ - bool parse(ReadBuf& ra, ReadBuf& rb) { - return patsrc_.parse(ra, rb); - } - - PatternComposer& patsrc_; // pattern composer - PerThreadReadBuf buf_; // read data buffer - bool last_batch_; // true if this is final batch - int last_batch_size_; // # reads read in previous batch - uint32_t seed_; -}; - -/** - * Abstract parent factory for PatternSourcePerThreads. - */ -class PatternSourcePerThreadFactory { -public: - PatternSourcePerThreadFactory( - PatternComposer& patsrc, uint32_t max_buf, uint32_t seed): - patsrc_(patsrc), max_buf_(max_buf), seed_(seed) {} - - virtual PatternSourcePerThread* create() const { - return new PatternSourcePerThread(patsrc_, max_buf_, seed_); - } - - /** - * Create a new heap-allocated vector of heap-allocated - * WrappedPatternSourcePerThreads. - */ - virtual std::vector* create(uint32_t n) const { - std::vector* v = new std::vector; - for(size_t i = 0; i < n; i++) { - v->push_back(new PatternSourcePerThread(patsrc_, max_buf_, seed_)); - assert(v->back() != NULL); - } - return v; - } - - /// Free memory associated with a pattern source - virtual void destroy(PatternSourcePerThread* patsrc) const { - assert(patsrc != NULL); - // Free the PatternSourcePerThread - delete patsrc; - } - - /// Free memory associated with a pattern source list - virtual void destroy(std::vector* patsrcs) const { - assert(patsrcs != NULL); - // Free all of the PatternSourcePerThreads - for(size_t i = 0; i < patsrcs->size(); i++) { - if((*patsrcs)[i] != NULL) { - delete (*patsrcs)[i]; - (*patsrcs)[i] = NULL; - } - } - // Free the vector - delete patsrcs; - } -private: - /// Container for obtaining paired reads from PatternSources - PatternComposer& patsrc_; - /// Maximum size of batch to read in - uint32_t max_buf_; - uint32_t seed_; -}; - -/** - * Encapsualtes a source of patterns where each raw pattern is trimmed - * by some user-defined amount on the 3' and 5' ends. Doesn't - * implement the actual trimming - that's up to the concrete - * descendants. - */ -class TrimmingPatternSource : public PatternSource { -public: - TrimmingPatternSource(uint32_t seed, - bool randomizeQuals = false, - const char *dumpfile = NULL, - bool verbose = false, - int trim3 = 0, - int trim5 = 0) : - PatternSource(seed, randomizeQuals, dumpfile, verbose), - trim3_(trim3), trim5_(trim5) { } -protected: - int trim3_; - int trim5_; -}; - -/// Skip to the end of the current string of newline chars and return -/// the first character after the newline chars, or -1 for EOF -static inline int getOverNewline(FileBuf& in) { - int c; - while(isspace(c = in.get())); - return c; -} - -/// Skip to the end of the current string of newline chars such that -/// the next call to get() returns the first character after the -/// whitespace -static inline int peekOverNewline(FileBuf& in) { - while(true) { - int c = in.peek(); - if(c != '\r' && c != '\n') { - return c; - } - in.get(); - } -} - -/// Skip to the end of the current line; return the first character -/// of the next line or -1 for EOF -static inline int getToEndOfLine(FileBuf& in) { - while(true) { - int c = in.get(); if(c < 0) return -1; - if(c == '\n' || c == '\r') { - while(c == '\n' || c == '\r') { - c = in.get(); if(c < 0) return -1; - } - // c now holds first character of next line - return c; - } - } -} - -/// Skip to the end of the current line such that the next call to -/// get() returns the first character on the next line -static inline int peekToEndOfLine(FileBuf& in) { - while(true) { - int c = in.get(); if(c < 0) return c; - if(c == '\n' || c == '\r') { - c = in.peek(); - while(c == '\n' || c == '\r') { - in.get(); if(c < 0) return c; // consume \r or \n - c = in.peek(); - } - // next get() gets first character of next line - return c; - } - } -} - -extern void wrongQualityFormat(const String& read_name); -extern void tooFewQualities(const String& read_name); -extern void tooManyQualities(const String& read_name); -extern void tooManySeqChars(const String& read_name); - -/** - * Encapsulates a source of patterns which is an in-memory vector. - */ -class VectorPatternSource : public TrimmingPatternSource { -public: - VectorPatternSource(uint32_t seed, - const vector& v, - bool color, - bool randomizeQuals = false, - const char *dumpfile = NULL, - bool verbose = false, - int trim3 = 0, - int trim5 = 0, - uint32_t skip = 0) : - TrimmingPatternSource(seed, randomizeQuals, - dumpfile, verbose, trim3, trim5), - color_(color), cur_(skip), skip_(skip), paired_(false), v_(), - quals_() - { - for(size_t i = 0; i < v.size(); i++) { - vector ss; - tokenize(v[i], ":", ss, 2); - assert_gt(ss.size(), 0); - assert_leq(ss.size(), 2); - // Initialize s - string s = ss[0]; - int mytrim5 = this->trim5_; - if(color_ && s.length() > 1) { - // This may be a primer character. If so, keep it in the - // 'primer' field of the read buf and parse the rest of the - // read without it. - int c = toupper(s[0]); - if(asc2dnacat[c] > 0) { - // First char is a DNA char - int c2 = toupper(s[1]); - // Second char is a color char - if(asc2colcat[c2] > 0) { - mytrim5 += 2; // trim primer and first color - } - } - } - if(color_) { - // Convert '0'-'3' to 'A'-'T' - for(size_t i = 0; i < s.length(); i++) { - if(s[i] >= '0' && s[i] <= '4') { - s[i] = "ACGTN"[(int)s[i] - '0']; - } - if(s[i] == '.') s[i] = 'N'; - } - } - if(s.length() <= (size_t)(trim3_ + mytrim5)) { - // Entire read is trimmed away - continue; - } else { - // Trim on 5' (high-quality) end - if(mytrim5 > 0) { - s.erase(0, mytrim5); - } - // Trim on 3' (low-quality) end - if(trim3_ > 0) { - s.erase(s.length()-trim3_); - } - } - // Initialize vq - string vq; - if(ss.size() == 2) { - vq = ss[1]; - } - // Trim qualities - if(vq.length() > (size_t)(trim3_ + mytrim5)) { - // Trim on 5' (high-quality) end - if(mytrim5 > 0) { - vq.erase(0, mytrim5); - } - // Trim on 3' (low-quality) end - if(trim3_ > 0) { - vq.erase(vq.length()-trim3_); - } - } - // Pad quals with Is if necessary; this shouldn't happen - while(vq.length() < length(s)) { - vq.push_back('I'); - } - // Truncate quals to match length of read if necessary; - // this shouldn't happen - if(vq.length() > length(s)) { - vq.erase(length(s)); - } - assert_eq(vq.length(), length(s)); - v_.push_back(s); - quals_.push_back(vq); - trimmed3_.push_back(trim3_); - trimmed5_.push_back(mytrim5); - ostringstream os; - os << (names_.size()); - names_.push_back(os.str()); - } - assert_eq(v_.size(), quals_.size()); - } - - virtual ~VectorPatternSource() { } - - virtual bool nextRead(ReadBuf& r) { - // Let Strings begin at the beginning of the respective bufs - r.reset(); - ThreadSafe ts(&mutex_m); - if(cur_ >= v_.size()) { - // Clear all the Strings, as a signal to the caller that - // we're out of reads - r.clearAll(); - assert(r.empty()); - return false; - } - // Copy v_*, quals_* strings into the respective Strings - r.color = color_; - r.patFw = v_[cur_]; - r.qual = quals_[cur_]; - r.trimmed3 = trimmed3_[cur_]; - r.trimmed5 = trimmed5_[cur_]; - ostringstream os; - os << cur_; - r.name = os.str(); - cur_++; - readCnt_++; - r.patid = (uint32_t)readCnt_; - return true; - } - - /** - * This is unused, but implementation is given for completeness. - */ - virtual pair nextReadPair(ReadBuf& ra, ReadBuf& rb) { - // Let Strings begin at the beginning of the respective bufs - throw 1; - return make_pair(false, false); -// ra.reset(); -// rb.reset(); -// if(!paired_) { -// paired_ = true; -// cur_ <<= 1; -// } -// ThreadSafe ts(&mutex_m); -// if(cur_ >= v_.size()-1) { -// // Clear all the Strings, as a signal to the caller that -// // we're out of reads -// ra.clearAll(); -// rb.clearAll(); -// assert(ra.empty()); -// assert(rb.empty()); -// return; -// } -// // Copy v_*, quals_* strings into the respective Strings -// ra.patFw = v_[cur_]; -// ra.qual = quals_[cur_]; -// ra.trimmed3 = trimmed3_[cur_]; -// ra.trimmed5 = trimmed5_[cur_]; -// cur_++; -// rb.patFw = v_[cur_]; -// rb.qual = quals_[cur_]; -// rb.trimmed3 = trimmed3_[cur_]; -// rb.trimmed5 = trimmed5_[cur_]; -// ostringstream os; -// os << readCnt_; -// ra.name = os.str(); -// rb.name = os.str(); -// ra.color = rb.color = color_; -// cur_++; -// readCnt_++; -// paired = true; -// ra.patid = rb.patid = (uint32_t)readCnt_; - } - - virtual void reset() { - TrimmingPatternSource::reset(); - cur_ = skip_; - paired_ = false; - } - - virtual void finalize(ReadBuf& r) const { } - - /** - * Read next batch. However, batch concept is not very applicable for this - * PatternSource where all the info has already been parsed into the fields - * in the contsructor. This essentially modifies the pt as though we read - * in some number of patterns. - */ - pair nextBatch( - PerThreadReadBuf& pt, - bool batch_a, - bool lock) - { - bool success = true; - int nread = 0; - pt.reset(); - ThreadSafe ts(&mutex_m, lock); - pt.setReadId(readCnt_); -#if 0 - // TODO: set nread to min of pt.size() and total - cur_ - // TODO: implement something like following function - pt.install_dummies(nread); -#endif - readCnt_ += nread; - return make_pair(success, nread); - } - - /** - * Finishes parsing outside the critical section - */ - virtual bool parse(ReadBuf& ra, ReadBuf& rb) const { - cerr << "In VectorPatternSource.parse()" << endl; - throw 1; - return false; - } - -private: - bool color_; - size_t cur_; - uint32_t skip_; - bool paired_; - vector > v_; /// forward sequences - vector > quals_; /// quality values parallel to v_ - vector > names_; /// names - vector trimmed3_; /// # bases trimmed from 3' end - vector trimmed5_; /// # bases trimmed from 5' end -}; - -/** - * - */ -class BufferedFilePatternSource : public TrimmingPatternSource { -public: - BufferedFilePatternSource(uint32_t seed, - const vector& infiles, - const vector* qinfiles, - bool randomizeQuals = false, - const char *dumpfile = NULL, - bool verbose = false, - int trim3 = 0, - int trim5 = 0, - uint32_t skip = 0) : - TrimmingPatternSource(seed, randomizeQuals, - dumpfile, verbose, trim3, trim5), - infiles_(infiles), - filecur_(0), - fb_(), - qfb_(), - skip_(skip), - first_(true) - { - qinfiles_.clear(); - if(qinfiles != NULL) qinfiles_ = *qinfiles; - assert_gt(infiles.size(), 0); - errs_.resize(infiles_.size(), false); - if(qinfiles_.size() > 0 && - qinfiles_.size() != infiles_.size()) - { - cerr << "Error: Different numbers of input FASTA/quality files (" - << infiles_.size() << "/" << qinfiles_.size() << ")" << endl; - throw 1; - } - assert(!fb_.isOpen()); - assert(!qfb_.isOpen()); - open(); // open first file in the list - filecur_++; - } - - virtual ~BufferedFilePatternSource() { - if(fb_.isOpen()) fb_.close(); - if(qfb_.isOpen()) { - assert_gt(qinfiles_.size(), 0); - qfb_.close(); - } - } - - /** - * Fill ReadBuf with the sequence, quality and name for the next - * read in the list of read files. This function gets called by - * all the search threads, so we must handle synchronization. - */ - virtual bool nextRead(ReadBuf& r) { - // We are entering a critical region, because we're - // manipulating our file handle and filecur_ state - ThreadSafe ts(&mutex_m); - pair flags = make_pair(false, false); - while(!flags.first && !flags.second) { - flags = readLight(r); - } - if(first_ && !flags.first) { - cerr << "Warning: Could not find any reads in \"" - << infiles_[0] << "\"" << endl; - } - first_ = false; - while(!flags.first && filecur_ < infiles_.size()) { - assert(flags.second); - // Open next file - open(); - resetForNextFile(); // reset state to handle a fresh file - flags.first = flags.second = false; - while(!flags.first && !flags.second) { - flags = readLight(r); - } - assert_geq(r.patid, skip_); - if(!flags.first) { - // No reads could be extracted from this _infile - cerr << "Warning: Could not find any reads in \"" - << infiles_[filecur_] << "\"" << endl; - } - filecur_++; - } - return flags.first; - } - - /** - * - */ - virtual pair nextReadPair(ReadBuf& ra, ReadBuf& rb) { - // We are entering a critical region, because we're - // manipulating our file handle and filecur_ state - ThreadSafe ts(&mutex_m); - bool success, eof, paired; - do { - BoolTriple result = readPairLight(ra, rb); - success = result.a; - eof = result.b; - paired = result.c; - } while(!success && !eof); - if(first_ && !success) { - cerr << "Warning: Could not find any read pairs in \"" - << infiles_[0] << "\"" << endl; - } - first_ = false; - while(!success && filecur_ < infiles_.size()) { - // Open next file - open(); - resetForNextFile(); // reset state to handle a fresh file - do { - BoolTriple result = readPairLight(ra, rb); - success = result.a; - eof = result.b; - paired = result.c; - } while(!success && !eof); - if(!success && eof) { - cerr << "Warning: Could not find any reads in \"" - << infiles_[filecur_] << "\"" << endl; - } - filecur_++; - } - return make_pair(success, paired); - } - - /** - * Reset state so that we read start reading again from the - * beginning of the first file. Should only be called by the - * master thread. + * Reset state to handle a fresh file */ - virtual void reset() { - TrimmingPatternSource::reset(); - filecur_ = 0, - open(); - filecur_++; - } - - /** - * Fill Read with the sequence, quality and name for the next - * read in the list of read files. This function gets called by - * all the search threads, so we must handle synchronization. - * - * What does the return value signal? - * In the event that we read more data, it should at least signal how many - * reads were read, and whether we're totally done. It's debatable whether - * it should convey anything about the individual read files, like whether - * we finished one of them. - */ - pair nextBatch( - PerThreadReadBuf& pt, - bool batch_a, - bool lock) - { - bool done = false; - int nread = 0; - - // synchronization at this level because both reading and manipulation of - // current file pointer have to be protected - ThreadSafe ts(&mutex_m, lock); - pt.setReadId(readCnt_); - while(true) { // loop that moves on to next file when needed - do { - pair ret = nextBatchFromFile(pt, batch_a); - done = ret.first; - nread = ret.second; - } while(!done && nread == 0); // not sure why this would happen - if(done && filecur_ < infiles_.size()) { // finished with this file - open(); - resetForNextFile(); // reset state to handle a fresh file - filecur_++; - if(nread == 0) { - continue; - } - } - break; - } - assert_geq(nread, 0); - readCnt_ += nread; - return make_pair(done, nread); - } - -protected: - - /// Read another pattern from the input file; this is overridden - /// to deal with specific file formats - virtual pair readLight(ReadBuf& r) = 0; + virtual void resetForNextFile() { } /** - * Light-parse a batch of unpaired reads from current file into the given - * buffer. Called from CFilePatternSource.nextBatch(). + * Open the next file in the list of input files. */ - virtual std::pair nextBatchFromFile( - PerThreadReadBuf& pt, - bool batch_a) = 0; - - /// Read another pattern pair from the input file; this is - /// overridden to deal with specific file formats - virtual BoolTriple readPairLight( - ReadBuf& ra, - ReadBuf& rb) = 0; - - /// Reset state to handle a fresh file - virtual void resetForNextFile() { } - - void open() { - if(fb_.isOpen()) fb_.close(); - if(qfb_.isOpen()) qfb_.close(); - while(filecur_ < infiles_.size()) { - // Open read - FILE *in; - if(infiles_[filecur_] == "-") { - in = stdin; - } else if((in = fopen(infiles_[filecur_].c_str(), "rb")) == NULL) { - if(!errs_[filecur_]) { - cerr << "Warning: Could not open read file \"" << infiles_[filecur_] << "\" for reading; skipping..." << endl; - errs_[filecur_] = true; - } - filecur_++; - continue; - } - fb_.newFile(in); - // Open quality - if(!qinfiles_.empty()) { - FILE *in; - if(qinfiles_[filecur_] == "-") { - in = stdin; - } else if((in = fopen(qinfiles_[filecur_].c_str(), "rb")) == NULL) { - if(!errs_[filecur_]) { - cerr << "Warning: Could not open quality file \"" << qinfiles_[filecur_] << "\" for reading; skipping..." << endl; - errs_[filecur_] = true; - } - filecur_++; - continue; - } - qfb_.newFile(in); - } - return; - } - throw 1; - } + void open(); vector infiles_; /// filenames for read files vector qinfiles_; /// filenames for quality files vector errs_; /// whether we've already printed an error for each file size_t filecur_; /// index into infiles_ of next file to read - FileBuf fb_; /// read file currently being read from - FileBuf qfb_; /// quality file currently being read from + FILE *fp_; /// read file currently being read from + FILE *qfp_; /// quality file currently being read from + bool is_open_; /// whether fp_ is currently open uint32_t skip_; /// number of reads to skip bool first_; + char buf_[64*1024]; /// file buffer for sequences + char qbuf_[64*1024]; /// file buffer for qualities }; /** - * + * Synchronized concrete pattern source for a list of FASTA or CSFASTA + * (if color = true) files. */ -class CFilePatternSource : public TrimmingPatternSource { +class FastaPatternSource : public CFilePatternSource { + public: - CFilePatternSource( - uint32_t seed, - const vector& infiles, + + FastaPatternSource( + const vector& infiles, const vector* qinfiles, - bool randomizeQuals = false, + bool color, const char *dumpfile = NULL, - bool verbose = false, int trim3 = 0, int trim5 = 0, + bool solexa64 = false, + bool phred64 = false, + bool intQuals = false, uint32_t skip = 0) : - TrimmingPatternSource(seed, randomizeQuals, - dumpfile, verbose, trim3, trim5), - infiles_(infiles), - filecur_(0), - fp_(NULL), - qfp_(NULL), - is_open_(false), - skip_(skip), - first_(true) - { - qinfiles_.clear(); - if(qinfiles != NULL) qinfiles_ = *qinfiles; - assert_gt(infiles.size(), 0); - errs_.resize(infiles_.size(), false); - if(qinfiles_.size() > 0 && - qinfiles_.size() != infiles_.size()) - { - cerr << "Error: Different numbers of input FASTA/quality files (" - << infiles_.size() << "/" << qinfiles_.size() << ")" << endl; - throw 1; - } - open(); // open first file in the list - filecur_++; - } - - virtual ~CFilePatternSource() { - if(is_open_) { - assert(fp_ != NULL); - fclose(fp_); - fp_ = NULL; - if(qfp_ != NULL) { - fclose(qfp_); - qfp_ = NULL; - } - } - } - - /** - * Fill ReadBuf with the sequence, quality and name for the next - * read in the list of read files. This function gets called by - * all the search threads, so we must handle synchronization. - */ - virtual bool nextRead(ReadBuf& r) { - // We are entering a critical region, because we're - // manipulating our file handle and filecur_ state - ThreadSafe ts(&mutex_m); - pair flags = make_pair(false, false); - while(!flags.first && !flags.second) { - flags = readLight(r); - } - if(first_ && !flags.first) { - cerr << "Warning: Could not find any reads in \"" - << infiles_[0] << "\"" << endl; - } - first_ = false; - while(!flags.first && filecur_ < infiles_.size()) { - assert(flags.second); - // Open next file - open(); - resetForNextFile(); // reset state to handle a fresh file - flags.first = flags.second = false; - while(!flags.first && !flags.second) { - flags = readLight(r); - } - assert_geq(r.patid, skip_); - if(!flags.first) { - cerr << "Warning: Could not find any reads in \"" - << infiles_[filecur_] << "\"" << endl; - } - filecur_++; - } - return flags.first; - } + CFilePatternSource( + infiles, + qinfiles, + dumpfile, + trim3, + trim5, + skip), + first_(true), + color_(color), + solexa64_(solexa64), + phred64_(phred64), + intQuals_(intQuals) { } /** - * + * Reset so that next call to nextBatch* gets the first batch. + * Should only be called by the master thread. */ - virtual pair nextReadPair(ReadBuf& ra, ReadBuf& rb) { - // We are entering a critical region, because we're - // manipulating our file handle and filecur_ state - ThreadSafe ts(&mutex_m); - bool success, eof, paired; - do { - BoolTriple result = readPairLight(ra, rb); - success = result.a; - eof = result.b; - paired = result.c; - } while(!success && !eof); - if(first_ && !success) { - cerr << "Warning: Could not find any read pairs in \"" - << infiles_[0] << "\"" << endl; - } - first_ = false; - while(!success && filecur_ < infiles_.size()) { - // Open next file - open(); - resetForNextFile(); // reset state to handle a fresh file - do { - BoolTriple result = readPairLight(ra, rb); - success = result.a; - eof = result.b; - paired = result.c; - } while(!success && !eof); - if(!success && eof) { - cerr << "Warning: Could not find any reads in \"" - << infiles_[filecur_] << "\"" << endl; - } - filecur_++; - } - return make_pair(success, paired); + virtual void reset() { + first_ = true; + CFilePatternSource::reset(); } /** - * Fill Read with the sequence, quality and name for the next - * read in the list of read files. This function gets called by - * all the search threads, so we must handle synchronization. - * - * Returns pair where bool indicates whether we're - * completely done, and int indicates how many reads were read. - */ - pair nextBatch( - PerThreadReadBuf& pt, - bool batch_a, - bool lock) - { - bool done = false; - int nread = 0; - - // synchronization at this level because both reading and manipulation of - // current file pointer have to be protected - ThreadSafe ts(&mutex_m, lock); - pt.setReadId(readCnt_); - while(true) { // loop that moves on to next file when needed - do { - pair ret = nextBatchFromFile(pt, batch_a); - done = ret.first; - nread = ret.second; - } while(!done && nread == 0); // not sure why this would happen - if(done && filecur_ < infiles_.size()) { // finished with this file - open(); - resetForNextFile(); // reset state to handle a fresh file - filecur_++; - if(nread == 0) { - continue; - } - } - break; - } - assert_geq(nread, 0); - //updating readCnt_ in nextBatchFromFile - //readCnt_ += nread; - return make_pair(done, nread); - } - - /** - * Reset state so that we read start reading again from the - * beginning of the first file. Should only be called by the - * master thread. + * Finalize FASTA parsing outside critical section. */ - virtual void reset() { - TrimmingPatternSource::reset(); - filecur_ = 0, - open(); - filecur_++; - } - + virtual bool parse(Read& ra, Read& rb, TReadId rdid) const; + protected: /** - * Light-parse a batch of unpaired reads from current file into the given - * buffer. Called from CFilePatternSource.nextBatch(). + * Light-parse a FASTA batch into the given buffer. */ virtual std::pair nextBatchFromFile( PerThreadReadBuf& pt, - bool batch_a) = 0; - - /// Read another pattern from the input file; this is overridden - /// to deal with specific file formats - virtual pair readLight(ReadBuf& r) = 0; - - - /// Read another pattern pair from the input file; this is - /// overridden to deal with specific file formats - virtual BoolTriple readPairLight(ReadBuf& ra, ReadBuf& rb) = 0; - - /// Reset state to handle a fresh file - virtual void resetForNextFile() { } - - void open() { - if(is_open_) { - is_open_ = false; - fclose(fp_); - fp_ = NULL; - if(qfp_ != NULL) { - fclose(qfp_); - qfp_ = NULL; - } - } - while(filecur_ < infiles_.size()) { - // Open read - if(infiles_[filecur_] == "-") { - fp_ = stdin; - } else if((fp_ = fopen(infiles_[filecur_].c_str(), "rb")) == NULL) { - if(!errs_[filecur_]) { - cerr << "Warning: Could not open read file \"" << infiles_[filecur_] << "\" for reading; skipping..." << endl; - errs_[filecur_] = true; - } - filecur_++; - continue; - } - is_open_ = true; - setvbuf(fp_, buf_, _IOFBF, 64*1024); - if(!qinfiles_.empty()) { - if(qinfiles_[filecur_] == "-") { - qfp_ = stdin; - } else if((qfp_ = fopen(qinfiles_[filecur_].c_str(), "rb")) == NULL) { - if(!errs_[filecur_]) { - cerr << "Warning: Could not open quality file \"" << qinfiles_[filecur_] << "\" for reading; skipping..." << endl; - errs_[filecur_] = true; - } - filecur_++; - continue; - } - assert(qfp_ != NULL); - setvbuf(qfp_, qbuf_, _IOFBF, 64*1024); - } - return; - } - throw 1; - } - - vector infiles_; /// filenames for read files - vector qinfiles_; /// filenames for quality files - vector errs_; /// whether we've already printed an error for each file - size_t filecur_; /// index into infiles_ of next file to read - FILE *fp_; /// read file currently being read from - FILE *qfp_; /// quality file currently being read from - bool is_open_; /// whether fp_ is currently open - uint32_t skip_; /// number of reads to skip - bool first_; - char buf_[64*1024]; /// file buffer for sequences - char qbuf_[64*1024]; /// file buffer for qualities -}; - -/** - * Parse a single quality string from fb and store qualities in r. - * Assume the next character obtained via fb.get() is the first - * character of the quality string. When returning, the next - * character returned by fb.peek() or fb.get() should be the first - * character of the following line. - */ -int parseQuals(ReadBuf& r, - FileBuf& fb, - int readLen, - int trim3, - int trim5, - bool intQuals, - bool phred64, - bool solexa64); - -/** - * Synchronized concrete pattern source for a list of FASTA or CSFASTA - * (if color = true) files. - */ -class FastaPatternSource : public BufferedFilePatternSource { -public: - FastaPatternSource(uint32_t seed, - const vector& infiles, - const vector* qinfiles, - bool color, - bool randomizeQuals, - const char *dumpfile = NULL, - bool verbose = false, - int trim3 = 0, - int trim5 = 0, - bool solexa64 = false, - bool phred64 = false, - bool intQuals = false, - uint32_t skip = 0) : - BufferedFilePatternSource(seed, infiles, qinfiles, randomizeQuals, - dumpfile, verbose, trim3, - trim5, skip), - first_(true), color_(color), solexa64_(solexa64), - phred64_(phred64), intQuals_(intQuals) - { } - virtual void reset() { - first_ = true; - BufferedFilePatternSource::reset(); - } - - -protected: - - /// Read another pattern from a FASTA input file - pair nextBatchFromFile( - PerThreadReadBuf& pt, - bool batch_a) - { - throw 1; - return make_pair(false, 0); - } - - /// Read another pattern from a FASTA input file - bool parse(ReadBuf& r, ReadBuf& rb) const { - r.reset(); - cerr << "In FastaPatternSource.parse()" << endl; - throw 1; - return false; - } + bool batch_a); /** * Scan to the next FASTA record (starting with >) and return the first @@ -1824,211 +714,18 @@ class FastaPatternSource : public BufferedFilePatternSource { static int skipToNextFastaRecord(FileBuf& in) { int c; while((c = in.get()) != '>') { - if(in.eof()) return -1; - } - return c; - } - - /// Called when we have to bail without having parsed a read. - void bail(ReadBuf& r) { - r.clearAll(); - fb_.resetLastN(); - qfb_.resetLastN(); - } - - /// Read another pattern from a FASTA input file - virtual pair readLight(ReadBuf& r) { - int c, qc = 0; - int dstLen = 0; - int nameLen = 0; - bool doquals = qinfiles_.size() > 0; - assert(!doquals || qfb_.isOpen()); - assert(fb_.isOpen()); - r.color = color_; - - // Skip over between-read comments. Note that SOLiD's comments use #s - c = fb_.get(); - if(c < 0) { - bail(r); - return make_pair(false, true); - } - while(c == '#' || c == ';') { - c = fb_.peekUptoNewline(); - fb_.resetLastN(); - c = fb_.get(); - } - assert_eq(1, fb_.lastNCur()); - if(doquals) { - qc = qfb_.get(); - if(qc < 0) { - bail(r); - return make_pair(false, true); - } - while(qc == '#' || qc == ';') { - qc = qfb_.peekUptoNewline(); - qfb_.resetLastN(); - qc = qfb_.get(); - } - assert_eq(1, qfb_.lastNCur()); - } - - // Pick off the first carat - if(first_) { - if(c != '>') { - cerr << "Error: reads file does not look like a FASTA file" << endl; - throw 1; - } - if(doquals && qc != '>') { - cerr << "Error: quality file does not look like a FASTA quality file" << endl; - throw 1; - } - first_ = false; - } - assert_eq('>', c); - if(doquals) assert_eq('>', qc); - c = fb_.get(); // get next char after '>' - if(doquals) qc = qfb_.get(); - - // Read to the end of the id line, sticking everything after the '>' - // into *name - bool warning = false; - while(true) { - if(c < 0 || qc < 0) { - bail(r); - return make_pair(false, true); - } - if(c == '\n' || c == '\r') { - // Break at end of line, after consuming all \r's, \n's - while(c == '\n' || c == '\r') { - c = fb_.get(); - if(doquals) qc = qfb_.get(); - if(c < 0 || qc < 0) { bail(r); return make_pair(false, true); } - } - break; - } - if(doquals && c != qc) { - cerr << "Warning: one or more mismatched read names between FASTA and quality files" << endl; - warning = true; - } - r.nameBuf[nameLen++] = c; - c = fb_.get(); - if(doquals) qc = qfb_.get(); - } - _setBegin(r.name, r.nameBuf); - _setLength(r.name, nameLen); - if(warning) { - cerr << " Offending read name: \"" << r.name << "\"" << endl; - } - - // _in now points just past the first character of a sequence - // line, and c holds the first character - int begin = 0; - int mytrim5 = this->trim5_; - if(color_) { - // This is the primer character, keep it in the - // 'primer' field of the read buf and keep parsing - c = toupper(c); - if(asc2dnacat[c] > 0) { - // First char is a DNA char - int c2 = toupper(fb_.peek()); - if(asc2colcat[c2] > 0) { - // Second char is a color char - r.primer = c; - r.trimc = c2; - mytrim5 += 2; - } - } - if(c < 0) { - bail(r); - return make_pair(false, true); - } - } - while(c != '>' && c >= 0) { - if(color_) { - if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0']; - if(c == '.') c = 'N'; - } - if(asc2dnacat[c] > 0 && begin++ >= mytrim5) { - if(dstLen + 1 > 1024) tooManySeqChars(r.name); - r.patBufFw[dstLen] = charToDna5[c]; - if(!doquals) r.qualBuf[dstLen] = 'I'; - dstLen++; - } - if(fb_.peek() == '>') { - break; - } - c = fb_.get(); - } - dstLen -= this->trim3_; - if(dstLen < 0) dstLen = 0; - _setBegin (r.patFw, (Dna5*)r.patBufFw); - _setLength(r.patFw, dstLen); - r.trimmed3 = this->trim3_; - r.trimmed5 = mytrim5; - if(doquals) { - if(dstLen > 0) { - parseQuals(r, qfb_, dstLen + r.trimmed3 + r.trimmed5, - r.trimmed3, r.trimmed5, intQuals_, phred64_, - solexa64_); - } else { - // Bail - qfb_.peekUptoNewline(); - qfb_.get(); - qfb_.resetLastN(); - fb_.resetLastN(); - // Count the read - readCnt_++; - r.patid = (uint32_t)readCnt_-1; - return make_pair(true, false); - } - } - _setBegin (r.qual, r.qualBuf); - _setLength(r.qual, dstLen); - // Set up a default name if one hasn't been set - if(nameLen == 0) { - itoa10((int)readCnt_, r.nameBuf); - _setBegin(r.name, r.nameBuf); - nameLen = (int)strlen(r.nameBuf); - _setLength(r.name, nameLen); - } - assert_gt(nameLen, 0); - readCnt_++; - r.patid = (uint32_t)(readCnt_-1); - r.readOrigBufLen = fb_.copyLastN(r.readOrigBuf); - fb_.resetLastN(); - if(doquals) { - r.qualOrigBufLen = qfb_.copyLastN(r.qualOrigBuf); - qfb_.resetLastN(); - if(false) { - cout << "Name: " << r.name << endl - << " Seq: " << r.patFw << " (" << seqan::length(r.patFw) << ")" << endl - << "Qual: " << r.qual << " (" << seqan::length(r.qual) << ")" << endl - << "Orig seq:" << endl; - for(size_t i = 0; i < r.readOrigBufLen; i++) cout << r.readOrigBuf[i]; - cout << "Orig qual:" << endl; - for(size_t i = 0; i < r.qualOrigBufLen; i++) cout << r.qualOrigBuf[i]; - cout << endl; - } + if(in.eof()) return -1; } - return make_pair(true, false); - } - - /// Read another pair of patterns from a FASTA input file - virtual BoolTriple readPairLight(ReadBuf& ra, ReadBuf& rb) { - // (For now, we shouldn't ever be here) - cerr << "In FastaPatternSource.readPair()" << endl; - throw 1; - return BoolTriple(); + return c; } /** - * Finishes parsing outside the critical section + * Reset state to handle a fresh file */ - virtual void finalize(ReadBuf& r) const { } - virtual void resetForNextFile() { first_ = true; } + virtual void dump(ostream& out, const String& seq, const String& qual, @@ -2036,7 +733,9 @@ class FastaPatternSource : public BufferedFilePatternSource { { out << ">" << name << endl << seq << endl; } + private: + bool first_; bool color_; bool solexa64_; @@ -2061,28 +760,183 @@ static inline bool tokenizeQualLine(FileBuf& filebuf, char *buf, size_t buflen, * delimited name, seq, qual fields (or, for paired-end reads, * basename, seq1, qual1, seq2, qual2). */ -class TabbedPatternSource : public BufferedFilePatternSource { +class TabbedPatternSource : public CFilePatternSource { public: - TabbedPatternSource(uint32_t seed, - const vector& infiles, - bool color, - bool randomizeQuals = false, - const char *dumpfile = NULL, - bool verbose = false, - int trim3 = 0, - int trim5 = 0, - bool solQuals = false, - bool phred64Quals = false, - bool intQuals = false, - uint32_t skip = 0) : - BufferedFilePatternSource(seed, infiles, NULL, randomizeQuals, - dumpfile, verbose, - trim3, trim5, skip), + TabbedPatternSource( + const vector& infiles, + bool secondName, // whether it's --12/--tab5 or --tab6 + bool color, + const char *dumpfile = NULL, + int trim3 = 0, + int trim5 = 0, + bool solQuals = false, + bool phred64Quals = false, + bool intQuals = false, + uint32_t skip = 0) : + CFilePatternSource( + infiles, + NULL, + dumpfile, + trim3, + trim5, + skip), color_(color), solQuals_(solQuals), phred64Quals_(phred64Quals), - intQuals_(intQuals) - { } + intQuals_(intQuals) { } + + /** + * Finalize tabbed parsing outside critical section. + */ + virtual bool parse(Read& ra, Read& rb, TReadId rdid) const; + +protected: + + /** + * Light-parse a batch of tabbed-format reads into given buffer. + */ + virtual std::pair nextBatchFromFile( + PerThreadReadBuf& pt, + bool batch_a); + + /** + * Dump a FASTQ-style record for the read. + */ + virtual void dump(ostream& out, + const String& seq, + const String& qual, + const String& name) + { + out << "@" << name << endl << seq << endl + << "+" << endl << qual << endl; + } + +protected: + + bool color_; // colorspace reads? + bool solQuals_; // base qualities are log odds + bool phred64Quals_; // base qualities are on -64 scale + bool intQuals_; // base qualities are space-separated strings + bool secondName_; // true if --tab6, false if --tab5 +}; + +/** + * Synchronized concrete pattern source for a list of FASTA files where + * reads need to be extracted from long continuous sequences. + */ +class FastaContinuousPatternSource : public CFilePatternSource { +public: + FastaContinuousPatternSource( + const vector& infiles, + size_t length, + size_t freq, + const char *dumpfile = NULL, + uint32_t skip = 0) : + CFilePatternSource( + infiles, + NULL, + dumpfile, + 0, + 0, + skip), + length_(length), + freq_(freq), + eat_(length_-1), + beginning_(true), + bufCur_(0), + subReadCnt_(0llu) + { + assert_gt(freq_, 0); + resetForNextFile(); + assert_lt(length_, (size_t)Read::BUF_SIZE); + } + + virtual void reset() { + CFilePatternSource::reset(); + resetForNextFile(); + } + + /** + * Finalize FASTA parsing outside critical section. + */ + virtual bool parse(Read& ra, Read& rb, TReadId rdid) const; + +protected: + + /** + * Light-parse a batch into the given buffer. + */ + virtual std::pair nextBatchFromFile( + PerThreadReadBuf& pt, + bool batch_a); + + /** + * Reset state to be read for the next file. + */ + virtual void resetForNextFile() { + eat_ = length_-1; + //name_prefix_buf_.clear(); + beginning_ = true; + bufCur_ = 0; + subReadCnt_ = readCnt_; + } + +private: + const size_t length_; /// length of reads to generate + const size_t freq_; /// frequency to sample reads + size_t eat_; /// number of characters we need to skip before + /// we have flushed all of the ambiguous or + /// non-existent characters out of our read + /// window + bool beginning_; /// skipping over the first read length? + char buf_[1024]; /// read buffer + char name_prefix_buf_[1024]; /// FASTA sequence name buffer + char name_int_buf_[20]; /// for composing offsets for names + size_t bufCur_; /// buffer cursor; points to where we should + /// insert the next character + uint64_t subReadCnt_;/// number to subtract from readCnt_ to get + /// the pat id to output (so it resets to 0 for + /// each new sequence) +}; + +/** + * Read a FASTQ-format file. + * See: http://maq.sourceforge.net/fastq.shtml + */ +class FastqPatternSource : public CFilePatternSource { +public: + FastqPatternSource( + const vector& infiles, + bool color, + const char *dumpfile = NULL, + int trim3 = 0, + int trim5 = 0, + bool solexa_quals = false, + bool phred64Quals = false, + bool integer_quals = false, + uint32_t skip = 0) : + CFilePatternSource( + infiles, + NULL, + dumpfile, + trim3, + trim5, + skip), + first_(true), + solQuals_(solexa_quals), + phred64Quals_(phred64Quals), + intQuals_(integer_quals), + color_(color) { } + + virtual void reset() { + first_ = true; + CFilePatternSource::reset(); + } + + /** + * Finalize FASTQ parsing outside critical section. + */ + virtual bool parse(Read& ra, Read& rb, TReadId rdid) const; protected: @@ -2093,1208 +947,393 @@ class TabbedPatternSource : public BufferedFilePatternSource { * for lines worth of the input file into a buffer (r.readOrigBuf). finalize() * then parses the contents of r.readOrigBuf later. */ - pair nextBatchFromFile( + virtual pair nextBatchFromFile( PerThreadReadBuf& pt, - bool batch_a) - { - throw 1; - return make_pair(false, 0); + bool batch_a); + + virtual void resetForNextFile() { + first_ = true; } - /// Read another pattern from a FASTA input file - bool parse(ReadBuf& r, ReadBuf& rb) const { - r.reset(); - cerr << "In TabbedPatternSource.parse()" << endl; - throw 1; - return false; + virtual void dump(ostream& out, + const String& seq, + const String& qual, + const String& name) + { + out << "@" << name << endl << seq << endl << "+" << endl << qual << endl; } - /// Read another pattern from a FASTA input file - virtual pair readLight(ReadBuf& r) { - r.color = color_; - int trim5 = this->trim5_; - // fb_ is about to dish out the first character of the - // name field - if(parseName(r, NULL, '\t') == -1) { - peekOverNewline(fb_); // skip rest of line - r.clearAll(); - return make_pair(false, true); - } - assert_neq('\t', fb_.peek()); - - // fb_ is about to dish out the first character of the - // sequence field - int charsRead = 0; - int dstLen = parseSeq(r, charsRead, trim5, '\t'); - assert_neq('\t', fb_.peek()); - if(dstLen <= 0) { - peekOverNewline(fb_); // skip rest of line - r.clearAll(); - return make_pair(false, true); - } +private: - // fb_ is about to dish out the first character of the - // quality-string field - char ct = 0; - if(parseQuals(r, charsRead, dstLen, trim5, ct, '\n') <= 0) { - peekOverNewline(fb_); // skip rest of line - r.clearAll(); - return make_pair(false, true); - } - r.trimmed3 = this->trim3_; - r.trimmed5 = trim5; - assert_eq(ct, '\n'); - assert_neq('\n', fb_.peek()); - r.readOrigBufLen = fb_.copyLastN(r.readOrigBuf); - fb_.resetLastN(); - // The last character read in parseQuals should have been a - // '\n' - - readCnt_++; - r.patid = (uint32_t)(readCnt_-1); - return make_pair(true, false); - } + bool first_; + bool solQuals_; + bool phred64Quals_; + bool intQuals_; + bool color_; +}; - /// Read another pair of patterns from a FASTA input file - virtual BoolTriple readPairLight(ReadBuf& ra, ReadBuf& rb) { - // fb_ is about to dish out the first character of the - // name field - int mytrim5_1 = this->trim5_; - if(parseName(ra, &rb, '\t') == -1) { - peekOverNewline(fb_); // skip rest of line - ra.clearAll(); - rb.clearAll(); - fb_.resetLastN(); - // !success, eof, !paired - return BoolTriple(false, true, false); - } - assert_neq('\t', fb_.peek()); - - // fb_ is about to dish out the first character of the - // sequence field for the first mate - int charsRead1 = 0; - int dstLen1 = parseSeq(ra, charsRead1, mytrim5_1, '\t'); - if(dstLen1 <= -1) { - peekOverNewline(fb_); // skip rest of line - ra.clearAll(); - rb.clearAll(); - fb_.resetLastN(); - // !success, eof, !paired - return BoolTriple(false, true, false); - } - assert_neq('\t', fb_.peek()); - - // fb_ is about to dish out the first character of the - // quality-string field - char ct = 0; - if(parseQuals(ra, charsRead1, dstLen1, mytrim5_1, ct, '\t', '\n') <= 0) { - peekOverNewline(fb_); // skip rest of line - ra.clearAll(); - rb.clearAll(); - fb_.resetLastN(); - // !success, eof, !paired - return BoolTriple(false, true, false); - } - ra.trimmed3 = this->trim3_; - ra.trimmed5 = mytrim5_1; - assert(ct == '\t' || ct == '\n'); - if(ct == '\n') { - // Unpaired record; return. - rb.clearAll(); - peekOverNewline(fb_); - ra.readOrigBufLen = fb_.copyLastN(ra.readOrigBuf); - fb_.resetLastN(); - readCnt_++; - ra.patid = (uint32_t)(readCnt_-1); - // success, !eof, !paired - return BoolTriple(true, false, false); - } - assert_neq('\t', fb_.peek()); - - // fb_ is about to dish out the first character of the - // sequence field for the second mate - int charsRead2 = 0; - int mytrim5_2 = this->trim5_; - int dstLen2 = parseSeq(rb, charsRead2, mytrim5_2, '\t'); - if(dstLen2 <= 0) { - peekOverNewline(fb_); // skip rest of line - ra.clearAll(); - rb.clearAll(); - fb_.resetLastN(); - // !success, !eof, !paired - return BoolTriple(false, false, false); - } - assert_neq('\t', fb_.peek()); - - // fb_ is about to dish out the first character of the - // quality-string field - if(parseQuals(rb, charsRead2, dstLen2, mytrim5_2, ct, '\n') <= 0) { - peekOverNewline(fb_); // skip rest of line - ra.clearAll(); - rb.clearAll(); - fb_.resetLastN(); - // !success, !eof, !paired - return BoolTriple(false, false, false); - } - assert_eq('\n', ct); - if(fb_.peek() == '\n') { - assert(false); - } - peekOverNewline(fb_); - ra.readOrigBufLen = fb_.copyLastN(ra.readOrigBuf); - fb_.resetLastN(); +/** + * Read a Raw-format file (one sequence per line). No quality strings + * allowed. All qualities are assumed to be 'I' (40 on the Phred-33 + * scale). + */ +class RawPatternSource : public CFilePatternSource { - rb.trimmed3 = this->trim3_; - rb.trimmed5 = mytrim5_2; +public: - // The last character read in parseQuals should have been a - // '\n' + RawPatternSource( + const vector& infiles, + bool color, + const char *dumpfile = NULL, + int trim3 = 0, + int trim5 = 0, + uint32_t skip = 0) : + CFilePatternSource( + infiles, + NULL, + dumpfile, + trim3, + trim5, + skip), + first_(true), + color_(color) { } - readCnt_++; - ra.patid = rb.patid = (uint32_t)(readCnt_-1); - // success, !eof, paired - return BoolTriple(true, false, true); + virtual void reset() { + first_ = true; + CFilePatternSource::reset(); } /** - * Finishes parsing outside the critical section + * Finalize raw parsing outside critical section. */ - virtual void finalize(ReadBuf& r) const { } - + virtual bool parse(Read& ra, Read& rb, TReadId rdid) const; + +protected: /** - * Dump a FASTQ-style record for the read. + * Light-parse a batch into the given buffer. */ + virtual std::pair nextBatchFromFile( + PerThreadReadBuf& pt, + bool batch_a); + + virtual void resetForNextFile() { + first_ = true; + } + virtual void dump(ostream& out, const String& seq, const String& qual, const String& name) { - out << "@" << name << endl << seq << endl - << "+" << endl << qual << endl; + out << seq << endl; } + + private: + bool first_; + bool color_; +}; + +/** + * Abstract parent class for synhconized sources of paired-end reads + * (and possibly also single-end reads). + */ +class PatternComposer { +public: + PatternComposer() { } + + virtual ~PatternComposer() { } + + virtual void reset() = 0; + /** - * Parse a name from fb_ and store in r. Assume that the next - * character obtained via fb_.get() is the first character of - * the sequence and the string stops at the next char upto (could - * be tab, newline, etc.). + * Member function override by concrete, format-specific classes. */ - int parseName(ReadBuf& r, ReadBuf* r2, char upto = '\t') { - // Read the name out of the first field - int c = 0; - int nameLen = 0; - while(true) { - if((c = fb_.get()) < 0) { - return -1; - } - if(c == upto) { - // Finished with first field - break; - } - if(c == '\n' || c == '\r') { - return -1; - } - if(r2 != NULL) (*r2).nameBuf[nameLen] = c; - r.nameBuf[nameLen++] = c; - } - _setBegin(r.name, r.nameBuf); - _setLength(r.name, nameLen); - if(r2 != NULL) { - _setBegin((*r2).name, (*r2).nameBuf); - _setLength((*r2).name, nameLen); - } - // Set up a default name if one hasn't been set - if(nameLen == 0) { - itoa10((int)readCnt_, r.nameBuf); - _setBegin(r.name, r.nameBuf); - nameLen = (int)strlen(r.nameBuf); - _setLength(r.name, nameLen); - if(r2 != NULL) { - itoa10((int)readCnt_, (*r2).nameBuf); - _setBegin((*r2).name, (*r2).nameBuf); - _setLength((*r2).name, nameLen); - } + virtual std::pair nextBatch(PerThreadReadBuf& pt) = 0; + + /** + * Make appropriate call into the format layer to parse individual read. + */ + virtual bool parse(Read& ra, Read& rb, TReadId rdid) = 0; + + virtual void free_pmembers( const vector &elist) { + for (size_t i = 0; i < elist.size(); i++) { + if (elist[i] != NULL) + delete elist[i]; } - assert_gt(nameLen, 0); - return nameLen; + } + +protected: + + MUTEX_T mutex_m; /// mutex for locking critical regions +}; + +/** + * Encapsulates a synchronized source of both paired-end reads and + * unpaired reads, where the paired-end must come from parallel files. + */ +class SoloPatternComposer : public PatternComposer { + +public: + + SoloPatternComposer(const vector& src) : + PatternComposer(), + cur_(0), + src_(src) + { + for(size_t i = 0; i < src_.size(); i++) { + assert(src_[i] != NULL); + } } /** - * Parse a single sequence from fb_ and store in r. Assume - * that the next character obtained via fb_.get() is the first - * character of the sequence and the sequence stops at the next - * char upto (could be tab, newline, etc.). + * Reset this object and all the PatternSources under it so that + * the next call to nextReadPair gets the very first read pair. */ - int parseSeq(ReadBuf& r, int& charsRead, int& trim5, char upto = '\t') { - int begin = 0; - int dstLen = 0; - int c = fb_.get(); - assert(c != upto); - r.color = color_; - if(color_) { - // This may be a primer character. If so, keep it in the - // 'primer' field of the read buf and parse the rest of the - // read without it. - c = toupper(c); - if(asc2dnacat[c] > 0) { - // First char is a DNA char - int c2 = toupper(fb_.peek()); - // Second char is a color char - if(asc2colcat[c2] > 0) { - r.primer = c; - r.trimc = c2; - trim5 += 2; // trim primer and first color - } - } - if(c < 0) { return -1; } - } - while(c != upto) { - if(color_) { - if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0']; - } - if(c == '.') c = 'N'; - if(isalpha(c)) { - assert_in(toupper(c), "ACGTN"); - if(begin++ >= trim5) { - assert_neq(0, dna4Cat[c]); - if(dstLen + 1 > 1024) { - cerr << "Input file contained a pattern more than 1024 characters long. Please truncate" << endl - << "reads and re-run Bowtie" << endl; - throw 1; - } - r.patBufFw[dstLen] = charToDna5[c]; - dstLen++; - } - charsRead++; - } - if((c = fb_.get()) < 0) { - return -1; - } + virtual void reset() { + for(size_t i = 0; i < src_.size(); i++) { + src_[i]->reset(); } - dstLen -= this->trim3_; - _setBegin (r.patFw, (Dna5*)r.patBufFw); - _setLength(r.patFw, dstLen); - return dstLen; + cur_ = 0; } /** - * Parse a single quality string from fb_ and store in r. - * Assume that the next character obtained via fb_.get() is - * the first character of the quality string and the string stops - * at the next char upto (could be tab, newline, etc.). + * The main member function for dispensing pairs of reads or + * singleton reads. Returns true iff ra and rb contain a new + * pair; returns false if ra contains a new unpaired read. */ - int parseQuals(ReadBuf& r, int charsRead, int dstLen, int trim5, - char& c2, char upto = '\t', char upto2 = -1) - { - int qualsRead = 0; - int c = 0; - if (intQuals_) { - char buf[4096]; - while (qualsRead < charsRead) { - vector s_quals; - if(!tokenizeQualLine(fb_, buf, 4096, s_quals)) break; - for (unsigned int j = 0; j < s_quals.size(); ++j) { - char c = intToPhred33(atoi(s_quals[j].c_str()), solQuals_); - assert_geq(c, 33); - if (qualsRead >= trim5) { - size_t off = qualsRead - trim5; - if(off >= 1024) tooManyQualities(r.name); - r.qualBuf[off] = c; - } - ++qualsRead; - } - } // done reading integer quality lines - if (charsRead > qualsRead) tooFewQualities(r.name); - } else { - // Non-integer qualities - while((qualsRead < dstLen + trim5) && c >= 0) { - c = fb_.get(); - c2 = c; - if (c == ' ') wrongQualityFormat(r.name); - if(c < 0) { - // EOF occurred in the middle of a read - abort - return -1; - } - if(!isspace(c) && c != upto && (upto2 == -1 || c != upto2)) { - if (qualsRead >= trim5) { - size_t off = qualsRead - trim5; - if(off >= 1024) tooManyQualities(r.name); - c = charToPhred33(c, solQuals_, phred64Quals_); - assert_geq(c, 33); - r.qualBuf[off] = c; - } - qualsRead++; - } else { - break; - } - } - if(qualsRead != dstLen + trim5) { - assert(false); - } - } - _setBegin (r.qual, (char*)r.qualBuf); - _setLength(r.qual, dstLen); - while(c != upto && (upto2 == -1 || c != upto2)) { - c = fb_.get(); - c2 = c; - } - return qualsRead; + pair nextBatch(PerThreadReadBuf& pt); + + /** + * Make appropriate call into the format layer to parse individual read. + */ + virtual bool parse(Read& ra, Read& rb, TReadId rdid) { + return src_[0]->parse(ra, rb, rdid); } - bool color_; - bool solQuals_; - bool phred64Quals_; - bool intQuals_; +protected: + + volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors + vector src_; /// PatternSources for paired-end reads }; /** - * Synchronized concrete pattern source for a list of FASTA files where - * reads need to be extracted from long continuous sequences. + * Encapsulates a synchronized source of both paired-end reads and + * unpaired reads, where the paired-end must come from parallel files. */ -class FastaContinuousPatternSource : public BufferedFilePatternSource { +class DualPatternComposer : public PatternComposer { + public: - FastaContinuousPatternSource( - uint32_t seed, - const vector& infiles, - size_t length, - size_t freq, - const char *dumpfile = NULL, - bool verbose = false, - uint32_t skip = 0) : - BufferedFilePatternSource(seed, infiles, NULL, false, - dumpfile, verbose, 0, 0, skip), - length_(length), freq_(freq), - eat_(length_-1), beginning_(true), - nameChars_(0), bufCur_(0), subReadCnt_(0llu) - { - resetForNextFile(); - assert_lt(length_, (size_t)ReadBuf::BUF_SIZE); - } - virtual void reset() { - BufferedFilePatternSource::reset(); - resetForNextFile(); + DualPatternComposer(const vector& srca, + const vector& srcb) : + PatternComposer(), + cur_(0), + srca_(srca), + srcb_(srcb) + { + // srca_ and srcb_ must be parallel + assert_eq(srca_.size(), srcb_.size()); + for(size_t i = 0; i < srca_.size(); i++) { + // Can't have NULL first-mate sources. Second-mate sources + // can be NULL, in the case when the corresponding first- + // mate source is unpaired. + assert(srca_[i] != NULL); + for(size_t j = 0; j < srcb_.size(); j++) { + assert_neq(srca_[i], srcb_[j]); + } + } } -protected: - /** - * "Light" parser. This is inside the critical section, so the key is to do - * just enough parsing so that another function downstream (finalize()) can do - * the rest of the parsing. Really this function's only job is to stick every - * for lines worth of the input file into a buffer (r.readOrigBuf). finalize() - * then parses the contents of r.readOrigBuf later. + * Reset this object and all the PatternSources under it so that + * the next call to nextReadPair gets the very first read pair. */ - pair nextBatchFromFile( - PerThreadReadBuf& pt, - bool batch_a) - { - throw 1; - return make_pair(false, 0); - } - - /// Read another pattern from a FASTA input file - bool parse(ReadBuf& r, ReadBuf& rb) const { - assert(r.empty()); - assert(rb.empty()); - cerr << "In FastaContinuousPatternSource.parse()" << endl; - throw 1; - } - - /// Read another pattern from a FASTA input file - virtual pair readLight(ReadBuf& r) { - while(true) { - int c = fb_.get(); - if(c < 0) { - seqan::clear(r.patFw); - return make_pair(false, true); - } - if(c == '>') { - resetForNextFile(); - c = fb_.peek(); - bool sawSpace = false; - while(c != '\n' && c != '\r') { - if(!sawSpace) { - sawSpace = isspace(c); - } - if(!sawSpace) { - nameBuf_[nameChars_++] = c; - } - fb_.get(); - c = fb_.peek(); - } - while(c == '\n' || c == '\r') { - fb_.get(); - c = fb_.peek(); - } - nameBuf_[nameChars_++] = '_'; - } else { - int cat = dna4Cat[c]; - if(cat == 2) c = 'N'; - if(cat == 0) { - // Encountered non-DNA, non-IUPAC char; skip it - continue; - } else { - // DNA char - buf_[bufCur_++] = c; - if(bufCur_ == 1024) bufCur_ = 0; - if(eat_ > 0) { - eat_--; - // Try to keep readCnt_ aligned with the offset - // into the reference; that let's us see where - // the sampling gaps are by looking at the read - // name - if(!beginning_) readCnt_++; - continue; - } - for(size_t i = 0; i < length_; i++) { - if(length_ - i <= bufCur_) { - c = buf_[bufCur_ - (length_ - i)]; - } else { - // Rotate - c = buf_[bufCur_ - (length_ - i) + 1024]; - } - r.patBufFw [i] = charToDna5[c]; - r.qualBuf[i] = 'I'; - } - _setBegin (r.patFw, (Dna5*)r.patBufFw); - _setLength(r.patFw, length_); - _setBegin (r.qual, r.qualBuf); - _setLength(r.qual, length_); - // Set up a default name if one hasn't been set - for(size_t i = 0; i < nameChars_; i++) { - r.nameBuf[i] = nameBuf_[i]; - } - itoa10((int)(readCnt_ - subReadCnt_), &r.nameBuf[nameChars_]); - _setBegin(r.name, r.nameBuf); - _setLength(r.name, strlen(r.nameBuf)); - eat_ = freq_-1; - readCnt_++; - beginning_ = false; - r.patid = (uint32_t)(readCnt_-1); - break; - } + virtual void reset() { + for(size_t i = 0; i < srca_.size(); i++) { + srca_[i]->reset(); + if(srcb_[i] != NULL) { + srcb_[i]->reset(); } } - return make_pair(true, false); - } - /// Shouldn't ever be here; it's not sensible to obtain read pairs - // from a continuous input. - virtual BoolTriple readPairLight(ReadBuf& ra, ReadBuf& rb) { - cerr << "In FastaContinuousPatternSource.readPair()" << endl; - throw 1; - return BoolTriple(); + cur_ = 0; } /** - * Finishes parsing outside the critical section + * The main member function for dispensing pairs of reads or + * singleton reads. Returns true iff ra and rb contain a new + * pair; returns false if ra contains a new unpaired read. */ - virtual void finalize(ReadBuf& r) const { } + pair nextBatch(PerThreadReadBuf& pt); - /** - * Reset state to be read for the next file. + * Make appropriate call into the format layer to parse individual read. */ - virtual void resetForNextFile() { - eat_ = length_-1; - beginning_ = true; - bufCur_ = nameChars_ = 0; - subReadCnt_ = readCnt_; + virtual bool parse(Read& ra, Read& rb, TReadId rdid) { + return srca_[0]->parse(ra, rb, rdid); } -private: - size_t length_; /// length of reads to generate - size_t freq_; /// frequency to sample reads - size_t eat_; /// number of characters we need to skip before - /// we have flushed all of the ambiguous or - /// non-existent characters out of our read - /// window - bool beginning_; /// skipping over the first read length? - char buf_[1024]; /// read buffer - char nameBuf_[1024];/// read buffer for name of fasta record being - /// split into mers - size_t nameChars_; /// number of name characters copied into buf - size_t bufCur_; /// buffer cursor; points to where we should - /// insert the next character - uint64_t subReadCnt_;/// number to subtract from readCnt_ to get - /// the pat id to output (so it resets to 0 for - /// each new sequence) + +protected: + + volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors + vector srca_; /// PatternSources for 1st mates and/or unpaired reads + vector srcb_; /// PatternSources for 2nd mates }; /** - * Read a FASTQ-format file. - * See: http://maq.sourceforge.net/fastq.shtml + * Encapsulates a single thread's interaction with the PatternSource. + * Most notably, this class holds the buffers into which the + * PatterSource will write sequences. This class is *not* threadsafe + * - it doesn't need to be since there's one per thread. PatternSource + * is thread-safe. */ -class FastqPatternSource : public CFilePatternSource { +class PatternSourcePerThread { + public: - FastqPatternSource(uint32_t seed, - const vector& infiles, - bool color, - bool randomizeQuals = false, - const char *dumpfile = NULL, - bool verbose = false, - int trim3 = 0, - int trim5 = 0, - bool solexa_quals = false, - bool phred64Quals = false, - bool integer_quals = false, - uint32_t skip = 0) : - CFilePatternSource(seed, infiles, NULL, randomizeQuals, - dumpfile, verbose, - trim3, trim5, skip), - first_(true), - solQuals_(solexa_quals), - phred64Quals_(phred64Quals), - intQuals_(integer_quals), - color_(color) - { } - virtual void reset() { - first_ = true; - CFilePatternSource::reset(); - } -protected: - + PatternSourcePerThread( + PatternComposer& composer, + uint32_t max_buf, + uint32_t seed) : + composer_(composer), + buf_(max_buf), + last_batch_(false), + last_batch_size_(0), + seed_(seed) { } + /** - * "Light" parser. This is inside the critical section, so the key is to do - * just enough parsing so that another function downstream (finalize()) can do - * the rest of the parsing. Really this function's only job is to stick every - * for lines worth of the input file into a buffer (r.readOrigBuf). finalize() - * then parses the contents of r.readOrigBuf later. + * Get the next paired or unpaired read from the wrapped + * PatternComposer. Returns a pair of bools; first indicates + * whether we were successful, second indicates whether we're + * done. */ - pair nextBatchFromFile( - PerThreadReadBuf& pt, - bool batch_a) - { - int c = 0; - vector& readBuf = batch_a ? pt.bufa_ : pt.bufb_; - size_t& len = readBuf[0].readOrigBufLen; - len = 0; - if(first_) { - c = getc_unlocked(fp_); - while(c == '\r' || c == '\n') { - c = getc_unlocked(fp_); - } - if(c != '@') { - cerr << "Error: reads file does not look like a FASTQ file" << endl; - throw 1; - } - readBuf[0].readOrigBuf[len++] = c; - assert_eq('@', c); - first_ = false; - } - // Note: to reduce the number of times we have to enter the critical - // section (each entrance has some assocaited overhead), we could populate - // the buffer with several reads worth of data here, instead of just one. - bool done = false, aborted = false; - size_t readi = 0; - // Read until we run out of input or until we've filled the buffer - for(; readi < pt.max_buf_ && !done; readi++) { - char* buf = readBuf[readi].readOrigBuf; - assert(readi == 0 || strlen(buf) == 0); - if(readi > 0) - { - len = readBuf[readi].readOrigBufLen; - len = 0; - } - int newlines = 4; - while(newlines) { - c = getc_unlocked(fp_); - done = c < 0; - if(c == '\n' || (done && newlines == 1)) { - newlines--; - c = '\n'; - } else if(done) { - //return make_pair(false, true); - aborted = true; //Unexpected EOF - break; - } - buf[len++] = c; - } - readCnt_++; - readBuf[readi].patid = readCnt_-1; - } - if(aborted) { - readi--; - readCnt_--; - } - return make_pair(done, readi); - //return make_pair(false, 0); - } - - /// Read another pattern from a FASTQ input file - bool parse(ReadBuf& r, ReadBuf& rb) const { - assert(strlen(r.readOrigBuf) != 0); - assert(r.empty()); - int c; - size_t name_len = 0; - size_t cur = 1; - r.color = color_; - r.primer = -1; - - // Parse read name - while(true) { - assert(cur < r.readOrigBufLen); - c = r.readOrigBuf[cur++]; - if(c == '\n' || c == '\r') { - do { - c = r.readOrigBuf[cur++]; - } while(c == '\n' || c == '\r'); - break; - } - r.nameBuf[name_len++] = c; - } - _setBegin(r.name, r.nameBuf); - _setLength(r.name, name_len); - - if(color_) { - // May be a primer character. If so, keep it 'primer' field - // of read buf and parse the rest of the read without it. - // c = toupper(c); - // if(asc2dnacat[c] > 0) { - // // First char is a DNA char - // int c2 = toupper(fb_.peek()); - // // Second char is a color char - // if(asc2colcat[c2] > 0) { - // r.primer = c; - // r.trimc = c2; - // mytrim5 += 2; // trim primer and first color - // } - // } - // if(c < 0) { bail(r); return; } - throw 1; - } - - // Parse sequence - int nchar = 0; - uint8_t *seqbuf = r.patBufFw; - while(c != '+') { - if(c == '.') { - c = 'N'; - } - if(color_ && c >= '0' && c <= '4') { - c = "ACGTN"[(int)c - '0']; - } - if(isalpha(c)) { - // If it's past the 5'-end trim point - if(nchar++ >= trim5_) { - *seqbuf++ = charToDna5[c]; - } - } - assert(cur < r.readOrigBufLen); - c = r.readOrigBuf[cur++]; - } - int seq_len = (int)(seqbuf - r.patBufFw); - r.trimmed5 = (int)(nchar - seq_len); - r.trimmed3 = min(seq_len, trim3_); - seq_len = max(seq_len - trim3_, 0); - _setBegin(r.patFw, (Dna5*)r.patBufFw); - _setLength(r.patFw, seq_len); - - assert_eq('+', c); - do { - assert(cur < r.readOrigBufLen); - c = r.readOrigBuf[cur++]; - } while(c != '\n' && c != '\r'); - do { - assert(cur < r.readOrigBufLen); - c = r.readOrigBuf[cur++]; - } while(c == '\n' || c == '\r'); - - // Now we're on the next non-blank line after the + line - if(seq_len == 0) { - return true; // done parsing empty read - } - - int nqual = 0; - char *qualbuf = r.qualBuf; - if (intQuals_) { - // TODO: must implement this for compatibility with other Bowtie - throw 1; // not yet implemented - } else { - while(c != '\r' && c != '\n') { - c = charToPhred33(c, solQuals_, phred64Quals_); - if(c == ' ') { - wrongQualityFormat(r.name); - return false; - } - if(nqual++ >= r.trimmed5) { - *qualbuf++ = c; - } - c = r.readOrigBuf[cur++]; - } - int qual_len = (int)(qualbuf - r.qualBuf); - qual_len = max(0, qual_len - r.trimmed3); - if(qual_len < seq_len) { - tooFewQualities(r.name); - return false; - } else if(qual_len > seq_len+1) { - tooManyQualities(r.name); - return false; - } - _setBegin(r.qual, (char*)r.qualBuf); - _setLength(r.qual, seq_len); - } - // Set up a default name if one hasn't been set - if(name_len == 0) { - itoa10(static_cast(readCnt_), r.nameBuf); - _setBegin(r.name, r.nameBuf); - name_len = (int)strlen(r.nameBuf); - _setLength(r.name, name_len); - } - if(strlen(rb.readOrigBuf) != 0 && empty(rb.patFw)) { - return parse(rb, r); - } - return true; - } + std::pair nextReadPair(); - pair readLight(ReadBuf& r); - - void finalize(ReadBuf &r) const; - - /// Read another pattern from a FASTQ input file -// virtual void readHeavy(ReadBuf& r, uint32_t& patid) { -// const int bufSz = ReadBuf::BUF_SIZE; -// while(true) { -// int c; -// int dstLen = 0; -// int nameLen = 0; -// r.color = color_; -// r.primer = -1; -// // Pick off the first at -// if(first_) { -// c = fb_.get(); -// if(c != '@') { -// c = getOverNewline(fb_); -// if(c < 0) { bail(r); return; } -// } -// if(c != '@') { -// cerr << "Error: reads file does not look like a FASTQ file" << endl; -// throw 1; -// } -// assert_eq('@', c); -// first_ = false; -// } -// -// // Read to the end of the id line, sticking everything after the '@' -// // into *name -// while(true) { -// c = fb_.get(); -// if(c < 0) { bail(r); return; } -// if(c == '\n' || c == '\r') { -// // Break at end of line, after consuming all \r's, \n's -// while(c == '\n' || c == '\r') { -// c = fb_.get(); -// if(c < 0) { bail(r); return; } -// } -// break; -// } -// r.nameBuf[nameLen++] = c; -// if(nameLen > bufSz-2) { -// // Too many chars in read name; print friendly error message -// _setBegin(r.name, r.nameBuf); -// _setLength(r.name, nameLen); -// cerr << "FASTQ read name is too long; read names must be " << (bufSz-2) << " characters or fewer." << endl; -// cerr << "Beginning of bad read name: " << r.name << endl; -// throw 1; -// } -// } -// _setBegin(r.name, r.nameBuf); -// assert_leq(nameLen, bufSz-2); -// _setLength(r.name, nameLen); -// // c now holds the first character on the line after the -// // @name line -// -// // fb_ now points just past the first character of a -// // sequence line, and c holds the first character -// int charsRead = 0; -// uint8_t *sbuf = r.patBufFw; -// int dstLens[] = {0, 0, 0, 0}; -// int *dstLenCur = &dstLens[0]; -// int mytrim5 = this->trim5_; -// if(color_) { -// // This may be a primer character. If so, keep it in the -// // 'primer' field of the read buf and parse the rest of the -// // read without it. -// c = toupper(c); -// if(asc2dnacat[c] > 0) { -// // First char is a DNA char -// int c2 = toupper(fb_.peek()); -// // Second char is a color char -// if(asc2colcat[c2] > 0) { -// r.primer = c; -// r.trimc = c2; -// mytrim5 += 2; // trim primer and first color -// } -// } -// if(c < 0) { bail(r); return; } -// } -// int trim5 = mytrim5; -// if(c == '+') { -// // Read had length 0; print warning (if not quiet) and quit -// if(!quiet) { -// cerr << "Warning: Skipping read (" << r.name << ") because it had length 0" << endl; -// } -// peekToEndOfLine(fb_); -// fb_.get(); -// continue; -// } -// while(c != '+') { -// // Convert color numbers to letters if necessary -// if(color_) { -// if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0']; -// } -// if(c == '.') c = 'N'; -// if(isalpha(c)) { -// // If it's past the 5'-end trim point -// assert_in(toupper(c), "ACGTN"); -// if(charsRead >= trim5) { -// if((*dstLenCur) >= 1024) tooManySeqChars(r.name); -// sbuf[(*dstLenCur)++] = charToDna5[c]; -// } -// charsRead++; -// } -// c = fb_.get(); -// if(c < 0) { bail(r); return; } -// } -// // Trim from 3' end -// dstLen = dstLens[0]; -// charsRead = dstLen + mytrim5; -// dstLen -= this->trim3_; -// // Set trimmed bounds of buffers -// _setBegin(r.patFw, (Dna5*)r.patBufFw); -// _setLength(r.patFw, dstLen); -// assert_eq('+', c); -// -// // Chew up the optional name on the '+' line -// peekToEndOfLine(fb_); -// -// // Now read the qualities -// if (intQuals_) { -// int qualsRead = 0; -// char buf[4096]; -// if(color_ && r.primer != -1) mytrim5--; -// while (qualsRead < charsRead) { -// vector s_quals; -// if(!tokenizeQualLine(fb_, buf, 4096, s_quals)) break; -// for (unsigned int j = 0; j < s_quals.size(); ++j) { -// char c = intToPhred33(atoi(s_quals[j].c_str()), solQuals_); -// assert_geq(c, 33); -// if (qualsRead >= mytrim5) { -// size_t off = qualsRead - mytrim5; -// if(off >= 1024) tooManyQualities(r.name); -// r.qualBuf[off] = c; -// } -// ++qualsRead; -// } -// } // done reading integer quality lines -// if(color_ && r.primer != -1) mytrim5++; -// if(qualsRead < charsRead-mytrim5) { -// tooFewQualities(r.name); -// } else if(qualsRead > charsRead-mytrim5+1) { -// tooManyQualities(r.name); -// } -// if(qualsRead == charsRead-mytrim5+1 && color_ && r.primer != -1) { -// for(int i = 0; i < qualsRead-1; i++) { -// r.qualBuf[i] = r.qualBuf[i+1]; -// } -// } -// _setBegin(r.qual, (char*)r.qualBuf); -// _setLength(r.qual, dstLen); -// peekOverNewline(fb_); -// } else { -// // Non-integer qualities -// char *qbuf = r.qualBuf; -// trim5 = mytrim5; -// if(color_ && r.primer != -1) trim5--; -// int itrim5 = trim5; -// int qualsRead[4] = {0, 0, 0, 0}; -// int *qualsReadCur = &qualsRead[0]; -// while(true) { -// c = fb_.get(); -// if(c == ' ') { -// wrongQualityFormat(r.name); -// } -// if(c < 0) { bail(r); return; } -// if (c != '\r' && c != '\n') { -// if (*qualsReadCur >= trim5) { -// size_t off = (*qualsReadCur) - trim5; -// if(off >= 1024) tooManyQualities(r.name); -// c = charToPhred33(c, solQuals_, phred64Quals_); -// assert_geq(c, 33); -// qbuf[off] = c; -// } -// (*qualsReadCur)++; -// } else { -// break; -// } -// } -// qualsRead[0] -= this->trim3_; -// int qRead = 0; -// if (qualsRead[0] > itrim5) -// qRead = (int)(qualsRead[0] - itrim5); -// if(qRead < dstLen) { -// tooFewQualities(r.name); -// } else if(qRead > dstLen+1) { -// tooManyQualities(r.name); -// } -// if(qRead == dstLen+1 && color_ && r.primer != -1) { -// for(int i = 0; i < dstLen; i++) { -// r.qualBuf[i] = r.qualBuf[i+1]; -// } -// } -// _setBegin (r.qual, (char*)r.qualBuf); -// _setLength(r.qual, dstLen); -// -// if(c == '\r' || c == '\n') { -// c = peekOverNewline(fb_); -// } else { -// c = peekToEndOfLine(fb_); -// } -// } -// r.readOrigBufLen = fb_.copyLastN(r.readOrigBuf); -// fb_.resetLastN(); -// -// c = fb_.get(); -// assert(c == -1 || c == '@'); -// -// // Set up a default name if one hasn't been set -// if(nameLen == 0) { -// itoa10((int)readCnt_, r.nameBuf); -// _setBegin(r.name, r.nameBuf); -// nameLen = (int)strlen(r.nameBuf); -// _setLength(r.name, nameLen); -// } -// r.trimmed3 = this->trim3_; -// r.trimmed5 = mytrim5; -// assert_gt(nameLen, 0); -// readCnt_++; -// patid = (uint32_t)(readCnt_-1); -// return; -// } -// } - /// Read another read pair from a FASTQ input file - virtual BoolTriple readPairLight(ReadBuf& ra, ReadBuf& rb) { - // (For now, we shouldn't ever be here) - cerr << "In FastqPatternSource.readPair()" << endl; - throw 1; - return BoolTriple(); + Read& bufa() { return buf_.read_a(); } + Read& bufb() { return buf_.read_b(); } + + const Read& bufa() const { return buf_.read_a(); } + const Read& bufb() const { return buf_.read_b(); } + + TReadId rdid() const { return buf_.rdid(); } + + /** + * Return true iff the read currently in the buffer is a + * paired-end read. + */ + bool paired() const { + // can also do buf_.read_b().mate > 0, but the mate + // field isn't set until finalize is called, whereas + // parsed is set by the time parse() is finished. + return buf_.read_b().parsed; } - virtual void resetForNextFile() { - first_ = true; - } +private: - virtual void dump(ostream& out, - const String& seq, - const String& qual, - const String& name) - { - out << "@" << name << endl << seq << endl << "+" << endl << qual << endl; + /** + * When we've finished fully parsing and dishing out reads in + * the current batch, we go get the next one by calling into + * the composition layer. + */ + std::pair nextBatch() { + buf_.reset(); + std::pair res = composer_.nextBatch(buf_); + buf_.init(); + return res; } -private: + /** + * Once name/sequence/qualities have been parsed for an + * unpaired read, set all the other key fields of the Read + * struct. + */ + void finalize(Read& ra); /** - * Do things we need to do if we have to bail in the middle of a - * read, usually because we reached the end of the input without - * finishing. + * Once name/sequence/qualities have been parsed for a + * paired-end read, set all the other key fields of the Read + * structs. + */ + void finalizePair(Read& ra, Read& rb); + + /** + * Call into composition layer (which in turn calls into + * format layer) to parse the read. */ - void bail(ReadBuf& r) { - seqan::clear(r.patFw); - r.readOrigBufLen = 0; + bool parse(Read& ra, Read& rb) { + return composer_.parse(ra, rb, buf_.rdid()); } - bool first_; - bool solQuals_; - bool phred64Quals_; - bool intQuals_; - bool color_; + PatternComposer& composer_; // pattern composer + PerThreadReadBuf buf_; // read data buffer + bool last_batch_; // true if this is final batch + int last_batch_size_; // # reads read in previous batch + uint32_t seed_; // pseudo-random seed based on read content }; /** - * Read a Raw-format file (one sequence per line). No quality strings - * allowed. All qualities are assumed to be 'I' (40 on the Phred-33 - * scale). + * Abstract parent factory for PatternSourcePerThreads. */ -class RawPatternSource : public BufferedFilePatternSource { - +class PatternSourcePerThreadFactory { public: + PatternSourcePerThreadFactory( + PatternComposer& composer, + uint32_t max_buf, + uint32_t seed): + composer_(composer), + max_buf_(max_buf), + seed_(seed) {} - RawPatternSource(uint32_t seed, - const vector& infiles, - bool color, - bool randomizeQuals = false, - const char *dumpfile = NULL, - bool verbose = false, - int trim3 = 0, - int trim5 = 0, - uint32_t skip = 0) : - BufferedFilePatternSource(seed, infiles, NULL, randomizeQuals, - dumpfile, verbose, trim3, trim5, skip), - first_(true), color_(color) - { } - - virtual void reset() { - first_ = true; - BufferedFilePatternSource::reset(); + /** + * Create a new heap-allocated PatternSourcePerThreads. + */ + virtual PatternSourcePerThread* create() const { + return new PatternSourcePerThread(composer_, max_buf_, seed_); } -protected: - /** - * "Light" parser. This is inside the critical section, so the key is to do - * just enough parsing so that another function downstream (finalize()) can do - * the rest of the parsing. Really this function's only job is to stick every - * for lines worth of the input file into a buffer (r.readOrigBuf). finalize() - * then parses the contents of r.readOrigBuf later. + * Create a new heap-allocated vector of heap-allocated + * PatternSourcePerThreads. */ - pair nextBatchFromFile( - PerThreadReadBuf& pt, - bool batch_a) - { - throw 1; - return make_pair(false, 0); + virtual std::vector* create(uint32_t n) const { + std::vector* v = new std::vector; + for(size_t i = 0; i < n; i++) { + v->push_back(new PatternSourcePerThread(composer_, max_buf_, seed_)); + assert(v->back() != NULL); + } + return v; } - /// Read another pattern from a FASTA input file - bool parse(ReadBuf& r, ReadBuf& rb) const { - cerr << "In RawPatternSource.parse()" << endl; - throw 1; - return false; + /// Free memory associated with a pattern source + virtual void destroy(PatternSourcePerThread* composer) const { + assert(composer != NULL); + // Free the PatternSourcePerThread + delete composer; } - /// Read another pattern from a Raw input file - virtual pair readLight(ReadBuf& r) { - int c; - int dstLen = 0; - int nameLen = 0; - c = getOverNewline(this->fb_); - if(c < 0) { bail(r); return make_pair(false, false); } - assert(!isspace(c)); - r.color = color_; - int mytrim5 = this->trim5_; - if(first_) { - // Check that the first character is sane for a raw file - int cc = c; - if(color_) { - if(cc >= '0' && cc <= '4') cc = "ACGTN"[(int)cc - '0']; - if(cc == '.') cc = 'N'; - } - if(dna4Cat[cc] == 0) { - cerr << "Error: reads file does not look like a Raw file" << endl; - if(c == '>') { - cerr << "Reads file looks like a FASTA file; please use -f" << endl; - } - if(c == '@') { - cerr << "Reads file looks like a FASTQ file; please use -q" << endl; - } - throw 1; - } - first_ = false; - } - if(color_) { - // This may be a primer character. If so, keep it in the - // 'primer' field of the read buf and parse the rest of the - // read without it. - c = toupper(c); - if(asc2dnacat[c] > 0) { - // First char is a DNA char - int c2 = toupper(fb_.peek()); - // Second char is a color char - if(asc2colcat[c2] > 0) { - r.primer = c; - r.trimc = c2; - mytrim5 += 2; // trim primer and first color - } - } - if(c < 0) { bail(r); return make_pair(false, false); } - } - // _in now points just past the first character of a sequence - // line, and c holds the first character - while(!isspace(c) && c >= 0) { - if(color_) { - if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0']; + /// Free memory associated with a pattern source list + virtual void destroy(std::vector* composers) const { + assert(composers != NULL); + // Free all of the PatternSourcePerThreads + for(size_t i = 0; i < composers->size(); i++) { + if((*composers)[i] != NULL) { + delete (*composers)[i]; + (*composers)[i] = NULL; } - if(c == '.') c = 'N'; - if(isalpha(c) && dstLen >= mytrim5) { - size_t len = dstLen - mytrim5; - if(len >= 1024) tooManyQualities(String("(no name)")); - r.patBufFw [len] = charToDna5[c]; - r.qualBuf[len] = 'I'; - dstLen++; - } else if(isalpha(c)) dstLen++; - if(isspace(fb_.peek())) break; - c = fb_.get(); } - if(dstLen >= (this->trim3_ + mytrim5)) { - dstLen -= (this->trim3_ + mytrim5); - } else { - dstLen = 0; - } - _setBegin (r.patFw, (Dna5*)r.patBufFw); - _setLength(r.patFw, dstLen); - _setBegin (r.qual, r.qualBuf); - _setLength(r.qual, dstLen); - - c = peekToEndOfLine(fb_); - r.trimmed3 = this->trim3_; - r.trimmed5 = mytrim5; - r.readOrigBufLen = fb_.copyLastN(r.readOrigBuf); - fb_.resetLastN(); - - // Set up name - itoa10((int)readCnt_, r.nameBuf); - _setBegin(r.name, r.nameBuf); - nameLen = (int)strlen(r.nameBuf); - _setLength(r.name, nameLen); - readCnt_++; - - r.patid = (uint32_t)(readCnt_-1); - return make_pair(true, false); - } - - /// Read another read pair from a FASTQ input file - virtual BoolTriple readPairLight(ReadBuf& ra, ReadBuf& rb) { - // (For now, we shouldn't ever be here) - cerr << "In RawPatternSource.readPair()" << endl; - throw 1; - return BoolTriple(); - } - - /** - * Finishes parsing outside the critical section - */ - virtual void finalize(ReadBuf& r) const { } - - virtual void resetForNextFile() { - first_ = true; - } - - virtual void dump(ostream& out, - const String& seq, - const String& qual, - const String& name) - { - out << seq << endl; + // Free the vector + delete composers; } - private: - /** - * Do things we need to do if we have to bail in the middle of a - * read, usually because we reached the end of the input without - * finishing. - */ - void bail(ReadBuf& r) { - seqan::clear(r.patFw); - fb_.resetLastN(); - } - - bool first_; - bool color_; + /// Container for obtaining paired reads from PatternSources + PatternComposer& composer_; + /// Maximum size of batch to read in + uint32_t max_buf_; + uint32_t seed_; }; #endif /*PAT_H_*/ diff --git a/range_cache.h b/range_cache.h index 8b662cc..ad7421f 100644 --- a/range_cache.h +++ b/range_cache.h @@ -504,7 +504,7 @@ class RangeCache { TIndexOffU jumps = 0; if(tops.size() > 0) { entTop = tops.back(); - jumps = tops.size(); + jumps = (TIndexOff)tops.size(); } // Cache the entry for the end of the tunnel assert(map_.find(entTop) == map_.end()); diff --git a/range_source.h b/range_source.h index 1b2c3de..c19644f 100644 --- a/range_source.h +++ b/range_source.h @@ -1587,7 +1587,7 @@ class RangeSource { virtual ~RangeSource() { } /// Set query to find ranges for - virtual void setQuery(ReadBuf& r, Range *partial) = 0; + virtual void setQuery(Read& r, Range *partial) = 0; /// Set up the range search. virtual void initBranch(PathManager& pm) = 0; /// Advance the range search by one memory op @@ -1752,8 +1752,8 @@ class SingleRangeSourceDriver : public RangeSourceDriver { */ virtual void setQueryImpl(PatternSourcePerThread* patsrc, Range *r) { this->done = false; - pm_.reset(patsrc->patid()); - ReadBuf* buf = mate1_ ? &patsrc->bufa() : &patsrc->bufb(); + pm_.reset((uint32_t)patsrc->rdid()); + Read* buf = mate1_ ? &patsrc->bufa() : &patsrc->bufb(); len_ = buf->length(); rs_->setQuery(*buf, r); initRangeSource((fw_ == ebwtFw_) ? buf->qual : buf->qualRev);