Permalink
Browse files

Add support for interleaved fastq

  • Loading branch information...
1 parent fcb1579 commit 97dd7107f4539de9895f5f0866c83aa40f3888c8 @ch4rr0 ch4rr0 committed Jun 5, 2017
Showing with 8,053 additions and 10 deletions.
  1. +12 −2 ebwt_search.cpp
  2. +1 −0 formats.h
  3. +1 −0 hit.h
  4. +24 −8 pat.cpp
  5. +3 −0 pat.h
  6. +8,000 −0 reads/e_coli_1000_interleaved.fq
  7. +12 −0 scripts/test/simple_tests.pl
View
@@ -338,7 +338,8 @@ enum {
ARG_QUALS2,
ARG_ALLOW_CONTAIN,
ARG_COLOR_PRIMER,
- ARG_WRAPPER
+ ARG_WRAPPER,
+ ARG_INTERLEAVED_FASTQ,
};
static struct option long_options[] = {
@@ -444,6 +445,7 @@ static struct option long_options[] = {
{(char*)"allow-contain",no_argument, 0, ARG_ALLOW_CONTAIN},
{(char*)"col-primer", no_argument, 0, ARG_COLOR_PRIMER},
{(char*)"wrapper", required_argument, 0, ARG_WRAPPER},
+ {(char*)"interleaved", required_argument, 0, ARG_INTERLEAVED_FASTQ},
{(char*)0, 0, 0, 0} // terminator
};
@@ -461,14 +463,15 @@ static void printUsage(ostream& out) {
}
out << "Usage: " << endl
- << tool_name << " [options]* <ebwt> {-1 <m1> -2 <m2> | --12 <r> | <s>} [<hit>]" << endl
+ << tool_name << " [options]* <ebwt> {-1 <m1> -2 <m2> | --12 <r> | --interleaved <i> | <s>} [<hit>]" << endl
<< endl
<< " <m1> Comma-separated list of files containing upstream mates (or the" << endl
<< " sequences themselves, if -c is set) paired with mates in <m2>" << endl
<< " <m2> Comma-separated list of files containing downstream mates (or the" << endl
<< " sequences themselves if -c is set) paired with mates in <m1>" << endl
<< " <r> Comma-separated list of files containing Crossbow-style reads. Can be" << endl
<< " a mixture of paired and unpaired. Specify \"-\" for stdin." << endl
+ << " <i> Files with interleaved paired-end FASTQ reads." << endl
<< " <s> Comma-separated list of files containing unpaired reads, or the" << endl
<< " sequences themselves, if -c is set. Specify \"-\" for stdin." << endl
<< " <hit> File to write hits to (default: stdout)" << endl
@@ -647,6 +650,7 @@ static void parseOptions(int argc, const char **argv) {
case '1': tokenize(optarg, ",", mates1); break;
case '2': tokenize(optarg, ",", mates2); break;
case ARG_ONETWO: tokenize(optarg, ",", mates12); format = TAB_MATE; break;
+ case ARG_INTERLEAVED_FASTQ: tokenize(optarg, ",", mates12); format = INTERLEAVED; break;
case 'f': format = FASTA; break;
case 'F': {
format = FASTA_CONT;
@@ -2592,6 +2596,12 @@ patsrcFromStrings(int format,
trim3, trim5,
solexaQuals, phred64Quals,
integerQuals);
+ case INTERLEAVED:
+ return new FastqPatternSource (reads, color,
+ patDumpfile,
+ trim3, trim5,
+ solexaQuals, phred64Quals,
+ integerQuals, true /* is interleaved */);
case TAB_MATE:
return new TabbedPatternSource(reads, false, color,
patDumpfile,
View
@@ -12,6 +12,7 @@ enum file_format {
FASTA = 1,
FASTA_CONT,
FASTQ,
+ INTERLEAVED,
TAB_MATE,
RAW,
CMDLINE,
View
1 hit.h
@@ -882,6 +882,7 @@ class HitSinkPerThread {
hitsForThisRead_(),
_max(max),
_n(n),
+ defaultMapq_(defaultMapq),
threadId_(threadId)
{
sink.addWrapper();
View
32 pat.cpp
@@ -906,7 +906,7 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile(
bool batch_a)
{
int c = 0;
- vector<Read>& readBuf = batch_a ? pt.bufa_ : pt.bufb_;
+ vector<Read>* readBuf = batch_a ? &pt.bufa_ : &pt.bufb_;
if(first_) {
c = getc_wrapper();
while(c == '\r' || c == '\n') {
@@ -917,15 +917,15 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile(
throw 1;
}
first_ = false;
- readBuf[0].readOrigBuf[0] = c;
- readBuf[0].readOrigBufLen = 1;
+ (*readBuf)[0].readOrigBuf[0] = c;
+ (*readBuf)[0].readOrigBufLen = 1;
}
bool done = false, aborted = false;
size_t readi = 0;
// Read until we run out of input or until we've filled the buffer
- for(; readi < pt.max_buf_ && !done; readi++) {
- char* buf = readBuf[readi].readOrigBuf;
- assert(readi == 0 || readBuf[readi].readOrigBufLen == 0);
+ while (readi < pt.max_buf_ && !done) {
+ char* buf = (*readBuf)[readi].readOrigBuf;
+ assert(readi == 0 || (*readBuf)[readi].readOrigBufLen == 0);
int newlines = 4;
while(newlines) {
c = getc_wrapper();
@@ -936,10 +936,26 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile(
newlines--;
c = '\n';
} else if(done) {
- aborted = true; // Unexpected EOF
+ if (newlines == 4) {
+ newlines = 0;
+ } else {
+ aborted = true; // Unexpected EOF
+ }
break;
}
- buf[readBuf[readi].readOrigBufLen++] = c;
+ buf[(*readBuf)[readi].readOrigBufLen++] = c;
+ }
+ if (c > 0) {
+ if (interleaved_) {
+ // alternate between read buffers
+ batch_a = !batch_a;
+ readBuf = batch_a ? &pt.bufa_ : &pt.bufb_;
+ // increment read counter after each pair gets read
+ readi = batch_a ? readi + 1 : readi;
+ }
+ else {
+ readi++;
+ }
}
}
if(aborted) {
View
3 pat.h
@@ -934,6 +934,7 @@ class FastqPatternSource : public CFilePatternSource {
bool solexa_quals = false,
bool phred64Quals = false,
bool integer_quals = false,
+ bool interleaved = false,
uint32_t skip = 0) :
CFilePatternSource(
infiles,
@@ -945,6 +946,7 @@ class FastqPatternSource : public CFilePatternSource {
solQuals_(solexa_quals),
phred64Quals_(phred64Quals),
intQuals_(integer_quals),
+ interleaved_(interleaved),
color_(color) { }
virtual void reset() {
@@ -988,6 +990,7 @@ class FastqPatternSource : public CFilePatternSource {
bool solQuals_;
bool phred64Quals_;
bool intQuals_;
+ bool interleaved_;
bool color_;
};
Oops, something went wrong.

0 comments on commit 97dd710

Please sign in to comment.