Permalink
Fetching contributors…
Cannot retrieve contributors at this time
1445 lines (1388 sloc) 37.2 KB
#include <cmath>
#include <iostream>
#include <string>
#include <stdexcept>
#include <seqan/sequence.h>
#include <seqan/file.h>
#include "pat.h"
#include "filebuf.h"
using namespace std;
using namespace seqan;
/**
* Calculate a per-read random seed based on a combination of
* the read data (incl. sequence, name, quals) and the global
* seed in '_randSeed'.
*/
static uint32_t genRandSeed(
const String<Dna5>& qry,
const String<char>& qual,
const String<char>& name,
uint32_t seed)
{
// Calculate a per-read random seed based on a combination of
// the read data (incl. sequence, name, quals) and the global
// seed
uint32_t rseed = (seed + 101) * 59 * 61 * 67 * 71 * 73 * 79 * 83;
size_t qlen = seqan::length(qry);
// Throw all the characters of the read into the random seed
for(size_t i = 0; i < qlen; i++) {
int p = (int)qry[i];
assert_leq(p, 4);
size_t off = ((i & 15) << 1);
rseed ^= (p << off);
}
// Throw all the quality values for the read into the random
// seed
for(size_t i = 0; i < qlen; i++) {
int p = (int)qual[i];
assert_leq(p, 255);
size_t off = ((i & 3) << 3);
rseed ^= (p << off);
}
// Throw all the characters in the read name into the random
// seed
size_t namelen = seqan::length(name);
for(size_t i = 0; i < namelen; i++) {
int p = (int)name[i];
assert_leq(p, 255);
size_t off = ((i & 3) << 3);
rseed ^= (p << off);
}
return rseed;
}
/**
* Once name/sequence/qualities have been parsed for an
* unpaired read, set all the other key fields of the Read
* struct.
*/
void PatternSourcePerThread::finalize(Read& ra) {
ra.mate = 0;
ra.constructRevComps();
ra.constructReverses();
ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_);
}
/**
* Once name/sequence/qualities have been parsed for a
* paired-end read, set all the other key fields of the Read
* structs.
*/
void PatternSourcePerThread::finalizePair(Read& ra, Read& rb) {
ra.mate = 1;
ra.constructRevComps();
ra.constructReverses();
ra.fixMateName(1);
ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_);
rb.mate = 2;
rb.constructRevComps();
rb.constructReverses();
rb.fixMateName(2);
rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, seed_);
}
/**
* Get the next paired or unpaired read from the wrapped
* PatternComposer. Returns a pair of bools; first indicates
* whether we were successful, second indicates whether we're
* done.
*/
pair<bool, bool> PatternSourcePerThread::nextReadPair() {
// Prepare batch
if(buf_.exhausted()) {
pair<bool, int> res = nextBatch();
if(res.first && res.second == 0) {
return make_pair(false, true);
}
last_batch_ = res.first;
last_batch_size_ = res.second;
assert_eq(0, buf_.cur_buf_);
} else {
buf_.next(); // advance cursor
assert_gt(buf_.cur_buf_, 0);
}
bool this_is_last = buf_.cur_buf_ == last_batch_size_-1;
if(buf_.rdid() < skip_) {
return make_pair(false, this_is_last ? last_batch_ : false);
}
// Parse read/pair
assert(strlen(buf_.read_a().readOrigBuf) != 0);
assert(buf_.read_a().empty());
if(!parse(buf_.read_a(), buf_.read_b())) {
return make_pair(false, false);
}
// Finalize read/pair
if(paired()) {
finalizePair(buf_.read_a(), buf_.read_b());
} else {
finalize(buf_.read_a());
}
return make_pair(true, this_is_last ? last_batch_ : false);
}
/**
* The main member function for dispensing pairs of reads or
* singleton reads. Returns true iff ra and rb contain a new
* pair; returns false if ra contains a new unpaired read.
*/
pair<bool, int> SoloPatternComposer::nextBatch(PerThreadReadBuf& pt) {
uint32_t cur = cur_;
while(cur < src_.size()) {
// Patterns from srca_[cur_] are unpaired
pair<bool, int> res;
do {
res = src_[cur]->nextBatch(
pt,
true, // batch A (or pairs)
true); // grab lock below
} while(!res.first && res.second == 0);
if(res.second == 0) {
ThreadSafe ts(&mutex_m);
if(cur + 1 > cur_) {
cur_++;
}
cur = cur_;
continue; // on to next pair of PatternSources
}
return res;
}
assert_leq(cur, src_.size());
return make_pair(true, 0);
}
/**
* The main member function for dispensing pairs of reads or
* singleton reads. Returns true iff ra and rb contain a new
* pair; returns false if ra contains a new unpaired read.
*/
pair<bool, int> DualPatternComposer::nextBatch(PerThreadReadBuf& pt) {
// 'cur' indexes the current pair of PatternSources
uint32_t cur = cur_;
while(cur < srca_.size()) {
if(srcb_[cur] == NULL) {
pair<bool, int> res = srca_[cur]->nextBatch(
pt,
true, // batch A (or pairs)
true); // grab lock below
bool done = res.first;
if(!done && res.second == 0) {
ThreadSafe ts(&mutex_m);
if(cur + 1 > cur_) cur_++;
cur = cur_; // Move on to next PatternSource
continue; // on to next pair of PatternSources
}
return make_pair(done, res.second);
} else {
pair<bool, int> resa, resb;
// Lock to ensure that this thread gets parallel reads
// in the two mate files
{
ThreadSafe ts(&mutex_m);
resa = srca_[cur]->nextBatch(
pt,
true, // batch A
false); // don't grab lock below
resb = srcb_[cur]->nextBatch(
pt,
false, // batch B
false); // don't grab lock below
assert_eq(srca_[cur]->readCount(),
srcb_[cur]->readCount());
}
if(resa.second < resb.second) {
cerr << "Error, fewer reads in file specified with -1 "
<< "than in file specified with -2" << endl;
throw 1;
} else if(resa.second == 0 && resb.second == 0) {
ThreadSafe ts(&mutex_m);
if(cur + 1 > cur_) {
cur_++;
}
cur = cur_; // Move on to next PatternSource
continue; // on to next pair of PatternSources
} else if(resb.second < resa.second) {
cerr << "Error, fewer reads in file specified with -2 "
<< "than in file specified with -1" << endl;
throw 1;
}
assert_eq(resa.first, resb.first);
assert_eq(resa.second, resb.second);
return make_pair(resa.first, resa.second);
}
}
assert_leq(cur, srca_.size());
return make_pair(true, 0);
}
/**
* Fill Read with the sequence, quality and name for the next
* read in the list of read files. This function gets called by
* all the search threads, so we must handle synchronization.
*
* Returns pair<bool, int> where bool indicates whether we're
* completely done, and int indicates how many reads were read.
*/
pair<bool, int> CFilePatternSource::nextBatchImpl(
PerThreadReadBuf& pt,
bool batch_a)
{
bool done = false;
int nread = 0;
pt.setReadId(readCnt_);
while(true) { // loop that moves on to next file when needed
do {
pair<bool, int> ret = nextBatchFromFile(pt, batch_a);
done = ret.first;
nread = ret.second;
} while(!done && nread == 0); // not sure why this would happen
if(done && filecur_ < infiles_.size()) { // finished with this file
open();
resetForNextFile(); // reset state to handle a fresh file
filecur_++;
if(nread == 0) {
continue;
}
}
break;
}
assert_geq(nread, 0);
readCnt_ += nread;
return make_pair(done, nread);
}
pair<bool, int> CFilePatternSource::nextBatch(
PerThreadReadBuf& pt,
bool batch_a,
bool lock)
{
if(lock) {
// synchronization at this level because both reading and manipulation of
// current file pointer have to be protected
ThreadSafe ts(&mutex);
return nextBatchImpl(pt, batch_a);
} else {
return nextBatchImpl(pt, batch_a);
}
}
/**
* Open the next file in the list of input files.
*/
void CFilePatternSource::open() {
if(is_open_) {
is_open_ = false;
if (compressed_) {
gzclose(zfp_);
zfp_ = NULL;
}
else if (fp_ != stdin) {
fclose(fp_);
fp_ = NULL;
}
if(qfp_ != NULL && qfp_ != stdin) {
fclose(qfp_);
qfp_ = NULL;
}
}
while(filecur_ < infiles_.size()) {
// Open read
if(infiles_[filecur_] == "-") {
compressed_ = true;
int fn = dup(fileno(stdin));
zfp_ = gzdopen(fn, "rb");
}
else {
compressed_ = false;
if (is_gzipped_file(infiles_[filecur_])) {
compressed_ = true;
zfp_ = gzopen(infiles_[filecur_].c_str(), "rb");
}
else {
fp_ = fopen(infiles_[filecur_].c_str(), "rb");
}
if ((compressed_ && zfp_ == NULL) || (!compressed_ && fp_ == NULL)) {
if(!errs_[filecur_]) {
cerr << "Warning: Could not open read file \""
<< infiles_[filecur_] << "\" for reading; skipping..."
<< endl;
errs_[filecur_] = true;
}
filecur_++;
continue;
}
}
is_open_ = true;
if (compressed_) {
#if ZLIB_VERNUM < 0x1235
cerr << "Warning: gzbuffer added in zlib v1.2.3.5. Unable to change "
"buffer size from default of 8192." << endl;
#else
gzbuffer(zfp_, 64*1024);
#endif
}
else {
setvbuf(fp_, buf_, _IOFBF, 64*1024);
}
if(!qinfiles_.empty()) {
if(qinfiles_[filecur_] == "-") {
qfp_ = stdin;
} else if((qfp_ = fopen(qinfiles_[filecur_].c_str(), "rb")) == NULL) {
if(!errs_[filecur_]) {
cerr << "Warning: Could not open quality file \""
<< qinfiles_[filecur_] << "\" for reading; skipping..."
<< endl;
errs_[filecur_] = true;
}
filecur_++;
continue;
}
assert(qfp_ != NULL);
setvbuf(qfp_, qbuf_, _IOFBF, 64*1024);
}
return;
}
throw 1;
}
/**
* Constructor for vector pattern source, used when the user has
* specified the input strings on the command line using the -c
* option.
*/
VectorPatternSource::VectorPatternSource(
const vector<string>& seqs,
bool color,
const char *dumpfile,
int trim3,
int trim5) :
TrimmingPatternSource(dumpfile, trim3, trim5),
color_(color),
cur_(0),
paired_(false),
tokbuf_(),
bufs_()
{
// Install sequences in buffers, ready for immediate copying in
// nextBatch(). Formatting of the buffer is just like
// TabbedPatternSource.
const size_t seqslen = seqs.size();
for(size_t i = 0; i < seqslen; i++) {
tokbuf_.clear();
tokenize(seqs[i], ":", tokbuf_, 2);
assert_gt(tokbuf_.size(), 0);
assert_leq(tokbuf_.size(), 2);
// Get another buffer ready
bufs_.resize(bufs_.size()+1);
bufs_.back().clear();
// Install name
itoa10<TReadId>(static_cast<TReadId>(i), nametmp_);
bufs_.back() = nametmp_;
bufs_.back().push_back('\t');
// Install sequence
bufs_.back().append(tokbuf_[0].c_str());
bufs_.back().push_back('\t');
// Install qualities
if(tokbuf_.size() > 1) {
bufs_.back().append(tokbuf_[1].c_str());
} else {
const size_t len = tokbuf_[0].length();
for(size_t i = 0; i < len; i++) {
bufs_.back().push_back('I');
}
}
}
}
/**
* Read next batch. However, batch concept is not very applicable for this
* PatternSource where all the info has already been parsed into the fields
* in the contsructor. This essentially modifies the pt as though we read
* in some number of patterns.
*/
pair<bool, int> VectorPatternSource::nextBatchImpl(
PerThreadReadBuf& pt,
bool batch_a)
{
pt.setReadId(cur_);
vector<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
size_t readi = 0;
for(; readi < pt.max_buf_ && cur_ < bufs_.size(); readi++, cur_++) {
const size_t len = bufs_[cur_].length();
memcpy(readbuf[readi].readOrigBuf, bufs_[cur_].c_str(), len);
readbuf[readi].readOrigBufLen = len;
}
readCnt_ += readi;
return make_pair(cur_ == bufs_.size(), readi);
}
pair<bool, int> VectorPatternSource::nextBatch(
PerThreadReadBuf& pt,
bool batch_a,
bool lock)
{
if(lock) {
ThreadSafe ts(&mutex);
return nextBatchImpl(pt, batch_a);
} else {
return nextBatchImpl(pt, batch_a);
}
}
/**
* Finishes parsing outside the critical section.
*/
bool VectorPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const {
// Very similar to TabbedPatternSource
// Light parser (nextBatchFromFile) puts unparsed data
// into Read& r, even when the read is paired.
assert(ra.empty());
assert_gt(ra.readOrigBufLen, 0); // raw data for read/pair is here
int c = '\t';
size_t cur = 0;
const size_t buflen = ra.readOrigBufLen;
// Loop over the two ends
for(int endi = 0; endi < 2 && c == '\t'; endi++) {
Read& r = ((endi == 0) ? ra : rb);
assert_eq(0, seqan::length(r.name));
// Parse name if (a) this is the first end, or
// (b) this is tab6
size_t nameoff = 0;
if(endi < 1 || paired_) {
// Parse read name
c = ra.readOrigBuf[cur++];
while(c != '\t' && cur < buflen) {
r.nameBuf[nameoff++] = c;
c = ra.readOrigBuf[cur++];
}
assert_eq('\t', c);
if(cur >= buflen) {
return false; // record ended prematurely
}
} else if(endi > 0) {
// if this is the second end and we're parsing
// tab5, copy name from first end
rb.name = ra.name;
}
r.nameBuf[nameoff] = '\0';
_setBegin(r.name, r.nameBuf);
_setLength(r.name, nameoff);
// Parse sequence
assert_eq(0, seqan::length(r.patFw));
c = ra.readOrigBuf[cur++];
int nchar = 0, seqoff = 0;
if(color_ && asc2dnacat[c] > 0) {
// First char is a DNA char (primer)
if(asc2colcat[toupper(r.readOrigBuf[cur++])] <= 0) {
// 2nd char isn't a color, so don't assume 'c' is primer
cur -= 2;
} else {
// 'c' is primer
r.primer = c;
}
c = r.readOrigBuf[cur++];
}
if(color_) {
while(c != '\t' && cur < buflen) {
if(c >= '0' && c < '4') {
c = "ACGTN"[(int)c - '0'];
}
if(c == '.') {
c = 'N';
}
if(isalpha(c)) {
assert_in(toupper(c), "ACGTN");
if(nchar++ >= this->trim5_) {
assert_neq(0, asc2dnacat[c]);
r.patBufFw[seqoff++] = charToDna5[c]; // ascii to int
}
}
c = ra.readOrigBuf[cur++];
}
ra.color = true;
} else {
while(c != '\t' && cur < buflen) {
if(isalpha(c)) {
assert_in(toupper(c), "ACGTN");
if(nchar++ >= this->trim5_) {
assert_neq(0, asc2dnacat[c]);
r.patBufFw[seqoff++] = charToDna5[c]; // ascii to int
}
}
c = ra.readOrigBuf[cur++];
}
}
assert_eq('\t', c);
if(cur >= buflen) {
return false; // record ended prematurely
}
r.patBufFw[seqoff] = '\0';
_setBegin(r.patFw, (Dna5*)r.patBufFw);
// record amt trimmed from 5' end due to --trim5
r.trimmed5 = (int)(nchar - seqoff);
// record amt trimmed from 3' end due to --trim3
int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_;
_setLength(r.patFw, seqoff - trim3);
r.trimmed3 = trim3;
// Parse qualities
assert_eq(0, seqan::length(r.qual));
c = ra.readOrigBuf[cur++];
int nqual = 0;
size_t qualoff = 0;
while(c != '\t' && c != '\n' && c != '\r') {
if(c == ' ') {
wrongQualityFormat(r.name);
return false;
}
char cadd = charToPhred33(c, false, false);
if(++nqual > this->trim5_) {
r.qualBuf[qualoff++] = cadd;
}
if(cur >= buflen) break;
c = ra.readOrigBuf[cur++];
}
if(nchar > nqual) {
tooFewQualities(r.name);
return false;
} else if(nqual > nchar) {
tooManyQualities(r.name);
return false;
}
r.qualBuf[seqoff] = '\0';
_setBegin(r.qual, r.qualBuf);
_setLength(r.qual, seqan::length(r.patFw));
assert(c == '\t' || c == '\n' || c == '\r' || cur >= buflen);
}
ra.parsed = true;
if(!rb.parsed && rb.readOrigBufLen > 0) {
return parse(rb, ra, rdid);
}
return true;
}
/**
* Light-parse a FASTA batch into the given buffer.
*/
pair<bool, int> FastaPatternSource::nextBatchFromFile(
PerThreadReadBuf& pt,
bool batch_a)
{
int c;
vector<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
if(first_) {
c = getc_wrapper();
if(c == EOF) {
return make_pair(true, 0);
}
while(c == '\r' || c == '\n') {
c = getc_wrapper();
}
if(c != '>') {
cerr << "Error: reads file does not look like a FASTA file" << endl;
throw 1;
}
first_ = false;
}
bool done = false;
size_t readi = 0;
// Read until we run out of input or until we've filled the buffer
for(; readi < pt.max_buf_ && !done; readi++) {
readbuf[readi].readOrigBuf[0] = '>';
size_t bufoff = 1;
while(true) {
c = getc_wrapper();
if(c < 0 || c == '>') {
done = c < 0;
break;
}
readbuf[readi].readOrigBuf[bufoff++] = c;
}
readbuf[readi].readOrigBufLen = bufoff;
}
// Immediate EOF case
if(done && readbuf[readi-1].readOrigBufLen == 1) {
readi--;
}
return make_pair(done, readi);
}
/**
* Finalize FASTA parsing outside critical section.
*/
bool FastaPatternSource::parse(Read& r, Read& rb, TReadId rdid) const {
// We assume the light parser has put the raw data for the separate ends
// into separate Read objects. That doesn't have to be the case, but
// that's how we've chosen to do it for FastqPatternSource
assert_gt(r.readOrigBufLen, 0);
assert(r.empty());
int c = -1;
size_t cur = 1;
const size_t buflen = r.readOrigBufLen;
// Parse read name
assert_eq(0, seqan::length(r.name));
int nameoff = 0;
while(cur < buflen) {
c = r.readOrigBuf[cur++];
if(c == '\n' || c == '\r') {
do {
c = r.readOrigBuf[cur++];
} while((c == '\n' || c == '\r') && cur < buflen);
break;
}
r.nameBuf[nameoff++] = c;
}
if(cur >= buflen) {
return false; // FASTA ended prematurely
}
if(nameoff > 0) {
r.nameBuf[nameoff] = '\0';
_setBegin(r.name, r.nameBuf);
_setLength(r.name, nameoff);
}
// Parse sequence
int nchar = 0, seqoff = 0;
assert_eq(0, seqan::length(r.patFw));
assert(c != '\n' && c != '\r');
assert_lt(cur, buflen);
if(color_ && asc2dnacat[c] > 0) {
// First char is a DNA char (primer)
if(asc2colcat[toupper(r.readOrigBuf[cur++])] <= 0) {
// 2nd char isn't a color, so don't assume 'c' is primer
cur -= 2;
} else {
// 'c' is primer
r.primer = c;
}
c = r.readOrigBuf[cur++];
}
if(color_) {
while(c != '\n' && cur < buflen) {
if(c >= '0' && c < '4') {
c = "ACGTN"[(int)c - '0'];
}
if(c == '.') {
c = 'N';
}
if(isalpha(c)) {
assert_in(toupper(c), "ACGTN");
if(nchar++ >= this->trim5_) {
assert_neq(0, asc2dnacat[c]);
r.patBufFw[seqoff++] = charToDna5[c]; // ascii to int
}
}
c = r.readOrigBuf[cur++];
}
r.color = true;
} else {
while(c != '\n' && cur < buflen) {
if(c == '.') {
c = 'N';
}
if(isalpha(c)) {
// If it's past the 5'-end trim point
if(nchar++ >= this->trim5_) {
r.patBufFw[seqoff++] = charToDna5[c];
}
}
assert_lt(cur, buflen);
c = r.readOrigBuf[cur++];
}
}
r.patBufFw[seqoff] = '\0';
_setBegin(r.patFw, (Dna5*)r.patBufFw);
// record amt trimmed from 5' end due to --trim5
r.trimmed5 = (int)(nchar - seqoff);
// record amt trimmed from 3' end due to --trim3
int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_;
_setLength(r.patFw, seqoff - trim3);
r.trimmed3 = trim3;
for(size_t i = 0; i < seqoff - trim3; i++) {
r.qualBuf[i] = 'I';
}
r.qualBuf[seqoff - trim3] = '\0';
_setBegin(r.qual, r.qualBuf);
_setLength(r.qual, seqoff - trim3);
// Set up a default name if one hasn't been set
if(nameoff == 0) {
itoa10<TReadId>(static_cast<TReadId>(rdid), r.nameBuf);
_setBegin(r.name, r.nameBuf);
_setLength(r.name, strlen(r.nameBuf));
}
r.parsed = true;
if(!rb.parsed && rb.readOrigBufLen > 0) {
return parse(rb, r, rdid);
}
return true;
}
/**
* Light-parse a FASTA-continuous batch into the given buffer.
* This is trickier for FASTA-continuous than for other formats,
* for several reasons:
*
* 1. Reads are substrings of a longer FASTA input string
* 2. Reads may overlap w/r/t the longer FASTA string
* 3. Read names depend on the most recently observed FASTA
* record name
*/
pair<bool, int> FastaContinuousPatternSource::nextBatchFromFile(
PerThreadReadBuf& pt,
bool batch_a)
{
int c = -1;
vector<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
size_t readi = 0;
int nameoff = 0;
while(readi < pt.max_buf_) {
c = getc_wrapper();
if(c < 0) {
break;
}
if(c == '>') {
resetForNextFile();
nameoff = 0;
c = getc_wrapper();
bool sawSpace = false;
while(c != '\n' && c != '\r') {
if(!sawSpace) {
sawSpace = isspace(c);
}
if(!sawSpace) {
// Put it in the name prefix buffer so we
// can re-use this prefix for all the reads
// that are substrings of this FASTA sequence
name_prefix_buf_[nameoff++] = c;
}
c = getc_wrapper();
}
while(c == '\n' || c == '\r') {
c = getc_wrapper();
}
if(c < 0) {
break;
}
name_prefix_buf_[nameoff++] = '_';
}
int cat = asc2dnacat[c];
if(cat >= 2) c = 'N';
if(cat == 0) {
// Non-DNA, non-IUPAC char; skip
continue;
} else {
// DNA char
buf_[bufCur_++] = c;
if(bufCur_ == 1024) {
bufCur_ = 0; // wrap around circular buf
}
if(eat_ > 0) {
eat_--;
// Try to keep readCnt_ aligned with the offset
// into the reference; that lets us see where
// the sampling gaps are by looking at the read
// name
if(!beginning_) {
readCnt_++;
}
continue;
}
// install name
memcpy(readbuf[readi].readOrigBuf, name_prefix_buf_, nameoff);
itoa10<TReadId>(readCnt_ - subReadCnt_, name_int_buf_);
size_t bufoff = nameoff;
memcpy(readbuf[readi].readOrigBuf + bufoff, name_int_buf_, strlen(name_int_buf_));
bufoff += strlen(name_int_buf_);
readbuf[readi].readOrigBuf[bufoff++] = '\t';
// install sequence
for(size_t i = 0; i < length_; i++) {
if(length_ - i <= bufCur_) {
c = buf_[bufCur_ - (length_ - i)];
} else {
// Rotate
c = buf_[bufCur_ - (length_ - i) + 1024];
}
readbuf[readi].readOrigBuf[bufoff++] = c;
}
readbuf[readi].readOrigBufLen = bufoff;
eat_ = freq_-1;
readCnt_++;
beginning_ = false;
readi++;
}
}
return make_pair(c < 0, readi);
}
/**
* Finalize FASTA-continuous parsing outside critical section.
*/
bool FastaContinuousPatternSource::parse(
Read& ra,
Read& rb,
TReadId rdid) const
{
// Light parser (nextBatchFromFile) puts unparsed data
// into Read& r, even when the read is paired.
assert(ra.empty());
assert(rb.empty());
assert_gt(ra.readOrigBufLen, 0); // raw data for read/pair is here
assert_eq(0, rb.readOrigBufLen);
int c = '\t';
size_t cur = 0;
const size_t buflen = ra.readOrigBufLen;
// Parse read name
assert_eq(0, seqan::length(ra.name));
int nameoff = 0;
c = ra.readOrigBuf[cur++];
while(c != '\t' && cur < buflen) {
ra.nameBuf[nameoff++] = c;
c = ra.readOrigBuf[cur++];
}
assert_eq('\t', c);
if(cur >= buflen) {
return false; // record ended prematurely
}
ra.nameBuf[nameoff] = '\0';
_setBegin(ra.name, ra.nameBuf);
_setLength(ra.name, nameoff);
// Parse sequence
assert_eq(0, seqan::length(ra.patFw));
c = ra.readOrigBuf[cur++];
int nchar = 0, seqoff = 0;
while(cur < buflen) {
if(isalpha(c)) {
assert_in(toupper(c), "ACGTN");
if(nchar++ >= this->trim5_) {
assert_neq(0, asc2dnacat[c]);
ra.patBufFw[seqoff++] = charToDna5[c]; // ascii to int
}
}
c = ra.readOrigBuf[cur++];
}
ra.patBufFw[seqoff] = '\0';
_setBegin(ra.patFw, (Dna5*)ra.patBufFw);
// record amt trimmed from 5' end due to --trim5
ra.trimmed5 = (int)(nchar - seqoff);
// record amt trimmed from 3' end due to --trim3
int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_;
_setLength(ra.patFw, seqoff - trim3);
ra.trimmed3 = trim3;
// Make fake qualities
assert_eq(0, seqan::length(ra.qual));
int qualoff = 0;
for(size_t i = 0; i < seqoff; i++) {
ra.qualBuf[qualoff++] = 'I';
}
ra.qualBuf[qualoff] = '\0';
_setBegin(ra.qual, ra.qualBuf);
_setLength(ra.qual, qualoff);
ra.parsed = true;
return true;
}
/**
* "Light" parser. This is inside the critical section, so the key is to do
* just enough parsing so that another function downstream (finalize()) can do
* the rest of the parsing. Really this function's only job is to stick every
* for lines worth of the input file into a buffer (r.readOrigBuf). finalize()
* then parses the contents of r.readOrigBuf later.
*/
pair<bool, int> FastqPatternSource::nextBatchFromFile(
PerThreadReadBuf& pt,
bool batch_a)
{
int c = 0;
vector<Read>* readBuf = batch_a ? &pt.bufa_ : &pt.bufb_;
if(first_) {
c = getc_wrapper();
while(c == '\r' || c == '\n') {
c = getc_wrapper();
}
if(c != '@') {
cerr << "Error: reads file does not look like a FASTQ file" << endl;
throw 1;
}
first_ = false;
(*readBuf)[0].readOrigBuf[0] = c;
(*readBuf)[0].readOrigBufLen = 1;
}
bool done = false, aborted = false;
size_t readi = 0;
// Read until we run out of input or until we've filled the buffer
while (readi < pt.max_buf_ && !done) {
char* buf = (*readBuf)[readi].readOrigBuf;
assert(readi == 0 || (*readBuf)[readi].readOrigBufLen == 0);
int newlines = 4;
while(newlines) {
c = getc_wrapper();
done = c < 0;
if(c == '\n' || (done && newlines == 1)) {
// Saw newline, or EOF that we're
// interpreting as final newline
newlines--;
c = '\n';
} else if(done) {
if (newlines == 4) {
newlines = 0;
} else {
aborted = true; // Unexpected EOF
}
break;
}
buf[(*readBuf)[readi].readOrigBufLen++] = c;
}
if (c > 0) {
if (interleaved_) {
// alternate between read buffers
batch_a = !batch_a;
readBuf = batch_a ? &pt.bufa_ : &pt.bufb_;
// increment read counter after each pair gets read
readi = batch_a ? readi + 1 : readi;
}
else {
readi++;
}
}
}
if(aborted) {
readi--;
}
return make_pair(done, readi);
}
/**
* Finalize FASTQ parsing outside critical section.
*/
bool FastqPatternSource::parse(Read &r, Read& rb, TReadId rdid) const {
// We assume the light parser has put the raw data for the separate ends
// into separate Read objects. That doesn't have to be the case, but
// that's how we've chosen to do it for FastqPatternSource
assert_gt(r.readOrigBufLen, 0);
assert(r.empty());
int c;
size_t cur = 1;
const size_t buflen = r.readOrigBufLen;
// Parse read name
assert_eq(0, seqan::length(r.name));
int nameoff = 0;
while(true) {
assert_lt(cur, buflen);
c = r.readOrigBuf[cur++];
if(c == '\n' || c == '\r') {
do {
c = r.readOrigBuf[cur++];
} while(c == '\n' || c == '\r');
break;
}
r.nameBuf[nameoff++] = c;
}
r.nameBuf[nameoff] = '\0';
_setBegin(r.name, r.nameBuf);
_setLength(r.name, nameoff);
// Parse sequence
int nchar = 0, seqoff = 0;
assert_eq(0, seqan::length(r.patFw));
if(color_ && asc2dnacat[c] > 0) {
// First char is a DNA char (primer)
if(asc2colcat[toupper(r.readOrigBuf[cur++])] <= 0) {
// 2nd char isn't a color, so don't assume 'c' is primer
cur -= 2;
} else {
// 'c' is primer
r.primer = c;
}
c = r.readOrigBuf[cur++];
}
if(color_) {
while(c != '+' && cur < buflen) {
if(c >= '0' && c < '4') {
c = "ACGTN"[(int)c - '0'];
}
if(c == '.') {
c = 'N';
}
if(isalpha(c)) {
assert_in(toupper(c), "ACGTN");
if(nchar++ >= this->trim5_) {
assert_neq(0, asc2dnacat[c]);
r.patBufFw[seqoff++] = charToDna5[c]; // ascii to int
}
}
c = r.readOrigBuf[cur++];
}
r.color = true;
} else {
while(c != '+' && cur < buflen) {
if(c == '.') {
c = 'N';
}
if(isalpha(c)) {
// If it's past the 5'-end trim point
if(nchar++ >= this->trim5_) {
r.patBufFw[seqoff++] = charToDna5[c];
}
}
assert_lt(cur, buflen);
c = r.readOrigBuf[cur++];
}
}
_setBegin(r.patFw, (Dna5*)r.patBufFw);
// record amt trimmed from 5' end due to --trim5
r.trimmed5 = (int)(nchar - seqoff);
// record amt trimmed from 3' end due to --trim3
int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_;
_setLength(r.patFw, seqoff - trim3);
r.patBufFw[seqan::length(r.patFw)] = '\0';
r.trimmed3 = trim3;
assert_eq('+', c);
do {
assert_lt(cur, buflen);
c = r.readOrigBuf[cur++];
} while(c != '\n' && c != '\r');
while(cur < buflen && (c == '\n' || c == '\r')) {
c = r.readOrigBuf[cur++];
}
assert_eq(0, seqan::length(r.qual));
int nqual = 0, qualoff = 0;
if (intQuals_) {
int cur_int = 0;
while(c != '\t' && c != '\n' && c != '\r') {
cur_int *= 10;
cur_int += (int)(c - '0');
c = r.readOrigBuf[cur++];
if(c == ' ' || c == '\t' || c == '\n' || c == '\r') {
char cadd = intToPhred33(cur_int, solQuals_);
cur_int = 0;
if (c == ' ')
c = r.readOrigBuf[cur++];
assert_geq(cadd, 33);
if(++nqual > this->trim5_) {
r.qualBuf[qualoff++] = cadd;
}
}
}
} else {
c = charToPhred33(c, solQuals_, phred64Quals_);
if(nqual++ >= r.trimmed5) {
r.qualBuf[qualoff++] = c;
}
while(cur < buflen) {
c = r.readOrigBuf[cur++];
if (c == ' ') {
wrongQualityFormat(r.name);
return false;
}
if(c == '\r' || c == '\n') {
break;
}
c = charToPhred33(c, solQuals_, phred64Quals_);
if(nqual++ >= r.trimmed5) {
r.qualBuf[qualoff++] = c;
}
}
if(qualoff < seqoff) {
tooFewQualities(r.name);
return false;
} else if(qualoff > seqoff) {
tooManyQualities(r.name);
return false;
}
}
r.qualBuf[seqan::length(r.patFw)] = '\0';
_setBegin(r.qual, r.qualBuf);
_setLength(r.qual, seqan::length(r.patFw));
// Set up a default name if one hasn't been set
if(seqan::length(r.name) == 0) {
itoa10<TReadId>(static_cast<TReadId>(readCnt_), r.nameBuf);
_setBegin(r.name, r.nameBuf);
_setLength(r.name, nameoff);
}
r.parsed = true;
if(!rb.parsed && rb.readOrigBufLen > 0) {
return parse(rb, r, rdid);
}
return true;
}
/**
* Light-parse a batch of tabbed-format reads into given buffer.
*/
pair<bool, int> TabbedPatternSource::nextBatchFromFile(
PerThreadReadBuf& pt,
bool batch_a)
{
int c = getc_wrapper();
while(c >= 0 && (c == '\n' || c == '\r')) {
c = getc_wrapper();
}
vector<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
size_t readi = 0;
// Read until we run out of input or until we've filled the buffer
for(; readi < pt.max_buf_ && c >= 0; readi++) {
readbuf[readi].readOrigBufLen = 0;
while(c >= 0 && c != '\n' && c != '\r') {
readbuf[readi].readOrigBuf[readbuf[readi].readOrigBufLen++] = c;
c = getc_wrapper();
}
while(c >= 0 && (c == '\n' || c == '\r') && readi < pt.max_buf_ - 1) {
c = getc_wrapper();
}
}
return make_pair(c < 0, readi);
}
/**
* Finalize tabbed parsing outside critical section.
*/
bool TabbedPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const {
// Light parser (nextBatchFromFile) puts unparsed data
// into Read& r, even when the read is paired.
assert(ra.empty());
assert(rb.empty());
assert_gt(ra.readOrigBufLen, 0); // raw data for read/pair is here
assert_eq(0, rb.readOrigBufLen);
int c = '\t';
size_t cur = 0;
const size_t buflen = ra.readOrigBufLen;
bool paired = false;
// Loop over the two ends
for(int endi = 0; endi < 2 && c == '\t'; endi++) {
Read& r = ((endi == 0) ? ra : rb);
assert_eq(0, seqan::length(r.name));
// Parse name if (a) this is the first end, or
// (b) this is tab6
int nameoff = 0;
if(endi < 1 || secondName_) {
// Parse read name
c = ra.readOrigBuf[cur++];
while(c != '\t' && cur < buflen) {
r.name[nameoff++] = c;
c = ra.readOrigBuf[cur++];
}
assert_eq('\t', c);
if(cur >= buflen) {
return false; // record ended prematurely
}
r.nameBuf[nameoff] = '\0';
_setBegin(r.name, r.nameBuf);
_setLength(r.name, nameoff);
} else if(endi > 0) {
// if this is the second end and we're parsing
// tab5, copy name from first end
rb.name = ra.name; // not a deep copy
}
paired = endi > 0;
// Parse sequence
assert_eq(0, seqan::length(r.patFw));
c = ra.readOrigBuf[cur++];
int nchar = 0, seqoff = 0;
if(color_ && asc2dnacat[c] > 0) {
// First char is a DNA char (primer)
if(asc2colcat[toupper(r.readOrigBuf[cur++])] <= 0) {
// 2nd char isn't a color, so don't assume 'c' is primer
cur -= 2;
} else {
// 'c' is primer
r.primer = c;
}
c = r.readOrigBuf[cur++];
}
if(color_) {
while(c != '\t' && cur < buflen) {
if(c >= '0' && c < '4') {
c = "ACGTN"[(int)c - '0'];
}
if(c == '.') {
c = 'N';
}
if(isalpha(c)) {
assert_in(toupper(c), "ACGTN");
if(nchar++ >= this->trim5_) {
assert_neq(0, asc2dnacat[c]);
r.patBufFw[seqoff++] = charToDna5[c]; // ascii to int
}
}
c = r.readOrigBuf[cur++];
}
r.color = true;
} else {
while(c != '\t' && cur < buflen) {
if(isalpha(c)) {
assert_in(toupper(c), "ACGTN");
if(nchar++ >= this->trim5_) {
assert_neq(0, asc2dnacat[c]);
r.patBufFw[seqoff++] = charToDna5[c];
}
}
c = ra.readOrigBuf[cur++];
}
}
assert_eq('\t', c);
if(cur >= buflen) {
return false; // record ended prematurely
}
_setBegin(r.patFw, (Dna5*)r.patBufFw);
// record amt trimmed from 5' end due to --trim5
r.trimmed5 = (int)(nchar - seqoff);
// record amt trimmed from 3' end due to --trim3
int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_;
_setLength(r.patFw, seqoff - trim3);
r.patBufFw[seqan::length(r.patFw)] = '\0';
r.trimmed3 = trim3;
// Parse qualities
assert_eq(0, seqan::length(r.qual));
c = ra.readOrigBuf[cur++];
int nqual = 0, qualoff = 0;
if (intQuals_) {
int cur_int = 0;
while(c != '\t' && c != '\n' && c != '\r' && cur < buflen) {
cur_int *= 10;
cur_int += (int)(c - '0');
c = ra.readOrigBuf[cur++];
if(c == ' ' || c == '\t' || c == '\n' || c == '\r') {
char cadd = intToPhred33(cur_int, solQuals_);
cur_int = 0;
assert_geq(cadd, 33);
if(++nqual > this->trim5_) {
r.qualBuf[qualoff++] = cadd;
}
}
}
} else {
while(c != '\t' && c != '\n' && c != '\r') {
if(c == ' ') {
wrongQualityFormat(r.name);
return false;
}
char cadd = charToPhred33(c, solQuals_, phred64Quals_);
if(++nqual > this->trim5_) {
r.qualBuf[qualoff++] = cadd;
}
if(cur >= buflen) break;
c = ra.readOrigBuf[cur++];
}
}
if(nchar > nqual) {
tooFewQualities(r.name);
return false;
} else if(nqual > nchar) {
tooManyQualities(r.name);
return false;
}
r.qualBuf[seqan::length(r.patFw)] = '\0';
_setBegin(r.qual, r.qualBuf);
_setLength(r.qual, seqan::length(r.patFw));
assert(c == '\t' || c == '\n' || c == '\r' || cur >= buflen);
}
ra.parsed = true;
rb.parsed = paired;
return true;
}
/**
* Light-parse a batch of raw-format reads into given buffer.
*/
pair<bool, int> RawPatternSource::nextBatchFromFile(
PerThreadReadBuf& pt,
bool batch_a)
{
int c = getc_wrapper();
while(c >= 0 && (c == '\n' || c == '\r')) {
c = getc_wrapper();
}
vector<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
size_t readi = 0;
// Read until we run out of input or until we've filled the buffer
for(; readi < pt.max_buf_ && c >= 0; readi++) {
readbuf[readi].readOrigBufLen = 0;
while(c >= 0 && c != '\n' && c != '\r') {
readbuf[readi].readOrigBuf[readbuf[readi].readOrigBufLen++] = c;
c = getc_wrapper();
}
while(c >= 0 && (c == '\n' || c == '\r')) {
c = getc_wrapper();
}
}
return make_pair(c < 0, readi);
}
/**
* Finalize raw parsing outside critical section.
*/
bool RawPatternSource::parse(Read& r, Read& rb, TReadId rdid) const {
assert(r.empty());
assert_gt(r.readOrigBufLen, 0);
size_t cur = 0;
const size_t buflen = r.readOrigBufLen;
// Parse sequence
assert_eq(0, seqan::length(r.patFw));
int nchar = 0, seqoff = 0;
int c = r.readOrigBuf[cur++];
if(color_ && asc2dnacat[c] > 0) {
// First char is a DNA char (primer)
if(asc2colcat[toupper(r.readOrigBuf[cur++])] <= 0) {
// 2nd char isn't a color, so don't assume 'c' is primer
cur -= 2;
} else {
// 'c' is primer
r.primer = c;
}
c = r.readOrigBuf[cur++];
}
if(color_) {
while(c != '\0') {
assert(c != '\r' && c != '\n');
if(c >= '0' && c < '4') {
c = "ACGTN"[(int)c - '0'];
}
if(c == '.') {
c = 'N';
}
if(isalpha(c)) {
assert_in(toupper(c), "ACGTN");
if(nchar++ >= this->trim5_) {
assert_neq(0, asc2dnacat[c]);
r.patBufFw[seqoff++] = charToDna5[c]; // ascii to int
}
}
c = r.readOrigBuf[cur++];
}
r.color = true;
} else {
cur--;
while(cur < buflen) {
c = r.readOrigBuf[cur++];
assert(c != '\r' && c != '\n');
if(isalpha(c)) {
assert_in(toupper(c), "ACGTN");
if(nchar++ >= this->trim5_) {
assert_neq(0, asc2dnacat[c]);
r.patBufFw[seqoff++] = charToDna5[c];
}
}
}
}
_setBegin(r.patFw, (Dna5*)r.patBufFw);
// record amt trimmed from 5' end due to --trim5
r.trimmed5 = (int)(nchar - seqoff);
// record amt trimmed from 3' end due to --trim3
int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_;
_setLength(r.patFw, seqoff - trim3);
r.patBufFw[seqan::length(r.patFw)] = '\0';
r.trimmed3 = trim3;
// Give the name field a dummy value
itoa10<TReadId>(rdid, r.nameBuf);
_setBegin(r.name, r.nameBuf);
_setLength(r.name, strlen(r.nameBuf));
// Give the base qualities dummy values
assert_eq(0, seqan::length(r.qual));
const size_t len = seqan::length(r.patFw);
for(size_t i = 0; i < len; i++) {
r.qualBuf[i] = 'I';
}
_setBegin(r.qual, r.qualBuf);
_setLength(r.qual, seqan::length(r.patFw));
r.parsed = true;
if(!rb.parsed && rb.readOrigBufLen > 0) {
return parse(rb, r, rdid);
}
return true;
}
void wrongQualityFormat(const String<char>& read_name) {
cerr << "Encountered a space parsing the quality string for read " << read_name << endl
<< "If this is a FASTQ file with integer (non-ASCII-encoded) qualities, please" << endl
<< "re-run Bowtie with the --integer-quals option." << endl;
throw 1;
}
void tooFewQualities(const String<char>& read_name) {
cerr << "Too few quality values for read: " << read_name << endl
<< "\tare you sure this is a FASTQ-int file?" << endl;
throw 1;
}
void tooManyQualities(const String<char>& read_name) {
cerr << "Reads file contained a pattern with more than 1024 quality values." << endl
<< "Please truncate reads and quality values and and re-run Bowtie" << endl;
throw 1;
}
void tooManySeqChars(const String<char>& read_name) {
cerr << "Reads file contained a pattern with more than 1024 sequence characters." << endl
<< "Please truncate reads and quality values and and re-run Bowtie." << endl
<< "Offending read: " << read_name << endl;
throw 1;
}