Permalink
299 lines (289 sloc) 8.46 KB
#include "ref_read.h"
/**
* Reads past the next ambiguous or unambiguous stretch of sequence
* from the given FASTA file and returns its length. Does not do
* anything with the sequence characters themselves; this is purely for
* measuring lengths.
*/
RefRecord fastaRefReadSize(FileBuf& in,
const RefReadInParams& rparms,
bool first,
BitpairOutFileBuf* bpout)
{
int c;
static int lastc = '>'; // last character seen
// RefRecord params
TIndexOffU len = 0; // 'len' counts toward total length
// 'off' counts number of ambiguous characters before first
// unambiguous character
size_t off = 0;
// Pick off the first carat and any preceding whitespace
if(first) {
assert(!in.eof());
lastc = '>';
c = in.getPastWhitespace();
if(in.eof()) {
// Got eof right away; emit warning
cerr << "Warning: Empty input file" << endl;
lastc = -1;
return RefRecord(0, 0, true);
}
assert(c == '>');
}
first = true;
// Skip to the end of the id line; if the next line is either
// another id line or a comment line, keep skipping
if(lastc == '>') {
// Skip to the end of the name line
do {
if((c = in.getPastNewline()) == -1) {
// No more input
cerr << "Warning: Encountered empty reference sequence" << endl;
lastc = -1;
return RefRecord(0, 0, true);
}
if(c == '>') {
cerr << "Warning: Encountered empty reference sequence" << endl;
}
// continue until a non-name, non-comment line
} while (c == '>');
} else {
first = false; // not the first in a sequence
off = 1; // The gap has already been consumed, so count it
if((c = in.get()) == -1) {
// Don't emit a warning, since this might legitimately be
// a gap on the end of the final sequence in the file
lastc = -1;
return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
}
}
// Now skip to the first DNA character, counting gap characters
// as we go
int lc = -1; // last-DNA char variable for color conversion
while(true) {
int cat = dna4Cat[c];
if(rparms.nsToAs && cat == 2) c = 'A';
if(cat == 1) {
// This is a DNA character
if(rparms.color) {
if(lc != -1) {
// Got two consecutive unambiguous DNAs
break; // to read-in loop
}
// Keep going; we need two consecutive unambiguous DNAs
lc = charToDna5[(int)c];
// The 'if(off > 0)' takes care of the case where
// the reference is entirely unambiguous and we don't
// want to incorrectly increment off.
if(off > 0) off++;
} else {
break; // to read-in loop
}
} else if(cat == 2) {
if(lc != -1 && off == 0) off++;
lc = -1;
off++; // skip over gap character and increment
} else if(c == '>') {
if(off > 0 && lastc == '>') {
cerr << "Warning: Encountered reference sequence with only gaps" << endl;
} else if(lastc == '>') {
cerr << "Warning: Encountered empty reference sequence" << endl;
}
lastc = '>';
return RefRecord((TIndexOffU)off, 0, first);
}
c = in.get();
if(c == -1) {
// End-of-file
if(off > 0 && lastc == '>') {
cerr << "Warning: Encountered reference sequence with only gaps" << endl;
} else if(lastc == '>') {
cerr << "Warning: Encountered empty reference sequence" << endl;
}
lastc = -1;
return RefRecord((TIndexOffU)off, 0, first);
}
}
assert(!rparms.color || (lc != -1));
assert_eq(1, dna4Cat[c]); // C must be unambiguous base
if(off > 0 && rparms.color && first) {
// Handle the case where the first record has ambiguous
// characters but we're in color space; one of those counts is
// spurious
off--;
}
// in now points just past the first character of a sequence
// line, and c holds the first character
while(c != -1 && c != '>') {
if(rparms.nsToAs && dna4Cat[c] == 2) c = 'A';
uint8_t cat = dna4Cat[c];
int cc = toupper(c);
if(rparms.bisulfite && cc == 'C') c = cc = 'T';
if(cat == 1) {
// It's a DNA character
assert(cc == 'A' || cc == 'C' || cc == 'G' || cc == 'T');
// Consume it
len++;
// Output it
if(bpout != NULL) {
if(rparms.color) {
// output color
bpout->write(dinuc2color[charToDna5[(int)c]][lc]);
} else if(!rparms.color) {
// output nucleotide
bpout->write(charToDna5[c]);
}
}
lc = charToDna5[(int)c];
} else if(cat == 2) {
// It's an N or a gap
lastc = c;
assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
} else {
// Not DNA and not a gap, ignore it
#ifndef NDEBUG
if(!isspace(c)) {
cerr << "Unexpected character in sequence: ";
if(isprint(c)) {
cerr << ((char)c) << endl;
} else {
cerr << "(" << c << ")" << endl;
}
}
#endif
}
c = in.get();
}
lastc = c;
return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
}
static void
printRecords(ostream& os, const vector<RefRecord>& l) {
for(size_t i = 0; i < l.size(); i++) {
os << l[i].first << ", " << l[i].off << ", " << l[i].len << endl;
}
}
/**
* Reverse the 'src' list of RefRecords into the 'dst' list. Don't
* modify 'src'.
*/
void reverseRefRecords(const vector<RefRecord>& src,
vector<RefRecord>& dst,
bool recursive,
bool verbose)
{
dst.clear();
{
vector<RefRecord> cur;
for(int64_t i = (int64_t)src.size()-1; i >= 0; i--) {
bool first = (i == (int)src.size()-1 || src[i+1].first);
if(src[i].len) {
cur.push_back(RefRecord(0, src[i].len, first));
first = false;
}
if(src[i].off) cur.push_back(RefRecord(src[i].off, 0, first));
}
for(int64_t i = 0; i < (int64_t)cur.size(); i++) {
assert(cur[i].off == 0 || cur[i].len == 0);
if(i < (int64_t)cur.size()-1 && cur[i].off != 0 && !cur[i+1].first) {
dst.push_back(RefRecord(cur[i].off, cur[i+1].len, cur[i].first));
i++;
} else {
dst.push_back(cur[i]);
}
}
}
if(verbose) {
cout << "Source: " << endl;
printRecords(cout, src);
cout << "Dest: " << endl;
printRecords(cout, dst);
}
#ifndef NDEBUG
if(!recursive) {
vector<RefRecord> tmp;
reverseRefRecords(dst, tmp, true);
assert_eq(tmp.size(), src.size());
for(size_t i = 0; i < src.size(); i++) {
assert_eq(src[i].len, tmp[i].len);
assert_eq(src[i].off, tmp[i].off);
assert_eq(src[i].first, tmp[i].first);
}
}
#endif
}
/**
* Calculate a vector containing the sizes of all of the patterns in
* all of the given input files, in order. Returns the total size of
* all references combined. Rewinds each istream before returning.
*/
std::pair<size_t, size_t>
fastaRefReadSizes(vector<FileBuf*>& in,
vector<RefRecord>& recs,
vector<uint32_t>& plens,
const RefReadInParams& rparms,
BitpairOutFileBuf* bpout,
TIndexOff& numSeqs)
{
TIndexOffU unambigTot = 0;
size_t bothTot = 0;
assert_gt(in.size(), 0);
uint32_t both = 0, unambig = 0;
// For each input istream
for(size_t i = 0; i < in.size(); i++) {
bool first = true;
assert(!in[i]->eof());
// For each pattern in this istream
while(!in[i]->eof()) {
RefRecord rec = fastaRefReadSize(*in[i], rparms, first, bpout);
// Update plens
if(rec.first) {
if(unambig > 0) {
plens.push_back(both);
}
both = 0;
unambig = 0;
}
#ifndef ACCOUNT_FOR_ALL_GAP_REFS
if(rec.len == 0) rec.first = false;
#endif
if((unambigTot + rec.len) < unambigTot) {
#ifdef BOWTIE_64BIT_INDEX
cerr << "Error: Reference sequence has more than 2^32-1 characters! Please divide the" << endl
<< "reference into smaller chunks and index each independently." << endl;
#else
cerr << "Error: Reference sequence has more than 2^32-1 characters! Please try to" << endl
<< "build a large index instead using the appropiate options." << endl;
#endif
throw 1;
}
// Add the length of this record.
if(rec.first) numSeqs++;
unambigTot += rec.len; unambig += rec.len;
bothTot += rec.len; both += rec.len;
bothTot += rec.off; both += rec.off;
first = false;
if(rec.len == 0 && rec.off == 0 && !rec.first) continue;
recs.push_back(rec);
}
// Reset the input stream
in[i]->reset();
assert(!in[i]->eof());
#ifndef NDEBUG
// Check that it's really reset
int c = in[i]->get();
assert_eq('>', c);
in[i]->reset();
assert(!in[i]->eof());
#endif
}
assert_geq(bothTot, 0);
assert_geq(unambigTot, 0);
if(unambig > 0) {
plens.push_back(both);
}
return make_pair(
unambigTot, // total number of unambiguous DNA characters read
bothTot); // total number of DNA characters read, incl. ambiguous ones
}