Permalink
Fetching contributors…
Cannot retrieve contributors at this time
373 lines (352 sloc) 11.9 KB
#ifndef EBWT_SEARCH_UTIL_H_
#define EBWT_SEARCH_UTIL_H_
#include <iostream>
#include <vector>
#include <map>
#include <stdint.h>
#include <seqan/sequence.h>
#include "hit.h"
#include "qual.h"
/// Encapsulates a change made to a query base, i.e. "the 3rd base from
/// the 5' end was changed from an A to a T". Useful when using
/// for matching seeded by "seedlings".
struct QueryMutation {
QueryMutation() : pos(0), oldBase(0), newBase(0) { }
QueryMutation(uint16_t _pos, uint8_t _oldBase, uint8_t _newBase) :
pos(_pos), oldBase(_oldBase), newBase(_newBase)
{
assert_neq(oldBase, newBase);
assert_leq(oldBase, 4);
assert_lt(newBase, 4);
}
uint16_t pos;
uint8_t oldBase; // original base from the read
uint8_t newBase; // mutated to fit the reference in at least one place
};
/**
* Encapsulates a partial alignment. Supports up to 256 positions and
* up to 3 substitutions. The 'type' field of all the alternative
* structs tells us whether this entry is a singleton entry, an offset
* into the spillover list, a non-tail entry in the spillover list, or
* a tail entry in the spillover list.
*/
typedef union {
struct {
uint64_t pos0 : 16; // mismatched pos 1
uint64_t pos1 : 16; // mismatched pos 2
uint64_t pos2 : 16; // mismatched pos 3
uint64_t char0 : 2; // substituted char for pos 1
uint64_t char1 : 2; // substituted char for pos 2
uint64_t char2 : 2; // substituted char for pos 3
uint64_t reserved : 8;
uint64_t type : 2; // type of entry; 0=singleton_entry,
// 1=list_offset, 2=list_entry,
// 3=list_tail
} entry;
struct {
uint64_t off : 62;// offset into list
uint64_t type : 2; // type of entry; 0=singleton,
// 1=list_offset, 2=list_entry,
// 3=list_tail
} off; // offset into list
struct {
uint64_t unk : 62;// padding
uint64_t type : 2; // type of entry; 0=singleton,
// 1=list_offset, 2=list_entry,
// 3=list_tail
} unk; // unknown
struct {
uint64_t u64 : 64;
} u64;
/**
*
*/
bool repOk(uint32_t qualMax, uint32_t slen, const String<char>& quals, bool maqPenalty) {
uint32_t qual = 0;
assert_leq(slen, seqan::length(quals));
if(entry.pos0 != 0xffff) {
assert_lt(entry.pos0, slen);
qual += mmPenalty(maqPenalty, phredCharToPhredQual(quals[entry.pos0]));
}
if(entry.pos1 != 0xffff) {
assert_lt(entry.pos1, slen);
qual += mmPenalty(maqPenalty, phredCharToPhredQual(quals[entry.pos1]));
}
if(entry.pos2 != 0xffff) {
assert_lt(entry.pos2, slen);
qual += mmPenalty(maqPenalty, phredCharToPhredQual(quals[entry.pos2]));
}
assert_leq(qual, qualMax);
return true;
}
} PartialAlignment;
#ifndef NDEBUG
static bool sameHalfPartialAlignment(PartialAlignment pa1, PartialAlignment pa2) {
if(pa1.unk.type == 1 || pa2.unk.type == 1) return false;
assert_neq(0xffff, pa1.entry.pos0);
assert_neq(0xffff, pa2.entry.pos0);
// Make sure pa1's pos0 is represented in pa1
if(pa1.entry.pos0 == pa2.entry.pos0) {
if(pa1.entry.char0 != pa2.entry.char0) return false;
} else if(pa1.entry.pos0 == pa2.entry.pos1) {
if(pa1.entry.char0 != pa2.entry.char1) return false;
} else if(pa1.entry.pos0 == pa2.entry.pos2) {
if(pa1.entry.char0 != pa2.entry.char2) return false;
} else {
return false;
}
if(pa1.entry.pos1 != 0xffff) {
if (pa1.entry.pos1 == pa2.entry.pos0) {
if(pa1.entry.char1 != pa2.entry.char0) return false;
} else if(pa1.entry.pos1 == pa2.entry.pos1) {
if(pa1.entry.char1 != pa2.entry.char1) return false;
} else if(pa1.entry.pos1 == pa2.entry.pos2) {
if(pa1.entry.char1 != pa2.entry.char2) return false;
} else {
return false;
}
}
if(pa1.entry.pos2 != 0xffff) {
if (pa1.entry.pos2 == pa2.entry.pos0) {
if(pa1.entry.char2 != pa2.entry.char0) return false;
} else if(pa1.entry.pos2 == pa2.entry.pos1) {
if(pa1.entry.char2 != pa2.entry.char1) return false;
} else if(pa1.entry.pos2 == pa2.entry.pos2) {
if(pa1.entry.char2 != pa2.entry.char2) return false;
} else {
return false;
}
}
return true;
}
static bool samePartialAlignment(PartialAlignment pa1, PartialAlignment pa2) {
return sameHalfPartialAlignment(pa1, pa2) && sameHalfPartialAlignment(pa2, pa1);
}
static bool validPartialAlignment(PartialAlignment pa) {
if(pa.entry.pos0 != 0xffff) {
if(pa.entry.pos0 == pa.entry.pos1) return false;
if(pa.entry.pos0 == pa.entry.pos2) return false;
} else {
if(pa.entry.pos1 != 0xffff) return false;
if(pa.entry.pos2 != 0xffff) return false;
}
if(pa.entry.pos1 != 0xffff) {
if(pa.entry.pos1 == pa.entry.pos2) return false;
} else {
if(pa.entry.pos2 != 0xffff) return false;
}
return true;
}
#endif
extern
void printHit(const vector<String<Dna5> >& os,
const Hit& h,
const String<Dna5>& qry,
size_t qlen,
uint32_t unrevOff,
uint32_t oneRevOff,
uint32_t twoRevOff,
uint32_t threeRevOff,
bool ebwtFw);
/**
* A synchronized data structure for storing partial alignments
* associated with patids, with particular attention to compactness.
*/
class PartialAlignmentManager {
public:
PartialAlignmentManager(size_t listSz = 10 * 1024 * 1024) {
// Reserve space for 10M partialsList entries = 40 MB
_partialsList.reserve(listSz);
}
~PartialAlignmentManager() { }
/**
* Add a set of partial alignments for a particular patid into the
* partial-alignment database. This version locks the database,
* and so is safe to call if there are potential readers or
* writers currently running.
*/
void addPartials(uint32_t patid, const vector<PartialAlignment>& ps) {
if(ps.size() == 0) return;
ThreadSafe _ts(&mutex_m);
size_t origPlSz = _partialsList.size();
// Assert that the entry doesn't exist yet
assert(_partialsMap.find(patid) == _partialsMap.end());
if(ps.size() == 1) {
_partialsMap[patid] = ps[0];
_partialsMap[patid].entry.type = 0; // singleton
} else {
#ifndef NDEBUG
// Make sure there are not duplicate entries
for(size_t i = 0; i < ps.size()-1; i++)
for(size_t j = i+1; j < ps.size(); j++)
assert(!samePartialAlignment(ps[i], ps[j]));
#endif
// Insert a "pointer" record into the map that refers to
// the stretch of the _partialsList vector that contains
// the partial alignments.
PartialAlignment al;
al.u64.u64 = 0xffffffffffffffffllu;
al.off.off = origPlSz;
al.off.type = 1; // list offset
_partialsMap[patid] = al; // install pointer
assert_gt(ps.size(), 1);
// Now add all the non-tail partial alignments (all but the
// last) to the _partialsList
for(size_t i = 0; i < ps.size()-1; i++) {
assert(validPartialAlignment(ps[i]));
_partialsList.push_back(ps[i]);
// list entry (non-tail)
_partialsList.back().entry.type = 2;
}
// Now add the tail (last) partial alignment and mark it as
// such
assert(validPartialAlignment(ps.back()));
_partialsList.push_back(ps.back());
// list tail
_partialsList.back().entry.type = 3;
#ifndef NDEBUG
// Make sure there are not duplicate entries
assert_eq(_partialsList.size(), origPlSz + ps.size());
for(size_t i = origPlSz; i < _partialsList.size()-1; i++) {
for(size_t j = i+1; j < _partialsList.size(); j++) {
assert(!samePartialAlignment(_partialsList[i], _partialsList[j]));
}
}
#endif
}
// Assert that we added an entry
assert(_partialsMap.find(patid) != _partialsMap.end());
}
/**
* Get a set of partial alignments for a particular patid out of
* the partial-alignment database.
*/
void getPartials(uint32_t patid, vector<PartialAlignment>& ps) {
assert_eq(0, ps.size());
ThreadSafe _ts(&mutex_m);
getPartialsUnsync(patid, ps);
}
/**
* Get a set of partial alignments for a particular patid out of
* the partial-alignment database. This version does not attempt to
* lock the database. This is more efficient than the synchronized
* version, but is unsafe if there are other threads that may be
* writing to the database.
*/
void getPartialsUnsync(uint32_t patid, vector<PartialAlignment>& ps) {
assert_eq(0, ps.size());
if(_partialsMap.find(patid) == _partialsMap.end()) {
return;
}
PartialAlignment al;
al.u64.u64 = _partialsMap[patid].u64.u64;
uint32_t type = al.unk.type;
if(type == 0) {
// singleton
ps.push_back(al);
} else {
// list
assert_eq(1, type);
uint32_t off = (uint32_t)al.off.off;
do {
assert_lt(off, _partialsList.size());
ASSERT_ONLY(type = _partialsList[off].entry.type);
assert(type == 2 || type == 3);
#ifndef NDEBUG
// Make sure this entry isn't equal to any other entry
for(size_t i = 0; i < ps.size(); i++) {
assert(validPartialAlignment(ps[i]));
assert(!samePartialAlignment(ps[i], _partialsList[off]));
}
#endif
assert(validPartialAlignment(_partialsList[off]));
ps.push_back(_partialsList[off]);
ASSERT_ONLY(uint32_t pos0 = ps.back().entry.pos0);
ASSERT_ONLY(uint32_t pos1 = ps.back().entry.pos1);
ASSERT_ONLY(uint32_t pos2 = ps.back().entry.pos2);
assert(pos1 == 0xffff || pos0 != pos1);
assert(pos2 == 0xffff || pos0 != pos2);
assert(pos2 == 0xffff || pos1 != pos2);
} while(_partialsList[off++].entry.type == 2);
assert_eq(3, _partialsList[off-1].entry.type);
}
assert_gt(ps.size(), 0);
}
/// Call to clear the database when there is only one element in it
void clear(uint32_t patid) {
assert_eq(1, _partialsMap.count(patid));
assert_eq(1, _partialsMap.size());
_partialsMap.erase(patid);
assert_eq(0, _partialsMap.size());
_partialsList.clear();
assert_eq(0, _partialsList.size());
}
size_t size() {
return _partialsMap.size();
}
/**
* Convert a partial alignment into a QueryMutation string.
*/
static uint8_t toMutsString(const PartialAlignment& pal,
const String<Dna5>& seq,
const String<char>& quals,
String<QueryMutation>& muts,
bool maqPenalty = true)
{
reserve(muts, 4);
assert_eq(0, length(muts));
uint32_t plen = (uint32_t)length(seq);
assert_gt(plen, 0);
assert_neq(1, pal.unk.type);
// Do first mutation
uint8_t oldQuals = 0;
uint32_t pos0 = pal.entry.pos0;
assert_lt(pos0, plen);
uint16_t tpos0 = plen-1-pos0;
uint32_t chr0 = pal.entry.char0;
uint8_t oldChar = (uint8_t)seq[tpos0];
uint8_t oldQual0 = mmPenalty(maqPenalty, phredCharToPhredQual(quals[tpos0]));
assert_leq(oldQual0, 99);
oldQuals += oldQual0; // take quality hit
appendValue(muts, QueryMutation(tpos0, oldChar, chr0)); // apply mutation
if(pal.entry.pos1 != 0xffff) {
// Do second mutation
uint32_t pos1 = pal.entry.pos1;
assert_lt(pos1, plen);
uint16_t tpos1 = plen-1-pos1;
uint32_t chr1 = pal.entry.char1;
oldChar = (uint8_t)seq[tpos1];
uint8_t oldQual1 = mmPenalty(maqPenalty, phredCharToPhredQual(quals[tpos1]));
assert_leq(oldQual1, 99);
oldQuals += oldQual1; // take quality hit
assert_neq(tpos1, tpos0);
appendValue(muts, QueryMutation(tpos1, oldChar, chr1)); // apply mutation
if(pal.entry.pos2 != 0xffff) {
// Do second mutation
uint32_t pos2 = pal.entry.pos2;
assert_lt(pos2, plen);
uint16_t tpos2 = plen-1-pos2;
uint32_t chr2 = pal.entry.char2;
oldChar = (uint8_t)seq[tpos2];
uint8_t oldQual2 = mmPenalty(maqPenalty, phredCharToPhredQual(quals[tpos2]));
assert_leq(oldQual2, 99);
oldQuals += oldQual2; // take quality hit
assert_neq(tpos2, tpos0);
assert_neq(tpos2, tpos1);
append(muts, QueryMutation(tpos2, oldChar, chr2)); // apply mutation
}
}
assert_gt(length(muts), 0);
assert_leq(length(muts), 3);
return oldQuals;
}
private:
/// Maps patids to partial alignments for that patid
map<uint32_t, PartialAlignment> _partialsMap;
/// Overflow for when a patid has more than 1 partial alignment
vector<PartialAlignment> _partialsList;
/// Lock for 'partialsMap' and 'partialsList'; necessary because
/// search threads will be reading and writing them
MUTEX_T mutex_m;
};
#endif /* EBWT_SEARCH_UTIL_H_ */