Skip to content

Commit

Permalink
custom read geometry allowed as well
Browse files Browse the repository at this point in the history
  • Loading branch information
Rob Patro committed Oct 10, 2020
1 parent cb7d573 commit 2dfca74
Show file tree
Hide file tree
Showing 7 changed files with 437 additions and 218 deletions.
6 changes: 4 additions & 2 deletions include/AlevinUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,16 +79,18 @@ namespace alevin{

template <typename ProtocolT>
bool extractUMI(std::string& read,
std::string& read2,
ProtocolT& pt,
std::string& umi);

template <typename ProtocolT>
void getReadSequence(ProtocolT& pt,
std::string* getReadSequence(ProtocolT& pt,
std::string& seq,
std::string& seq2,
std::string& subseq);

template <typename ProtocolT>
bool extractBarcode(std::string& read, ProtocolT& pt, std::string& bc);
bool extractBarcode(std::string& read, std::string& read2, ProtocolT& pt, std::string& bc);

template <typename OrderedOptionsT>
bool writeCmdInfo(boost::filesystem::path cmdInfoPath,
Expand Down
91 changes: 77 additions & 14 deletions include/SingleCellProtocols.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,78 @@ namespace alevin{

static constexpr size_t num_tag_pieces{16};
struct TagGeometry {
uint32_t read_num{0};
chobo::static_vector<std::pair<size_t, size_t>, num_tag_pieces> substr_locs{};
// chobo::static_vector<std::pair<uint32_t,uint8_t>, 16> bc_locs;
// std::vector<std::pair<uint32_t, uint32_t>> substr_locs{};
size_t length{0};
size_t largest_index{0};

inline bool unbounded() const { return length == std::string::npos; }
// uint32_t read_num{0};
// tuples are read_num, start_pos, length
chobo::static_vector<std::pair<uint32_t, size_t>, num_tag_pieces> substr_locs1{};
chobo::static_vector<std::pair<uint32_t, size_t>, num_tag_pieces> substr_locs2{};
// the total length of the tag on read 1
size_t length1{0};
// the total length of the tag on read 2
size_t length2{0};
// the largest index on read 1
size_t largest_index1{0};
// the largest index on read 2
size_t largest_index2{0};

inline bool unbounded1() const { return length1 == std::string::npos; }
inline bool unbounded2() const { return length2 == std::string::npos; }

inline bool uses_r1() const { return !substr_locs1.empty(); }
inline bool uses_r2() const { return !substr_locs2.empty(); }

size_t length() const { return length1 + length2; }

// Given the geometry of the tag, fill in the tag from
// read 1 (`from1`) and read 2 (`from2`), placing the constructed
// tag in `to`.
//
// *assumption*: `to` is large enough to hold the tag
// *returns*: true if the tag was written completely, and false otherwise
inline bool extract_tag(std::string& from1, std::string& from2, std::string&to) {
// if anything is too short, just ignore the whole thing
if (uses_r1() and (from1.length() < largest_index1)) { return false; }
if (uses_r2() and (from2.length() < largest_index2)) { return false; }

// will point to the next place to
// begin filling the output string
auto fill_it = to.begin();

// grab anything from read 1
auto f1b = from1.begin();
for (auto& st_len : substr_locs1) {
auto f1 = f1b + st_len.first;
fill_it = std::copy(f1, f1 + st_len.second, fill_it);
}

// grab anything from read 2
auto f2b = from2.begin();
for (auto& st_len : substr_locs2) {
auto f2 = f2b + st_len.first;
fill_it = std::copy(f2, f2 + st_len.second, fill_it);
}
return true;
}

bool extract(std::string& from, std::string&to) {
if (!unbounded() and (from.length() < largest_index)) { return false; }
inline bool extract_read(std::string& from1, std::string& from2, std::string&to) {
// if anything is too short, just ignore the whole thing
if (uses_r1() and !unbounded1() and (from1.length() < largest_index1)) { return false; }
if (uses_r2() and !unbounded2() and (from2.length() < largest_index2)) { return false; }

// since the read extraction doesn't have a
// fixed size, we'll append rather than
// overwrite.
to.clear();
for (auto& st_len : substr_locs) {
to += from.substr(st_len.first, st_len.second);
// grab anything from read 1
for (auto& st_len : substr_locs1) {
to.append(from1, st_len.first, st_len.second);
}
// grab anything from read 2
for (auto& st_len : substr_locs2) {
to.append(from2, st_len.first, st_len.second);
}
return true;
}

};

std::ostream& operator<<(std::ostream& os, const TagGeometry& tg);
Expand All @@ -50,6 +105,10 @@ namespace alevin{
// template requirements right now
void set_umi_geo(TagGeometry& g) { (void)g; };
void set_bc_geo(TagGeometry& g) { (void)g; };
void set_read_geo(TagGeometry& g) { (void)g; };
uint32_t barcode_length() const { return barcodeLength; }
uint32_t umi_length() const { return umiLength; }

uint32_t barcodeLength, umiLength, maxValue;
BarcodeEnd end;
};
Expand Down Expand Up @@ -124,9 +183,13 @@ namespace alevin{
// vector of offset, length pairs
TagGeometry umi_geo;
TagGeometry bc_geo;
TagGeometry read_geo;

void set_umi_geo(TagGeometry& g) { umi_geo = g; umiLength = umi_geo.length; };
void set_bc_geo(TagGeometry& g) { bc_geo = g; barcodeLength = bc_geo.length; };
void set_umi_geo(TagGeometry& g) { umi_geo = g; umiLength = umi_geo.length1 + umi_geo.length2; }
void set_bc_geo(TagGeometry& g) { bc_geo = g; barcodeLength = bc_geo.length1 + bc_geo.length2; }
void set_read_geo(TagGeometry& g) { read_geo = g; }
uint32_t barcode_length() const { return barcodeLength; }
uint32_t umi_length() const { return umiLength; }

// These values do nothing in this class except
// maintain template compat ... fix this design later.
Expand Down
19 changes: 11 additions & 8 deletions src/Alevin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ void densityCalculator(single_parser* parser,

uint64_t usedNumBarcodesLocal{0};
uint64_t totNumBarcodesLocal{0};
std::string extractedBarcode;
std::string extractedBarcode( aopt.protocol.barcodeLength, 'N' );
while (parser->refill(rg)) {
rangeSize = rg.size();
for (size_t i = 0; i < rangeSize; ++i) { // For all the read in this batch
Expand All @@ -132,8 +132,11 @@ void densityCalculator(single_parser* parser,
//if (aopt.protocol.end == bcEnd::THREE) {
// std::reverse(seq.begin(), seq.end());
//}
extractedBarcode.clear();
bool extraction_ok = aut::extractBarcode(seq, aopt.protocol, extractedBarcode);
//extractedBarcode.clear();

// NOTE: This means custom barcode extraction can't be on
// on more than one read in this mode! Come back and fix this later
bool extraction_ok = aut::extractBarcode(seq, seq, aopt.protocol, extractedBarcode);
bool seqOk = (extraction_ok) ? aut::sequenceCheck(extractedBarcode) : false;
if (not seqOk){
bool recovered = aut::recoverBarcode(extractedBarcode);
Expand Down Expand Up @@ -541,8 +544,8 @@ bool writeFastq(AlevinOpts<ProtocolT>& aopt,
TrueBcsT& trueBarcodes){
size_t rangeSize{0};
uint32_t totNumBarcodes{0};
std::string barcode;
std::string umi;
std::string barcode( aopt.protocol.barcodeLength, 'N');
std::string umi( aopt.protocol.umiLength, 'N');

#if defined(__linux) && defined(__GLIBCXX__) && __GLIBCXX__ >= 20200128
std::random_device rd("/dev/urandom");
Expand All @@ -562,10 +565,10 @@ bool writeFastq(AlevinOpts<ProtocolT>& aopt,
rangeSize = rg.size();
for (size_t i = 0; i < rangeSize; ++i) { // For all the read in this batch
auto& rp = rg[i];
barcode.clear();
// barcode.clear();
if(aopt.protocol.end == bcEnd::FIVE){
{
bool extracted_barcode = aut::extractBarcode(rp.first.seq, aopt.protocol, barcode);
bool extracted_barcode = aut::extractBarcode(rp.first.seq, rp.second.seq, aopt.protocol, barcode);
bool seqOk = (extracted_barcode) ? aut::sequenceCheck(barcode) : false;
if (not seqOk){
bool recovered = aut::recoverBarcode(barcode);
Expand All @@ -574,7 +577,7 @@ bool writeFastq(AlevinOpts<ProtocolT>& aopt,
}

{
bool seqOk = aut::extractUMI(rp.first.seq, aopt.protocol, umi);
bool seqOk = aut::extractUMI(rp.first.seq, rp.second.seq, aopt.protocol, umi);
if (not seqOk){
std::cerr<<"Can't extract UMI";
return false;
Expand Down

0 comments on commit 2dfca74

Please sign in to comment.