Skip to content

Commit

Permalink
Merge pull request #703 from COMBINE-lab/indrop
Browse files Browse the repository at this point in the history
Add inDrop v2 to alevin

Fix corner case logic bug where barcode recover attempted even if extracted_barcode was `false`.
  • Loading branch information
rob-p committed Nov 21, 2021
2 parents 5bdbf3c + b7fcc53 commit f95567a
Show file tree
Hide file tree
Showing 10 changed files with 106 additions and 65 deletions.
3 changes: 3 additions & 0 deletions include/AlevinUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <algorithm>
#include <limits>
#include <string>
#include <numeric>

#include "spdlog/spdlog.h"

Expand Down Expand Up @@ -72,6 +73,8 @@ namespace alevin{
void readWhitelist(bfs::path& filePath,
TrueBcsT& trueBarcodes);

unsigned int hammingDistance(const std::string s1, const std::string s2);

template <typename ProtocolT>
bool processAlevinOpts(AlevinOpts<ProtocolT>& aopt,
SalmonOpts& sopt, bool noTgMap,
Expand Down
14 changes: 8 additions & 6 deletions include/SingleCellProtocols.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,17 +124,19 @@ namespace alevin{
DropSeq(): Rule(12, 8, BarcodeEnd::FIVE, 16777216){}
};

struct InDrop : Rule{
//InDrop starts from 5end with variable
//length barcodes so provide the full
// length of the barcode including w1.
// UMI length is 6
InDrop(): Rule(42, 6, BarcodeEnd::FIVE, 22347776){}
struct InDropV2 : Rule{
//InDropV2 starts from 5end with variable
//length barcodes where barcode1 varies from 8 to 11 bp
// followed by w1 sequence, 8 bp barcode2 and 6bp UMI
InDropV2(): Rule(20, 6, BarcodeEnd::FIVE, 22347776){}

std::string w1;
std::size_t w1Length, maxHammingDist = 2, bc2Len = 8;
void setW1(std::string& w1_){
w1 = w1_;
w1Length = w1.length();
}
std::size_t w1Pos = 0, bc2EndPos;
};

struct CITESeq : Rule{
Expand Down
16 changes: 7 additions & 9 deletions src/Alevin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1022,7 +1022,7 @@ salmon-based processing of single-cell RNA-seq data.

bool noTgMap {false};
bool dropseq = vm["dropseq"].as<bool>();
bool indrop = vm["indrop"].as<bool>();
bool indropV2 = vm["indropV2"].as<bool>();
bool citeseq = vm["citeseq"].as<bool>();
bool chromV3 = vm["chromiumV3"].as<bool>();
bool chrom = vm["chromium"].as<bool>();
Expand All @@ -1040,7 +1040,7 @@ salmon-based processing of single-cell RNA-seq data.

uint8_t validate_num_protocols {0};
if (dropseq) validate_num_protocols += 1;
if (indrop) validate_num_protocols += 1;
if (indropV2) validate_num_protocols += 1;
if (citeseq) { validate_num_protocols += 1; noTgMap = true;}
if (chromV3) validate_num_protocols += 1;
if (chrom) validate_num_protocols += 1;
Expand Down Expand Up @@ -1081,20 +1081,18 @@ salmon-based processing of single-cell RNA-seq data.
vm, commentString, noTgMap,
barcodeFiles, readFiles, salmonIndex);
}
else if(indrop){
std::cout<<"Indrop get neighbors removed, please use other protocols";
exit(1);
else if(indropV2){
if(vm.count("w1") != 0){
std::string w1 = vm["w1"].as<std::string>();
AlevinOpts<apt::InDrop> aopt;
AlevinOpts<apt::InDropV2> aopt;
aopt.protocol.setW1(w1);
//aopt.jointLog->warn("Using InDrop Setting for Alevin");
//aopt.jointLog->warn("Using InDropV2 Setting for Alevin");
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
barcodeFiles, readFiles, salmonIndex);
}
else{
fmt::print(stderr, "ERROR: indrop needs w1 flag too.\n Exiting Now");
fmt::print(stderr, "ERROR: indropV2 needs w1 flag too.\n Exiting Now");
exit(1);
}
}
Expand All @@ -1104,7 +1102,7 @@ salmon-based processing of single-cell RNA-seq data.
aopt.protocol.setFeatureLength(vm["featureLength"].as<size_t>());
aopt.protocol.setFeatureStart(vm["featureStart"].as<size_t>());

//aopt.jointLog->warn("Using InDrop Setting for Alevin");
//aopt.jointLog->warn("Using InDropV2 Setting for Alevin");
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
barcodeFiles, readFiles, salmonIndex);
Expand Down
2 changes: 1 addition & 1 deletion src/AlevinHash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ int salmonHashQuantify(AlevinOpts<apt::CITESeq>& aopt,
bfs::path& outputDirectory,
CFreqMapT& freqCounter);
template
int salmonHashQuantify(AlevinOpts<apt::InDrop>& aopt,
int salmonHashQuantify(AlevinOpts<apt::InDropV2>& aopt,
bfs::path& outputDirectory,
CFreqMapT& freqCounter);
template
Expand Down
74 changes: 56 additions & 18 deletions src/AlevinUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,12 @@ namespace alevin {
return &seq2;
}
template <>
std::string* getReadSequence(apt::InDrop& protocol,
std::string* getReadSequence(apt::InDropV2& protocol,
std::string& seq,
std::string& seq2,
std::string& subseq){
(void)seq;
return &seq2;
(void)seq2;
return &seq;
}
// end of read extraction

Expand Down Expand Up @@ -251,14 +251,14 @@ namespace alevin {
return true;
}
template <>
bool extractUMI<apt::InDrop>(std::string& read,
bool extractUMI<apt::InDropV2>(std::string& read,
std::string& read2,
apt::InDrop& pt,
apt::InDropV2& pt,
std::string& umi){
(void)read;
(void)read2;
std::cout<<"Incorrect call for umi extract";
exit(1);
return (read2.length() >= pt.w1Length + pt.barcodeLength + pt.umiLength) ?
(umi.assign(read2, pt.bc2EndPos, pt.umiLength), true) : false;
return true;
}

template <>
Expand Down Expand Up @@ -378,21 +378,49 @@ namespace alevin {
(bc.assign(read, pt.umiLength, pt.barcodeLength), true) : false;
}
template <>
bool extractBarcode<apt::InDrop>(std::string& read,
bool extractBarcode<apt::InDropV2>(std::string& read,
std::string& read2,
apt::InDrop& pt, std::string& bc){
(void)read2;
std::string::size_type index = read.find(pt.w1);
if (index == std::string::npos){
return false;
apt::InDropV2& pt, std::string& bc){
(void)read;
if(read2.length() >= (pt.w1Length + pt.barcodeLength + pt.umiLength)) {
pt.w1Pos = read2.find(pt.w1);
if (pt.w1Pos == std::string::npos){
bool found = false;
for( int i = 8; i <= 11; i++){
if (hammingDistance(pt.w1, read2.substr(i,pt.w1Length)) <= pt.maxHammingDist) {
pt.w1Pos = i;
found = true;
break;
}
}
if (!found) {return false;}
}
bc = read.substr(0, index);
if(bc.size()<8 or bc.size()>12){
if(pt.w1Pos < 8 or pt.w1Pos > 11){
return false;
}
bc = read2.substr(0, pt.w1Pos);
uint32_t offset = bc.size()+pt.w1.size();
bc += read.substr(offset, offset+8);
bc += read2.substr(offset, pt.bc2Len);
switch (pt.barcodeLength - bc.size())
{
case 1:
bc += "A";
break;
case 2:
bc += "AT";
break;
case 3:
bc += "AAG";
break;
case 4:
bc += "AAAC";
break;
}
pt.bc2EndPos = offset+pt.bc2Len;
return true;
} else {
return false;
}
}

void getIndelNeighbors(
Expand Down Expand Up @@ -476,6 +504,16 @@ namespace alevin {
neighbors);
}

unsigned int hammingDistance(const std::string s1, const std::string s2){
if(s1.size() != s2.size()){
throw std::invalid_argument("Strings have different lengths, can't compute hamming distance");
}

// compute dot product for all postisions, start with 0 and add if the values are not equal
return std::inner_product(s1.begin(),s1.end(),s2.begin(), 0, std::plus<unsigned int>(),
std::not2(std::equal_to<std::string::value_type>()));
}

void getTxpToGeneMap(spp::sparse_hash_map<uint32_t, uint32_t>& txpToGeneMap,
spp::sparse_hash_map<std::string, uint32_t>& geneIdxMap,
const std::string& t2gFileName,
Expand Down Expand Up @@ -1313,7 +1351,7 @@ namespace alevin {
SalmonOpts& sopt, bool noTgMap,
boost::program_options::variables_map& vm);
template
bool processAlevinOpts(AlevinOpts<apt::InDrop>& aopt,
bool processAlevinOpts(AlevinOpts<apt::InDropV2>& aopt,
SalmonOpts& sopt, bool noTgMap,
boost::program_options::variables_map& vm);
template
Expand Down
2 changes: 1 addition & 1 deletion src/CollapsedCellOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1438,7 +1438,7 @@ template
bool CollapsedCellOptimizer::optimize(EqMapT& fullEqMap,
spp::sparse_hash_map<uint32_t, uint32_t>& txpToGeneMap,
spp::sparse_hash_map<std::string, uint32_t>& geneIdxMap,
AlevinOpts<apt::InDrop>& aopt,
AlevinOpts<apt::InDropV2>& aopt,
GZipWriter& gzw,
std::vector<std::string>& trueBarcodes,
std::vector<uint32_t>& umiCount,
Expand Down
6 changes: 3 additions & 3 deletions src/GZipWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1865,8 +1865,8 @@ bool GZipWriter::writeEquivCounts<SCExpT, apt::CITESeq>(
const AlevinOpts<apt::CITESeq>& aopts,
SCExpT& readExp);
template
bool GZipWriter::writeEquivCounts<SCExpT, apt::InDrop>(
const AlevinOpts<apt::InDrop>& aopts,
bool GZipWriter::writeEquivCounts<SCExpT, apt::InDropV2>(
const AlevinOpts<apt::InDropV2>& aopts,
SCExpT& readExp);
template
bool GZipWriter::writeEquivCounts<SCExpT, apt::ChromiumV3>(
Expand Down Expand Up @@ -1911,7 +1911,7 @@ template bool
GZipWriter::writeMetaAlevin<apt::CITESeq>(const AlevinOpts<apt::CITESeq>& opts,
boost::filesystem::path aux_dir);
template bool
GZipWriter::writeMetaAlevin<apt::InDrop>(const AlevinOpts<apt::InDrop>& opts,
GZipWriter::writeMetaAlevin<apt::InDropV2>(const AlevinOpts<apt::InDropV2>& opts,
boost::filesystem::path aux_dir);
template bool
GZipWriter::writeMetaAlevin<apt::Chromium>(const AlevinOpts<apt::Chromium>& opts,
Expand Down
4 changes: 2 additions & 2 deletions src/ProgramOptionsGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,8 @@ namespace salmon {
"alevin-developer Options");
alevindevs.add_options()
(
"indrop", po::bool_switch()->default_value(alevin::defaults::isInDrop),
"Use inDrop (not extensively tested) Single Cell protocol for the library. must specify w1 too.")
"indropV2", po::bool_switch()->default_value(alevin::defaults::isInDrop),
"Use inDropV2 Single Cell protocol for the library. Must specify w1 too.")
(
"w1", po::value<std::string>(),
"Must be used in conjunction with inDrop;")
Expand Down
48 changes: 24 additions & 24 deletions src/SalmonAlevin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -663,7 +663,7 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
//barcode.clear();
nonstd::optional<uint32_t> barcodeIdx;
extraBAMtags.clear();
bool seqOk;
bool seqOk = false;

// keep track of the *least* freqeuntly
// occurring hit in this fragment to consider
Expand All @@ -673,13 +673,13 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re

if (alevinOpts.protocol.end == bcEnd::FIVE ||
alevinOpts.protocol.end == bcEnd::THREE){
bool extracted_bc = aut::extractBarcode(rp.first.seq, rp.second.seq, localProtocol, barcode);
seqOk = (extracted_bc) ?
aut::sequenceCheck(barcode, Sequence::BARCODE) : false;

if (not seqOk){
bool recovered = aut::recoverBarcode(barcode);
if (recovered) { seqOk = true; }
bool extracted_barcode = aut::extractBarcode(rp.first.seq, rp.second.seq, localProtocol, barcode);
if (extracted_barcode) {
seqOk = aut::sequenceCheck(barcode, Sequence::BARCODE);
if (not seqOk){
bool recovered = aut::recoverBarcode(barcode);
if (recovered) { seqOk = true; }
}
}

// If we have a valid barcode
Expand Down Expand Up @@ -1190,17 +1190,17 @@ void process_reads_sc_align(paired_parser* parser, ReadExperimentT& readExp, Rea
//barcode.clear();
nonstd::optional<uint32_t> barcodeIdx;
extraBAMtags.clear();
bool seqOk;
bool seqOk = false;

if (alevinOpts.protocol.end == bcEnd::FIVE ||
alevinOpts.protocol.end == bcEnd::THREE){
bool extracted_barcode = aut::extractBarcode(rp.first.seq, rp.second.seq, localProtocol, barcode);
seqOk = (extracted_barcode) ?
aut::sequenceCheck(barcode, Sequence::BARCODE) : false;

if (not seqOk){
bool recovered = aut::recoverBarcode(barcode);
if (recovered) { seqOk = true; }
if (extracted_barcode) {
seqOk = aut::sequenceCheck(barcode, Sequence::BARCODE);
if (not seqOk){
bool recovered = aut::recoverBarcode(barcode);
if (recovered) { seqOk = true; }
}
}

// If we have a valid barcode
Expand Down Expand Up @@ -1648,17 +1648,17 @@ void processReadsQuasi(
//barcode.clear();
nonstd::optional<uint32_t> barcodeIdx;
extraBAMtags.clear();
bool seqOk;
bool seqOk = false;

if (alevinOpts.protocol.end == bcEnd::FIVE ||
alevinOpts.protocol.end == bcEnd::THREE){
bool extracted_barcode = aut::extractBarcode(rp.first.seq, rp.second.seq, localProtocol, barcode);
seqOk = (extracted_barcode) ?
aut::sequenceCheck(barcode, Sequence::BARCODE) : false;

if (not seqOk){
bool recovered = aut::recoverBarcode(barcode);
if (recovered) { seqOk = true; }
if (extracted_barcode) {
seqOk = aut::sequenceCheck(barcode, Sequence::BARCODE);
if (not seqOk){
bool recovered = aut::recoverBarcode(barcode);
if (recovered) { seqOk = true; }
}
}

// If we have a barcode sequence, but not yet an index
Expand Down Expand Up @@ -3025,11 +3025,11 @@ alevinQuant(AlevinOpts<apt::CITESeq>& aopt, SalmonOpts& sopt,
std::unique_ptr<SalmonIndex>& salmonIndex);

template int
alevin_sc_align(AlevinOpts<apt::InDrop>& aopt, SalmonOpts& sopt,
alevin_sc_align(AlevinOpts<apt::InDropV2>& aopt, SalmonOpts& sopt,
boost::program_options::parsed_options& orderedOptions,
std::unique_ptr<SalmonIndex>& salmonIndex);
template int
alevinQuant(AlevinOpts<apt::InDrop>& aopt, SalmonOpts& sopt,
alevinQuant(AlevinOpts<apt::InDropV2>& aopt, SalmonOpts& sopt,
SoftMapT& barcodeMap, TrueBcsT& trueBarcodes,
spp::sparse_hash_map<uint32_t, uint32_t>& txpToGeneMap,
spp::sparse_hash_map<std::string, uint32_t>& geneIdxMap,
Expand Down
2 changes: 1 addition & 1 deletion src/WhiteList.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ namespace alevin {
std::vector<std::string>& trueBarcodes,
bool useRibo, bool useMito,
size_t numLowConfidentBarcode);
template bool performWhitelisting(AlevinOpts<alevin::protocols::InDrop>& aopt,
template bool performWhitelisting(AlevinOpts<alevin::protocols::InDropV2>& aopt,
std::vector<std::string>& trueBarcodes,
bool useRibo, bool useMito,
size_t numLowConfidentBarcode);
Expand Down

0 comments on commit f95567a

Please sign in to comment.