From 795987cd981169dadb6ee7c98dd907aa183dd265 Mon Sep 17 00:00:00 2001 From: Changqing Wang Date: Thu, 14 Sep 2023 17:44:37 +1000 Subject: [PATCH] custom barcode patterns; IUPAC ambiguous codes --- flexiplex.c++ | 172 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 112 insertions(+), 60 deletions(-) diff --git a/flexiplex.c++ b/flexiplex.c++ index 9a854c5..813f156 100644 --- a/flexiplex.c++ +++ b/flexiplex.c++ @@ -69,14 +69,6 @@ void reverse_compliment(string & seq){ transform(seq.begin(),seq.end(),seq.begin(),compliment); } -//Holds the search string patterns -struct SearchSeq { - string primer; - string polyA; - string umi_seq; - string temp_barcode; -} search_pattern ; - //Holds the found barcode and associated information struct Barcode { string barcode; @@ -149,8 +141,8 @@ unsigned int edit_distance(const std::string& s1, const std::string& s2, unsigne Barcode get_barcode(string & seq, unordered_set *known_barcodes, int flank_max_editd, - int barcode_max_editd){//, - // SearchSeq & ss){ + int barcode_max_editd, + const std::vector> &search_pattern) { const int OFFSET=5; //wiggle room in bases of the expected barcode start site to search. @@ -159,15 +151,27 @@ Barcode get_barcode(string & seq, barcode.editd=100; barcode.flank_editd=100; barcode.unambiguous = false; //initialise edlib configuration - EdlibEqualityPair additionalEqualities[5] = {{'?','A'},{'?','C'},{'?','G'},{'?','T'},{'?','N'}}; - EdlibAlignConfig edlibConf = {flank_max_editd, EDLIB_MODE_HW, EDLIB_TASK_PATH, additionalEqualities, 5}; + EdlibEqualityPair additionalEqualities[28] = { + {'R', 'A'}, {'R', 'G'}, + {'K', 'G'}, {'K', 'T'}, + {'S', 'G'}, {'S', 'C'}, + {'Y', 'C'}, {'Y', 'T'}, + {'M', 'A'}, {'M', 'C'}, + {'W', 'A'}, {'W', 'T'}, + {'B', 'C'}, {'B', 'G'}, {'B', 'T'}, + {'H', 'A'}, {'H', 'C'}, {'H', 'T'}, + {'N', 'A'}, {'N', 'C'}, {'N', 'G'}, {'N', 'T'}, + {'D', 'A'}, {'D', 'G'}, {'D', 'T'}, + {'V', 'A'}, {'V', 'C'}, {'V', 'G'} + }; + EdlibAlignConfig edlibConf = {flank_max_editd, EDLIB_MODE_HW, EDLIB_TASK_PATH, additionalEqualities, 28}; //search for primer and ployT (barcode and umi as wildcards) - string search_string= - search_pattern.primer+ - search_pattern.temp_barcode+ - search_pattern.umi_seq+ - search_pattern.polyA; + std::string search_string; + for (const auto &pair : search_pattern) { + search_string += pair.second; + } + EdlibAlignResult result = edlibAlign(search_string.c_str(), search_string.length(), seq.c_str(), seq.length(), edlibConf); if(result.status != EDLIB_STATUS_OK | result.numLocations==0 ){ edlibFreeAlignResult(result); @@ -178,14 +182,12 @@ Barcode get_barcode(string & seq, barcode.flank_end=result.endLocations[0]; // Extract sub-patterns from aligment directly - vector subpattern_lengths = { - search_pattern.primer.length(), - search_pattern.temp_barcode.length(), - search_pattern.umi_seq.length(), - search_pattern.polyA.length() - }; + std::vector subpattern_lengths; + for (const auto &pair : search_pattern) { + subpattern_lengths.push_back(pair.second.length()); + } - std::vector subpattern_ends; +std::vector subpattern_ends; subpattern_ends.resize(subpattern_lengths.size()); std::partial_sum(subpattern_lengths.begin(), subpattern_lengths.end(), subpattern_ends.begin()); @@ -220,18 +222,56 @@ Barcode get_barcode(string & seq, //if not checking against known list of barcodes, return sequence after the primer //also check for a perfect match straight up as this will save computer later. - string exact_bc=seq.substr(read_to_subpatterns[0], read_to_subpatterns[1] - read_to_subpatterns[0]); - if(known_barcodes->size()==0 || (known_barcodes->find(exact_bc) != known_barcodes->end())){ + + int bc_index = -1, umi_index = -1; + auto it_pattern = + std::find_if(search_pattern.begin(), search_pattern.end(), + [](const std::pair &pair) { + return pair.first == "BC"; + }); + if (it_pattern != search_pattern.end()) { + bc_index = std::distance(search_pattern.begin(), it_pattern); + } else { + // error + } + it_pattern = + std::find_if(search_pattern.begin(), search_pattern.end(), + [](const std::pair &pair) { + return pair.first == "UMI"; + }); + if (it_pattern != search_pattern.end()) { + umi_index = std::distance(search_pattern.begin(), it_pattern); + } else { + // error + } + + std::string exact_bc = seq.substr( + read_to_subpatterns[bc_index == 0 ? barcode.flank_start : (bc_index - 1)], + read_to_subpatterns[bc_index] - + read_to_subpatterns[bc_index == 0 ? barcode.flank_start + : (bc_index - 1)]); +if(known_barcodes->size()==0 || (known_barcodes->find(exact_bc) != known_barcodes->end())){ barcode.barcode=exact_bc; barcode.editd=0; barcode.unambiguous=true; - barcode.umi=seq.substr(read_to_subpatterns[1], search_pattern.umi_seq.length()); - return(barcode); + if (umi_index == -1) { + barcode.umi = ""; + } else { + barcode.umi = + seq.substr(read_to_subpatterns[umi_index == 0 ? barcode.flank_start + : (umi_index - 1)], + search_pattern[umi_index].second.length()); + } +return(barcode); } // otherwise widen our search space and the look for matches with errors - string barcode_seq=seq.substr(read_to_subpatterns[0]-OFFSET,search_pattern.temp_barcode.length()+2*OFFSET); - + std::string barcode_seq = + seq.substr(read_to_subpatterns[bc_index == 0 ? barcode.flank_start + : (bc_index - 1)] - + OFFSET, + search_pattern[bc_index].second.length() + 2 * OFFSET); + //iterate over all the known barcodes, checking each sequentially unordered_set::iterator known_barcodes_itr=known_barcodes->begin(); unsigned int editDistance; unsigned int endDistance; @@ -244,8 +284,14 @@ Barcode get_barcode(string & seq, barcode.unambiguous=true; barcode.editd=editDistance; barcode.barcode=*known_barcodes_itr; - barcode.umi=seq.substr(read_to_subpatterns[0]-OFFSET+endDistance,search_pattern.umi_seq.length());//assumes no error in UMI seq. - if(editDistance==0){ //if perfect match is found we're done. + if (umi_index == -1) { + barcode.umi = ""; + } else { + barcode.umi = + seq.substr(read_to_subpatterns[0] - OFFSET + endDistance, + search_pattern[umi_index].second.length()); // assumes no error in UMI seq. + } +if(editDistance==0){ //if perfect match is found we're done. return(barcode); } } @@ -254,12 +300,11 @@ Barcode get_barcode(string & seq, } //search a read for one or more barcodes (parent function that calls get_barcode) -vector big_barcode_search(string & sequence, unordered_set & known_barcodes, - int max_flank_editd, int max_editd){ //, SearchSeq & ss){ +vector big_barcode_search(string & sequence, unordered_set & known_barcodes, int max_flank_editd, int max_editd, const std::vector> &search_pattern) { vector return_vec; //vector of all the barcodes found //search for barcode - Barcode result=get_barcode(sequence,&known_barcodes,max_flank_editd,max_editd); //,ss); + Barcode result=get_barcode(sequence,&known_barcodes,max_flank_editd,max_editd, search_pattern); //,ss); if(result.editd<=max_editd && result.unambiguous) //add to return vector if edit distance small enough return_vec.push_back(result); @@ -271,7 +316,7 @@ vector big_barcode_search(string & sequence, unordered_set & kn masked_sequence.replace(return_vec.at(i).flank_start,flank_length,string(flank_length,'X')); } //recursively call this function until no more barcodes are found vector masked_res; - masked_res=big_barcode_search(masked_sequence,known_barcodes,max_flank_editd,max_editd); //,ss); + masked_res=big_barcode_search(masked_sequence,known_barcodes,max_flank_editd,max_editd, search_pattern); //,ss); return_vec.insert(return_vec.end(),masked_res.begin(),masked_res.end()); //add to result } @@ -371,7 +416,8 @@ void print_read(string read_id,string read, string qual, // separated out from main so that this can be run with threads void search_read(vector & reads, unordered_set & known_barcodes, - int flank_edit_distance, int edit_distance){ + int flank_edit_distance, int edit_distance, + const std::vector> &search_pattern) { for(int r=0; r & reads, unordered_set & known_bar reads[r].vec_bc_for=big_barcode_search(reads[r].line, known_barcodes, flank_edit_distance, - edit_distance); + edit_distance, + search_pattern); reads[r].rev_line=reads[r].line; reverse_compliment(reads[r].rev_line); //Check the reverse compliment of the read reads[r].vec_bc_rev=big_barcode_search(reads[r].rev_line, known_barcodes, flank_edit_distance, - edit_distance); + edit_distance, + search_pattern); } } @@ -410,11 +458,8 @@ int main(int argc, char **argv){ bool split_file_by_barcode=false; //(s) bool remove_barcodes=true; //(r) - search_pattern.primer = "CTACACGACGCTCTTCCGATCT"; //(p) - search_pattern.polyA = string(9,'T'); //(T) - search_pattern.umi_seq = string(12,'?'); //(length u) - search_pattern.temp_barcode = string(16,'?'); //(length b) - + std::vector> search_pattern; + //Set of known barcodes unordered_set known_barcodes; unordered_set found_barcodes; @@ -454,10 +499,6 @@ int main(int argc, char **argv){ print_usage(); exit(1); //case barcode file is empty } - //set barcode length automatically from known barcodes.. - int bl=(known_barcodes.begin())->length(); - search_pattern.temp_barcode=string(bl,'?'); - cerr << "Setting barcode length automatically to " << bl << "\n"; params+=2; break; } @@ -480,28 +521,26 @@ int main(int argc, char **argv){ break; } case 'l':{ - search_pattern.primer=optarg; - cerr << "Setting primer to search for: " << search_pattern.primer << "\n"; + search_pattern.push_back(std::make_pair("Primer", optarg)); + cerr << "Setting primer to search for: " << optarg << "\n"; params+=2; break; } case 'r':{ - search_pattern.polyA=optarg; - cerr << "Setting polyT to search for: " << search_pattern.polyA << "\n"; + search_pattern.push_back(std::make_pair("PolyT", optarg)); + cerr << "Setting polyT to search for: " << optarg << "\n"; params+=2; break; } case 'u':{ - int ul=atoi(optarg); - search_pattern.umi_seq=string(ul,'?'); - cerr << "Setting UMI length to " << ul << "\n"; + search_pattern.push_back(std::make_pair("UMI", optarg)); + cerr << "Setting UMI to search for: " << optarg << "\n"; params+=2; break; } case 'b':{ - int bl=atoi(optarg); - search_pattern.temp_barcode=string(bl,'?'); - cerr << "Setting barcode length to " << bl << "\n"; + search_pattern.push_back(std::make_pair("BC", optarg)); + cerr << "Setting barcode to search for: " << optarg << "\n"; params+=2; break; } @@ -540,6 +579,19 @@ int main(int argc, char **argv){ } } + if (search_pattern.empty()) { + search_pattern = { + {"primer", "CTACACGACGCTCTTCCGATCT"}, + {"BC", std::string(16, 'N')}, + {"UMI", std::string(12, 'N')}, + {"polyA", std::string(9, 'T')} + }; + } else { + for (auto i : search_pattern) { + std::cerr << i.first << ": " << i.second << "\n"; + } + } + cerr << "For usage information type: flexiplex -h" << endl; istream * in; @@ -635,13 +687,13 @@ int main(int argc, char **argv){ cout << "Read="<