Skip to content

Commit

Permalink
custom barcode patterns; IUPAC ambiguous codes
Browse files Browse the repository at this point in the history
  • Loading branch information
ChangqingW committed Sep 14, 2023
1 parent f343077 commit 795987c
Showing 1 changed file with 112 additions and 60 deletions.
172 changes: 112 additions & 60 deletions flexiplex.c++
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,6 @@ void reverse_compliment(string & seq){
transform(seq.begin(),seq.end(),seq.begin(),compliment);
}

//Holds the search string patterns
struct SearchSeq {
string primer;
string polyA;
string umi_seq;
string temp_barcode;
} search_pattern ;

//Holds the found barcode and associated information
struct Barcode {
string barcode;
Expand Down Expand Up @@ -149,8 +141,8 @@ unsigned int edit_distance(const std::string& s1, const std::string& s2, unsigne
Barcode get_barcode(string & seq,
unordered_set<string> *known_barcodes,
int flank_max_editd,
int barcode_max_editd){//,
// SearchSeq & ss){
int barcode_max_editd,
const std::vector<std::pair<std::string, std::string>> &search_pattern) {

const int OFFSET=5; //wiggle room in bases of the expected barcode start site to search.

Expand All @@ -159,15 +151,27 @@ Barcode get_barcode(string & seq,
barcode.editd=100; barcode.flank_editd=100; barcode.unambiguous = false;

//initialise edlib configuration
EdlibEqualityPair additionalEqualities[5] = {{'?','A'},{'?','C'},{'?','G'},{'?','T'},{'?','N'}};
EdlibAlignConfig edlibConf = {flank_max_editd, EDLIB_MODE_HW, EDLIB_TASK_PATH, additionalEqualities, 5};
EdlibEqualityPair additionalEqualities[28] = {
{'R', 'A'}, {'R', 'G'},
{'K', 'G'}, {'K', 'T'},
{'S', 'G'}, {'S', 'C'},
{'Y', 'C'}, {'Y', 'T'},
{'M', 'A'}, {'M', 'C'},
{'W', 'A'}, {'W', 'T'},
{'B', 'C'}, {'B', 'G'}, {'B', 'T'},
{'H', 'A'}, {'H', 'C'}, {'H', 'T'},
{'N', 'A'}, {'N', 'C'}, {'N', 'G'}, {'N', 'T'},
{'D', 'A'}, {'D', 'G'}, {'D', 'T'},
{'V', 'A'}, {'V', 'C'}, {'V', 'G'}
};
EdlibAlignConfig edlibConf = {flank_max_editd, EDLIB_MODE_HW, EDLIB_TASK_PATH, additionalEqualities, 28};

//search for primer and ployT (barcode and umi as wildcards)
string search_string=
search_pattern.primer+
search_pattern.temp_barcode+
search_pattern.umi_seq+
search_pattern.polyA;
std::string search_string;
for (const auto &pair : search_pattern) {
search_string += pair.second;
}

EdlibAlignResult result = edlibAlign(search_string.c_str(), search_string.length(), seq.c_str(), seq.length(), edlibConf);
if(result.status != EDLIB_STATUS_OK | result.numLocations==0 ){
edlibFreeAlignResult(result);
Expand All @@ -178,14 +182,12 @@ Barcode get_barcode(string & seq,
barcode.flank_end=result.endLocations[0];

// Extract sub-patterns from aligment directly
vector<long unsigned int> subpattern_lengths = {
search_pattern.primer.length(),
search_pattern.temp_barcode.length(),
search_pattern.umi_seq.length(),
search_pattern.polyA.length()
};
std::vector<long unsigned int> subpattern_lengths;
for (const auto &pair : search_pattern) {
subpattern_lengths.push_back(pair.second.length());
}

std::vector<long unsigned int> subpattern_ends;
std::vector<long unsigned int> subpattern_ends;
subpattern_ends.resize(subpattern_lengths.size());
std::partial_sum(subpattern_lengths.begin(), subpattern_lengths.end(), subpattern_ends.begin());

Expand Down Expand Up @@ -220,18 +222,56 @@ Barcode get_barcode(string & seq,

//if not checking against known list of barcodes, return sequence after the primer
//also check for a perfect match straight up as this will save computer later.
string exact_bc=seq.substr(read_to_subpatterns[0], read_to_subpatterns[1] - read_to_subpatterns[0]);
if(known_barcodes->size()==0 || (known_barcodes->find(exact_bc) != known_barcodes->end())){

int bc_index = -1, umi_index = -1;
auto it_pattern =
std::find_if(search_pattern.begin(), search_pattern.end(),
[](const std::pair<std::string, std::string> &pair) {
return pair.first == "BC";
});
if (it_pattern != search_pattern.end()) {
bc_index = std::distance(search_pattern.begin(), it_pattern);
} else {
// error
}
it_pattern =
std::find_if(search_pattern.begin(), search_pattern.end(),
[](const std::pair<std::string, std::string> &pair) {
return pair.first == "UMI";
});
if (it_pattern != search_pattern.end()) {
umi_index = std::distance(search_pattern.begin(), it_pattern);
} else {
// error
}

std::string exact_bc = seq.substr(
read_to_subpatterns[bc_index == 0 ? barcode.flank_start : (bc_index - 1)],
read_to_subpatterns[bc_index] -
read_to_subpatterns[bc_index == 0 ? barcode.flank_start
: (bc_index - 1)]);
if(known_barcodes->size()==0 || (known_barcodes->find(exact_bc) != known_barcodes->end())){
barcode.barcode=exact_bc;
barcode.editd=0;
barcode.unambiguous=true;
barcode.umi=seq.substr(read_to_subpatterns[1], search_pattern.umi_seq.length());
return(barcode);
if (umi_index == -1) {
barcode.umi = "";
} else {
barcode.umi =
seq.substr(read_to_subpatterns[umi_index == 0 ? barcode.flank_start
: (umi_index - 1)],
search_pattern[umi_index].second.length());
}
return(barcode);
}

// otherwise widen our search space and the look for matches with errors
string barcode_seq=seq.substr(read_to_subpatterns[0]-OFFSET,search_pattern.temp_barcode.length()+2*OFFSET);

std::string barcode_seq =
seq.substr(read_to_subpatterns[bc_index == 0 ? barcode.flank_start
: (bc_index - 1)] -
OFFSET,
search_pattern[bc_index].second.length() + 2 * OFFSET);

//iterate over all the known barcodes, checking each sequentially
unordered_set<string>::iterator known_barcodes_itr=known_barcodes->begin();
unsigned int editDistance; unsigned int endDistance;
Expand All @@ -244,8 +284,14 @@ Barcode get_barcode(string & seq,
barcode.unambiguous=true;
barcode.editd=editDistance;
barcode.barcode=*known_barcodes_itr;
barcode.umi=seq.substr(read_to_subpatterns[0]-OFFSET+endDistance,search_pattern.umi_seq.length());//assumes no error in UMI seq.
if(editDistance==0){ //if perfect match is found we're done.
if (umi_index == -1) {
barcode.umi = "";
} else {
barcode.umi =
seq.substr(read_to_subpatterns[0] - OFFSET + endDistance,
search_pattern[umi_index].second.length()); // assumes no error in UMI seq.
}
if(editDistance==0){ //if perfect match is found we're done.
return(barcode);
}
}
Expand All @@ -254,12 +300,11 @@ Barcode get_barcode(string & seq,
}

//search a read for one or more barcodes (parent function that calls get_barcode)
vector<Barcode> big_barcode_search(string & sequence, unordered_set<string> & known_barcodes,
int max_flank_editd, int max_editd){ //, SearchSeq & ss){
vector<Barcode> big_barcode_search(string & sequence, unordered_set<string> & known_barcodes, int max_flank_editd, int max_editd, const std::vector<std::pair<std::string, std::string>> &search_pattern) {
vector<Barcode> return_vec; //vector of all the barcodes found

//search for barcode
Barcode result=get_barcode(sequence,&known_barcodes,max_flank_editd,max_editd); //,ss);
Barcode result=get_barcode(sequence,&known_barcodes,max_flank_editd,max_editd, search_pattern); //,ss);
if(result.editd<=max_editd && result.unambiguous) //add to return vector if edit distance small enough
return_vec.push_back(result);

Expand All @@ -271,7 +316,7 @@ vector<Barcode> big_barcode_search(string & sequence, unordered_set<string> & kn
masked_sequence.replace(return_vec.at(i).flank_start,flank_length,string(flank_length,'X'));
} //recursively call this function until no more barcodes are found
vector<Barcode> masked_res;
masked_res=big_barcode_search(masked_sequence,known_barcodes,max_flank_editd,max_editd); //,ss);
masked_res=big_barcode_search(masked_sequence,known_barcodes,max_flank_editd,max_editd, search_pattern); //,ss);
return_vec.insert(return_vec.end(),masked_res.begin(),masked_res.end()); //add to result
}

Expand Down Expand Up @@ -371,22 +416,25 @@ void print_read(string read_id,string read, string qual,

// separated out from main so that this can be run with threads
void search_read(vector<SearchResult> & reads, unordered_set<string> & known_barcodes,
int flank_edit_distance, int edit_distance){
int flank_edit_distance, int edit_distance,
const std::vector<std::pair<std::string, std::string>> &search_pattern) {

for(int r=0; r<reads.size(); r++){

//forward search
reads[r].vec_bc_for=big_barcode_search(reads[r].line,
known_barcodes,
flank_edit_distance,
edit_distance);
edit_distance,
search_pattern);
reads[r].rev_line=reads[r].line;
reverse_compliment(reads[r].rev_line);
//Check the reverse compliment of the read
reads[r].vec_bc_rev=big_barcode_search(reads[r].rev_line,
known_barcodes,
flank_edit_distance,
edit_distance);
edit_distance,
search_pattern);
}
}

Expand All @@ -410,11 +458,8 @@ int main(int argc, char **argv){
bool split_file_by_barcode=false; //(s)
bool remove_barcodes=true; //(r)

search_pattern.primer = "CTACACGACGCTCTTCCGATCT"; //(p)
search_pattern.polyA = string(9,'T'); //(T)
search_pattern.umi_seq = string(12,'?'); //(length u)
search_pattern.temp_barcode = string(16,'?'); //(length b)

std::vector<std::pair<std::string, std::string>> search_pattern;

//Set of known barcodes
unordered_set<string> known_barcodes;
unordered_set<string> found_barcodes;
Expand Down Expand Up @@ -454,10 +499,6 @@ int main(int argc, char **argv){
print_usage();
exit(1); //case barcode file is empty
}
//set barcode length automatically from known barcodes..
int bl=(known_barcodes.begin())->length();
search_pattern.temp_barcode=string(bl,'?');
cerr << "Setting barcode length automatically to " << bl << "\n";
params+=2;
break;
}
Expand All @@ -480,28 +521,26 @@ int main(int argc, char **argv){
break;
}
case 'l':{
search_pattern.primer=optarg;
cerr << "Setting primer to search for: " << search_pattern.primer << "\n";
search_pattern.push_back(std::make_pair("Primer", optarg));
cerr << "Setting primer to search for: " << optarg << "\n";
params+=2;
break;
}
case 'r':{
search_pattern.polyA=optarg;
cerr << "Setting polyT to search for: " << search_pattern.polyA << "\n";
search_pattern.push_back(std::make_pair("PolyT", optarg));
cerr << "Setting polyT to search for: " << optarg << "\n";
params+=2;
break;
}
case 'u':{
int ul=atoi(optarg);
search_pattern.umi_seq=string(ul,'?');
cerr << "Setting UMI length to " << ul << "\n";
search_pattern.push_back(std::make_pair("UMI", optarg));
cerr << "Setting UMI to search for: " << optarg << "\n";
params+=2;
break;
}
case 'b':{
int bl=atoi(optarg);
search_pattern.temp_barcode=string(bl,'?');
cerr << "Setting barcode length to " << bl << "\n";
search_pattern.push_back(std::make_pair("BC", optarg));
cerr << "Setting barcode to search for: " << optarg << "\n";
params+=2;
break;
}
Expand Down Expand Up @@ -540,6 +579,19 @@ int main(int argc, char **argv){
}
}

if (search_pattern.empty()) {
search_pattern = {
{"primer", "CTACACGACGCTCTTCCGATCT"},
{"BC", std::string(16, 'N')},
{"UMI", std::string(12, 'N')},
{"polyA", std::string(9, 'T')}
};
} else {
for (auto i : search_pattern) {
std::cerr << i.first << ": " << i.second << "\n";
}
}

cerr << "For usage information type: flexiplex -h" << endl;

istream * in;
Expand Down Expand Up @@ -635,13 +687,13 @@ int main(int argc, char **argv){
cout << "Read="<<t<< "\n";
cout << "2. Added read for thread" << (t / buffer) << " at " << asctime(now) << "\n"; **/
sr_v[t].resize(b+1);
threads[t]=std::thread(search_read,ref(sr_v[t]),ref(known_barcodes),flank_edit_distance,edit_distance);
threads[t]=std::thread(search_read,ref(sr_v[t]),ref(known_barcodes),flank_edit_distance,edit_distance, ref(search_pattern));
for(int t2=t+1; t2 < n_threads ; t2++) sr_v[t2].resize(0);
goto print_result; //advance the line
}
}
// send reads to the thread
threads[t]=std::thread(search_read,ref(sr_v[t]),ref(known_barcodes),flank_edit_distance,edit_distance);
threads[t]=std::thread(search_read,ref(sr_v[t]),ref(known_barcodes),flank_edit_distance,edit_distance, ref(search_pattern));
}
/** t_ = std::time(0); // get time now
now = std::localtime(&t_);
Expand Down

0 comments on commit 795987c

Please sign in to comment.