Skip to content

Commit

Permalink
better separate overhang and general cases; handle complex general an…
Browse files Browse the repository at this point in the history
…d overhang cases happening together
  • Loading branch information
HosseinAsghari committed Sep 18, 2022
1 parent 9c3e422 commit cf54093
Showing 1 changed file with 45 additions and 17 deletions.
62 changes: 45 additions & 17 deletions src/PuffAligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,8 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
bool invalidStart = (signedRefStartPos < 0);
bool invalidEnd = (signedRefEndPos > refTotalLength);

uint32_t maxSoftclipLenGeneral = mopts.maxSoftclipFractionGeneral * readLen;
uint32_t maxSoftclipLenOverhang = mopts.maxSoftclipFractionOverhang * readLen;
int32_t maxSoftclipLenGeneral = mopts.maxSoftclipFractionGeneral * readLen;
int32_t maxSoftclipLenOverhang = mopts.maxSoftclipFractionOverhang * readLen;

if (mopts.end2end) {
maxSoftclipLenGeneral = 0;
Expand Down Expand Up @@ -484,11 +484,14 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
logger_->debug("\t\t\tread: [{}]", readWindow);
logger_->debug("\t\t\tref: [{}]", refSeqBuffer_);

bool isOverhang = (readStartPosOnRef < 0);
auto remainedSoftClipLen = isOverhang ? remainedSoftClipLenOverhang : remainedSoftClipLenGeneral;

bandwidth = maxGaps;
logger_->debug("\t\t\tksw2_parameters: bandwidth={}, end_bonus={}, zdrop={}", bandwidth, aligner_config.end_bonus, aligner_config.dropoff);
auto cutoff = minAcceptedScore - mopts.matchScore * read.length();
aligner(readWindow.data(), readWindow.length(), refSeqBuffer_.data(),
refSeqBuffer_.length(), &ez, cutoff, remainedSoftClipLenGeneral,
refSeqBuffer_.length(), &ez, cutoff, remainedSoftClipLen,
ksw2pp::EnumToType<ksw2pp::KSW2AlignmentType::EXTENSION>());
logger_->debug("\t\t\tksw2_results:");
logger_->debug("\t\t\t\tmax={}, max_q={}, max_t={}", ez.max, ez.max_q, ez.max_t);
Expand Down Expand Up @@ -521,8 +524,10 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
// https://github.com/COMBINE-lab/salmon/issues/475).

decltype(alignmentScore) part_score = ez.max;

numSoftClipped = readWindow.length() - (ez.max_q + 1);
if(remainedSoftClipLenGeneral < numSoftClipped || ez.mqe + aligner_config.end_bonus > ez.max)

if(remainedSoftClipLen < numSoftClipped || ez.mqe + aligner_config.end_bonus > ez.max)
{
part_score = ez.mqe;
openGapLen = ez.mqe_t + 1;
Expand All @@ -534,9 +539,6 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
openGapLen = ez.max_t + 1;
cigarGen.add_item(numSoftClipped, 'S');
}
remainedSoftClipLenGeneral -= numSoftClipped;
remainedSoftClipLenOverhang -= numSoftClipped;
logger_->debug("\t\t\t\tremainedSoftClipLen={}->{};", remainedSoftClipLenGeneral + numSoftClipped, remainedSoftClipLenGeneral);

alignmentScore += part_score;
addCigar(cigarGen, ez, true);
Expand Down Expand Up @@ -567,7 +569,16 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
cigarGen.add_item(firstMemStart_read, cigar_char);
}
}

// both of the thresholds are updated in case the 5'-end was overhang and later the 3'-end is going to be
// generally soft-clipped. this would be a complex scenario and we check the general threshold for both sides
// in this case to avoid having soft-clipped lengths that are no allowed
remainedSoftClipLenGeneral -= numSoftClipped;
remainedSoftClipLenOverhang -= numSoftClipped;
logger_->debug("\t\t\t#soft-clipped={}", numSoftClipped);

arOut.softclip_start = numSoftClipped;

logger_->debug("\t\t\tscore_sofar: {}", alignmentScore);
logger_->debug("\t\t\tcigar_sofar: {}", cigarGen.get_cigar());
}
Expand Down Expand Up @@ -729,11 +740,21 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
logger_->debug("\t\t\tCASE 1: some reference bases left to align");
logger_->debug("\t\t\tread: [{}]", readWindow);
logger_->debug("\t\t\tref: [{}]", refSeqBuffer_);

bool isOverhang = (gapRead > refLen);

// if the 5'-end was soft-clip generally (not in overhanging), we will use the general threshold regardless
// that's because in complex General-Overhang or Overhang-General scenarios, we do not want to allow soft-cliping
// above the specified limit so all the soft-clips in these scenarios are considered general
// the code will automatically consider aligning to the end of 3'-end here as well if the limit is reached
auto remainedSoftClipLen = (isOverhang && (remainedSoftClipLenGeneral == maxSoftclipLenGeneral))
? remainedSoftClipLenOverhang
: remainedSoftClipLenGeneral;

logger_->debug("\t\t\tksw2_parameters: bandwidth={}, end_bonus={}, zdrop={}", bandwidth, aligner_config.end_bonus, aligner_config.dropoff);
auto cutoff = minAcceptedScore - alignmentScore - mopts.matchScore * readWindow.length();
aligner(readWindow.data(), readWindow.length(), refSeqBuffer_.data(),
refLen, &ez, cutoff, remainedSoftClipLenGeneral,
refLen, &ez, cutoff, remainedSoftClipLen,
ksw2pp::EnumToType<ksw2pp::KSW2AlignmentType::EXTENSION>());
logger_->debug("\t\t\tksw2_results:");
logger_->debug("\t\t\t\tmax={}, max_q={}, max_t={}", ez.max, ez.max_q, ez.max_t);
Expand All @@ -750,11 +771,11 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
if (ez.mqe != KSW_NEG_INF) ez.stopped = 0;

decltype(alignmentScore) part_score = ez.max;

numSoftClipped = readWindow.length() - (ez.max_q + 1);
addCigar(cigarGen, ez, false);

if(remainedSoftClipLenGeneral < numSoftClipped || ez.mqe + aligner_config.end_bonus > ez.max)
numSoftClipped = readWindow.length() - (ez.max_q + 1);

if(remainedSoftClipLen < numSoftClipped || ez.mqe + aligner_config.end_bonus > ez.max)
{
part_score = ez.mqe;
numSoftClipped = 0;
Expand All @@ -764,10 +785,7 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
part_score = ez.max;
cigarGen.add_item(numSoftClipped, 'S');
}
remainedSoftClipLenGeneral -= numSoftClipped;
remainedSoftClipLenOverhang -= numSoftClipped;
logger_->debug("\t\t\t\tremainedSoftClipLen={}->{};", remainedSoftClipLenGeneral + numSoftClipped, remainedSoftClipLenGeneral);


alignmentScore += part_score;

// NOTE: pre soft-clip code for adjusting the alignment score.
Expand All @@ -783,14 +801,20 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
mopts.allowSoftclip
? readWindow.length()
: 0;

// again, general threshold will be used if 5'-end was softclip in general case and not overhang
auto remainedSoftClipLen = (remainedSoftClipLenGeneral == maxSoftclipLenGeneral)
? remainedSoftClipLenOverhang
: remainedSoftClipLenGeneral;

alignmentScore +=
mopts.allowSoftclip && remainedSoftClipLenOverhang >= numSoftClipped
mopts.allowSoftclip && remainedSoftClipLen >= numSoftClipped
? 0
: (-1 * mopts.gapOpenPenalty +
-1 * mopts.gapExtendPenalty * readWindow.length());

char cigar_char;
if (mopts.allowSoftclip && remainedSoftClipLenOverhang >= numSoftClipped) {
if (mopts.allowSoftclip && remainedSoftClipLen >= numSoftClipped) {
cigar_char = 'S';
} else {
cigar_char = 'I';
Expand All @@ -801,6 +825,10 @@ bool PuffAligner::alignRead(std::string& read, std::string& read_rc, const std::
cigarGen.add_item(readWindow.length(), cigar_char);
}
}
remainedSoftClipLenGeneral -= numSoftClipped;
remainedSoftClipLenOverhang -= numSoftClipped;
logger_->debug("\t\t\t#soft-clipped={}", numSoftClipped);

arOut.softclip_end = numSoftClipped;
logger_->debug("\t\t\tscore_sofar: {}", alignmentScore);
logger_->debug("\t\t\tcigar_sofar: {}", cigarGen.get_cigar());
Expand Down

0 comments on commit cf54093

Please sign in to comment.