-
Notifications
You must be signed in to change notification settings - Fork 160
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #617 from OceanGenomics/ont
Ont! Implementation of `--ont` flag, including disabling of the length correction and inclusion of the new ONT, long-read error model. This PR also includes checks for the relevant requirements for long-read alignments (i.e. issues regarding only recording certain information in the primary alignment, which doesn't seem to be an issue with short read data).
- Loading branch information
Showing
25 changed files
with
1,699 additions
and
1,052 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
#ifndef __ALIGNMENTCOMMON_H__ | ||
#define __ALIGNMENTCOMMON_H__ | ||
|
||
#include <atomic> | ||
#include <memory> | ||
#include <mutex> | ||
|
||
|
||
// logger includes | ||
#include "spdlog/spdlog.h" | ||
|
||
extern "C" { | ||
#include "io_lib/os.h" | ||
#include "io_lib/scram.h" | ||
#undef max | ||
#undef min | ||
} | ||
|
||
struct UnpairedRead; | ||
struct ReadPair; | ||
class Transcript; | ||
|
||
|
||
// Common functionalities to different alignment models | ||
class AlignmentCommon { | ||
public: | ||
AlignmentCommon() | ||
: burnedIn_(false) | ||
{ } | ||
|
||
bool burnedIn() { return burnedIn_; } | ||
void burnedIn(bool burnedIn) { burnedIn_ = burnedIn; } | ||
|
||
void setLogger(std::shared_ptr<spdlog::logger> logger) { logger_ = logger; } | ||
bool hasLogger() { return (logger_) ? true : false; } | ||
|
||
|
||
static bool hasIndel(ReadPair& hit); | ||
static bool hasIndel(UnpairedRead& hit); | ||
|
||
protected: | ||
enum AlignmentModelChar { | ||
ALN_A = 0, | ||
ALN_C = 1, | ||
ALN_G = 2, | ||
ALN_T = 3, | ||
ALN_DASH = 4, | ||
ALN_SOFT_CLIP = 5, | ||
ALN_HARD_CLIP = 6, | ||
ALN_PAD = 7, | ||
ALN_REF_SKIP = 8 | ||
}; | ||
|
||
static bool hasIndel(bam_seq_t* read); | ||
static void setBasesFromCIGAROp_(enum cigar_op op, size_t& curRefBase, size_t& curReadBase); | ||
static char opToChr(enum cigar_op op); | ||
|
||
template<typename T> | ||
static int32_t alnLen(const T& aln, const T& primary) { | ||
const auto l = aln.readLen(); | ||
return l != 0 ? l : primary.readLen(); | ||
} | ||
|
||
struct ErrorCount { | ||
protected: | ||
int32_t insertions_, deletions_, mismatches_, matches_; | ||
int32_t sclips_, hclips_; // soft and hard clips | ||
int32_t fclips_, bclips_; // clips at front and at back | ||
friend AlignmentCommon; | ||
|
||
public: | ||
inline int32_t clips() const { return sclips_ + hclips_; } | ||
// Indels + mismatches | ||
inline int32_t ims() const { return insertions_ + deletions_ + mismatches_; } | ||
// Should be equal to the length of the query sequence | ||
inline int32_t length() const { return insertions_ + mismatches_ + matches_ + sclips_; } | ||
void clear() { | ||
insertions_ = deletions_ = mismatches_ = matches_ = sclips_ = hclips_ = fclips_ = bclips_ = 0; | ||
} | ||
inline int32_t insertions() const { return insertions_; } | ||
inline int32_t deletions() const { return deletions_; } | ||
inline int32_t mismatches() const { return mismatches_; } | ||
inline int32_t matches() const { return matches_; } | ||
inline int32_t sclips() const { return sclips_; } | ||
inline int32_t hclips() const { return hclips_; } | ||
inline int32_t fclips() const { return fclips_; } | ||
inline int32_t bclips() const { return bclips_; } | ||
|
||
bool computeErrorCount(bam_seq_t* read, bam_seq_t* primary, Transcript& ref, | ||
const char* src); | ||
}; | ||
bool computeErrorCount(bam_seq_t* read, bam_seq_t* primary, Transcript& ref, | ||
ErrorCount& counts, const char* src); | ||
|
||
std::shared_ptr<spdlog::logger> logger_; | ||
std::atomic<bool> burnedIn_; | ||
|
||
std::mutex throwMutex_; | ||
}; | ||
|
||
#endif /* __ALIGNMENTCOMMON_H__ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#ifndef ONT_ALIGNMENT_MODEL | ||
#define ONT_ALIGNMENT_MODEL | ||
|
||
#include <atomic> | ||
#include <memory> | ||
#include <mutex> | ||
|
||
// logger includes | ||
#include "spdlog/spdlog.h" | ||
|
||
#include "AlignmentCommon.hpp" | ||
|
||
// #include "AtomicMatrix.hpp" | ||
// #include "tbb/concurrent_vector.h" | ||
|
||
|
||
class ONTAlignmentModel | ||
: public AlignmentCommon | ||
{ | ||
public: | ||
static const uint32_t maxReadLen = 50000; // XXX: That should be a paramater. Read longer than that are binned together | ||
static const uint32_t binLen = 100; // XXX: That should be a parameter | ||
|
||
ONTAlignmentModel(double alpha, uint32_t readBins = 4); | ||
~ONTAlignmentModel() { } | ||
|
||
/** | ||
* For unpaired reads, update the error model to account for errors | ||
* we've observed in this read pair. primaryAln contains the first | ||
* alignment in the alignment group. | ||
*/ | ||
void update(const UnpairedRead& aln, const UnpairedRead& primaryAln, | ||
Transcript& ref, double p, double mass); | ||
|
||
/** | ||
* Compute the log-likelihood of the observed unpaired alignment | ||
* given the current error model. primaryAln contains the first | ||
* alignment in the alignment group. | ||
*/ | ||
double logLikelihood(const UnpairedRead& aln, const UnpairedRead& primaryAln, Transcript& ref); | ||
|
||
void normalize(); | ||
|
||
void printModel(std::ostream&); | ||
|
||
private: | ||
// void ONTAlignmentModel::update(bam_seq_t* read, Transcript& ref, double p, double mass, | ||
// std::vector<AtomicMatrix<double>>& transitionProbs); | ||
bool isEnabled_; | ||
// size_t maxLen_; | ||
size_t readBins_; | ||
|
||
// Maintain a mutex in case the error model wants to talk to the | ||
// console / log. | ||
bool printed; | ||
std::mutex outputMutex_; | ||
|
||
struct average { | ||
std::atomic<double> mass; | ||
std::atomic<double> sum; | ||
average() : mass(0.0), sum(0.0) { } | ||
}; | ||
// Error model. Probability parameter p of the binomial distribution | ||
// B(p,n) for each read in a bin (based on length n). | ||
std::vector<average> errorModel_; | ||
|
||
// Clip length model. Geometric distribution with parameter | ||
// p. Binned for read size. | ||
// Separate models are considered for front and back clips | ||
std::vector<average> frontClipModel_; | ||
std::vector<average> backClipModel_; | ||
}; | ||
|
||
#endif // ERROR_MODEL |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.