Skip to content

Commit

Permalink
Merge pull request #617 from OceanGenomics/ont
Browse files Browse the repository at this point in the history
Ont!

Implementation of `--ont` flag, including disabling of the length correction and 
inclusion of the new ONT, long-read error model.  This PR also includes checks 
for the relevant requirements for long-read alignments (i.e. issues regarding only 
recording certain information in the primary alignment, which doesn't seem to 
be an issue with short read data).
  • Loading branch information
rob-p committed May 12, 2021
2 parents 728403c + 98d4ba3 commit 787fdb3
Show file tree
Hide file tree
Showing 25 changed files with 1,699 additions and 1,052 deletions.
101 changes: 101 additions & 0 deletions include/AlignmentCommon.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#ifndef __ALIGNMENTCOMMON_H__
#define __ALIGNMENTCOMMON_H__

#include <atomic>
#include <memory>
#include <mutex>


// logger includes
#include "spdlog/spdlog.h"

extern "C" {
#include "io_lib/os.h"
#include "io_lib/scram.h"
#undef max
#undef min
}

struct UnpairedRead;
struct ReadPair;
class Transcript;


// Common functionalities to different alignment models
class AlignmentCommon {
public:
AlignmentCommon()
: burnedIn_(false)
{ }

bool burnedIn() { return burnedIn_; }
void burnedIn(bool burnedIn) { burnedIn_ = burnedIn; }

void setLogger(std::shared_ptr<spdlog::logger> logger) { logger_ = logger; }
bool hasLogger() { return (logger_) ? true : false; }


static bool hasIndel(ReadPair& hit);
static bool hasIndel(UnpairedRead& hit);

protected:
enum AlignmentModelChar {
ALN_A = 0,
ALN_C = 1,
ALN_G = 2,
ALN_T = 3,
ALN_DASH = 4,
ALN_SOFT_CLIP = 5,
ALN_HARD_CLIP = 6,
ALN_PAD = 7,
ALN_REF_SKIP = 8
};

static bool hasIndel(bam_seq_t* read);
static void setBasesFromCIGAROp_(enum cigar_op op, size_t& curRefBase, size_t& curReadBase);
static char opToChr(enum cigar_op op);

template<typename T>
static int32_t alnLen(const T& aln, const T& primary) {
const auto l = aln.readLen();
return l != 0 ? l : primary.readLen();
}

struct ErrorCount {
protected:
int32_t insertions_, deletions_, mismatches_, matches_;
int32_t sclips_, hclips_; // soft and hard clips
int32_t fclips_, bclips_; // clips at front and at back
friend AlignmentCommon;

public:
inline int32_t clips() const { return sclips_ + hclips_; }
// Indels + mismatches
inline int32_t ims() const { return insertions_ + deletions_ + mismatches_; }
// Should be equal to the length of the query sequence
inline int32_t length() const { return insertions_ + mismatches_ + matches_ + sclips_; }
void clear() {
insertions_ = deletions_ = mismatches_ = matches_ = sclips_ = hclips_ = fclips_ = bclips_ = 0;
}
inline int32_t insertions() const { return insertions_; }
inline int32_t deletions() const { return deletions_; }
inline int32_t mismatches() const { return mismatches_; }
inline int32_t matches() const { return matches_; }
inline int32_t sclips() const { return sclips_; }
inline int32_t hclips() const { return hclips_; }
inline int32_t fclips() const { return fclips_; }
inline int32_t bclips() const { return bclips_; }

bool computeErrorCount(bam_seq_t* read, bam_seq_t* primary, Transcript& ref,
const char* src);
};
bool computeErrorCount(bam_seq_t* read, bam_seq_t* primary, Transcript& ref,
ErrorCount& counts, const char* src);

std::shared_ptr<spdlog::logger> logger_;
std::atomic<bool> burnedIn_;

std::mutex throwMutex_;
};

#endif /* __ALIGNMENTCOMMON_H__ */
5 changes: 3 additions & 2 deletions include/AlignmentGroup.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ extern "C" {

#include "ReadPair.hpp"
#include "SalmonMath.hpp"
#include "UtilityFunctions.hpp"
#include <vector>

struct EmptyCellInfo {};
Expand Down Expand Up @@ -50,8 +51,8 @@ template <typename FragT, typename BCType = EmptyCellInfo, typename UMIType = Em
}

inline bool& isUniquelyMapped() { return isUniquelyMapped_; }
inline size_t numAlignments() { return alignments_.size(); }
inline size_t size() { return numAlignments(); }
inline size_t numAlignments() const { return alignments_.size(); }
inline size_t size() const { return numAlignments(); }

template <typename Archive> void serialize(Archive& archive) {
archive(alignments_);
Expand Down
11 changes: 6 additions & 5 deletions include/AlignmentLibrary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ extern "C" {
// Our includes
#include "AlignmentGroup.hpp"
#include "AlignmentModel.hpp"
#include "ONTAlignmentModel.hpp"
#include "BAMQueue.hpp"
#include "BAMUtils.hpp"
#include "ClusterForest.hpp"
Expand Down Expand Up @@ -50,7 +51,7 @@ template <typename T> class NullFragmentFilter;
* It is used to group them together and track information about them
* during the quantification procedure.
*/
template <typename FragT, typename EQBuilderT> class AlignmentLibrary {
template <typename FragT, typename EQBuilderT, typename AlignModelT> class AlignmentLibrary {

public:
AlignmentLibrary(std::vector<boost::filesystem::path>& alnFiles,
Expand Down Expand Up @@ -208,7 +209,7 @@ for (auto& txp : transcripts_) {
fragLenStd, fragLenKernelN,
fragLenKernelP, 1));

alnMod_.reset(new AlignmentModel(1.0, salmonOpts.numErrorBins));
alnMod_.reset(new AlignModelT(1.0, salmonOpts.numErrorBins));
alnMod_->setLogger(salmonOpts.jointLog);

if (libFmt.type == ReadType::SINGLE_END) {
Expand Down Expand Up @@ -290,7 +291,7 @@ for (auto& txp : transcripts_) {
fragLenStd, fragLenKernelN,
fragLenKernelP, 1));

alnMod_.reset(new AlignmentModel(1.0, salmonOpts.numErrorBins));
alnMod_.reset(new AlignModelT(1.0, salmonOpts.numErrorBins));
alnMod_->setLogger(salmonOpts.jointLog);
salmon::utils::markAuxiliaryTargets(salmonOpts, transcripts_);
}
Expand Down Expand Up @@ -386,7 +387,7 @@ for (auto& txp : transcripts_) {
return flDist_.get();
}

inline AlignmentModel& alignmentModel() { return *alnMod_.get(); }
inline AlignModelT& alignmentModel() { return *alnMod_.get(); }

// SequenceBiasModel& sequenceBiasModel() { return seqBiasModel_; }

Expand Down Expand Up @@ -642,7 +643,7 @@ for (auto& txp : transcripts_) {
/**
* The emperical error model
*/
std::unique_ptr<AlignmentModel> alnMod_;
std::unique_ptr<AlignModelT> alnMod_;

/** Keeps track of the number of passes that have been
* made through the alignment file.
Expand Down
75 changes: 22 additions & 53 deletions include/AlignmentModel.hpp
Original file line number Diff line number Diff line change
@@ -1,80 +1,52 @@
#ifndef ALIGNMENT_MODEL
#define ALIGNMENT_MODEL

#include <atomic>
#include <memory>
#include <mutex>

// logger includes
#include "spdlog/spdlog.h"

#include "AlignmentCommon.hpp"
#include "AtomicMatrix.hpp"
#include "tbb/concurrent_vector.h"

extern "C" {
#include "io_lib/os.h"
#include "io_lib/scram.h"
#undef max
#undef min
}

struct UnpairedRead;
struct ReadPair;
class Transcript;

class AlignmentModel {
class AlignmentModel
: public AlignmentCommon
{
public:
AlignmentModel(double alpha, uint32_t readBins = 4);
bool burnedIn();
void burnedIn(bool burnedIn);

/**
* For unpaired reads, update the error model to account
* for errors we've observed in this read pair.
* For unpaired reads, update the error model to account for errors
* we've observed in this read pair. primaryAln contains the first
* alignment in the alignment group.
*/
void update(const UnpairedRead&, Transcript& ref, double p, double mass);
void update(const UnpairedRead& aln, const UnpairedRead& primaryAln,
Transcript& ref, double p, double mass);

/**
* Compute the log-likelihood of the observed unpaired alignment given the
* current error model.
* Compute the log-likelihood of the observed unpaired alignment
* given the current error model. primaryAln contains the first
* alignment in the alignment group.
*/
double logLikelihood(const UnpairedRead&, Transcript& ref);
double logLikelihood(const UnpairedRead&, const UnpairedRead& primaryAln, Transcript& ref);

// The primaryAln is ignored by the ReadPair overlaods (at least for now)

/**
* For paired-end reads, update the error model to account
* for errors we've observed in this read pair.
* For paired-end reads, update the error model to account for
* errors we've observed in this read pair.
*/
void update(const ReadPair&, Transcript& ref, double p, double mass);
void update(const ReadPair& aln, const ReadPair& primaryAln,
Transcript& ref, double p, double mass);

/**
* Compute the log-likelihood of the observed paire-end alignment given the
* current error model.
*/
double logLikelihood(const ReadPair&, Transcript& ref);

bool hasIndel(UnpairedRead& r);
bool hasIndel(ReadPair& r);

void setLogger(std::shared_ptr<spdlog::logger> logger);
bool hasLogger();
double logLikelihood(const ReadPair& aln, const ReadPair& primaryAln, Transcript& ref);

void normalize();

private:
enum AlignmentModelChar {
ALN_A = 0,
ALN_C = 1,
ALN_G = 2,
ALN_T = 3,
ALN_DASH = 4,
ALN_SOFT_CLIP = 5,
ALN_HARD_CLIP = 6,
ALN_PAD = 7,
ALN_REF_SKIP = 8
};

void setBasesFromCIGAROp_(enum cigar_op op, size_t& curRefBase,
size_t& curReadBase);
// std::stringstream& readStr, std::stringstream& matchStr,
// std::stringstream& refstr);

Expand All @@ -93,11 +65,10 @@ class AlignmentModel {
* These functions, which work directly on bam_seq_t* types, drive the
* update() and logLikelihood() methods above.
*/
void update(bam_seq_t* read, Transcript& ref, double p, double mass,
void update(bam_seq_t* read, bam_seq_t* primary, Transcript& ref, double p, double mass,
std::vector<AtomicMatrix<double>>& mismatchProfile);
AlnModelProb logLikelihood(bam_seq_t* read, Transcript& ref,
AlnModelProb logLikelihood(bam_seq_t* read, bam_seq_t* primary, Transcript& ref,
std::vector<AtomicMatrix<double>>& mismatchProfile);
bool hasIndel(bam_seq_t* r);

// NOTE: Do these need to be concurrent_vectors as before?
// Store the mismatch probability tables for the left and right reads
Expand All @@ -107,8 +78,6 @@ class AlignmentModel {
bool isEnabled_;
// size_t maxLen_;
size_t readBins_;
std::atomic<bool> burnedIn_;
std::shared_ptr<spdlog::logger> logger_;
// Maintain a mutex in case the error model wants to talk to the
// console / log.
std::mutex outputMutex_;
Expand Down
74 changes: 74 additions & 0 deletions include/ONTAlignmentModel.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#ifndef ONT_ALIGNMENT_MODEL
#define ONT_ALIGNMENT_MODEL

#include <atomic>
#include <memory>
#include <mutex>

// logger includes
#include "spdlog/spdlog.h"

#include "AlignmentCommon.hpp"

// #include "AtomicMatrix.hpp"
// #include "tbb/concurrent_vector.h"


class ONTAlignmentModel
: public AlignmentCommon
{
public:
static const uint32_t maxReadLen = 50000; // XXX: That should be a paramater. Read longer than that are binned together
static const uint32_t binLen = 100; // XXX: That should be a parameter

ONTAlignmentModel(double alpha, uint32_t readBins = 4);
~ONTAlignmentModel() { }

/**
* For unpaired reads, update the error model to account for errors
* we've observed in this read pair. primaryAln contains the first
* alignment in the alignment group.
*/
void update(const UnpairedRead& aln, const UnpairedRead& primaryAln,
Transcript& ref, double p, double mass);

/**
* Compute the log-likelihood of the observed unpaired alignment
* given the current error model. primaryAln contains the first
* alignment in the alignment group.
*/
double logLikelihood(const UnpairedRead& aln, const UnpairedRead& primaryAln, Transcript& ref);

void normalize();

void printModel(std::ostream&);

private:
// void ONTAlignmentModel::update(bam_seq_t* read, Transcript& ref, double p, double mass,
// std::vector<AtomicMatrix<double>>& transitionProbs);
bool isEnabled_;
// size_t maxLen_;
size_t readBins_;

// Maintain a mutex in case the error model wants to talk to the
// console / log.
bool printed;
std::mutex outputMutex_;

struct average {
std::atomic<double> mass;
std::atomic<double> sum;
average() : mass(0.0), sum(0.0) { }
};
// Error model. Probability parameter p of the binomial distribution
// B(p,n) for each read in a bin (based on length n).
std::vector<average> errorModel_;

// Clip length model. Geometric distribution with parameter
// p. Binned for read size.
// Separate models are considered for front and back clips
std::vector<average> frontClipModel_;
std::vector<average> backClipModel_;
};

#endif // ERROR_MODEL
4 changes: 3 additions & 1 deletion include/ReadPair.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ struct ReadPair {

inline char* getName() const { return bam_name(read1); }

inline uint32_t getNameLength() {
inline uint32_t getNameLength() const {
uint32_t l = bam_name_len(read1);
char* r = getName();
if (l > 2 and r[l - 2] == '/') {
Expand Down Expand Up @@ -207,6 +207,8 @@ struct ReadPair {
return bam_pos(read1) + bam_seq_len(read1);
}
}
inline bool isPrimary() const { return !(bam_flag(read1) & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)); }
inline bool isSecondary() const { return (bam_flag(read1) & BAM_FSECONDARY) || (bam_flag(read2) & BAM_FSECONDARY); }

inline ReadType fragType() const { return ReadType::PAIRED_END; }
inline int32_t transcriptID() const { return bam_ref(read1); }
Expand Down

0 comments on commit 787fdb3

Please sign in to comment.