Merge pull request #617 from OceanGenomics/ont

Ont! Implementation of `--ont` flag, including disabling of the length correction and inclusion of the new ONT, long-read error model. This PR also includes checks for the relevant requirements for long-read alignments (i.e. issues regarding only recording certain information in the primary alignment, which doesn't seem to be an issue with short read data).
COMBINE-lab · May 12, 2021 · 787fdb3 · 787fdb3
2 parents 728403c + 98d4ba3
commit 787fdb3
Show file tree

Hide file tree

Showing 25 changed files with 1,699 additions and 1,052 deletions.
diff --git a/include/AlignmentCommon.hpp b/include/AlignmentCommon.hpp
@@ -0,0 +1,101 @@
+#ifndef __ALIGNMENTCOMMON_H__
+#define __ALIGNMENTCOMMON_H__
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+
+
+// logger includes
+#include "spdlog/spdlog.h"
+
+extern "C" {
+#include "io_lib/os.h"
+#include "io_lib/scram.h"
+#undef max
+#undef min
+}
+
+struct UnpairedRead;
+struct ReadPair;
+class Transcript;
+
+
+// Common functionalities to different alignment models
+class AlignmentCommon {
+public:
+  AlignmentCommon()
+    : burnedIn_(false)
+  { }
+
+  bool burnedIn() { return burnedIn_; }
+  void burnedIn(bool burnedIn) { burnedIn_ = burnedIn; }
+
+  void setLogger(std::shared_ptr<spdlog::logger> logger) { logger_ = logger; }
+  bool hasLogger() { return (logger_) ? true : false; }
+
+
+  static bool hasIndel(ReadPair& hit);
+  static bool hasIndel(UnpairedRead& hit);
+
+protected:
+  enum AlignmentModelChar {
+    ALN_A = 0,
+    ALN_C = 1,
+    ALN_G = 2,
+    ALN_T = 3,
+    ALN_DASH = 4,
+    ALN_SOFT_CLIP = 5,
+    ALN_HARD_CLIP = 6,
+    ALN_PAD = 7,
+    ALN_REF_SKIP = 8
+  };
+
+  static bool hasIndel(bam_seq_t* read);
+  static void setBasesFromCIGAROp_(enum cigar_op op, size_t& curRefBase, size_t& curReadBase);
+  static char opToChr(enum cigar_op op);
+
+  template<typename T>
+  static int32_t alnLen(const T& aln, const T& primary) {
+    const auto l = aln.readLen();
+    return l != 0 ? l : primary.readLen();
+  }
+
+  struct ErrorCount {
+  protected:
+    int32_t insertions_, deletions_, mismatches_, matches_;
+    int32_t sclips_, hclips_; // soft and hard clips
+    int32_t fclips_, bclips_; // clips at front and at back
+    friend AlignmentCommon;
+
+  public:
+    inline int32_t clips() const { return sclips_ + hclips_; }
+    // Indels + mismatches
+    inline int32_t ims() const { return insertions_ + deletions_ + mismatches_; }
+    // Should be equal to the length of the query sequence
+    inline int32_t length() const { return insertions_ + mismatches_ + matches_ + sclips_; }
+    void clear() {
+      insertions_ = deletions_ = mismatches_ = matches_ = sclips_ = hclips_ = fclips_ = bclips_ = 0;
+    }
+    inline int32_t insertions() const { return insertions_; }
+    inline int32_t deletions() const { return deletions_; }
+    inline int32_t mismatches() const { return mismatches_; }
+    inline int32_t matches() const { return matches_; }
+    inline int32_t sclips() const { return sclips_; }
+    inline int32_t hclips() const { return hclips_; }
+    inline int32_t fclips() const { return fclips_; }
+    inline int32_t bclips() const { return bclips_; }
+
+    bool computeErrorCount(bam_seq_t* read, bam_seq_t* primary, Transcript& ref,
+                           const char* src);
+  };
+  bool computeErrorCount(bam_seq_t* read, bam_seq_t* primary, Transcript& ref,
+                         ErrorCount& counts, const char* src);
+
+  std::shared_ptr<spdlog::logger> logger_;
+  std::atomic<bool> burnedIn_;
+
+  std::mutex throwMutex_;
+};
+
+#endif /* __ALIGNMENTCOMMON_H__ */
diff --git a/include/AlignmentGroup.hpp b/include/AlignmentGroup.hpp
@@ -14,6 +14,7 @@ extern "C" {
 
 #include "ReadPair.hpp"
 #include "SalmonMath.hpp"
+#include "UtilityFunctions.hpp"
 #include <vector>
 
 struct EmptyCellInfo {};
@@ -50,8 +51,8 @@ template <typename FragT, typename BCType = EmptyCellInfo, typename UMIType = Em
   }
 
   inline bool& isUniquelyMapped() { return isUniquelyMapped_; }
-  inline size_t numAlignments() { return alignments_.size(); }
-  inline size_t size() { return numAlignments(); }
+  inline size_t numAlignments() const { return alignments_.size(); }
+  inline size_t size() const { return numAlignments(); }
 
   template <typename Archive> void serialize(Archive& archive) {
     archive(alignments_);

diff --git a/include/AlignmentLibrary.hpp b/include/AlignmentLibrary.hpp
@@ -11,6 +11,7 @@ extern "C" {
 // Our includes
 #include "AlignmentGroup.hpp"
 #include "AlignmentModel.hpp"
+#include "ONTAlignmentModel.hpp"
 #include "BAMQueue.hpp"
 #include "BAMUtils.hpp"
 #include "ClusterForest.hpp"
@@ -50,7 +51,7 @@ template <typename T> class NullFragmentFilter;
  *  It is used to group them together and track information about them
  *  during the quantification procedure.
  */
-template <typename FragT, typename EQBuilderT> class AlignmentLibrary {
+template <typename FragT, typename EQBuilderT, typename AlignModelT> class AlignmentLibrary {
 
 public:
   AlignmentLibrary(std::vector<boost::filesystem::path>& alnFiles,
@@ -208,7 +209,7 @@ for (auto& txp : transcripts_) {
                                                  fragLenStd, fragLenKernelN,
                                                  fragLenKernelP, 1));
 
-    alnMod_.reset(new AlignmentModel(1.0, salmonOpts.numErrorBins));
+    alnMod_.reset(new AlignModelT(1.0, salmonOpts.numErrorBins));
     alnMod_->setLogger(salmonOpts.jointLog);
 
     if (libFmt.type == ReadType::SINGLE_END) {
@@ -290,7 +291,7 @@ for (auto& txp : transcripts_) {
                                                  fragLenStd, fragLenKernelN,
                                                  fragLenKernelP, 1));
 
-    alnMod_.reset(new AlignmentModel(1.0, salmonOpts.numErrorBins));
+    alnMod_.reset(new AlignModelT(1.0, salmonOpts.numErrorBins));
     alnMod_->setLogger(salmonOpts.jointLog);
     salmon::utils::markAuxiliaryTargets(salmonOpts, transcripts_);
   }
@@ -386,7 +387,7 @@ for (auto& txp : transcripts_) {
     return flDist_.get();
   }
 
-  inline AlignmentModel& alignmentModel() { return *alnMod_.get(); }
+  inline AlignModelT& alignmentModel() { return *alnMod_.get(); }
 
   // SequenceBiasModel& sequenceBiasModel() { return seqBiasModel_; }
 
@@ -642,7 +643,7 @@ for (auto& txp : transcripts_) {
   /**
    * The emperical error model
    */
-  std::unique_ptr<AlignmentModel> alnMod_;
+  std::unique_ptr<AlignModelT> alnMod_;
 
   /** Keeps track of the number of passes that have been
    *  made through the alignment file.

diff --git a/include/AlignmentModel.hpp b/include/AlignmentModel.hpp
@@ -1,80 +1,52 @@
 #ifndef ALIGNMENT_MODEL
 #define ALIGNMENT_MODEL
 
-#include <atomic>
-#include <memory>
 #include <mutex>
 
-// logger includes
-#include "spdlog/spdlog.h"
-
+#include "AlignmentCommon.hpp"
 #include "AtomicMatrix.hpp"
 #include "tbb/concurrent_vector.h"
 
-extern "C" {
-#include "io_lib/os.h"
-#include "io_lib/scram.h"
-#undef max
-#undef min
-}
-
-struct UnpairedRead;
-struct ReadPair;
-class Transcript;
-
-class AlignmentModel {
+class AlignmentModel
+  : public AlignmentCommon
+{
 public:
   AlignmentModel(double alpha, uint32_t readBins = 4);
-  bool burnedIn();
-  void burnedIn(bool burnedIn);
 
   /**
-   *  For unpaired reads, update the error model to account
-   *  for errors we've observed in this read pair.
+   *  For unpaired reads, update the error model to account for errors
+   *  we've observed in this read pair. primaryAln contains the first
+   *  alignment in the alignment group.
    */
-  void update(const UnpairedRead&, Transcript& ref, double p, double mass);
+  void update(const UnpairedRead& aln, const UnpairedRead& primaryAln,
+              Transcript& ref, double p, double mass);
 
   /**
-   * Compute the log-likelihood of the observed unpaired alignment given the
-   * current error model.
+   * Compute the log-likelihood of the observed unpaired alignment
+   * given the current error model. primaryAln contains the first
+   * alignment in the alignment group.
    */
-  double logLikelihood(const UnpairedRead&, Transcript& ref);
+  double logLikelihood(const UnpairedRead&, const UnpairedRead& primaryAln, Transcript& ref);
+
+  // The primaryAln is ignored by the ReadPair overlaods (at least for now)
 
   /**
-   *  For paired-end reads, update the error model to account
-   *  for errors we've observed in this read pair.
+   *  For paired-end reads, update the error model to account for
+   *  errors we've observed in this read pair.
    */
-  void update(const ReadPair&, Transcript& ref, double p, double mass);
+  void update(const ReadPair& aln, const ReadPair& primaryAln,
+              Transcript& ref, double p, double mass);
 
   /**
    * Compute the log-likelihood of the observed paire-end alignment given the
    * current error model.
    */
-  double logLikelihood(const ReadPair&, Transcript& ref);
-
-  bool hasIndel(UnpairedRead& r);
-  bool hasIndel(ReadPair& r);
-
-  void setLogger(std::shared_ptr<spdlog::logger> logger);
-  bool hasLogger();
+  double logLikelihood(const ReadPair& aln, const ReadPair& primaryAln, Transcript& ref);
 
   void normalize();
 
 private:
-  enum AlignmentModelChar {
-    ALN_A = 0,
-    ALN_C = 1,
-    ALN_G = 2,
-    ALN_T = 3,
-    ALN_DASH = 4,
-    ALN_SOFT_CLIP = 5,
-    ALN_HARD_CLIP = 6,
-    ALN_PAD = 7,
-    ALN_REF_SKIP = 8
-  };
 
-  void setBasesFromCIGAROp_(enum cigar_op op, size_t& curRefBase,
-                            size_t& curReadBase);
   // std::stringstream& readStr, std::stringstream& matchStr,
   // std::stringstream& refstr);
 
@@ -93,11 +65,10 @@ class AlignmentModel {
    * These functions, which work directly on bam_seq_t* types, drive the
    * update() and logLikelihood() methods above.
    */
-  void update(bam_seq_t* read, Transcript& ref, double p, double mass,
+  void update(bam_seq_t* read, bam_seq_t* primary, Transcript& ref, double p, double mass,
               std::vector<AtomicMatrix<double>>& mismatchProfile);
-  AlnModelProb logLikelihood(bam_seq_t* read, Transcript& ref,
+  AlnModelProb logLikelihood(bam_seq_t* read, bam_seq_t* primary, Transcript& ref,
                        std::vector<AtomicMatrix<double>>& mismatchProfile);
-  bool hasIndel(bam_seq_t* r);
 
   // NOTE: Do these need to be concurrent_vectors as before?
   // Store the mismatch probability tables for the left and right reads
@@ -107,8 +78,6 @@ class AlignmentModel {
   bool isEnabled_;
   // size_t maxLen_;
   size_t readBins_;
-  std::atomic<bool> burnedIn_;
-  std::shared_ptr<spdlog::logger> logger_;
   // Maintain a mutex in case the error model wants to talk to the
   // console / log.
   std::mutex outputMutex_;

diff --git a/include/ONTAlignmentModel.hpp b/include/ONTAlignmentModel.hpp
@@ -0,0 +1,74 @@
+#ifndef ONT_ALIGNMENT_MODEL
+#define ONT_ALIGNMENT_MODEL
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+
+// logger includes
+#include "spdlog/spdlog.h"
+
+#include "AlignmentCommon.hpp"
+
+// #include "AtomicMatrix.hpp"
+// #include "tbb/concurrent_vector.h"
+
+
+class ONTAlignmentModel
+  : public AlignmentCommon
+{
+public:
+  static const uint32_t maxReadLen = 50000; // XXX: That should be a paramater. Read longer than that are binned together
+  static const uint32_t binLen = 100; // XXX: That should be a parameter
+
+  ONTAlignmentModel(double alpha, uint32_t readBins = 4);
+  ~ONTAlignmentModel() { }
+
+  /**
+   *  For unpaired reads, update the error model to account for errors
+   *  we've observed in this read pair. primaryAln contains the first
+   *  alignment in the alignment group.
+   */
+  void update(const UnpairedRead& aln, const UnpairedRead& primaryAln,
+              Transcript& ref, double p, double mass);
+
+  /**
+   * Compute the log-likelihood of the observed unpaired alignment
+   * given the current error model. primaryAln contains the first
+   * alignment in the alignment group.
+   */
+  double logLikelihood(const UnpairedRead& aln, const UnpairedRead& primaryAln, Transcript& ref);
+
+  void normalize();
+
+  void printModel(std::ostream&);
+
+private:
+  // void ONTAlignmentModel::update(bam_seq_t* read, Transcript& ref, double p, double mass,
+  //                                std::vector<AtomicMatrix<double>>& transitionProbs);
+  bool isEnabled_;
+  // size_t maxLen_;
+  size_t readBins_;
+
+  // Maintain a mutex in case the error model wants to talk to the
+  // console / log.
+  bool printed;
+  std::mutex outputMutex_;
+
+  struct average {
+    std::atomic<double> mass;
+    std::atomic<double> sum;
+    average() : mass(0.0), sum(0.0) { }
+  };
+  // Error model. Probability parameter p of the binomial distribution
+  // B(p,n) for each read in a bin (based on length n).
+  std::vector<average> errorModel_;
+
+  // Clip length model. Geometric distribution with parameter
+  // p. Binned for read size.
+  // Separate models are considered for front and back clips
+  std::vector<average> frontClipModel_;
+  std::vector<average> backClipModel_;
+};
+
+#endif // ERROR_MODEL
diff --git a/include/ReadPair.hpp b/include/ReadPair.hpp
@@ -135,7 +135,7 @@ struct ReadPair {
 
   inline char* getName() const { return bam_name(read1); }
 
-  inline uint32_t getNameLength() {
+  inline uint32_t getNameLength() const {
     uint32_t l = bam_name_len(read1);
     char* r = getName();
     if (l > 2 and r[l - 2] == '/') {
@@ -207,6 +207,8 @@ struct ReadPair {
       return bam_pos(read1) + bam_seq_len(read1);
     }
   }
+  inline bool isPrimary() const { return !(bam_flag(read1) & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)); }
+  inline bool isSecondary() const { return (bam_flag(read1) & BAM_FSECONDARY) || (bam_flag(read2) & BAM_FSECONDARY); }
 
   inline ReadType fragType() const { return ReadType::PAIRED_END; }
   inline int32_t transcriptID() const { return bam_ref(read1); }