Some consts and code cleanup

COMBINE-lab · Jun 3, 2015 · 76db4a7 · 76db4a7
1 parent f7ab62d
commit 76db4a7
Show file tree

Hide file tree

Showing 7 changed files with 114 additions and 72 deletions.
diff --git a/include/EquivalenceClassBuilder.hpp b/include/EquivalenceClassBuilder.hpp
@@ -53,19 +53,10 @@ class EquivalenceClassBuilder {
 
         bool finish() {
             active_ = false;
-            /*
-            // unordered_map implementation
-            for (auto& kv : countMap_) {
-                kv.second.normalizeAux();
-                countVec_.push_back(kv);
-            }
-            */
-
             for (auto kv = countMap_.begin(); !kv.is_end(); ++kv) {
                 kv->second.normalizeAux();
                 countVec_.push_back(*kv);
             }
-            //countVec_ = countMap_.snapshot_table();
 
     	    logger_->info("Computed {} weighted equivalence classes "
 			  "for further processing", countVec_.size());
@@ -75,37 +66,18 @@ class EquivalenceClassBuilder {
         inline void addGroup(TranscriptGroup&& g,
                              std::vector<double>& weights) {
 
-            /*
-            // unordered_map implementation
-            std::lock_guard<std::mutex> lock(mapMut_);
-            auto it = countMap_.find(g);
-            if (it == countMap_.end()) {
-                TGValue v(weights, 1);
-                countMap_.emplace(g, v);
-            } else {
-                auto& x = it->second;
-                x.count++;
-                for (size_t i = 0; i < x.weights.size(); ++i) {
-                    x.weights[i] =
-                        salmon::math::logAdd(x.weights[i], weights[i]);
-                }
-            }
-            */
-
             auto upfn = [&weights](TGValue& x) -> TGValue& {
-
+                // update the count
                 x.count++;
-
+                // update the weights
                 for (size_t i = 0; i < x.weights.size(); ++i) {
                     x.weights[i] =
                         salmon::math::logAdd(x.weights[i], weights[i]);
                 }
-
                 return x;
             };
             TGValue v(weights, 1);
             countMap_.upsert(g, upfn, v);
-
         }
 
         std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec() {
@@ -115,12 +87,43 @@ class EquivalenceClassBuilder {
     private:
         std::atomic<bool> active_;
 	    cuckoohash_map<TranscriptGroup, TGValue, TranscriptGroupHasher> countMap_;
-        //std::unordered_map<TranscriptGroup, TGValue, TranscriptGroupHasher> countMap_;
-
         std::vector<std::pair<const TranscriptGroup, TGValue>> countVec_;
     	std::shared_ptr<spdlog::logger> logger_;
-        std::mutex mapMut_;
 };
 
 #endif // EQUIVALENCE_CLASS_BUILDER_HPP
 
+/** Unordered map implementation */
+//std::unordered_map<TranscriptGroup, TGValue, TranscriptGroupHasher> countMap_;
+//std::mutex mapMut_;
+/*
+bool finish() {
+    // unordered_map implementation
+    for (auto& kv : countMap_) {
+        kv.second.normalizeAux();
+        countVec_.push_back(kv);
+    }
+    return true;
+}
+*/
+
+/*
+inline void addGroup(TranscriptGroup&& g,
+        std::vector<double>& weights) {
+
+    // unordered_map implementation
+    std::lock_guard<std::mutex> lock(mapMut_);
+    auto it = countMap_.find(g);
+    if (it == countMap_.end()) {
+        TGValue v(weights, 1);
+        countMap_.emplace(g, v);
+    } else {
+        auto& x = it->second;
+        x.count++;
+        for (size_t i = 0; i < x.weights.size(); ++i) {
+            x.weights[i] =
+                salmon::math::logAdd(x.weights[i], weights[i]);
+        }
+    }
+}
+*/
diff --git a/include/ReadPair.hpp b/include/ReadPair.hpp
@@ -87,7 +87,7 @@ struct ReadPair {
         return l;//bam_name_len(read1);
     }
 
-    inline uint32_t fragLen() {
+    inline uint32_t fragLen() const {
         if (!isPaired()) { return 0; }
         auto leftmost1 = bam_pos(read1);
         auto leftmost2 = bam_pos(read2);
@@ -99,18 +99,18 @@ struct ReadPair {
         //return std::abs(read1->core.isize) + std::abs(read1->core.l_qseq) + std::abs(read2->core.l_qseq);
     }
 
-     inline bool isRight() { return isPaired() ? false : (isRightOrphan() ? true : false) ; }
-     inline bool isLeft() { return isPaired() ? false : (isLeftOrphan() ? true : false); }
+     inline bool isRight() const { return isPaired() ? false : (isRightOrphan() ? true : false) ; }
+     inline bool isLeft() const { return isPaired() ? false : (isLeftOrphan() ? true : false); }
 
-     inline int32_t left() {
+     inline int32_t left() const {
          if (isPaired()) {
              return std::min(bam_pos(read1), bam_pos(read2));
          } else {
              return bam_pos(read1);
          }
      }
 
-    inline int32_t right() {
+    inline int32_t right() const {
         if (isPaired()) {
             return std::max(bam_pos(read1) + bam_seq_len(read1),
                     bam_pos(read2) + bam_seq_len(read2));
@@ -119,7 +119,7 @@ struct ReadPair {
         }
     }
 
-    inline ReadType fragType() { return ReadType::PAIRED_END; }
+    inline ReadType fragType() const { return ReadType::PAIRED_END; }
     inline int32_t transcriptID() const { return bam_ref(read1); }
 
     inline double logQualProb() {

diff --git a/include/SalmonOpts.hpp b/include/SalmonOpts.hpp
@@ -43,6 +43,10 @@ struct SalmonOpts {
 
     bool noSeqBiasModel; // Don't learn and use a sequence-specific bias model.
 
+    bool noRichEqClasses; // Don't use rich equivalence classes --- forget the
+                          // aggregate weights for each transcript to each
+                          // equivalence class of fragments.
+
     bool useReadCompat; // Give a fragment assignment a likelihood based on the compatibility
                         // between the manner in which it mapped and the expected read
                         // library format.

diff --git a/include/UnpairedRead.hpp b/include/UnpairedRead.hpp
@@ -67,15 +67,15 @@ struct UnpairedRead {
         return bam_name_len(read);
     }
 
-   inline bool isRight() { return bam_flag(read) & BAM_FREVERSE; }
-   inline bool isLeft()  { return !isRight(); }
-   inline int32_t left() { return bam_pos(read); }
-   inline int32_t right() { return left() + bam_seq_len(read); }
-   inline uint32_t fragLen() { return 0; }
-   inline ReadType fragType() { return ReadType::SINGLE_END; }
+   inline bool isRight() const { return bam_flag(read) & BAM_FREVERSE; }
+   inline bool isLeft()  const { return !isRight(); }
+   inline int32_t left() const { return bam_pos(read); }
+   inline int32_t right() const { return left() + bam_seq_len(read); }
+   inline uint32_t fragLen() const { return 0; }
+   inline ReadType fragType() const { return ReadType::SINGLE_END; }
    inline int32_t transcriptID() const { return bam_ref(read); }
 
-    inline double logQualProb() {
+    inline double logQualProb() const {
         return salmon::math::LOG_1;
         /*
         int q = bam_map_qual(read);

diff --git a/src/CollapsedEMOptimizer.cpp b/src/CollapsedEMOptimizer.cpp
@@ -55,7 +55,7 @@ double normalize(std::vector<tbb::atomic<double>>& vec) {
 }
 
 /*
- * Use atomic compare-and-swap to update val to 
+ * Use atomic compare-and-swap to update val to
  * val + inc.  Update occurs in a loop in case other
  * threads update in the meantime.
  */
@@ -71,7 +71,7 @@ void incLoop(tbb::atomic<double>& val, double inc) {
 }
 
 /*
- * Use the "standard" EM algorithm over equivalence 
+ * Use the "standard" EM algorithm over equivalence
  * classes to estimate the latent variables (alphaOut)
  * given the current estimates (alphaIn).
  */
@@ -127,7 +127,7 @@ void EMUpdate_(
 }
 
 /*
- * Use the Variational Bayesian EM algorithm over equivalence 
+ * Use the Variational Bayesian EM algorithm over equivalence
  * classes to estimate the latent variables (alphaOut)
  * given the current estimates (alphaIn).
  */
@@ -176,13 +176,10 @@ void VBEMUpdate_(
                 const std::vector<uint32_t>& txps = tgroup.txps;
                 const std::vector<double>& auxs = kv.second.weights;
 
-
                 double denom = 0.0;
                 for (size_t i = 0; i < txps.size(); ++i) {
                     auto tid = txps[i];
                     auto aux = auxs[i];
-                    //double el = effLens(tid);
-                    //if (el <= 0) { el = 1.0; }
                     if (expTheta[tid] > 0.0) {
                         double v = expTheta[tid] * aux;
                         denom += v;
@@ -279,15 +276,6 @@ bool CollapsedEMOptimizer::optimize(ExpT& readExp,
     std::vector<Transcript>& transcripts = readExp.transcripts();
 
     using VecT = CollapsedEMOptimizer::VecType;
-    // With Eigen
-    /*
-    Eigen::VectorXd alphas(transcripts.size());
-    Eigen::VectorXd alphasPrime(transcripts.size());
-    alphasPrime.setZero();
-
-    Eigen::VectorXd effLens(transcripts.size());
-    Eigen::VectorXd expTheta(transcripts.size());
-    */
     // With atomics
     VecType alphas(transcripts.size(), 0.0);
     VecType alphasPrime(transcripts.size(), 0.0);
@@ -300,7 +288,6 @@ bool CollapsedEMOptimizer::optimize(ExpT& readExp,
     bool useVBEM{sopt.useVBOpt};
     auto jointLog = sopt.jointLog;
 
-    double logTotalMass = salmon::math::LOG_0;
     double totalLen{0.0};
 
     for (size_t i = 0; i < transcripts.size(); ++i) {
@@ -313,6 +300,34 @@ bool CollapsedEMOptimizer::optimize(ExpT& readExp,
         totalLen += effLens(i);
     }
 
+    // If the user requested *not* to use "rich" equivalence classes,
+    // then wipe out all of the weight information here and simply replace
+    // the weights with the effective length terms (here, the *inverse* of
+    // the effective length).
+    if (sopt.noRichEqClasses) {
+        tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(eqVec.size())),
+                [&eqVec, &effLens]( const BlockedIndexRange& range) -> void {
+                // For each index in the equivalence class vector
+                for (auto eqID : boost::irange(range.begin(), range.end())) {
+                    // The vector entry
+                    auto& kv = eqVec[eqID];
+                    // The label of the equivalence class
+                    const TranscriptGroup& k = kv.first;
+                    // The size of the label
+                    size_t classSize = k.txps.size();
+                    // The weights of the label
+                    TGValue& v = kv.second;
+
+                    // Iterate over each weight and set it equal to
+                    // 1 / effLen of the corresponding transcript
+                    for (size_t i = 0; i < classSize; ++i) {
+                        double el = effLens(k.txps[i]);
+                        v.weights[i] = (el <= 0.0) ? 1.0 : (1.0 / el);
+                    }
+                }
+        });
+    }
+
     auto numRemoved = markDegenerateClasses(eqVec, alphas, sopt.jointLog);
     sopt.jointLog->info("Marked {} weighted equivalence classes as degenerate",
             numRemoved);
@@ -403,6 +418,17 @@ bool CollapsedEMOptimizer::optimize<AlignmentLibrary<ReadPair>>(
 
 // Unused / old
 //
+
+// With Eigen
+/*
+   Eigen::VectorXd alphas(transcripts.size());
+   Eigen::VectorXd alphasPrime(transcripts.size());
+   alphasPrime.setZero();
+
+   Eigen::VectorXd effLens(transcripts.size());
+   Eigen::VectorXd expTheta(transcripts.size());
+*/
+
 /*
  * Serial implementation of EM update
  *

diff --git a/src/SalmonQuantify.cpp b/src/SalmonQuantify.cpp
@@ -506,21 +506,12 @@ void processMiniBatch(
                                                                hitPos,
                                                                aln.libFormat());
                         }
-                        /*
-                        if ( i % 1000 == 0 ) {
-                            std::cerr << "biasProb = " << std::exp(logBiasProb) << "\n";
-                        }
-                        */
+
                     }
 
                     // Increment the count of this type of read that we've seen
                     ++libTypeCounts[aln.libFormat().formatID()];
 
-                    // pre FSPD
-		    /*
-                    aln.logProb = (transcriptLogCount - logRefLength) +
-                                  logFragProb + logAlignCompatProb;
-		    */
                     double auxProb = startPosProb + logFragProb +
                                      logAlignCompatProb + logBiasProb;
 
@@ -2899,6 +2890,7 @@ int salmonQuantify(int argc, char *argv[]) {
     sopt.disableMappingCache = true;
     // no sequence bias for now
     sopt.noSeqBiasModel = true;
+    sopt.noRichEqClasses = false;
 
     po::options_description advanced("\n"
 		    		     "advanced options");
@@ -2937,6 +2929,14 @@ int salmonQuantify(int argc, char *argv[]) {
     ("noFragStartPosDist", po::bool_switch(&(sopt.noFragStartPosDist))->default_value(false), "[Currently Experimental] : "
                         "Don't consider / model non-uniformity in the fragment start positions "
                         "across the transcript.")
+    /*
+    // Don't expose this yet
+    ("noRichEqClasses", po::bool_switch(&(sopt.noRichEqClasses))->default_value(false),
+                        "Disable \"rich\" equivalent classes.  If this flag is passed, then "
+                        "all information about the relative weights for each transcript in the "
+                        "label of an equivalence class will be ignored, and only the relative "
+                        "abundance and effective length of each transcript will be considered.")
+    */
     //("noSeqBiasModel", po::bool_switch(&(sopt.noSeqBiasModel))->default_value(false),
     //                    "Don't learn and apply a model of sequence-specific bias")
     ("numAuxModelSamples", po::value<uint32_t>(&(sopt.numBurninFrags))->default_value(5000000), "The first <numAuxModelSamples> are used to train the "

diff --git a/src/SalmonQuantifyAlignments.cpp b/src/SalmonQuantifyAlignments.cpp
@@ -835,6 +835,7 @@ int salmonAlignmentQuantify(int argc, char* argv[]) {
 
     // no sequence bias for now
     sopt.noSeqBiasModel = true;
+    sopt.noRichEqClasses = false;
 
     po::options_description advanced("\nadvanced options");
     advanced.add_options()
@@ -863,6 +864,14 @@ int salmonAlignmentQuantify(int argc, char* argv[]) {
     ("noFragStartPosDist", po::bool_switch(&(sopt.noFragStartPosDist))->default_value(false), "[Currently Experimental] : "
                         "Don't consider / model non-uniformity in the fragment start positions "
                         "across the transcript.")
+    /*
+    // Don't expose this yet
+    ("noRichEqClasses", po::bool_switch(&(sopt.noRichEqClasses))->default_value(false),
+                        "Disable \"rich\" equivalent classes.  If this flag is passed, then "
+                        "all information about the relative weights for each transcript in the "
+                        "label of an equivalence class will be ignored, and only the relative "
+                        "abundance and effective length of each transcript will be considered.")
+                        */
     ("numErrorBins", po::value<uint32_t>(&(sopt.numErrorBins))->default_value(6), "The number of bins into which to divide "
                         "each read when learning and applying the error model.  For example, a value of 10 would mean that "
                         "effectively, a separate error model is leared and applied to each 10th of the read, while a value of "