Actually *use* the non-uniform start distribution

Also, implemented the non-uniform start distribution for alignment-based salmon.
COMBINE-lab · May 15, 2015 · 0e46903 · 0e46903
1 parent a3f294b
commit 0e46903
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 3 deletions.
diff --git a/include/AlignmentLibrary.hpp b/include/AlignmentLibrary.hpp
@@ -16,6 +16,7 @@ extern "C" {
 #include "LibraryFormat.hpp"
 #include "SalmonOpts.hpp"
 #include "FragmentLengthDistribution.hpp"
+#include "FragmentStartPositionDistribution.hpp"
 #include "AlignmentGroup.hpp"
 #include "ErrorModel.hpp"
 #include "AlignmentModel.hpp"
@@ -53,6 +54,7 @@ class AlignmentLibrary {
         transcriptFile_(transcriptFile),
         libFmt_(libFmt),
         transcripts_(std::vector<Transcript>()),
+	fragStartDists_(5),
         quantificationPasses_(0) {
             namespace bfs = boost::filesystem;
 
@@ -104,6 +106,26 @@ class AlignmentLibrary {
             fmt::print(stderr, "Populating targets from aln = {}, fasta = {} . . .",
                        alnFiles.front(), transcriptFile_);
             fp.populateTargets(transcripts_);
+	    for (auto& txp : transcripts_) {
+		    // Length classes taken from
+		    // ======
+		    // Roberts, Adam, et al.
+		    // "Improving RNA-Seq expression estimates by correcting for fragment bias."
+		    // Genome Biol 12.3 (2011): R22.
+		    // ======
+		    // perhaps, define these in a more data-driven way
+		    if (txp.RefLength <= 1334) {
+			    txp.lengthClassIndex(0);
+		    } else if (txp.RefLength <= 2104) {
+			    txp.lengthClassIndex(0);
+		    } else if (txp.RefLength <= 2988) {
+			    txp.lengthClassIndex(0);
+		    } else if (txp.RefLength <= 4389) {
+			    txp.lengthClassIndex(0);
+		    } else {
+			    txp.lengthClassIndex(0);
+		    }
+	    }
             fmt::print(stderr, "done\n");
 
             // Create the cluster forest for this set of transcripts
@@ -140,6 +162,10 @@ class AlignmentLibrary {
 
     inline SAM_hdr* header() { return bq->header(); }
 
+    inline std::vector<FragmentStartPositionDistribution>& fragmentStartPositionDistributions() {
+	    return fragStartDists_;
+    }
+
     inline FragmentLengthDistribution& fragmentLengthDistribution() {
         return *flDist_.get();
     }
@@ -224,6 +250,12 @@ class AlignmentLibrary {
      * in the same cluster.
      */
     std::unique_ptr<ClusterForest> clusters_;
+
+    /**
+      * The emperical fragment start position distribution
+      */
+    std::vector<FragmentStartPositionDistribution> fragStartDists_;
+
     /**
      * The emperical fragment length distribution.
      *

diff --git a/src/SalmonQuantify.cpp b/src/SalmonQuantify.cpp
@@ -477,12 +477,13 @@ void processMiniBatch(
                     ++libTypeCounts[aln.libFormat().formatID()];
 
                     // pre FSPD
+		    /*
                     aln.logProb = (transcriptLogCount - logRefLength) +
                                   logFragProb + logAlignCompatProb;
-                    /*
+		    */
                     aln.logProb = (transcriptLogCount + startPosProb) +
                                   logFragProb + logAlignCompatProb;
-                                  */
+
 
                     if (std::abs(aln.logProb) == LOG_0) { continue; }
 

diff --git a/src/SalmonQuantifyAlignments.cpp b/src/SalmonQuantifyAlignments.cpp
@@ -146,9 +146,15 @@ void processMiniBatch(AlignmentLibrary<FragT>& alnLib,
     auto& fragmentQueue = alnLib.fragmentQueue();
     auto& alignmentGroupQueue = alnLib.alignmentGroupQueue();
 
+    std::vector<FragmentStartPositionDistribution>& fragStartDists =
+        alnLib.fragmentStartPositionDistributions();
+
     auto& fragLengthDist = alnLib.fragmentLengthDistribution();
     auto& alnMod = alnLib.alignmentModel();
 
+    bool useFSPD{!salmonOpts.noFragStartPosDist};
+    bool useFragLengthDist{!salmonOpts.noFragLengthDist};
+
     double startingCumulativeMass = fmCalc.cumulativeLogMassAt(firstTimestepOfRound);
     const auto expectedLibraryFormat = alnLib.format();
     uint32_t numBurninFrags{salmonOpts.numBurninFrags};
@@ -318,10 +324,25 @@ void processMiniBatch(AlignmentLibrary<FragT>& alnLib,
                             errLike = alnMod.logLikelihood(*aln, transcript);
                         }
 
+			// Allow for a non-uniform fragment start position distribution
+                        double startPosProb = -logRefLength;
+                        auto hitPos = aln->left();
+			if (useFSPD and burnedIn and hitPos < refLength) {
+			  auto& fragStartDist =
+				  fragStartDists[transcript.lengthClassIndex()];
+			  startPosProb = fragStartDist(hitPos, refLength, logRefLength);
+			}
+
+			// Pre FSPD 
+			/*
                         double qualProb = -logRefLength + logFragProb +
                                           aln->logQualProb() +
                                           errLike + logAlignCompatProb;
-
+		        */	
+			double qualProb = startPosProb + logFragProb +
+                                          aln->logQualProb() +
+                                          errLike + logAlignCompatProb;
+
 
 
                         // The overall mass of this transcript, which is used to
@@ -380,13 +401,24 @@ void processMiniBatch(AlignmentLibrary<FragT>& alnLib,
 
                         double r = uni(eng);
                         if (!burnedIn and r < std::exp(aln->logProb)) {
+			    // Update the error model
                             if (salmonOpts.useErrorModel) {
                                 alnMod.update(*aln, transcript, LOG_1, logForgettingMass);
                             }
+			    // Update the fragment length distribution 
                             if (aln->isPaired() and !salmonOpts.noFragLengthDist) {
                                 double fragLength = aln->fragLen();
                                 fragLengthDist.addVal(fragLength, logForgettingMass);
                             }
+			    // Update the fragment start position distribution 
+			    if (useFSPD) {
+				    auto hitPos = aln->left();
+				    auto& fragStartDist =
+					    fragStartDists[transcript.lengthClassIndex()];
+				    fragStartDist.addVal(hitPos,
+						    transcript.RefLength,
+						    logForgettingMass);
+			    }
                         }
                     }
 
@@ -792,6 +824,9 @@ int salmonAlignmentQuantify(int argc, char* argv[]) {
                          "unlikely lengths will be assigned a smaller relative probability than those with more likely "
                         "lengths.  When this flag is passed in, the observed fragment length has no effect on that fragment's "
                         "a priori probability.")
+    ("noFragStartPosDist", po::bool_switch(&(sopt.noFragStartPosDist))->default_value(false), "[Currently Experimental] : "
+                        "Don't consider / model non-uniformity in the fragment start positions "
+                        "across the transcript.")
     ("numErrorBins", po::value<uint32_t>(&(sopt.numErrorBins))->default_value(6), "The number of bins into which to divide "
                         "each read when learning and applying the error model.  For example, a value of 10 would mean that "
                         "effectively, a separate error model is leared and applied to each 10th of the read, while a value of "