Skip to content

Commit

Permalink
Numerous changes and enhancements
Browse files Browse the repository at this point in the history
-- Significant speed-up to lightweight alignment.  The index
   was previously built with a sampling factor of 32.  This
   is now 1 and is a user-specifiable parameter during index
   construction.

-- Initial implementation of non-uniform fragment start position
   distribution.  Not yet enabled in alignment-based salmon, but
   this is not difficulty.  The estimation and use of the non-uniform
   fragment start position distribution can be disabled via a
   command line flag.
  • Loading branch information
rob-p committed May 15, 2015
1 parent 6e30b92 commit a3f294b
Show file tree
Hide file tree
Showing 12 changed files with 613 additions and 264 deletions.
1 change: 1 addition & 0 deletions include/AlignmentLibrary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ class AlignmentLibrary {

inline BAMQueue<FragT>& getAlignmentGroupQueue() { return *bq.get(); }

inline size_t upperBoundHits() { return bq->numMappedReads(); }
inline size_t numMappedReads() { return bq->numMappedReads(); }
inline size_t numUniquelyMappedReads() { return bq->numUniquelyMappedReads(); }

Expand Down
78 changes: 34 additions & 44 deletions include/FragmentStartPositionDistribution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* This class is similar to and inspired by the FragmentLengthDistribution
* class, which was itself modified from lengthdistribution.h ---
* originally written by Adam Roberts as part of the eXpress software.
* Rob Patro; 2014
* updated by Rob Patro; 2014, 2015
*/

#ifndef FRAGMENT_START_POSITION_DISTRIBUTION
Expand All @@ -13,6 +13,7 @@
#include <atomic>
#include <vector>
#include <string>
#include <mutex>

/**
* The FragmentStartPositionDistribution class keeps track of the observed fragment
Expand All @@ -22,88 +23,77 @@
* in to_string).
*/
class FragmentStartPositionDistribution {
/**
* A private vector that stores the (logged) kernel values.
**/
std::vector<double> kernel_;
/**
* A private vector that stores the observed (logged) mass for each length.
*/
std::vector<tbb::atomic<double>> hist_;
std::vector<tbb::atomic<double>> pmf_;
std::vector<tbb::atomic<double>> cmf_;
/**
* A private double that stores the total observed (logged) mass.
*/
tbb::atomic<double> totMass_;
tbb::atomic<double> totMass_;
/**
* A private double that stores the (logged) sum of the product of observed
* lengths and masses for quick mean calculations.
*/
tbb::atomic<double> sum_;
tbb::atomic<double> sum_;
/**
* The number of bins we consider within each transcript.
*/
size_t numBins_;

// Mutex for this distribution
std::mutex fspdMut_;
bool isUpdated_;

public:
/**
* LengthDistribution Constructor.
* @param alpha double that sets the average pseudo-counts (logged).
* @param max_val an integer that sets the maximum allowable length.
* @param prior_mu a size_t for the mean of the prior gaussian distribution.
If 0, a uniform distribution is used instead.
* @param prior_sigma a size_t for the standard deviation of the prior
* gaussian distribution.
* @param kernel_n a size_t specifying the number of trials in the kernel
* binomial distribution. Must be odd.
* @param kernel_p a double specifying the success probability for the kernel
* binomial distribution.
* @param bin_size a size_t specifying the size of bins to use internally to
* reduce the number of parameters in the distribution.
*/
FragmentLengthDistribution(double alpha, size_t max_val, size_t prior_mu,
size_t prior_sigma, size_t kernel_n, double kernel_p,
size_t bin_size = 1);
/**
* An accessor for the maximum allowed length.
* @return Max allowed length.
* FragmentStartPositionDistribution constructor:
* @param numBins The number of bins to consider for
* each transcript.
*
*/
size_t maxVal() const;
/**
* An accessor for the minimum observed length (1 initially).
* @return Minimum observed length.
*/
size_t minVal() const;
/**
* An accessor for the mean length in the distribution.
* @return Mean observed length.
*/
double mean() const;
/**
FragmentStartPositionDistribution(uint32_t numBins=20);

/**
* A member function that updates the distribution based on a new length
* observation.
* @param len an integer for the observed length.
* @param mass a double for the mass (logged) to add.
*/
void addVal(size_t len, double mass);
void addVal(int32_t hitPos, uint32_t txpLen, double mass);
/**
* A member function that returns the probability that a hit
* starts at the specified position within the given transcript length.
* @param hitPos The position where the fragment begins
* @param txpLen The length of the transcript
*/
double operator()(int32_t hitPos, uint32_t txpLen, double effLen);
// Evaluate the CDF between two points
double evalCDF(int32_t hitPos, uint32_t txpLen);
// Update the distribution (compute the CDF) and
// set isUpdated_;
void update();

/**
* An accessor for the (logged) probability of a given length.
* @param len an integer for the length to return the probability of.
* @return (logged) probability of observing the given length.
*/
double pmf(size_t len) const;
//double pmf(size_t len) const;
/**
* A member function that returns a (logged) cumulative mass for a given
* length.
* @param len an integer for the length to return the cmf value of.
* @return (Logged) cmf value of length.
*/
double cmf(size_t len) const;
//double cmf(size_t len) const;
/**
* A member function that returns a vector containing the (logged) cumulative
* mass function *for the bins*.
* @return (Logged) cmf of bins.
*/
std::vector<double> cmf() const;
//std::vector<double> cmf() const;
/**
* An accessor for the (logged) observation mass (including pseudo-counts).
* @return Total observation mass.
Expand Down
37 changes: 34 additions & 3 deletions include/ReadExperiment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ extern "C" {
#include "Transcript.hpp"
#include "ReadLibrary.hpp"
#include "FragmentLengthDistribution.hpp"
#include "FragmentStartPositionDistribution.hpp"

// Logger includes
#include "spdlog/spdlog.h"
Expand Down Expand Up @@ -42,7 +43,8 @@ class ReadExperiment {
readLibraries_(readLibraries),
//transcriptFile_(transcriptFile),
transcripts_(std::vector<Transcript>()),
totalAssignedFragments_(0) {
totalAssignedFragments_(0),
fragStartDists_(5){
namespace bfs = boost::filesystem;

// Make sure the read libraries are valid.
Expand All @@ -58,7 +60,6 @@ class ReadExperiment {
fragLenKernelN,
fragLenKernelP, 1));


// Make sure the transcript file exists.
/*
if (!bfs::exists(transcriptFile_)) {
Expand Down Expand Up @@ -120,7 +121,26 @@ class ReadExperiment {
if (rseq != 0) {
for (size_t i = 0; i < compLen; ++i) { seq[i] = nucTab[rseq[i]]; }
}
transcripts_.back().Sequence = salmon::stringtools::encodeSequenceInSAM(seq.c_str(), t.RefLength);
auto& txp = transcripts_.back();
txp.Sequence = salmon::stringtools::encodeSequenceInSAM(seq.c_str(), t.RefLength);
// Length classes taken from
// ======
// Roberts, Adam, et al.
// "Improving RNA-Seq expression estimates by correcting for fragment bias."
// Genome Biol 12.3 (2011): R22.
// ======
// perhaps, define these in a more data-driven way
if (t.RefLength <= 1334) {
txp.lengthClassIndex(0);
} else if (t.RefLength <= 2104) {
txp.lengthClassIndex(0);
} else if (t.RefLength <= 2988) {
txp.lengthClassIndex(0);
} else if (t.RefLength <= 4389) {
txp.lengthClassIndex(0);
} else {
txp.lengthClassIndex(0);
}
/*
std::cerr << "TS = " << t.RefName << " : \n";
std::cerr << seq << "\n VS \n";
Expand All @@ -147,6 +167,8 @@ class ReadExperiment {
uint64_t numAssignedFragments() { return numAssignedFragments_; }
uint64_t numMappedReads() { return numAssignedFragments_; }

uint64_t upperBoundHits() { return upperBoundHits_; }
void setUpperBoundHits(uint64_t ubh) { upperBoundHits_ = ubh; }

std::atomic<uint64_t>& numAssignedFragmentsAtomic() { return numAssignedFragments_; }

Expand Down Expand Up @@ -198,6 +220,9 @@ class ReadExperiment {
return numObservedFragsInFirstPass_;
}

std::vector<FragmentStartPositionDistribution>& fragmentStartPositionDistributions() {
return fragStartDists_;
}

bool softReset() {
if (quantificationPasses_ == 0) {
Expand Down Expand Up @@ -384,6 +409,11 @@ class ReadExperiment {
* in the same cluster.
*/
std::unique_ptr<ClusterForest> clusters_;
/**
*
*
*/
std::vector<FragmentStartPositionDistribution> fragStartDists_;
/** Keeps track of the number of passes that have been
* made through the alignment file.
*/
Expand All @@ -393,6 +423,7 @@ class ReadExperiment {
size_t quantificationPasses_{0};
uint64_t numAssignedFragsInFirstPass_{0};
uint64_t numObservedFragsInFirstPass_{0};
uint64_t upperBoundHits_{0};
std::unique_ptr<FragmentLengthDistribution> fragLengthDist_;
};

Expand Down
2 changes: 2 additions & 0 deletions include/SalmonOpts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ struct SalmonOpts {
// account when computing the probability that a
// fragment was generated from a transcript.

bool noFragStartPosDist; // Don't learn a non-uniform start distribution

bool useReadCompat; // Give a fragment assignment a likelihood based on the compatibility
// between the manner in which it mapped and the expected read
// librarry format.
Expand Down
8 changes: 5 additions & 3 deletions include/SalmonUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,14 +160,16 @@ void generateGeneLevelEstimates(boost::filesystem::path& geneMapPath,
* Given the information about the position and strand from which a paired-end
* read originated, return the library format with which it is compatible.
*/
LibraryFormat hitType(uint32_t end1Start, bool end1Fwd,
uint32_t end2Start, bool end2Fwd);
LibraryFormat hitType(int32_t end1Start, bool end1Fwd,
int32_t end2Start, bool end2Fwd);
LibraryFormat hitType(int32_t end1Start, bool end1Fwd, uint32_t len1,
int32_t end2Start, bool end2Fwd, uint32_t len2, bool canDovetail);
/**
* Given the information about the position and strand from which the
* single-end read originated, return the library format with which it
* is compatible.
*/
LibraryFormat hitType(uint32_t readStart, bool isForward);
LibraryFormat hitType(int32_t readStart, bool isForward);

}
}
Expand Down
6 changes: 6 additions & 0 deletions include/Transcript.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class Transcript {
mass_.store(other.mass_.load());
lastUpdate_.store(other.lastUpdate_.load());
cachedEffectiveLength_.store(other.cachedEffectiveLength_.load());
lengthClassIndex_ = other.lengthClassIndex_;
logPerBasePrior_ = other.logPerBasePrior_;
priorMass_ = other.priorMass_;
}
Expand All @@ -52,6 +53,7 @@ class Transcript {
mass_.store(other.mass_.load());
lastUpdate_.store(other.lastUpdate_.load());
cachedEffectiveLength_.store(other.cachedEffectiveLength_.load());
lengthClassIndex_ = other.lengthClassIndex_;
logPerBasePrior_ = other.logPerBasePrior_;
priorMass_ = other.priorMass_;
return *this;
Expand Down Expand Up @@ -186,6 +188,9 @@ class Transcript {
double perBasePrior() { return std::exp(logPerBasePrior_); }
inline size_t lastTimestepUpdated() { return lastTimestepUpdated_.load(); }

void lengthClassIndex(uint32_t ind) { lengthClassIndex_ = ind; }
uint32_t lengthClassIndex() { return lengthClassIndex_; }

std::string RefName;
uint32_t RefLength;
uint32_t id;
Expand All @@ -207,6 +212,7 @@ class Transcript {
tbb::atomic<double> sharedCount_;
tbb::atomic<double> cachedEffectiveLength_;
tbb::atomic<size_t> lastUpdate_;
uint32_t lengthClassIndex_;
double logPerBasePrior_;
};

Expand Down

0 comments on commit a3f294b

Please sign in to comment.