Skip to content

Commit

Permalink
Incorporated a few bug fixes; mainly
Browse files Browse the repository at this point in the history
(1) Made bias correction code aware of updated output format
    (thanks, Ido & Jason).

(2) Fixed bug in alignment-based Salmon where the alignment
    error model was incorrectly applied to orphaned reads
    in paired-end data.
  • Loading branch information
rob-p committed Jun 11, 2015
1 parent b2f1d91 commit 56120af
Show file tree
Hide file tree
Showing 14 changed files with 513 additions and 150 deletions.
8 changes: 5 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,7 @@ include(ExternalProject)
ExternalProject_Add(libbwa
DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/bwa/archive/0.7.12.3.tar.gz -o bwa-master.tar.gz &&
mkdir -p bwa-master &&
tar -xzvf bwa-master.tar.gz --strip-components=1 -C bwa-master
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/bwa-master
INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
Expand Down Expand Up @@ -464,10 +465,11 @@ message("Build system will compile Staden IOLib")
message("==================================================================")
ExternalProject_Add(libstadenio
DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/staden-io_lib/releases/download/v1.13.9/staden-io_lib-1.13.9.tar.gz -o staden-io_lib-v1.13.9.tar.gz &&
tar -xzf staden-io_lib-v1.13.9.tar.gz &&
DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/staden-io_lib/releases/download/v1.13.10/io_lib-1.13.10.tar.gz -o staden-io_lib-v1.13.10.tar.gz &&
mkdir -p staden-io_lib-1.13.10 &&
tar -xzf staden-io_lib-v1.13.10.tar.gz --strip-components=1 -C staden-io_lib-1.13.10 &&
rm -fr staden-io_lib &&
mv -f staden-io_lib-1.13.9 staden-io_lib
mv -f staden-io_lib-1.13.10 staden-io_lib
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/staden-io_lib
INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
CONFIGURE_COMMAND ./configure --enable-shared=no --without-libcurl --prefix=<INSTALL_DIR> LDFLAGS=${LIBSTADEN_LDFLAGS} CFLAGS=${LIBSTADEN_CFLAGS} CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER}
Expand Down
27 changes: 27 additions & 0 deletions include/CollapsedGibbsSampler.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef COLLAPSED_GIBBS_SAMPLER_HPP
#define COLLAPSED_GIBBS_SAMPLER_HPP

#include <unordered_map>

#include "tbb/atomic.h"
#include "tbb/task_scheduler_init.h"

#include "SalmonOpts.hpp"

#include "cuckoohash_map.hh"
#include "Eigen/Dense"

class CollapsedGibbsSampler {
public:
using VecType = std::vector<double>;
CollapsedGibbsSampler();

template <typename ExpT>
bool sample(ExpT& readExp,
SalmonOpts& sopt,
uint32_t numSamples = 500);

};

#endif // COLLAPSED_EM_OPTIMIZER_HPP

53 changes: 53 additions & 0 deletions include/MultinomialSampler.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#ifndef _MULTINOMIAL_SAMPLER_HPP_
#define _MULTINOMIAL_SAMPLER_HPP_

#include <random>
#include <vector>

class MultinomialSampler {
public:
MultinomialSampler(std::random_device& rd) :
gen_(rd()), u01_(0.0, 1.0) {}

void operator()(
std::vector<int>::iterator sampleBegin,
uint32_t n,
uint32_t k,
std::vector<double>::iterator probsBegin,
bool clearCounts = true) {
int i, j;
double u, sum;
std::vector<double> z(k+1, 0.0);

if (clearCounts) {
for (uint32_t i = 0; i < k; i++) {
*(sampleBegin + i) = 0;
}
}

z[0] = 0;
for (i = 1; i <= k; i++) {
sum = 0;
for (j = 0; j < i; j++) sum+= *(probsBegin + j);
z[i] = sum;
}

for (j = 0; j < n; j++) {
u = u01_(gen_);

for (i = 0; i < k; i++) {
if ((z[i] < u) && (u <= z[i+1])) {
(*(sampleBegin + i))++;
}
}
}
}


private:
std::mt19937 gen_;
std::uniform_real_distribution<> u01_;
};

#endif //_MULTINOMIAL_SAMPLER_HPP_

2 changes: 1 addition & 1 deletion include/ReadPair.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ struct ReadPair {
}
}

inline char* getName() {
inline char* getName() const {
return bam_name(read1);
}

Expand Down
3 changes: 3 additions & 0 deletions include/SalmonOpts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ struct SalmonOpts {
bool useMassBanking; // Bank unique mass in subsequent epochs of inference

bool useVBOpt; // Use Variational Bayesian EM instead of "regular" EM in the batch passes
bool useGSOpt; // Do Gibbs Sampling after optimization

uint32_t numGibbsSamples; // Number of rounds of Gibbs sampling to perform

// Related to the fragment length distribution
size_t fragLenDistMax;
Expand Down
34 changes: 33 additions & 1 deletion include/SalmonUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ extern "C" {
#include "format.h"

#include "SalmonOpts.hpp"
#include "SalmonMath.hpp"

#include "LibraryFormat.hpp"
#include "ReadLibrary.hpp"
Expand Down Expand Up @@ -73,7 +74,38 @@ TranscriptGeneMap readTranscriptToGeneMap( std::ifstream &ifile );

TranscriptGeneMap transcriptToGeneMapFromFasta( const std::string& transcriptsFile );

void incLoop(tbb::atomic<double>& val, double inc);
/*
* Use atomic compare-and-swap to update val to
* val + inc (*in log-space*). Update occurs in a loop in case other
* threads update in the meantime.
*/
inline void incLoopLog(tbb::atomic<double>& val, double inc) {
double oldMass = val.load();
double returnedMass = oldMass;
double newMass{salmon::math::LOG_0};
do {
oldMass = returnedMass;
newMass = salmon::math::logAdd(oldMass, inc);
returnedMass = val.compare_and_swap(newMass, oldMass);
} while (returnedMass != oldMass);
}

/*
* Use atomic compare-and-swap to update val to
* val + inc. Update occurs in a loop in case other
* threads update in the meantime.
*/
inline void incLoop(tbb::atomic<double>& val, double inc) {
double oldMass = val.load();
double returnedMass = oldMass;
double newMass{oldMass + inc};
do {
oldMass = returnedMass;
newMass = oldMass + inc;
returnedMass = val.compare_and_swap(newMass, oldMass);
} while (returnedMass != oldMass);
}


void aggregateEstimatesToGeneLevel(TranscriptGeneMap& tgm, boost::filesystem::path& inputPath);

Expand Down
28 changes: 4 additions & 24 deletions include/Transcript.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <cmath>
#include <limits>
#include "SalmonStringUtils.hpp"
#include "SalmonUtils.hpp"
#include "SalmonMath.hpp"
#include "SequenceBiasModel.hpp"
#include "FragmentLengthDistribution.hpp"
Expand Down Expand Up @@ -109,14 +110,7 @@ class Transcript {
}

inline void addSharedCount(double sc) {
double oldMass = sharedCount_;
double returnedMass = oldMass;
double newMass{0.0};
do {
oldMass = returnedMass;
newMass = oldMass + sc;
returnedMass = sharedCount_.compare_and_swap(newMass, oldMass);
} while (returnedMass != oldMass);
salmon::utils::incLoop(sharedCount_, sc);
}

inline void setLastTimestepUpdated(uint64_t currentTimestep) {
Expand All @@ -127,25 +121,11 @@ class Transcript {
}

inline void addBias(double bias) {
double oldVal = avgMassBias_;
double returnedVal = oldVal;
double newVal{0.0};
do {
oldVal = returnedVal;
newVal = salmon::math::logAdd(oldVal, bias);
returnedVal = avgMassBias_.compare_and_swap(newVal, oldVal);
} while (returnedVal != oldVal);
salmon::utils::incLoopLog(avgMassBias_, bias);
}

inline void addMass(double mass) {
double oldMass = mass_;
double returnedMass = oldMass;
double newMass{0.0};
do {
oldMass = returnedMass;
newMass = salmon::math::logAdd(oldMass, mass);
returnedMass = mass_.compare_and_swap(newMass, oldMass);
} while (returnedMass != oldMass);
salmon::utils::incLoopLog(mass_, mass);
}

inline void setMass(double mass) {
Expand Down
50 changes: 24 additions & 26 deletions src/AlignmentModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ double AlignmentModel::logLikelihood(bam_seq_t* read, Transcript& ref,
// Shouldn't happen!
if (readIdx >= readLen) {
if (logger_) {
logger_->warn("CIGAR string for read [{}] "
logger_->warn("(in logLikelihood()) CIGAR string for read [{}] "
"seems inconsistent. It refers to non-existant "
"positions in the read!", bam_name(read));
std::stringstream cigarStream;
Expand All @@ -276,7 +276,7 @@ double AlignmentModel::logLikelihood(bam_seq_t* read, Transcript& ref,
enum cigar_op op = static_cast<enum cigar_op>(cigar[j] & BAM_CIGAR_MASK);
cigarStream << opLen << opToChr(op);
}
logger_->warn("CIGAR = {}", cigarStream.str());
logger_->warn("(in logLikelihood()) CIGAR = {}", cigarStream.str());
}
return logLike;
}
Expand All @@ -288,9 +288,11 @@ double AlignmentModel::logLikelihood(bam_seq_t* read, Transcript& ref,
// Shouldn't happen!
if (uTranscriptIdx >= transcriptLen) {
if (logger_) {
logger_->warn("CIGAR string for read [{}] "
"seems inconsistent. It refers to non-existant "
"positions in the reference!", bam_name(read));
logger_->warn("(in logLikelihood()) CIGAR string for read [{}] "
"seems inconsistent. It refers to non-existant "
"positions in the reference! Transcript name "
"is {}, length is {}, id is {}. Read things refid is {}",
bam_name(read), ref.RefName, transcriptLen, ref.id, bam_ref(read));
}
return logLike;
}
Expand Down Expand Up @@ -347,14 +349,6 @@ double AlignmentModel::logLikelihood(const ReadPair& hit, Transcript& ref){
size_t leftLen = static_cast<size_t>(bam_seq_len(leftRead));
size_t rightLen = static_cast<size_t>(bam_seq_len(rightRead));

// NOTE: Raise a warning in this case?
/*
if (BOOST_UNLIKELY((leftLen > maxExpectedLen_) or
(rightLen > maxExpectedLen_))) {
return logLike;
}
*/

if (leftRead) {
logLike += logLikelihood(leftRead, ref, transitionProbsLeft_);
}
Expand Down Expand Up @@ -451,7 +445,7 @@ void AlignmentModel::update(bam_seq_t* read, Transcript& ref, double p, double m
// Shouldn't happen!
if (readIdx >= readLen) {
if (logger_) {
logger_->warn("CIGAR string for read [{}] "
logger_->warn("(in update()) CIGAR string for read [{}] "
"seems inconsistent. It refers to non-existant "
"positions in the read!", bam_name(read));
std::stringstream cigarStream;
Expand All @@ -472,10 +466,12 @@ void AlignmentModel::update(bam_seq_t* read, Transcript& ref, double p, double m
// Shouldn't happen!
if (uTranscriptIdx >= transcriptLen) {
if (logger_) {
logger_->warn("CIGAR string for read [{}] "
"seems inconsistent. It refers to non-existant "
"positions in the reference!", bam_name(read));
}
logger_->warn("(in update()) CIGAR string for read [{}] "
"seems inconsistent. It refers to non-existant "
"positions in the reference! Transcript name "
"is {}, length is {}, id is {}. Read things refid is {}",
bam_name(read), ref.RefName, transcriptLen, ref.id, bam_ref(read));
}
return;
}
curRefBase = samToTwoBit[ref.baseAt(uTranscriptIdx, readStrand)];
Expand Down Expand Up @@ -528,15 +524,17 @@ void AlignmentModel::update(const ReadPair& hit, Transcript& ref, double p, doub
if (mass == salmon::math::LOG_0) { return; }
if (BOOST_UNLIKELY(!isEnabled_)) { return; }

bam_seq_t* leftRead = (bam_pos(hit.read1) < bam_pos(hit.read2)) ? hit.read1 : hit.read2;
bam_seq_t* rightRead = (bam_pos(hit.read1) < bam_pos(hit.read2)) ? hit.read2 : hit.read1;

if (leftRead) {
update(leftRead, ref, p, mass, transitionProbsLeft_);
}

if (rightRead) {
if (hit.isPaired()){
bam_seq_t* leftRead = (bam_pos(hit.read1) < bam_pos(hit.read2)) ? hit.read1 : hit.read2;
bam_seq_t* rightRead = (bam_pos(hit.read1) < bam_pos(hit.read2)) ? hit.read2 : hit.read1;
update(leftRead, ref, p, mass, transitionProbsLeft_);
update(rightRead, ref, p, mass, transitionProbsRight_);
} else if (hit.isLeftOrphan()) {
bam_seq_t* read = hit.read1;
update(read, ref, p, mass, transitionProbsLeft_);
} else if (hit.isRightOrphan()) {
bam_seq_t* read = hit.read1;
update(read, ref, p, mass, transitionProbsRight_);
}
}

Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ is.c
bwt_gen.c
bwtindex.c
CollapsedEMOptimizer.cpp
CollapsedGibbsSampler.cpp
Salmon.cpp
BuildSalmonIndex.cpp
SalmonQuantify.cpp
Expand Down

0 comments on commit 56120af

Please sign in to comment.