Permalink
Browse files

Initial integration of quasi-mapping into salmon --- *huge* speed boost

  • Loading branch information...
rob-p committed Aug 28, 2015
1 parent 60c607a commit 234cb13d67a9a1b995c86c8669d4cefc919fbc87
View
@@ -26,7 +26,7 @@ SET(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
## Set the standard required compile flags
# Nov 18th --- removed -DHAVE_CONFIG_H
set (CMAKE_CXX_FLAGS "-pthread -funroll-loops -fPIC -fomit-frame-pointer -Ofast -DHAVE_ANSI_TERM -DHAVE_SSTREAM -Wall -std=c++11 -Wreturn-type -Werror=return-type")
set (CMAKE_CXX_FLAGS "-pthread -funroll-loops -fPIC -fomit-frame-pointer -Ofast -DRAPMAP_SALMON_SUPPORT -DHAVE_ANSI_TERM -DHAVE_SSTREAM -Wall -std=c++11 -Wreturn-type -Werror=return-type")
##
# OSX is strange (some might say, stupid in this regard). Deal with it's quirkines here.
@@ -327,11 +327,12 @@ message("==================================================================")
include(ExternalProject)
ExternalProject_Add(libcereal
DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
URL http://www.cs.cmu.edu/~robp/files/cereal-v1.0.0.tgz
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/cereal-1.0.0
DOWNLOAD_COMMAND curl -k -L https://github.com/USCiLab/cereal/archive/v1.1.2.tar.gz -o cereal-v1.1.2.tar.gz &&
tar -xzvf cereal-v1.1.2.tar.gz
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/cereal-1.1.2
INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
UPDATE_COMMAND sh -c "mkdir -p <SOURCE_DIR>/build"
BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/cereal-1.0.0/build
BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/cereal-1.1.2/build
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND sh -c "mkdir -p <INSTALL_DIR>/include && cp -r <SOURCE_DIR>/include/cereal <INSTALL_DIR>/include"
@@ -482,10 +483,10 @@ message("Build system will fetch SPDLOG")
message("==================================================================")
ExternalProject_Add(libspdlog
DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/spdlog/archive/v1.5.tar.gz -o spdlog-v1.5.tar.gz &&
tar -xzf spdlog-v1.5.tar.gz &&
DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/spdlog/archive/v1.6.tar.gz -o spdlog-v1.6.tar.gz &&
tar -xzf spdlog-v1.6.tar.gz &&
rm -fr spdlog &&
mv -f spdlog-1.5 spdlog
mv -f spdlog-1.6 spdlog
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/spdlog
INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
CONFIGURE_COMMAND ""
@@ -534,12 +535,39 @@ if (NOT HAVE_FAST_MALLOC)
set (HAVE_FAST_MALLOC TRUE)
endif ()
##
## This depenency is for RapMap
##
message("Build system will fetch and build SparseHash")
message("==================================================================")
ExternalProject_Add(libsparsehash
DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/sparsehash/archive/sparsehash-2.0.2.tar.gz -o sparsehash-2.0.2.tar.gz &&
tar -xzf sparsehash-2.0.2.tar.gz
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/sparsehash-sparsehash-2.0.2
BUILD_IN_SOURCE TRUE
INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ./configure --prefix=<INSTALL_DIR>"
INSTALL_COMMAND make install
)
###
#
# Done building external dependencies.
#
###
###
#
# Grab RapMap sources for quasi-mapping code --- DURING CONFIGURE TIME!
#
####
if(NOT FETCHED_RAPMAP)
exec_program(${CMAKE_CURRENT_SOURCE_DIR}/scripts/fetchRapMap.sh)
set(FETCHED_RAPMAP TRUE CACHE BOOL "Has RapMap been fetched?" FORCE)
endif()
set (CPACK_SOURCE_IGNORE_FILES
"/src/PCA.cpp"
"/src/PCAUtils.cpp"
@@ -567,6 +595,7 @@ message("CPACK_SOURCE_IGNORE_FILES = ${CPACK_SOURCE_IGNORE_FILES}")
# Recurse into Salmon source directory
add_subdirectory ( src )
add_dependencies(salmon RapMap)
# build a CPack driven installer package
include (CPack)
View
@@ -77,88 +77,37 @@ class ReadExperiment {
throw std::invalid_argument(ss.str());
}
*/
salmonIndex_.reset(new SalmonIndex(sopt.jointLog));
salmonIndex_->load(indexDirectory);
bwaidx_t* idx_ = salmonIndex_->bwaIndex();
size_t numRecords = idx_->bns->n_seqs;
std::vector<Transcript> transcripts_tmp;
fmt::print(stderr, "Index contained {} targets\n", numRecords);
//transcripts_.resize(numRecords);
for (auto i : boost::irange(size_t(0), numRecords)) {
uint32_t id = i;
char* name = idx_->bns->anns[i].name;
uint32_t len = idx_->bns->anns[i].len;
// copy over the length, then we're done.
transcripts_tmp.emplace_back(id, name, len);
}
std::sort(transcripts_tmp.begin(), transcripts_tmp.end(),
[](const Transcript& t1, const Transcript& t2) -> bool {
return t1.id < t2.id;
});
double alpha = 0.005;
char nucTab[256];
nucTab[0] = 'A'; nucTab[1] = 'C'; nucTab[2] = 'G'; nucTab[3] = 'T';
for (size_t i = 4; i < 256; ++i) { nucTab[i] = 'N'; }
// Load the transcript sequence from file
for (auto& t : transcripts_tmp) {
transcripts_.emplace_back(t.id, t.RefName.c_str(), t.RefLength, alpha);
/* from BWA */
uint8_t* rseq = nullptr;
int64_t tstart, tend, compLen, l_pac = idx_->bns->l_pac;
tstart = idx_->bns->anns[t.id].offset;
tend = tstart + t.RefLength;
rseq = bns_get_seq(l_pac, idx_->pac, tstart, tend, &compLen);
if (compLen != t.RefLength) {
fmt::print(stderr,
"For transcript {}, stored length ({}) != computed length ({}) --- index may be corrupt. exiting\n",
t.RefName, compLen, t.RefLength);
std::exit(1);
}
std::string seq(t.RefLength, ' ');
if (rseq != 0) {
for (size_t i = 0; i < compLen; ++i) { seq[i] = nucTab[rseq[i]]; }
}
auto& txp = transcripts_.back();
txp.Sequence = salmon::stringtools::encodeSequenceInSAM(seq.c_str(), t.RefLength);
// Length classes taken from
// ======
// Roberts, Adam, et al.
// "Improving RNA-Seq expression estimates by correcting for fragment bias."
// Genome Biol 12.3 (2011): R22.
// ======
// perhaps, define these in a more data-driven way
if (t.RefLength <= 1334) {
txp.lengthClassIndex(0);
} else if (t.RefLength <= 2104) {
txp.lengthClassIndex(0);
} else if (t.RefLength <= 2988) {
txp.lengthClassIndex(0);
} else if (t.RefLength <= 4389) {
txp.lengthClassIndex(0);
} else {
txp.lengthClassIndex(0);
}
/*
std::cerr << "TS = " << t.RefName << " : \n";
std::cerr << seq << "\n VS \n";
for (size_t i = 0; i < t.RefLength; ++i) {
std::cerr << transcripts_.back().charBaseAt(i);
}
std::cerr << "\n\n";
*/
free(rseq);
/* end BWA code */
// ==== Figure out the index type
boost::filesystem::path versionPath = indexDirectory / "versionInfo.json";
SalmonIndexVersionInfo versionInfo;
versionInfo.load(versionPath);
if (versionInfo.indexVersion() == 0) {
fmt::MemoryWriter infostr;
infostr << "Error: The index version file " << versionPath.string()
<< " doesn't seem to exist. Please try re-building the salmon "
"index.";
throw std::invalid_argument(infostr.str());
}
// Since we have the de-coded reference sequences, we no longer need
// the encoded sequences, so free them.
/** TEST OPT **/
// free(idx_->pac); idx_->pac = nullptr;
/** END TEST OPT **/
transcripts_tmp.clear();
// ====== Done loading the transcripts from file
// Check index version compatibility here
auto indexType = versionInfo.indexType();
// ==== Figure out the index type
salmonIndex_.reset(new SalmonIndex(sopt.jointLog, indexType));
salmonIndex_->load(indexDirectory);
// Now we'll have either an FMD-based index or a QUASI index
// dispatch on the correct type.
switch (salmonIndex_->indexType()) {
case IndexType::QUASI:
loadTranscriptsFromQuasi();
break;
case IndexType::FMD:
loadTranscriptsFromFMD();
break;
}
// Create the cluster forest for this set of transcripts
clusters_.reset(new ClusterForest(transcripts_.size(), transcripts_));
@@ -188,6 +137,138 @@ class ReadExperiment {
}
}
SalmonIndex* getIndex() { return salmonIndex_.get(); }
void loadTranscriptsFromQuasi() {
RapMapSAIndex* idx_ = salmonIndex_->quasiIndex();
size_t numRecords = idx_->txpNames.size();
fmt::print(stderr, "Index contained {} targets\n", numRecords);
//transcripts_.resize(numRecords);
double alpha = 0.005;
for (auto i : boost::irange(size_t(0), numRecords)) {
uint32_t id = i;
const char* name = idx_->txpNames[i].c_str();
uint32_t len = idx_->txpLens[i];
// copy over the length, then we're done.
transcripts_.emplace_back(id, name, len, alpha);
auto& txp = transcripts_.back();
// The transcript sequence
auto txpSeq = idx_->seq.substr(idx_->txpOffsets[i], len);
txp.Sequence = salmon::stringtools::encodeSequenceInSAM(txpSeq.c_str(), len);
// Length classes taken from
// ======
// Roberts, Adam, et al.
// "Improving RNA-Seq expression estimates by correcting for fragment bias."
// Genome Biol 12.3 (2011): R22.
// ======
// perhaps, define these in a more data-driven way
if (txp.RefLength <= 1334) {
txp.lengthClassIndex(0);
} else if (txp.RefLength <= 2104) {
txp.lengthClassIndex(0);
} else if (txp.RefLength <= 2988) {
txp.lengthClassIndex(0);
} else if (txp.RefLength <= 4389) {
txp.lengthClassIndex(0);
} else {
txp.lengthClassIndex(0);
}
}
/*
std::sort(transcripts_tmp.begin(), transcripts_tmp.end(),
[](const Transcript& t1, const Transcript& t2) -> bool {
return t1.id < t2.id;
});
*/
// ====== Done loading the transcripts from file
}
void loadTranscriptsFromFMD() {
bwaidx_t* idx_ = salmonIndex_->bwaIndex();
size_t numRecords = idx_->bns->n_seqs;
std::vector<Transcript> transcripts_tmp;
fmt::print(stderr, "Index contained {} targets\n", numRecords);
//transcripts_.resize(numRecords);
for (auto i : boost::irange(size_t(0), numRecords)) {
uint32_t id = i;
char* name = idx_->bns->anns[i].name;
uint32_t len = idx_->bns->anns[i].len;
// copy over the length, then we're done.
transcripts_tmp.emplace_back(id, name, len);
}
std::sort(transcripts_tmp.begin(), transcripts_tmp.end(),
[](const Transcript& t1, const Transcript& t2) -> bool {
return t1.id < t2.id;
});
double alpha = 0.005;
char nucTab[256];
nucTab[0] = 'A'; nucTab[1] = 'C'; nucTab[2] = 'G'; nucTab[3] = 'T';
for (size_t i = 4; i < 256; ++i) { nucTab[i] = 'N'; }
// Load the transcript sequence from file
for (auto& t : transcripts_tmp) {
transcripts_.emplace_back(t.id, t.RefName.c_str(), t.RefLength, alpha);
/* from BWA */
uint8_t* rseq = nullptr;
int64_t tstart, tend, compLen, l_pac = idx_->bns->l_pac;
tstart = idx_->bns->anns[t.id].offset;
tend = tstart + t.RefLength;
rseq = bns_get_seq(l_pac, idx_->pac, tstart, tend, &compLen);
if (compLen != t.RefLength) {
fmt::print(stderr,
"For transcript {}, stored length ({}) != computed length ({}) --- index may be corrupt. exiting\n",
t.RefName, compLen, t.RefLength);
std::exit(1);
}
std::string seq(t.RefLength, ' ');
if (rseq != 0) {
for (size_t i = 0; i < compLen; ++i) { seq[i] = nucTab[rseq[i]]; }
}
auto& txp = transcripts_.back();
txp.Sequence = salmon::stringtools::encodeSequenceInSAM(seq.c_str(), t.RefLength);
// Length classes taken from
// ======
// Roberts, Adam, et al.
// "Improving RNA-Seq expression estimates by correcting for fragment bias."
// Genome Biol 12.3 (2011): R22.
// ======
// perhaps, define these in a more data-driven way
if (t.RefLength <= 1334) {
txp.lengthClassIndex(0);
} else if (t.RefLength <= 2104) {
txp.lengthClassIndex(0);
} else if (t.RefLength <= 2988) {
txp.lengthClassIndex(0);
} else if (t.RefLength <= 4389) {
txp.lengthClassIndex(0);
} else {
txp.lengthClassIndex(0);
}
/*
std::cerr << "TS = " << t.RefName << " : \n";
std::cerr << seq << "\n VS \n";
for (size_t i = 0; i < t.RefLength; ++i) {
std::cerr << transcripts_.back().charBaseAt(i);
}
std::cerr << "\n\n";
*/
free(rseq);
/* end BWA code */
}
// Since we have the de-coded reference sequences, we no longer need
// the encoded sequences, so free them.
/** TEST OPT **/
// free(idx_->pac); idx_->pac = nullptr;
/** END TEST OPT **/
transcripts_tmp.clear();
// ====== Done loading the transcripts from file
}
template <typename CallbackT>
bool processReads(const uint32_t& numThreads, CallbackT& processReadLibrary) {
bool burnedIn = (totalAssignedFragments_ + numAssignedFragments_ > 5000000);
View
@@ -6,7 +6,7 @@
#include "LibraryFormat.hpp"
#include "SalmonUtils.hpp"
#include "format.h"
#include "spdlog/details/format.h"
struct ReadPair {
bam_seq_t* read1 = nullptr;
Oops, something went wrong.

0 comments on commit 234cb13

Please sign in to comment.