Skip to content

Commit

Permalink
Merge pull request #665 from OceanGenomics/parserefactor
Browse files Browse the repository at this point in the history
Refactoring
  • Loading branch information
rob-p committed Sep 27, 2021
2 parents 96ea980 + 373ff32 commit 98870fd
Show file tree
Hide file tree
Showing 9 changed files with 280 additions and 287 deletions.
31 changes: 8 additions & 23 deletions include/ReadExperiment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

// Boost includes
#include <boost/filesystem.hpp>
#include <boost/filesystem/path.hpp>
#include <boost/range/irange.hpp>

// Cereal includes
Expand All @@ -48,10 +49,12 @@ class ReadExperiment {
public:
ReadExperiment(std::vector<ReadLibrary>& readLibraries,
// const boost::filesystem::path& transcriptFile,
const boost::filesystem::path& indexDirectory,
SalmonIndex* salmonIndex,
// const boost::filesystem::path& indexDirectory,
SalmonOpts& sopt)
: readLibraries_(readLibraries),
// transcriptFile_(transcriptFile),
salmonIndex_(salmonIndex),
transcripts_(std::vector<Transcript>()), totalAssignedFragments_(0),
fragStartDists_(5), posBiasFW_(5), posBiasRC_(5), posBiasExpectFW_(5),
posBiasExpectRC_(5), /*seqBiasModel_(1.0),*/ eqBuilder_(sopt.jointLog, sopt.maxHashResizeThreads),
Expand Down Expand Up @@ -115,24 +118,6 @@ class ReadExperiment {
}
*/

// ==== Figure out the index type
boost::filesystem::path versionPath = indexDirectory / "versionInfo.json";
SalmonIndexVersionInfo versionInfo;
versionInfo.load(versionPath);
if (versionInfo.indexVersion() == 0) {
fmt::MemoryWriter infostr;
infostr << "Error: The index version file " << versionPath.string()
<< " doesn't seem to exist. Please try re-building the salmon "
"index.";
throw std::invalid_argument(infostr.str());
}
// Check index version compatibility here
auto indexType = versionInfo.indexType();
// ==== Figure out the index type

salmonIndex_.reset(new SalmonIndex(sopt.jointLog, indexType));
salmonIndex_->load(indexDirectory);

// Now we'll have either an FMD-based index or a QUASI index
// dispatch on the correct type.
fmt::MemoryWriter infostr;
Expand All @@ -159,7 +144,7 @@ class ReadExperiment {
// Create the cluster forest for this set of transcripts
clusters_.reset(new ClusterForest(transcripts_.size(), transcripts_));
}

EQBuilderT& equivalenceClassBuilder() { return eqBuilder_; }

std::string getIndexSeqHash256() const { return salmonIndex_->seqHash256(); }
Expand Down Expand Up @@ -262,7 +247,7 @@ class ReadExperiment {
}
}

SalmonIndex* getIndex() { return salmonIndex_.get(); }
SalmonIndex* getIndex() { return salmonIndex_; }

template <typename PuffIndexT>
void loadTranscriptsFromPuff(PuffIndexT* idx_, const SalmonOpts& sopt) {
Expand Down Expand Up @@ -416,7 +401,7 @@ class ReadExperiment {
std::atomic<bool> burnedIn{
totalAssignedFragments_ + numAssignedFragments_ >= sopt.numBurninFrags};
for (auto& rl : readLibraries_) {
processReadLibrary(rl, salmonIndex_.get(), transcripts_, clusterForest(),
processReadLibrary(rl, salmonIndex_, transcripts_, clusterForest(),
*(fragLengthDist_.get()), numAssignedFragments_,
numThreads, burnedIn);
}
Expand Down Expand Up @@ -806,7 +791,7 @@ class ReadExperiment {
/**
* The index we've built on the set of transcripts.
*/
std::unique_ptr<SalmonIndex> salmonIndex_{nullptr};
SalmonIndex* salmonIndex_{nullptr};
/**
* The cluster forest maintains the dynamic relationship
* defined by transcripts and reads --- if two transcripts
Expand Down
5 changes: 5 additions & 0 deletions include/SalmonIndex.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,4 +246,9 @@ class SalmonIndex {
std::string decoyNameHash256_;
};

// Convenience function to load an index
std::unique_ptr<SalmonIndex>
checkLoadIndex(const boost::filesystem::path& indexDirectory,
std::shared_ptr<spdlog::logger>& logger);

#endif //__SALMON_INDEX_HPP
52 changes: 26 additions & 26 deletions src/Alevin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
<HEADER
**/

#include <memory>
#include <random>
#include <algorithm>
#include <atomic>
Expand Down Expand Up @@ -63,11 +64,11 @@

// salmon includes
#include "FastxParser.hpp"
#include "ProgramOptionsGenerator.hpp"
#include "SalmonConfig.hpp"
#include "SalmonDefaults.hpp"
#include "SalmonOpts.hpp"
#include "SalmonUtils.hpp"
#include "ProgramOptionsGenerator.hpp"

using paired_parser_qual = fastx_parser::FastxParser<fastx_parser::ReadQualPair>;
using single_parser = fastx_parser::FastxParser<fastx_parser::ReadSeq>;
Expand All @@ -78,20 +79,18 @@ namespace apt = alevin::protocols;
namespace aut = alevin::utils;

template <typename ProtocolT>
int alevin_sc_align(AlevinOpts<ProtocolT>& aopt,
SalmonOpts& sopt,
boost::program_options::parsed_options& orderedOptions);
int alevin_sc_align(AlevinOpts<ProtocolT>& aopt, SalmonOpts& sopt,
boost::program_options::parsed_options& orderedOptions,
std::unique_ptr<SalmonIndex>& salmonIndex);

template <typename ProtocolT>
int alevinQuant(AlevinOpts<ProtocolT>& aopt,
SalmonOpts& sopt,
SoftMapT& barcodeMap,
TrueBcsT& trueBarcodes,
int alevinQuant(AlevinOpts<ProtocolT>& aopt, SalmonOpts& sopt,
SoftMapT& barcodeMap, TrueBcsT& trueBarcodes,
spp::sparse_hash_map<uint32_t, uint32_t>& txpToGeneMap,
spp::sparse_hash_map<std::string, uint32_t>& geneIdxMap,
boost::program_options::parsed_options& orderedOptions,
CFreqMapT& freqCounter,
size_t numLowConfidentBarcode);
CFreqMapT& freqCounter, size_t numLowConfidentBarcode,
std::unique_ptr<SalmonIndex>& salmonIndex);

//colors for progress monitoring
const char RESET_COLOR[] = "\x1b[0m";
Expand Down Expand Up @@ -835,7 +834,8 @@ void initiatePipeline(AlevinOpts<ProtocolT>& aopt,
boost::program_options::variables_map& vm,
std::string commentString, bool noTgMap,
std::vector<std::string> barcodeFiles,
std::vector<std::string> readFiles){
std::vector<std::string> readFiles,
std::unique_ptr<SalmonIndex>& salmonIndex){
bool isOptionsOk = aut::processAlevinOpts(aopt, sopt, noTgMap, vm);
if (!isOptionsOk){
aopt.jointLog->flush();
Expand Down Expand Up @@ -894,14 +894,14 @@ void initiatePipeline(AlevinOpts<ProtocolT>& aopt,
// write out the cmd_info.json to make sure we have that
boost::filesystem::path outputDirectory = vm["output"].as<std::string>();
bool isWriteOk = aut::writeCmdInfo(outputDirectory / "cmd_info.json", orderedOptions);

if(!isWriteOk){
fmt::print(stderr, "Writing cmd_info.json in output directory failed.\nExiting now.");
exit(1);
}

// do the actual mapping
auto rc = alevin_sc_align(aopt, sopt, orderedOptions);

auto rc = alevin_sc_align(aopt, sopt, orderedOptions, salmonIndex);
if (rc == 0) {
aopt.jointLog->info("sc-align successful.");
} else {
Expand Down Expand Up @@ -949,7 +949,7 @@ void initiatePipeline(AlevinOpts<ProtocolT>& aopt,
aopt.jointLog->info("Done with Barcode Processing; Moving to Quantify\n");
alevinQuant(aopt, sopt, barcodeSoftMap, trueBarcodes,
txpToGeneMap, geneIdxMap, orderedOptions,
freqCounter, numLowConfidentBarcode);
freqCounter, numLowConfidentBarcode, salmonIndex);
}
else{
boost::filesystem::path cmdInfoPath = vm["output"].as<std::string>();
Expand All @@ -962,7 +962,7 @@ void initiatePipeline(AlevinOpts<ProtocolT>& aopt,
}
}

int salmonBarcoding(int argc, const char* argv[]) {
int salmonBarcoding(int argc, const char* argv[], std::unique_ptr<SalmonIndex>& salmonIndex) {
namespace bfs = boost::filesystem;
namespace po = boost::program_options;

Expand Down Expand Up @@ -1077,7 +1077,7 @@ salmon-based processing of single-cell RNA-seq data.
//aopt.jointLog->warn("Using DropSeq Setting for Alevin");
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
barcodeFiles, readFiles);
barcodeFiles, readFiles, salmonIndex);
}
else if(indrop){
std::cout<<"Indrop get neighbors removed, please use other protocols";
Expand All @@ -1089,7 +1089,7 @@ salmon-based processing of single-cell RNA-seq data.
//aopt.jointLog->warn("Using InDrop Setting for Alevin");
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
barcodeFiles, readFiles);
barcodeFiles, readFiles, salmonIndex);
}
else{
fmt::print(stderr, "ERROR: indrop needs w1 flag too.\n Exiting Now");
Expand All @@ -1105,7 +1105,7 @@ salmon-based processing of single-cell RNA-seq data.
//aopt.jointLog->warn("Using InDrop Setting for Alevin");
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
barcodeFiles, readFiles);
barcodeFiles, readFiles, salmonIndex);
}
else{
fmt::print(stderr, "ERROR: citeseq needs featureStart and featureLength flag too.\n Exiting Now");
Expand All @@ -1117,54 +1117,54 @@ salmon-based processing of single-cell RNA-seq data.
//aopt.jointLog->warn("Using 10x v3 Setting for Alevin");
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
barcodeFiles, readFiles);
barcodeFiles, readFiles, salmonIndex);
}
else if(chrom){
AlevinOpts<apt::Chromium> aopt;
//aopt.jointLog->warn("Using 10x v2 Setting for Alevin");
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
barcodeFiles, readFiles);
barcodeFiles, readFiles, salmonIndex);
}
else if(gemcode){
AlevinOpts<apt::Gemcode> aopt;
//aopt.jointLog->warn("Using 10x v1 Setting for Alevin");
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
unmateFiles, readFiles);
unmateFiles, readFiles, salmonIndex);
}
else if(celseq){
AlevinOpts<apt::CELSeq> aopt;
//aopt.jointLog->warn("Using CEL-Seq Setting for Alevin");
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
barcodeFiles, readFiles);
barcodeFiles, readFiles, salmonIndex);
}
else if(celseq2){
AlevinOpts<apt::CELSeq2> aopt;
//aopt.jointLog->warn("Using CEL-Seq2 Setting for Alevin");
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
barcodeFiles, readFiles);
barcodeFiles, readFiles, salmonIndex);
}
else if(quartzseq2){
AlevinOpts<apt::QuartzSeq2> aopt;
//aopt.jointLog->warn("Using Quartz-Seq2 Setting for Alevin");
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
barcodeFiles, readFiles);
barcodeFiles, readFiles, salmonIndex);
} else if (custom_old) {
AlevinOpts<apt::Custom> aopt;
//aopt.jointLog->warn("Using Custom Setting for Alevin");
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
barcodeFiles, readFiles);
barcodeFiles, readFiles, salmonIndex);
} else if (custom_new) {
AlevinOpts<apt::CustomGeometry> aopt;
//aopt.jointLog->warn("Using Custom Setting for Alevin");
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
barcodeFiles, readFiles);
barcodeFiles, readFiles, salmonIndex);
}

} catch (po::error& e) {
Expand Down
24 changes: 23 additions & 1 deletion src/BuildSalmonIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
// http://stackoverflow.com/questions/108318/whats-the-simplest-way-to-test-whether-a-number-is-a-power-of-2-in-c
bool isPowerOfTwo(uint32_t n) { return (n > 0 and (n & (n - 1)) == 0); }

int salmonIndex(int argc, const char* argv[]) {
int salmonIndex(int argc, const char* argv[], std::unique_ptr<SalmonIndex>& /* salmonIndex */) {

using std::string;
namespace bfs = boost::filesystem;
Expand Down Expand Up @@ -256,3 +256,25 @@ Creates a salmon index.
}
return ret;
}

std::unique_ptr<SalmonIndex> checkLoadIndex(const boost::filesystem::path& indexDirectory, std::shared_ptr<spdlog::logger>& logger) {
// ==== Figure out the index type
boost::filesystem::path versionPath =
indexDirectory / "versionInfo.json";
SalmonIndexVersionInfo versionInfo;
versionInfo.load(versionPath);
if (versionInfo.indexVersion() == 0) {
fmt::MemoryWriter infostr;
infostr
<< "Error: The index version file " << versionPath.string()
<< " doesn't seem to exist. Please try re-building the salmon "
"index.";
throw std::invalid_argument(infostr.str());
}
// Check index version compatibility here
auto indexType = versionInfo.indexType();
// ==== Figure out the index type
std::unique_ptr<SalmonIndex> res(new SalmonIndex(logger, indexType));
res->load(indexDirectory);
return res;
}

0 comments on commit 98870fd

Please sign in to comment.