Skip to content

Commit

Permalink
Merge branch 'develop' into memory-squeeze
Browse files Browse the repository at this point in the history
  • Loading branch information
rob-p committed Feb 11, 2020
2 parents b7d836f + 97287dc commit fb165d7
Show file tree
Hide file tree
Showing 7 changed files with 125 additions and 94 deletions.
1 change: 1 addition & 0 deletions include/ProgOpts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class IndexOptions {
uint32_t lossy_rate{5};
int32_t filt_size{-1};
bool buildEdgeVec{false};
bool buildEqCls{false};
std::string twopaco_tmp_dir{""};
};

Expand Down
18 changes: 14 additions & 4 deletions include/PufferfishBinaryGFAReader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ class BinaryGFAReader {
std::string id;
};

pufferfish::util::PackedContigInfoVec contigid2seq;

pufferfish::util::PackedContigInfoVec contigid2seq;

// path maps each transcript_id to a pair of <contig_id, orientation>
// orientation : +/true main, -/false reverse
spp::sparse_hash_map<uint64_t, std::vector<std::pair<uint64_t, bool>>> path;
Expand Down Expand Up @@ -66,19 +67,24 @@ class BinaryGFAReader {
std::vector<stx::string_view> split(stx::string_view str, char delims);

bool buildEdgeVec_{false};
bool buildEqClses_{false};
std::shared_ptr<spdlog::logger> logger_{nullptr};
compact::vector<uint64_t>* cpos_offsets;

public:
spp::sparse_hash_map<uint64_t, std::vector<pufferfish::util::Position>> contig2pos;
// spp::sparse_hash_map<uint64_t, std::vector<pufferfish::util::Position>>
std::vector<pufferfish::util::Position> contig2pos;

BinaryGFAReader(const char* gfaFileName, size_t input_k,
bool buildEdgeVEc, std::shared_ptr<spdlog::logger> logger);
bool buildEqClses, bool buildEdgeVEc,
std::shared_ptr<spdlog::logger> logger);

/*void encodeSeq(sdsl::int_vector<2>& seqVec, size_t offset,
stx::string_view str);
*/
void encodeSeq(compact::vector<uint64_t, 2>& seqVec, size_t offset,
stx::string_view str);

//spp::sparse_hash_map<uint64_t, pufferfish::util::PackedContigInfo>& getContigNameMap();
pufferfish::util::PackedContigInfoVec& getContigNameMap();

Expand All @@ -94,7 +100,11 @@ class BinaryGFAReader {
void serializeContigTable(const std::string& odir,
const std::vector<std::pair<std::string, uint16_t>>& shortRefsNameLen,
const std::vector<uint32_t>& refIdExtensions);
void deserializeContigTable();
uint64_t getContigLength(uint64_t i) {
return (i < contigid2seq.size()-1?contigid2seq[i+1]:rankVec_.size()) - contigid2seq[i];
}

void deserializeContigTable();
// void writeFile(std::string fileName);
};

Expand Down
10 changes: 10 additions & 0 deletions include/Util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1025,6 +1025,12 @@ Compile-time selection between list-like and map-like printing.
ar(transcript_id_, pos_);
}

void update(uint32_t tid, uint32_t tpos, bool torien) {
transcript_id_ = tid;
pos_ = tpos;
setOrientation(torien);
}

private:
// uint32_t orientMask_
};
Expand Down Expand Up @@ -1061,6 +1067,10 @@ Compile-time selection between list-like and map-like printing.
size_t fileOrder;
size_t offset;
uint32_t length;

PackedContigInfo(size_t fileOrder, size_t offset, uint32_t length) : fileOrder(fileOrder),
offset(offset),
length(length) {}
};

struct PackedContigInfoVec {
Expand Down
1 change: 1 addition & 0 deletions src/Pufferfish.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ int main(int argc, char* argv[]) {
(option("-k", "--klen") & value("kmer_length", indexOpt.k)) % "length of the k-mer with which the dBG was built (default = 31)",
(option("-p", "--threads") & value("threads", indexOpt.p)) % "total number of threads to use for building MPHF (default = 16)",
(option("-l", "--build-edges").set(indexOpt.buildEdgeVec, true) % "build and record explicit edge table for the contaigs of the ccdBG (default = false)"),
(option("-q", "--build-eqclses").set(indexOpt.buildEqCls, true) % "build and record equivalence classes (default = false)"),
(((option("-s", "--sparse").set(indexOpt.isSparse, true)) % "use the sparse pufferfish index (less space, but slower lookup)",
((option("-e", "--extension") & value("extension_size", indexOpt.extensionSize)) % "length of the extension to store in the sparse index (default = 4)")) |
((option("-x", "--lossy-rate").set(indexOpt.lossySampling, true)) & value("lossy_rate", indexOpt.lossy_rate) % "use the lossy sampling index with a sampling rate of x (less space and fast, but lower sensitivity)"))
Expand Down
165 changes: 86 additions & 79 deletions src/PufferfishBinaryGFAReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,15 @@ namespace pufferfish {
return ret;
}

BinaryGFAReader::BinaryGFAReader(const char *binDir, size_t input_k, bool buildEdgeVec,
BinaryGFAReader::BinaryGFAReader(const char *binDir, size_t input_k,
bool buildEqClses, bool buildEdgeVec,
std::shared_ptr<spdlog::logger> logger) {
logger_ = logger;
filename_ = std::string(binDir);
logger_->info("Setting the index/BinaryGfa directory {}", binDir);
k = input_k;
buildEdgeVec_ = buildEdgeVec;
buildEqClses_ = buildEqClses;
{
CLI::AutoTimer timer{"Loading contigs", CLI::Timer::Big};
std::string bfile = filename_ + "/seq.bin";
Expand Down Expand Up @@ -126,14 +128,15 @@ namespace pufferfish {
// fill out contigId2Seq
std::unique_ptr<rank9sel> rankSelDict{nullptr};
rankSelDict.reset(new rank9sel(&rankVec_, rankVec_.size()));
auto contig2seqSize = rankSelDict->rank(rankVec_.size()-1) + 1;
std::cerr << contig2seqSize << "\n";
logger_->info("Done wrapping the rank vector with a rank9sel structure.");
contigid2seq = pufferfish::util::PackedContigInfoVec(
rankVec_.size(),
static_cast<uint64_t>(rankSelDict->rank(rankVec_.size()-1)));

This comment has been minimized.

Copy link
@fataltes

fataltes Feb 11, 2020

Collaborator

@rob-p looks like it needs + 1 (assuming the last bit is always 1)

while (nextPos < rankVec_.size() and nextPos != 0) {
nextPos = static_cast<uint64_t>(rankSelDict->select(contigCntr)) + 1;// select(0) is meaningful
contigid2seq.add(prevPos);
//contigid2seq[contigCntr] = {contigCntr, prevPos, static_cast<uint32_t>(nextPos-prevPos)};
prevPos = nextPos;
contigCntr++;
}
Expand Down Expand Up @@ -182,7 +185,8 @@ namespace pufferfish {
uint32_t refLength{0};
bool firstContig{true};
for (auto &ctig : path[ref_cnt]) {
int32_t l = contigid2seq[ctig.first].length - (firstContig ? 0 : (k - 1));
auto len = getContigLength(ctig.first);
uint64_t l = len - (firstContig ? 0 : (k - 1));
refLength += l;
firstContig = false;
}
Expand All @@ -203,11 +207,11 @@ namespace pufferfish {
for (size_t i = 0; i < contigs.size() - 1; i++) {
auto cid = contigs[i].first;
bool ore = contigs[i].second;
size_t forder = contigid2seq[cid].fileOrder;
size_t forder = cid;//contigid2seq[cid].fileOrder;
auto nextcid = contigs[i + 1].first;
bool nextore = contigs[i + 1].second;

bool nextForder = contigid2seq[nextcid].fileOrder;
bool nextForder = nextcid;//contigid2seq[nextcid].fileOrder;
// a+,b+ end kmer of a , start kmer of b
// a+,b- end kmer of a , rc(end kmer of b)
// a-,b+ rc(start kmer of a) , start kmer of b
Expand All @@ -223,8 +227,9 @@ namespace pufferfish {
Direction nextContigDirection;
// If a is in the forward orientation, the last k-mer comes from the end, otherwise it is the reverse complement of the first k-mer
if (ore) {
auto cinfo = contigid2seq[cid];
lastKmerInContig.fromNum(
seqVec_.get_int(2 * (contigid2seq[cid].offset + contigid2seq[cid].length - k), 2 * k));
seqVec_.get_int(2 * (cinfo.offset + cinfo.length - k), 2 * k));
contigDirection = Direction::APPEND;
} else {
lastKmerInContig.fromNum(seqVec_.get_int(2 * contigid2seq[cid].offset, 2 * k));
Expand All @@ -237,8 +242,9 @@ namespace pufferfish {
firstKmerInNextContig.fromNum(seqVec_.get_int(2 * contigid2seq[nextcid].offset, 2 * k));
nextContigDirection = Direction::PREPEND;
} else {
auto cinfo = contigid2seq[nextcid];
firstKmerInNextContig.fromNum(
seqVec_.get_int(2 * (contigid2seq[nextcid].offset + contigid2seq[nextcid].length - k),
seqVec_.get_int(2 * (cinfo.offset + cinfo.length - k),
2 * k));
firstKmerInNextContig.swap();
nextContigDirection = Direction::APPEND;
Expand All @@ -259,7 +265,7 @@ namespace pufferfish {
logger_->info("Total # of numerical Contigs : {:n}", contigid2seq.size());
}

//spp::sparse_hash_map<uint64_t, pufferfish::util::PackedContigInfo> &

pufferfish::util::PackedContigInfoVec&
BinaryGFAReader::getContigNameMap() {
return contigid2seq;
Expand All @@ -273,47 +279,64 @@ namespace pufferfish {
uint64_t pos = 0;
uint64_t accumPos;
uint64_t currContigLength = 0;
uint64_t total_output_lines = 0;
std::vector<uint64_t> cposOffsetvec(contigid2seq.size() + 1, 0);
uint64_t totalPosCnt = 0;
for (auto const &ent : path) {
const uint64_t &tr = ent.first;
const std::vector<std::pair<uint64_t, bool>> &contigs = ent.second;
for (const auto &contig : contigs) {
cposOffsetvec[contig.first + 1]++;
totalPosCnt++;
}
}
for (uint64_t i = 0; i < cposOffsetvec.size() - 1; i++) {
cposOffsetvec[i+1] = cposOffsetvec[i]+cposOffsetvec[i+1];
}
logger_->info("Total # of contig vec entries: {:n}", totalPosCnt);
auto w = static_cast<uint32_t >(std::ceil(std::log2(totalPosCnt+1)));
logger_->info("bits per offset entry {:n}", w);

contig2pos.resize(totalPosCnt);
for (auto const &ent : path) {
const uint64_t tr = ent.first;
const std::vector<std::pair<uint64_t, bool>> &contigs = ent.second;
accumPos = 0;
for (size_t i = 0; i < contigs.size(); i++) {
if (contig2pos.find(contigs[i].first) == contig2pos.end()) {
contig2pos[contigs[i].first] = {};
total_output_lines += 1;
}
if (contigid2seq.find(contigs[i].first) == contigid2seq.end()) {
logger_->info("{}", contigs[i].first);
}
for (const auto &contig : contigs) {
pos = accumPos;
currContigLength = contigid2seq[contigs[i].first].length;
contig2pos[cposOffsetvec[contig.first]].update(tr, pos, contig.second);
// std::cerr << cposOffsetvec[contig.first] << ":" << tr << " " << contig2pos[cposOffsetvec[contig.first]].transcript_id() << "\n";
cposOffsetvec[contig.first]++;
currContigLength = getContigLength(contig.first);
accumPos += currContigLength - k;
(contig2pos[contigs[i].first])
.push_back(pufferfish::util::Position(tr, pos, contigs[i].second));
}
}
logger_->info("\nTotal # of segments we have position for : {:n}", total_output_lines);
// We need the +1 here because we store the last entry that is 1 greater than the last offset
// so we must be able to represent of number of size contigVecSize+1, not contigVecSize.
cpos_offsets = new compact::vector<uint64_t>(w, contigid2seq.size());
cpos_offsets->clear_mem();
(*cpos_offsets)[0] = 0;
for (uint64_t i = 0; i < cpos_offsets->size()-1; i++) {
(*cpos_offsets)[i+1] = cposOffsetvec[i];
}
cposOffsetvec.clear();
cposOffsetvec.shrink_to_fit();
logger_->info("Done constructing the contig vector. {}", cpos_offsets->size());
}

void BinaryGFAReader::clearContigTable() {
refMap.clear(); refMap.shrink_to_fit();
refLengths.clear(); refLengths.shrink_to_fit();
contig2pos.clear();
contig2pos.clear(); contig2pos.shrink_to_fit();
}

// Note : We assume that odir is the name of a valid (i.e., existing) directory.
void BinaryGFAReader::serializeContigTable(const std::string &odir,
const std::vector<std::pair<std::string, uint16_t>>& shortRefsNameLen,
const std::vector<uint32_t>& refIdExtensions) {
std::string ofile = odir + "/ctable.bin";
std::string eqfile = odir + "/eqtable.bin";
std::string rlfile = odir + "/reflengths.bin";
std::ofstream ct(ofile);
std::ofstream et(eqfile);
std::ofstream rl(rlfile);
cereal::BinaryOutputArchive ar(ct);
cereal::BinaryOutputArchive eqAr(et);
cereal::BinaryOutputArchive rlAr(rl);
decltype(refLengths) tmpRefLen;
tmpRefLen.reserve(refLengths.size() + shortRefsNameLen.size());
Expand Down Expand Up @@ -377,65 +400,49 @@ namespace pufferfish {
std::vector<uint32_t> eqIDs;
//std::vector<std::vector<pufferfish::util::Position>> cpos;

// Compute sizes to reserve
size_t contigVecSize{0};
size_t contigOffsetSize{1};
for (auto &kv : contigid2seq) {
contigOffsetSize++;
contigVecSize += contig2pos[kv.first].size();
}

logger_->info("total contig vec entries {:n}", contigVecSize);
std::vector<pufferfish::util::Position> cpos;
cpos.reserve(contigVecSize);

// We need the +1 here because we store the last entry that is 1 greater than the last offset
// so we must be able to represent of number of size contigVecSize+1, not contigVecSize.
size_t w = std::ceil(std::log2(contigVecSize+1));
logger_->info("bits per offset entry {:n}", w);
compact::vector<uint64_t> cpos_offsets(w, contigOffsetSize);

cpos_offsets[0] = 0;
for (uint64_t idx = 0; idx < contigid2seq.size(); idx++) {
auto &b = contig2pos[idx];
cpos_offsets[idx+1] = cpos_offsets[idx] + b.size();
cpos.insert(cpos.end(), std::make_move_iterator(b.begin()), std::make_move_iterator(b.end()));
std::vector<uint32_t> tlist;
for (auto &p : contig2pos[idx]) {
tlist.push_back(p.transcript_id());
// Build and store equivalence classes
if (buildEqClses_) {
std::string eqfile = odir + "/eqtable.bin";
std::ofstream et(eqfile);
cereal::BinaryOutputArchive eqAr(et);
uint64_t offset = 0;
for (uint64_t idx = 0; idx < cpos_offsets->size(); idx++) {
std::vector<uint32_t> tlist;
while (offset < (*cpos_offsets)[idx]) {
tlist.push_back(contig2pos[offset].transcript_id());
offset++;
}
std::sort(tlist.begin(), tlist.end());
tlist.erase(std::unique(tlist.begin(), tlist.end()), tlist.end());
auto eqID = static_cast<uint32_t >(eqMap.size());
if (eqMap.contains(tlist)) {
eqID = eqMap[tlist];
} else {
eqMap[tlist] = eqID;
}
eqIDs.push_back(eqID);
}
std::sort(tlist.begin(), tlist.end());
tlist.erase(std::unique(tlist.begin(), tlist.end()), tlist.end());
size_t eqID = eqMap.size();
if (eqMap.contains(tlist)) {
eqID = eqMap[tlist];
} else {
eqMap[tlist] = eqID;
logger_->info("there were {:n} equivalence classes", eqMap.size());
eqAr(eqIDs);
eqIDs.clear();
eqIDs.shrink_to_fit();
std::vector<std::vector<uint32_t>> eqLabels;
eqLabels.reserve(eqMap.size());
for (auto &kv : eqMap) {
eqLabels.push_back(kv.first);
}
eqIDs.push_back(eqID);
// ar(contig2pos[kv.first]);
std::sort(eqLabels.begin(), eqLabels.end(),
[&](const std::vector<uint32_t> &l1,
const std::vector<uint32_t> &l2) -> bool {
return eqMap[l1] < eqMap[l2];
});
eqAr(eqLabels);
}
logger_->info("there were {:n} equivalence classes", eqMap.size());
eqAr(eqIDs);
eqIDs.clear();
eqIDs.shrink_to_fit();
std::vector<std::vector<uint32_t>> eqLabels;
eqLabels.reserve(eqMap.size());
for (auto &kv : eqMap) {
eqLabels.push_back(kv.first);
}
std::sort(eqLabels.begin(), eqLabels.end(),
[&](const std::vector<uint32_t> &l1,
const std::vector<uint32_t> &l2) -> bool {
return eqMap[l1] < eqMap[l2];
});
eqAr(eqLabels);
ar(cpos);

ar(contig2pos);
{
std::string fname = odir + "/" + pufferfish::util::CONTIG_OFFSETS;
std::ofstream bfile(fname, std::ios::binary);
cpos_offsets.serialize(bfile);
cpos_offsets->serialize(bfile);
bfile.close();
}
}
Expand Down
Loading

0 comments on commit fb165d7

Please sign in to comment.