Skip to content

Commit

Permalink
Merge pull request #9 from COMBINE-lab/memory-squeeze
Browse files Browse the repository at this point in the history
PackedContigInfoVec
  • Loading branch information
rob-p committed Feb 11, 2020
2 parents 97287dc + a08d274 commit 95ac0ac
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 42 deletions.
15 changes: 6 additions & 9 deletions include/PufferfishBinaryGFAReader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,9 @@ class BinaryGFAReader {
std::string id;
};

// spp::sparse_hash_map<uint64_t, pufferfish::util::PackedContigInfo>
std::vector<uint64_t>
contigid2seq; // map of contig_id to # of letters in contig (contig
// length)

pufferfish::util::PackedContigInfoVec contigid2seq;

// path maps each transcript_id to a pair of <contig_id, orientation>
// orientation : +/true main, -/false reverse
spp::sparse_hash_map<uint64_t, std::vector<std::pair<uint64_t, bool>>> path;
Expand Down Expand Up @@ -85,8 +84,9 @@ class BinaryGFAReader {
*/
void encodeSeq(compact::vector<uint64_t, 2>& seqVec, size_t offset,
stx::string_view str);
// spp::sparse_hash_map<uint64_t, pufferfish::util::PackedContigInfo>&
std::vector<uint64_t> & getContigNameMap();

//spp::sparse_hash_map<uint64_t, pufferfish::util::PackedContigInfo>& getContigNameMap();
pufferfish::util::PackedContigInfoVec& getContigNameMap();

std::vector<std::string>& getRefIDs();
std::vector<uint32_t>& getRefLengths();
Expand All @@ -100,9 +100,6 @@ class BinaryGFAReader {
void serializeContigTable(const std::string& odir,
const std::vector<std::pair<std::string, uint16_t>>& shortRefsNameLen,
const std::vector<uint32_t>& refIdExtensions);
uint64_t getContigLength(uint64_t i) {
return (i < contigid2seq.size()-1?contigid2seq[i+1]:rankVec_.size()) - contigid2seq[i];
}

void deserializeContigTable();
// void writeFile(std::string fileName);
Expand Down
78 changes: 78 additions & 0 deletions include/Util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1073,6 +1073,84 @@ Compile-time selection between list-like and map-like printing.
length(length) {}
};

struct PackedContigInfoVec {
uint64_t useq_len_{0};
std::unique_ptr<std::vector<uint64_t>> data_{nullptr};

PackedContigInfoVec() {}

PackedContigInfoVec(uint64_t useq_len,
size_t ncontig) {
useq_len_ = useq_len;
data_.reset(new std::vector<uint64_t>);
data_->reserve(ncontig);
}

void add(uint64_t o) { data_->push_back(o); }

PackedContigInfo operator[](uint64_t i) const {
uint32_t len = (i < data_->size() - 1)
? ((*data_)[i + 1] - (*data_)[i])
: (useq_len_ - (*data_)[i]);
return PackedContigInfo{i, (*data_)[i], len};
}

void clear() {
useq_len_ = 0;
data_->clear();
data_->shrink_to_fit();
data_.reset(nullptr);
}

size_t size() { return data_->size(); }

struct PackedContigInfoVecIterator {
const PackedContigInfoVec* pci_{nullptr};
uint64_t it{std::numeric_limits<uint64_t>::max()};
std::pair<uint64_t, PackedContigInfo> p_{std::numeric_limits<uint64_t>::max(), {size_t(0), size_t(0), uint32_t(0)}};

PackedContigInfoVecIterator& operator++() {
++it;
return *this;
}

bool operator==(const PackedContigInfoVecIterator& o) const {
return it == o.it;
}

bool operator!=(const PackedContigInfoVecIterator& other) const {
return !(*this == other);
}

std::pair<uint64_t, PackedContigInfo>& operator*() {
p_.second = (*pci_)[it];
p_.first = it;
return p_;
}
};

const PackedContigInfoVecIterator find(uint64_t idx) const {
PackedContigInfoVecIterator i;
i.pci_ = this;
i.it = (idx >= data_->size()) ? data_->size() : idx;
return i;
}

const PackedContigInfoVecIterator begin() const {
PackedContigInfoVecIterator i;
i.pci_ = this;
i.it = 0;
return i;
}

const PackedContigInfoVecIterator end() const {
PackedContigInfoVecIterator i;
i.pci_ = this;
i.it = data_->size();
return i;
}
};

struct RefPos {
uint32_t pos;
bool isFW;
Expand Down
39 changes: 22 additions & 17 deletions src/PufferfishBinaryGFAReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,15 +130,16 @@ namespace pufferfish {
rankSelDict.reset(new rank9sel(&rankVec_, rankVec_.size()));
auto contig2seqSize = rankSelDict->rank(rankVec_.size()-1) + 1;
std::cerr << contig2seqSize << "\n";
contigid2seq.resize(contig2seqSize);
logger_->info("Done wrapping the rank vector with a rank9sel structure.");
contigid2seq = pufferfish::util::PackedContigInfoVec(
rankVec_.size(),contig2seqSize);
while (nextPos < rankVec_.size() and nextPos != 0) {
nextPos = static_cast<uint64_t>(rankSelDict->select(contigCntr)) + 1;// select(0) is meaningful
contigid2seq[contigCntr] = prevPos;// {contigCntr, prevPos, static_cast<uint32_t>(nextPos-prevPos)};
contigid2seq.add(prevPos);
prevPos = nextPos;
contigCntr++;
}
logger_->info("contig count for validation: {}", contigCntr);
logger_->info("contig count for validation: {:n}", contigCntr);

// start and end kmer-hash over the contigs
// might get deprecated later
Expand All @@ -152,11 +153,13 @@ namespace pufferfish {
refIdLen = 0;
file.read(reinterpret_cast<char *>(&refIdLen), refIdSize);
if (!file.good()) break;
char* temp = new char[refIdLen+1];
file.read(temp, refIdLen);
temp[refIdLen] = '\0';
refId = temp;
delete [] temp;
refId.assign(refIdLen+1, '\0');
//char* temp = new char[refIdLen+1];
//file.read(temp, refIdLen);
file.read(&(refId[0]), refIdLen);
//temp[refIdLen] = '\0';
//refId = temp;
//delete [] temp;
file.read(reinterpret_cast<char *>(&contigCntPerPath), sizeof(contigCntPerPath));
// std::cerr << "pathlen: " << contigCntPerPath << "\n";
// std::cerr << refId << " " << contigCntPerPath << "\n";
Expand All @@ -181,8 +184,7 @@ namespace pufferfish {
uint32_t refLength{0};
bool firstContig{true};
for (auto &ctig : path[ref_cnt]) {
auto len = getContigLength(ctig.first);
uint64_t l = len - (firstContig ? 0 : (k - 1));
uint64_t l = contigid2seq[ctig.first].length - (firstContig ? 0 : (k - 1));
refLength += l;
firstContig = false;
}
Expand Down Expand Up @@ -223,22 +225,24 @@ namespace pufferfish {
Direction nextContigDirection;
// If a is in the forward orientation, the last k-mer comes from the end, otherwise it is the reverse complement of the first k-mer
if (ore) {
auto cinfo = contigid2seq[cid];
lastKmerInContig.fromNum(
seqVec_.get_int(2 * (contigid2seq[cid] + getContigLength(cid)-k), 2 * k));// (contigid2seq[cid].offset + contigid2seq[cid].length - k), 2 * k));
seqVec_.get_int(2 * (cinfo.offset + cinfo.length - k), 2 * k));
contigDirection = Direction::APPEND;
} else {
lastKmerInContig.fromNum(seqVec_.get_int(2 * contigid2seq[cid], 2 * k));
lastKmerInContig.fromNum(seqVec_.get_int(2 * contigid2seq[cid].offset, 2 * k));
lastKmerInContig.swap();
contigDirection = Direction::PREPEND;
}

// If a is in the forward orientation, the first k-mer comes from the beginning, otherwise it is the reverse complement of the last k-mer
if (nextore) {
firstKmerInNextContig.fromNum(seqVec_.get_int(2 * contigid2seq[nextcid], 2 * k));
firstKmerInNextContig.fromNum(seqVec_.get_int(2 * contigid2seq[nextcid].offset, 2 * k));
nextContigDirection = Direction::PREPEND;
} else {
auto cinfo = contigid2seq[nextcid];
firstKmerInNextContig.fromNum(
seqVec_.get_int(2 * (contigid2seq[nextcid] + getContigLength(nextcid) - k),
seqVec_.get_int(2 * (cinfo.offset + cinfo.length - k),
2 * k));
firstKmerInNextContig.swap();
nextContigDirection = Direction::APPEND;
Expand All @@ -259,8 +263,8 @@ namespace pufferfish {
logger_->info("Total # of numerical Contigs : {:n}", contigid2seq.size());
}

// spp::sparse_hash_map<uint64_t, pufferfish::util::PackedContigInfo> &
std::vector<uint64_t> &

pufferfish::util::PackedContigInfoVec&
BinaryGFAReader::getContigNameMap() {
return contigid2seq;
}
Expand Down Expand Up @@ -299,7 +303,8 @@ namespace pufferfish {
contig2pos[cposOffsetvec[contig.first]].update(tr, pos, contig.second);
// std::cerr << cposOffsetvec[contig.first] << ":" << tr << " " << contig2pos[cposOffsetvec[contig.first]].transcript_id() << "\n";
cposOffsetvec[contig.first]++;
currContigLength = getContigLength(contig.first);
currContigLength = contigid2seq[contig.first].length;
//currContigLength = getContigLength(contig.first);
accumPos += currContigLength - k;
}
}
Expand Down
25 changes: 11 additions & 14 deletions src/PufferfishIndexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -482,13 +482,10 @@ int pufferfishIndex(pufferfish::IndexOptions& indexOpts) {
pf.serializeContigTable(outdir, shortRefsNameLen, refIdExtensions);
{
auto& cnmap = pf.getContigNameMap();
for (uint64_t idx = 0; idx < cnmap.size(); idx++) {
// auto& r1 = kv.second;
// tlen += r1.length;
// numKmers += r1.length - k + 1;
auto len = pf.getContigLength(idx);
tlen += len;
numKmers += len - k + 1;
for (auto& kv : cnmap) {
const auto& r1 = kv.second;
tlen += r1.length;
numKmers += r1.length - k + 1;
++nread;
}
jointLog->info("# segments = {:n}", nread);
Expand Down Expand Up @@ -757,11 +754,11 @@ int pufferfishIndex(pufferfish::IndexOptions& indexOpts) {
size_t ncontig = cnmap.size();
std::vector<size_t> sampledInds ;
for(size_t i = 0; i < ncontig; ++i) {//}auto& kv : cnmap){
auto len = pf.getContigLength(i);//cnmap[i];
const auto& r1 = cnmap[i];
sampledInds.clear();
computeSampledPositions(len, k, sampleSize, sampledInds) ;
computeSampledPositions(r1.length, k, sampleSize, sampledInds) ;
sampledKmers += sampledInds.size() ;
contigLengths.push_back(len) ;
contigLengths.push_back(r1.length) ;
}
jointLog->info("# sampled kmers = {:n}", sampledKmers) ;
jointLog->info("# skipped kmers = {:n}", numKmers - sampledKmers) ;
Expand Down Expand Up @@ -989,12 +986,12 @@ int pufferfishIndex(pufferfish::IndexOptions& indexOpts) {
{
auto& cnmap = pf.getContigNameMap() ;
std::vector<size_t> sampledInds ;
for(uint64_t idx = 0; idx < cnmap.size(); idx++){
auto len = pf.getContigLength(idx);
for(auto& kv : cnmap){
auto& r1 = kv.second ;
sampledInds.clear();
computeSampledPositionsLossy(len, k, sampleSize, sampledInds) ;
computeSampledPositionsLossy(r1.length, k, sampleSize, sampledInds) ;
sampledKmers += sampledInds.size() ;
contigLengths.push_back(len) ;
contigLengths.push_back(r1.length) ;
}
jointLog->info("# sampled kmers = {:n}", sampledKmers) ;
jointLog->info("# skipped kmers = {:n}", numKmers - sampledKmers) ;
Expand Down
5 changes: 3 additions & 2 deletions src/PufferfishValidate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ int doPufferfishInternalValidate(IndexT& pi, pufferfish::ValidateOptions& valida
uint64_t totalKmersSearched{0};
uint64_t validCnt = pi.getIndexedRefCount();
for (uint64_t i = 0; i < validCnt; i++) {

auto refLen = pi.refLength(i);
uint32_t posWithinRef{0};

Expand Down Expand Up @@ -179,8 +180,8 @@ int doPufferfishValidate(IndexT& pi, pufferfish::ValidateOptions& validateOpts)
auto& contigid2seq = pf.getContigNameMap() ;

//auto& paths = pf.getPaths() ;
for(uint64_t idx = 0; idx < contigid2seq.size(); idx++){
auto ctgInfo = pufferfish::util::PackedContigInfo(idx, contigid2seq[idx], pf.getContigLength(idx));//ctg.second ;
for(auto& ctg : contigid2seq){
auto& ctgInfo = ctg.second ;

uint64_t kbi, kei ;
kbi = seq.get_int(2*ctgInfo.offset, 2*k) ;
Expand Down

0 comments on commit 95ac0ac

Please sign in to comment.