Skip to content

Commit

Permalink
do not cache sequences until needed for the first time. also keep nam…
Browse files Browse the repository at this point in the history
…es and coordinates in separate caches
  • Loading branch information
glennhickey committed Jul 11, 2013
1 parent 1c88375 commit e0cd41c
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 24 deletions.
129 changes: 106 additions & 23 deletions api/hdf5_impl/hdf5Genome.cpp
Expand Up @@ -63,6 +63,12 @@ HDF5Genome::HDF5Genome(const string& name,
_metaData = new HDF5MetaData(&_group, metaGroupName);
_rup = new HDF5MetaData(&_group, rupGroupName);

_totalSequenceLength = _dnaArray.getSize() * 2;
if (_totalSequenceLength > 0 && _rup->get(rupGroupName) == "1")
{
_totalSequenceLength -= 1;
}

hsize_t chunk;
_dcprops.getChunk(1, &chunk);
}
Expand Down Expand Up @@ -176,6 +182,8 @@ void HDF5Genome::setDimensions(
void HDF5Genome::updateTopDimensions(
const vector<Sequence::UpdateInfo>& topDimensions)
{
loadSequencePosCache();
loadSequenceNameCache();
vector<Sequence::UpdateInfo>::const_iterator i;
map<string, HDF5Sequence*>::iterator cacheIt;
map<string, const Sequence::UpdateInfo*> inputMap;
Expand Down Expand Up @@ -244,6 +252,8 @@ void HDF5Genome::updateTopDimensions(
void HDF5Genome::updateBottomDimensions(
const vector<Sequence::UpdateInfo>& bottomDimensions)
{
loadSequencePosCache();
loadSequenceNameCache();
vector<Sequence::UpdateInfo>::const_iterator i;
map<string, HDF5Sequence*>::iterator cacheIt;
map<string, const Sequence::UpdateInfo*> inputMap;
Expand Down Expand Up @@ -375,6 +385,7 @@ hal_size_t HDF5Genome::getNumSequences() const

Sequence* HDF5Genome::getSequence(const string& name)
{
loadSequenceNameCache();
Sequence* sequence = NULL;
map<string, HDF5Sequence*>::iterator mapIt = _sequenceNameCache.find(name);
if (mapIt != _sequenceNameCache.end())
Expand All @@ -386,6 +397,7 @@ Sequence* HDF5Genome::getSequence(const string& name)

const Sequence* HDF5Genome::getSequence(const string& name) const
{
loadSequenceNameCache();
const Sequence* sequence = NULL;
map<string, HDF5Sequence*>::const_iterator mapIt =
_sequenceNameCache.find(name);
Expand All @@ -398,6 +410,7 @@ const Sequence* HDF5Genome::getSequence(const string& name) const

Sequence* HDF5Genome::getSequenceBySite(hal_size_t position)
{
loadSequencePosCache();
map<hal_size_t, HDF5Sequence*>::iterator i;
i = _sequencePosCache.upper_bound(position);
if (i != _sequencePosCache.end())
Expand All @@ -412,6 +425,7 @@ Sequence* HDF5Genome::getSequenceBySite(hal_size_t position)

const Sequence* HDF5Genome::getSequenceBySite(hal_size_t position) const
{
loadSequencePosCache();
map<hal_size_t, HDF5Sequence*>::const_iterator i;
i = _sequencePosCache.upper_bound(position);
if (i != _sequencePosCache.end())
Expand Down Expand Up @@ -762,6 +776,7 @@ void HDF5Genome::read()
_dnaArray.load(&_group, dnaArrayName, _numChunksInArrayBuffer);
}
catch (H5::Exception){}

try
{
_group.openDataSet(topArrayName);
Expand Down Expand Up @@ -799,43 +814,111 @@ void HDF5Genome::read()
void HDF5Genome::readSequences()
{
deleteSequenceCache();
_totalSequenceLength = 0;
}

void HDF5Genome::deleteSequenceCache()
{
if (_sequencePosCache.size() > 0)
{
map<hal_size_t, HDF5Sequence*>::iterator i;
for (i = _sequencePosCache.begin(); i != _sequencePosCache.end(); ++i)
{
delete i->second;
}
}
else if (_sequenceNameCache.size() > 0)
{
map<string, HDF5Sequence*>::iterator i;
for (i = _sequenceNameCache.begin(); i != _sequenceNameCache.end(); ++i)
{
delete i->second;
}
}
_sequencePosCache.clear();
_sequenceNameCache.clear(); // I share my pointers with above.
}

void HDF5Genome::loadSequencePosCache() const
{
if (_sequencePosCache.size() > 0)
{
return;
}
hal_size_t totalReadLen = 0;
hal_size_t numSequences = _sequenceNameArray.getSize();
for (hal_size_t i = 0; i < numSequences; ++i)

if (_sequenceNameCache.size() > 0)
{
HDF5Sequence* seq = new HDF5Sequence(this, &_sequenceIdxArray,
&_sequenceNameArray, i);
_sequencePosCache.insert(
pair<hal_size_t, HDF5Sequence*>(seq->getStartPosition() +
seq->getSequenceLength(), seq));
_sequenceNameCache.insert(
pair<string, HDF5Sequence*>(seq->getName(), seq));
_totalSequenceLength += seq->getSequenceLength();
assert(_sequenceNameCache.size() == numSequences);
map<std::string, HDF5Sequence*>::iterator i;
for (i = _sequenceNameCache.begin(); i != _sequenceNameCache.end(); ++i)
{
_sequencePosCache.insert(pair<hal_size_t, HDF5Sequence*>(
i->second->getStartPosition() +
i->second->getSequenceLength(), i->second));
totalReadLen += i->second->getSequenceLength();
}
}
else
{
for (hal_size_t i = 0; i < numSequences; ++i)
{
HDF5Sequence* seq =
new HDF5Sequence(const_cast<HDF5Genome*>(this),
const_cast<HDF5ExternalArray*>(&_sequenceIdxArray),
const_cast<HDF5ExternalArray*>(&_sequenceNameArray),
i);
_sequencePosCache.insert(
pair<hal_size_t, HDF5Sequence*>(seq->getStartPosition() +
seq->getSequenceLength(), seq));
totalReadLen += seq->getSequenceLength();
}
}
hal_size_t seqLenFromArray = _dnaArray.getSize() * 2;
if (seqLenFromArray > 0 && (seqLenFromArray != _totalSequenceLength &&
seqLenFromArray-1 != _totalSequenceLength))
if (totalReadLen != _totalSequenceLength)
{
stringstream ss;
ss << "Sequences for genome " << getName() << " have total length "
<< _totalSequenceLength << " but the (non-zero) DNA array contains "
<< seqLenFromArray << " elements. This is an internal error or the "
<< "file is corrupt.";
<< totalReadLen << " but the (non-zero) DNA array contains "
<< _totalSequenceLength << " elements. This is an internal error "
<< "or the file is corrupt.";
throw hal_exception(ss.str());
}
}

void HDF5Genome::deleteSequenceCache()
void HDF5Genome::loadSequenceNameCache() const
{
map<hal_size_t, HDF5Sequence*>::iterator i;
for (i = _sequencePosCache.begin(); i != _sequencePosCache.end(); ++i)
if (_sequenceNameCache.size() > 0)
{
delete i->second;
return;
}
hal_size_t numSequences = _sequenceNameArray.getSize();

if (_sequencePosCache.size() > 0)
{
assert(_sequencePosCache.size() == numSequences);
map<hal_size_t, HDF5Sequence*>::iterator i;
for (i = _sequencePosCache.begin(); i != _sequencePosCache.end(); ++i)
{
_sequenceNameCache.insert(pair<string, HDF5Sequence*>(
i->second->getName(), i->second));
}
}
else
{
for (hal_size_t i = 0; i < numSequences; ++i)
{
HDF5Sequence* seq =
new HDF5Sequence(const_cast<HDF5Genome*>(this),
const_cast<HDF5ExternalArray*>(&_sequenceIdxArray),
const_cast<HDF5ExternalArray*>(&_sequenceNameArray),
i);

_sequenceNameCache.insert(
pair<string, HDF5Sequence*>(seq->getName(), seq));
}
}
_sequencePosCache.clear();
_sequenceNameCache.clear(); // I share my pointers with above.
}

void HDF5Genome::writeSequences(const vector<Sequence::Info>&
sequenceDimensions)
{
Expand Down
2 changes: 2 additions & 0 deletions api/hdf5_impl/hdf5Genome.h
Expand Up @@ -171,6 +171,8 @@ class HDF5Genome : public Genome
void writeSequences(const std::vector<hal::Sequence::Info>&
sequenceDimensions);
void deleteSequenceCache();
void loadSequencePosCache() const;
void loadSequenceNameCache() const;
void setGenomeTopDimensions(
const std::vector<hal::Sequence::UpdateInfo>& sequenceDimensions);

Expand Down
2 changes: 1 addition & 1 deletion api/hdf5_impl/hdf5Sequence.cpp
Expand Up @@ -331,7 +331,7 @@ void HDF5Sequence::set(hal_size_t startPosition,
char* arrayBuffer = _nameArray->getUpdate(_index);
strcpy(arrayBuffer, sequenceInfo._name.c_str());

assert(getStartPosition() == startPosition);
assert(getStartPosition() == (hal_index_t)startPosition);
assert(getNumTopSegments() == sequenceInfo._numTopSegments);
assert(getNumBottomSegments() == sequenceInfo._numBottomSegments);
assert(getSequenceLength() == sequenceInfo._length);
Expand Down

0 comments on commit e0cd41c

Please sign in to comment.