Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions nthash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -616,5 +616,66 @@ inline void NTMS64(const char *kmerSeq, const std::vector<std::vector<unsigned>
}
}

// strand-aware multihash spaced seed ntHash with multiple hashes per seed
inline bool NTMSM64(const char *kmerSeq, const std::vector<std::vector<unsigned> > &seedSeq, const unsigned k, const unsigned m, const unsigned m2,
uint64_t& fhVal, uint64_t& rhVal, unsigned& locN, uint64_t* hVal, bool *hStn)
{
fhVal=rhVal=0;
locN=0;
for(int i=k-1; i>=0; i--) {
if(seedTab[(unsigned char)kmerSeq[i]]==seedN) {
locN=i;
return false;
}
fhVal = rol1(fhVal);
fhVal = swapbits033(fhVal);
fhVal ^= seedTab[(unsigned char)kmerSeq[k-1-i]];

rhVal = rol1(rhVal);
rhVal = swapbits033(rhVal);
rhVal ^= seedTab[(unsigned char)kmerSeq[i]&cpOff];
}

for(unsigned j=0; j<m; j++) {
uint64_t fsVal=fhVal, rsVal=rhVal;
for(std::vector<unsigned>::const_iterator i=seedSeq[j].begin(); i!=seedSeq[j].end(); ++i) {
fsVal ^= (msTab31l[(unsigned char)kmerSeq[*i]][(k-1-*i)%31] | msTab33r[(unsigned char)kmerSeq[*i]][(k-1-*i)%33]);
rsVal ^= (msTab31l[(unsigned char)kmerSeq[*i]&cpOff][*i%31] | msTab33r[(unsigned char)kmerSeq[*i]&cpOff][*i%33]);
}
hStn[j * m2] = rsVal<fsVal;
hVal[j * m2] = hStn[j * m2]? rsVal : fsVal;
for(unsigned j2=1; j2<m2; j2++) {
uint64_t tVal = hVal[j * m2] * (j2 ^ k * multiSeed);
tVal ^= tVal >> multiShift;
hStn[j * m2 + j2] = hStn[j * m2];
hVal[j * m2 + j2] = tVal;
}
}
return true;
}

// strand-aware multihash spaced seed ntHash for sliding k-mers with multiple hashes per seed
inline void NTMSM64(const char *kmerSeq, const std::vector<std::vector<unsigned> > &seedSeq, const unsigned char charOut, const unsigned char charIn,
const unsigned k, const unsigned m, const unsigned m2, uint64_t& fhVal, uint64_t& rhVal, uint64_t *hVal, bool *hStn)
{
fhVal = NTF64(fhVal,k,charOut,charIn);
rhVal = NTR64(rhVal,k,charOut,charIn);
for(unsigned j=0; j<m; j++) {
uint64_t fsVal=fhVal, rsVal=rhVal;
for(std::vector<unsigned>::const_iterator i=seedSeq[j].begin(); i!=seedSeq[j].end(); ++i) {
fsVal ^= (msTab31l[(unsigned char)kmerSeq[*i]][(k-1-*i)%31] | msTab33r[(unsigned char)kmerSeq[*i]][(k-1-*i)%33]);
rsVal ^= (msTab31l[(unsigned char)kmerSeq[*i]&cpOff][*i%31] | msTab33r[(unsigned char)kmerSeq[*i]&cpOff][*i%33]);
}
hStn[j * m2] = rsVal<fsVal;
hVal[j * m2] = hStn[j * m2]? rsVal : fsVal;
for(unsigned j2=1; j2<m2; j2++) {
uint64_t tVal = hVal[j * m2] * (j2 ^ k * multiSeed);
tVal ^= tVal >> multiShift;
hStn[j * m2 + j2] = hStn[j * m2];
hVal[j * m2 + j2] = tVal;
}
}
}


#endif
42 changes: 23 additions & 19 deletions stHashIterator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class stHashIterator
{

public:

static std::vector<std::vector<unsigned> > parseSeed(const std::vector<std::string> &seedString) {
std::vector<std::vector<unsigned> > seedSet;
for (unsigned i=0; i< seedString.size(); i++) {
Expand All @@ -31,7 +31,7 @@ class stHashIterator
}
return seedSet;
}

/**
* Default constructor. Creates an iterator pointing to
* the end of the iterator range.
Expand All @@ -47,14 +47,15 @@ class stHashIterator
* @param seq address of DNA sequence to be hashed
* @param seed address of spaced seed
* @param k k-mer size
* @param h number of hashes
* @param h number of seeds
* @param h2 number of hashes per seed
*/
stHashIterator(const std::string& seq, const std::vector<std::vector<unsigned> >& seed, unsigned h, unsigned k):
m_seq(seq), m_seed(seed), m_h(h), m_k(k), m_hVec(new uint64_t[h]), m_hStn(new bool[h]), m_pos(0)
stHashIterator(const std::string& seq, const std::vector<std::vector<unsigned> >& seed, unsigned h, unsigned h2, unsigned k):
m_seq(seq), m_seed(seed), m_h(h), m_h2(h2), m_k(k), m_hVec(new uint64_t[h * h2]), m_hStn(new bool[h * h2]), m_pos(0)
{
init();
}

/** Initialize internal state of iterator */
void init()
{
Expand All @@ -63,7 +64,7 @@ class stHashIterator
return;
}
unsigned locN=0;
while (m_pos<m_seq.length()-m_k+1 && !NTMS64(m_seq.data()+m_pos, m_seed, m_k, m_h, m_fhVal, m_rhVal, locN, m_hVec, m_hStn))
while (m_pos<m_seq.length()-m_k+1 && !NTMSM64(m_seq.data()+m_pos, m_seed, m_k, m_h, m_h2, m_fhVal, m_rhVal, locN, m_hVec, m_hStn))
m_pos+=locN+1;
if (m_pos >= m_seq.length()-m_k+1)
m_pos = std::numeric_limits<std::size_t>::max();
Expand All @@ -82,9 +83,9 @@ class stHashIterator
init();
}
else
NTMS64(m_seq.data()+m_pos, m_seed, m_seq.at(m_pos-1), m_seq.at(m_pos-1+m_k), m_k, m_h, m_fhVal, m_rhVal, m_hVec, m_hStn);
NTMSM64(m_seq.data()+m_pos, m_seed, m_seq.at(m_pos-1), m_seq.at(m_pos-1+m_k), m_k, m_h, m_h2, m_fhVal, m_rhVal, m_hVec, m_hStn);
}

size_t pos() const{
return m_pos;
}
Expand All @@ -95,13 +96,13 @@ class stHashIterator
return m_hStn;
}


/** get pointer to hash values for current k-mer */
const uint64_t* operator*() const
{
return m_hVec;
}


/** test equality with another iterator */
bool operator==(const stHashIterator& it) const
Expand All @@ -114,20 +115,20 @@ class stHashIterator
{
return !(*this == it);
}

/** pre-increment operator */
stHashIterator& operator++()
{
next();
return *this;
}

/** iterator pointing to one past last element */
static const stHashIterator end()
{
return stHashIterator();
}

/** destructor */
~stHashIterator() {
if(m_hVec!=NULL) {
Expand All @@ -137,16 +138,19 @@ class stHashIterator
}

private:

/** DNA sequence */
std::string m_seq;

/** Spaced Seed sequence */
std::vector<std::vector<unsigned> > m_seed;
/** number of hashes */

/** number of seeds */
unsigned m_h;

/** number of hashes per seed */
unsigned m_h2;

/** k-mer size */
unsigned m_k;

Expand All @@ -155,7 +159,7 @@ class stHashIterator

/** hash strands, forward = 0, reverse-complement = 1 */
bool *m_hStn;

/** position of current k-mer */
size_t m_pos;

Expand Down
10 changes: 5 additions & 5 deletions unittest/UnitTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ TEST_CASE("test fixture", "[UnitTests]")
std::vector<std::string> seedString;
seedString.push_back("11111100000000111111");

auto h = seedString.size();
auto numSeeds = seedString.size();
auto k = seedString[0].size();

std::vector<std::vector<unsigned>> seedSet = stHashIterator::parseSeed(seedString);
Expand All @@ -107,11 +107,11 @@ TEST_CASE("test fixture", "[UnitTests]")
std::string kmerM2 = "ACGTACACTGTACTGAGTCT";
std::string kmerM3 = "ACGTACACTGCACTGAGTCT";

stHashIterator ssItr(kmer, seedSet, h, k);
stHashIterator ssItr(kmer, seedSet, numSeeds, 1, k);

stHashIterator ssM1Itr(kmerM1, seedSet, h, k);
stHashIterator ssM2Itr(kmerM2, seedSet, h, k);
stHashIterator ssM3Itr(kmerM3, seedSet, h, k);
stHashIterator ssM1Itr(kmerM1, seedSet, numSeeds, 1, k);
stHashIterator ssM2Itr(kmerM2, seedSet, numSeeds, 1, k);
stHashIterator ssM3Itr(kmerM3, seedSet, numSeeds, 1, k);

assert((*ssItr)[0] == (*ssM1Itr)[0]);
assert((*ssItr)[0] == (*ssM2Itr)[0]);
Expand Down