Skip to content

Commit

Permalink
Simplify construction to sorting plus a single sweep
Browse files Browse the repository at this point in the history
  • Loading branch information
ByteHamster committed Mar 19, 2024
1 parent 7e2c0ba commit a46a650
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 54 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ add_subdirectory(extlib/util EXCLUDE_FROM_ALL)
target_link_libraries(SicHash INTERFACE ByteHamsterUtil)

add_subdirectory(extlib/simpleRibbon)
target_link_libraries(SicHash INTERFACE SimpleRibbon)
target_link_libraries(SicHash INTERFACE SimpleRibbon ips2ra)

if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
################### Benchmark build targets ###################
Expand Down
5 changes: 5 additions & 0 deletions include/IrregularCuckooHashTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ class IrregularCuckooHashTable {

void prepare(HashedKey hash) {
assert(numEntries < config.maxEntries);
heap[numEntries] = TableEntry();
heap[numEntries].hash = hash;
if (hash.mhc <= config.threshold1) {
heap[numEntries].hashFunctionMask = 0b001;
Expand All @@ -108,6 +109,10 @@ class IrregularCuckooHashTable {
numEntries++;
}

void clear() {
numEntries = 0;
}

bool construct(size_t M_, size_t seed_) {
M = M_;
seed = seed_;
Expand Down
94 changes: 41 additions & 53 deletions include/SicHash.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "IrregularCuckooHashTable.h"
#include <SimpleRibbon.h>
#include <EliasFano.h>
#include <ips2ra.hpp>

namespace sichash {
using seed_t = uint8_t;
Expand Down Expand Up @@ -110,34 +111,26 @@ class SicHash {
SimpleRibbon<1, ribbonWidth> *ribbon1 = nullptr;
SimpleRibbon<2, ribbonWidth> *ribbon2 = nullptr;
SimpleRibbon<3, ribbonWidth> *ribbon3 = nullptr;
std::vector<size_t> emptySlots;
util::EliasFano<minimalFanoLowerBits> minimalRemap;
util::EliasFano<minimalFanoLowerBits> *minimalRemap = nullptr;
size_t unnecessaryConstructions = 0;

// Keys parameter must be an std::vector<std::string> or an std::vector<HashedKey>.
SicHash(const auto &keys, SicHashConfig _config)
: config(_config), N(keys.size()),
minimalRemap(minimal ? (N / config.loadFactor - N) : 0, minimal ? N : 0) {
SicHash(const auto &keys, SicHashConfig _config) : config(_config), N(keys.size()) {
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
numSmallTables = keys.size() / config.smallTableSize + 1;
std::vector<IrregularCuckooHashTable> tables;
tables.reserve(numSmallTables);
IrregularCuckooHashTableConfig cuckooConfig;
cuckooConfig.threshold1 = config.threshold1;
cuckooConfig.threshold2 = config.threshold2;
cuckooConfig.maxEntries = config.smallTableSize * 1.2 + 100;
for (size_t i = 0; i < numSmallTables; i++) {
tables.emplace_back(cuckooConfig);
}

if (!config.silent) {
std::cout<<"Creating MHCs"<<std::endl;
}
bucketInfo.reserve(numSmallTables);
std::vector<std::pair<size_t, HashedKey>> hashedKeys;
hashedKeys.reserve(N);
for (const auto &key : keys) {
HashedKey hash = HashedKey(key);
size_t smallTable = hash.hash(HASH_FUNCTION_BUCKET_ASSIGNMENT, numSmallTables);
tables[smallTable].prepare(hash);
hashedKeys.emplace_back(smallTable, hash);
}
ips2ra::sort(hashedKeys.begin(), hashedKeys.end(),
[](const std::pair<size_t, HashedKey> &pair) { return pair.first; });

if (!config.silent) {
std::cout<<"Inserting into Cuckoo"<<std::endl;
Expand All @@ -149,10 +142,27 @@ class SicHash {
maps[0b111].reserve(keys.size() * config.class3Percentage());
size_t sizePrefix = 0;
unnecessaryConstructions = 0;
for (IrregularCuckooHashTable &table : tables) {
size_t tableM = table.size() / config.loadFactor;
hashedKeys.emplace_back(numSmallTables + 1, 0); // Sentinel
size_t keyIdx = 0;
IrregularCuckooHashTableConfig cuckooConfig;
cuckooConfig.threshold1 = config.threshold1;
cuckooConfig.threshold2 = config.threshold2;
cuckooConfig.maxEntries = config.smallTableSize * 1.2 + 100;
IrregularCuckooHashTable irregularCuckooHashTable(cuckooConfig);
std::vector<size_t> emptySlots;
if constexpr (minimal) {
emptySlots.reserve(N / config.loadFactor - N);
}

for (size_t tableIdx = 0; tableIdx < numSmallTables; tableIdx++) {
irregularCuckooHashTable.clear();
while (hashedKeys[keyIdx].first == tableIdx) {
irregularCuckooHashTable.prepare(hashedKeys[keyIdx].second);
keyIdx++;
}
size_t tableM = irregularCuckooHashTable.size() / config.loadFactor;
size_t seed = 0;
while (!table.construct(tableM, seed)) {
while (!irregularCuckooHashTable.construct(tableM, seed)) {
unnecessaryConstructions++;
seed++;
if (seed >= std::numeric_limits<seed_t>::max()) {
Expand All @@ -161,14 +171,14 @@ class SicHash {
}
bucketInfo.emplace_back(sizePrefix, seed);

for (size_t i = 0; i < table.size(); i++) {
IrregularCuckooHashTable::TableEntry &entry = table.heap[i];
for (size_t k = 0; k < irregularCuckooHashTable.size(); k++) {
IrregularCuckooHashTable::TableEntry &entry = irregularCuckooHashTable.heap[k];
maps[entry.hashFunctionMask].emplace_back(entry.hash.mhc, entry.hashFunctionIndex & entry.hashFunctionMask);
}
if constexpr (minimal) {
for (size_t i = 0; i < tableM; i++) {
if (table.cells[i] == nullptr) {
emptySlots.push_back(sizePrefix + i);
for (size_t k = 0; k < tableM; k++) {
if (irregularCuckooHashTable.cells[k] == nullptr) {
emptySlots.push_back(sizePrefix + k);
}
}
}
Expand All @@ -186,33 +196,11 @@ class SicHash {
ribbon3 = new SimpleRibbon<3, ribbonWidth>(maps[0b111]);

if constexpr (minimal) {
if (!config.silent) {
std::cout<<"Making minimal"<<std::endl;
}
size_t smallTableToRemap = 0;
while (bucketInfo[smallTableToRemap].offset < N) {
smallTableToRemap++;
}
smallTableToRemap--;
// Iterate over last few tables and remap filled positions
size_t emptyIndex = 0;
size_t i = keys.size() - bucketInfo[smallTableToRemap].offset;
for (;smallTableToRemap < numSmallTables; smallTableToRemap++) {
for (; i < tables[smallTableToRemap].M; i++) {
minimalRemap.push_back(emptySlots[emptyIndex]);
if (tables[smallTableToRemap].cells[i] != nullptr) {
emptyIndex++;
if (emptySlots[emptyIndex] >= N) {
// No more empty slots left. We do not need to write the following items to
// the EF sequence because they are never queried
emptyIndex--;
break;
}
}
}
i = 0;
minimalRemap = new util::EliasFano<minimalFanoLowerBits>(emptySlots.size(), emptySlots.back() + 1);
for (size_t slot : emptySlots) {
minimalRemap->push_back(slot);
}
minimalRemap.buildRankSelect();
minimalRemap->buildRankSelect();
emptySlots.clear();
emptySlots.shrink_to_fit();
}
Expand All @@ -229,7 +217,7 @@ class SicHash {
size_t bytes = ribbon1->size() + ribbon2->size() + ribbon3->size()
+ bucketInfo.size() * sizeof(bucketInfo.at(0));
if constexpr (minimal) {
bytes += minimalRemap.space();
bytes += minimalRemap->space();
}
return bytes * 8;
}
Expand All @@ -238,7 +226,7 @@ class SicHash {
[[nodiscard]] size_t spaceUsageTheory() const {
size_t bytes = ribbon1->size() + ribbon2->size() + ribbon3->size();
if constexpr (minimal) {
bytes += minimalRemap.space();
bytes += minimalRemap->space();
}

size_t efN = bucketInfo.size();
Expand Down Expand Up @@ -274,7 +262,7 @@ class SicHash {
size_t result = hash.hash(hashFunction + bucketInfo[smallTable].seed, M) + bucketInfo[smallTable].offset;
if constexpr (minimal) {
if (result >= N) {
return *minimalRemap.at(result - N);
return *minimalRemap->at(result - N);
}
}
return result;
Expand Down

0 comments on commit a46a650

Please sign in to comment.