From a46a650eec25b80fbf558c84d6b1621a84efeb23 Mon Sep 17 00:00:00 2001 From: ByteHamster Date: Tue, 19 Mar 2024 10:49:56 +0100 Subject: [PATCH] Simplify construction to sorting plus a single sweep --- CMakeLists.txt | 2 +- include/IrregularCuckooHashTable.h | 5 ++ include/SicHash.h | 94 +++++++++++++----------------- 3 files changed, 47 insertions(+), 54 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 32f359f..ae02df2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,7 @@ add_subdirectory(extlib/util EXCLUDE_FROM_ALL) target_link_libraries(SicHash INTERFACE ByteHamsterUtil) add_subdirectory(extlib/simpleRibbon) -target_link_libraries(SicHash INTERFACE SimpleRibbon) +target_link_libraries(SicHash INTERFACE SimpleRibbon ips2ra) if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) ################### Benchmark build targets ################### diff --git a/include/IrregularCuckooHashTable.h b/include/IrregularCuckooHashTable.h index 3399245..955cebc 100644 --- a/include/IrregularCuckooHashTable.h +++ b/include/IrregularCuckooHashTable.h @@ -97,6 +97,7 @@ class IrregularCuckooHashTable { void prepare(HashedKey hash) { assert(numEntries < config.maxEntries); + heap[numEntries] = TableEntry(); heap[numEntries].hash = hash; if (hash.mhc <= config.threshold1) { heap[numEntries].hashFunctionMask = 0b001; @@ -108,6 +109,10 @@ class IrregularCuckooHashTable { numEntries++; } + void clear() { + numEntries = 0; + } + bool construct(size_t M_, size_t seed_) { M = M_; seed = seed_; diff --git a/include/SicHash.h b/include/SicHash.h index ce5149d..b6a5901 100644 --- a/include/SicHash.h +++ b/include/SicHash.h @@ -3,6 +3,7 @@ #include "IrregularCuckooHashTable.h" #include #include +#include namespace sichash { using seed_t = uint8_t; @@ -110,34 +111,26 @@ class SicHash { SimpleRibbon<1, ribbonWidth> *ribbon1 = nullptr; SimpleRibbon<2, ribbonWidth> *ribbon2 = nullptr; SimpleRibbon<3, ribbonWidth> *ribbon3 = nullptr; - std::vector emptySlots; - util::EliasFano minimalRemap; + util::EliasFano *minimalRemap = nullptr; size_t unnecessaryConstructions = 0; // Keys parameter must be an std::vector or an std::vector. - SicHash(const auto &keys, SicHashConfig _config) - : config(_config), N(keys.size()), - minimalRemap(minimal ? (N / config.loadFactor - N) : 0, minimal ? N : 0) { + SicHash(const auto &keys, SicHashConfig _config) : config(_config), N(keys.size()) { + std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); numSmallTables = keys.size() / config.smallTableSize + 1; - std::vector tables; - tables.reserve(numSmallTables); - IrregularCuckooHashTableConfig cuckooConfig; - cuckooConfig.threshold1 = config.threshold1; - cuckooConfig.threshold2 = config.threshold2; - cuckooConfig.maxEntries = config.smallTableSize * 1.2 + 100; - for (size_t i = 0; i < numSmallTables; i++) { - tables.emplace_back(cuckooConfig); - } - if (!config.silent) { std::cout<<"Creating MHCs"<> hashedKeys; + hashedKeys.reserve(N); for (const auto &key : keys) { HashedKey hash = HashedKey(key); size_t smallTable = hash.hash(HASH_FUNCTION_BUCKET_ASSIGNMENT, numSmallTables); - tables[smallTable].prepare(hash); + hashedKeys.emplace_back(smallTable, hash); } + ips2ra::sort(hashedKeys.begin(), hashedKeys.end(), + [](const std::pair &pair) { return pair.first; }); if (!config.silent) { std::cout<<"Inserting into Cuckoo"< emptySlots; + if constexpr (minimal) { + emptySlots.reserve(N / config.loadFactor - N); + } + + for (size_t tableIdx = 0; tableIdx < numSmallTables; tableIdx++) { + irregularCuckooHashTable.clear(); + while (hashedKeys[keyIdx].first == tableIdx) { + irregularCuckooHashTable.prepare(hashedKeys[keyIdx].second); + keyIdx++; + } + size_t tableM = irregularCuckooHashTable.size() / config.loadFactor; size_t seed = 0; - while (!table.construct(tableM, seed)) { + while (!irregularCuckooHashTable.construct(tableM, seed)) { unnecessaryConstructions++; seed++; if (seed >= std::numeric_limits::max()) { @@ -161,14 +171,14 @@ class SicHash { } bucketInfo.emplace_back(sizePrefix, seed); - for (size_t i = 0; i < table.size(); i++) { - IrregularCuckooHashTable::TableEntry &entry = table.heap[i]; + for (size_t k = 0; k < irregularCuckooHashTable.size(); k++) { + IrregularCuckooHashTable::TableEntry &entry = irregularCuckooHashTable.heap[k]; maps[entry.hashFunctionMask].emplace_back(entry.hash.mhc, entry.hashFunctionIndex & entry.hashFunctionMask); } if constexpr (minimal) { - for (size_t i = 0; i < tableM; i++) { - if (table.cells[i] == nullptr) { - emptySlots.push_back(sizePrefix + i); + for (size_t k = 0; k < tableM; k++) { + if (irregularCuckooHashTable.cells[k] == nullptr) { + emptySlots.push_back(sizePrefix + k); } } } @@ -186,33 +196,11 @@ class SicHash { ribbon3 = new SimpleRibbon<3, ribbonWidth>(maps[0b111]); if constexpr (minimal) { - if (!config.silent) { - std::cout<<"Making minimal"<= N) { - // No more empty slots left. We do not need to write the following items to - // the EF sequence because they are never queried - emptyIndex--; - break; - } - } - } - i = 0; + minimalRemap = new util::EliasFano(emptySlots.size(), emptySlots.back() + 1); + for (size_t slot : emptySlots) { + minimalRemap->push_back(slot); } - minimalRemap.buildRankSelect(); + minimalRemap->buildRankSelect(); emptySlots.clear(); emptySlots.shrink_to_fit(); } @@ -229,7 +217,7 @@ class SicHash { size_t bytes = ribbon1->size() + ribbon2->size() + ribbon3->size() + bucketInfo.size() * sizeof(bucketInfo.at(0)); if constexpr (minimal) { - bytes += minimalRemap.space(); + bytes += minimalRemap->space(); } return bytes * 8; } @@ -238,7 +226,7 @@ class SicHash { [[nodiscard]] size_t spaceUsageTheory() const { size_t bytes = ribbon1->size() + ribbon2->size() + ribbon3->size(); if constexpr (minimal) { - bytes += minimalRemap.space(); + bytes += minimalRemap->space(); } size_t efN = bucketInfo.size(); @@ -274,7 +262,7 @@ class SicHash { size_t result = hash.hash(hashFunction + bucketInfo[smallTable].seed, M) + bucketInfo[smallTable].offset; if constexpr (minimal) { if (result >= N) { - return *minimalRemap.at(result - N); + return *minimalRemap->at(result - N); } } return result;