From 0fc569a3e3d152f6f91103773b05a5afaf73481b Mon Sep 17 00:00:00 2001 From: jsc0218 Date: Thu, 28 Dec 2023 22:21:51 +0000 Subject: [PATCH] support range hashed short circuit --- src/Dictionaries/FlatDictionary.cpp | 4 +- src/Dictionaries/RangeHashedDictionary.h | 249 +++++++++++++++++- .../02950_dictionary_short_circuit.reference | 3 + .../02950_dictionary_short_circuit.sql | 66 ++--- 4 files changed, 282 insertions(+), 40 deletions(-) diff --git a/src/Dictionaries/FlatDictionary.cpp b/src/Dictionaries/FlatDictionary.cpp index 8314ecd06fbd..06847ca872a0 100644 --- a/src/Dictionaries/FlatDictionary.cpp +++ b/src/Dictionaries/FlatDictionary.cpp @@ -154,7 +154,7 @@ ColumnPtr FlatDictionary::getColumn( ColumnPtr FlatDictionary::getColumnOrDefaultShortCircuit( const std::string & attribute_name, - const DataTypePtr & atribute_type, + const DataTypePtr & attribute_type, const Columns & key_columns, const DataTypes & key_types [[maybe_unused]], IColumn::Filter & default_mask) const @@ -166,7 +166,7 @@ ColumnPtr FlatDictionary::getColumnOrDefaultShortCircuit( auto size = ids.size(); - const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, atribute_type); + const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, attribute_type); size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second; const auto & attribute = attributes[attribute_index]; diff --git a/src/Dictionaries/RangeHashedDictionary.h b/src/Dictionaries/RangeHashedDictionary.h index 9be9fa1d0d42..a41e50efb8a6 100644 --- a/src/Dictionaries/RangeHashedDictionary.h +++ b/src/Dictionaries/RangeHashedDictionary.h @@ -136,6 +136,13 @@ class RangeHashedDictionary final : public IDictionary const DataTypes & key_types, const ColumnPtr & default_values_column) const override; + ColumnPtr getColumnOrDefaultShortCircuit( + const std::string & attribute_name, + const DataTypePtr & attribute_type, + const Columns & key_columns, + const DataTypes & key_types, + IColumn::Filter & default_mask) const override; + ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; Pipe read(const Names & column_names, size_t max_block_size, size_t num_streams) const override; @@ -245,6 +252,13 @@ class RangeHashedDictionary final : public IDictionary ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const; + template + void getItemsShortCircuitImpl( + const Attribute & attribute, + const Columns & key_columns, + ValueSetter && set_value, + IColumn::Filter & default_mask) const; + ColumnPtr getColumnInternal( const std::string & attribute_name, const DataTypePtr & result_type, @@ -456,6 +470,127 @@ ColumnPtr RangeHashedDictionary::getColumn( return result; } +template +ColumnPtr RangeHashedDictionary::getColumnOrDefaultShortCircuit( + const std::string & attribute_name, + const DataTypePtr & attribute_type, + const Columns & key_columns, + const DataTypes & key_types, + IColumn::Filter & default_mask) const +{ + if (dictionary_key_type == DictionaryKeyType::Complex) + { + auto key_types_copy = key_types; + key_types_copy.pop_back(); + dict_struct.validateKeyTypes(key_types_copy); + } + + ColumnPtr result; + + const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, attribute_type); + const size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second; + const auto & attribute = attributes[attribute_index]; + + /// Cast range column to storage type + Columns modified_key_columns = key_columns; + const ColumnPtr & range_storage_column = key_columns.back(); + ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types.back(), ""}; + modified_key_columns.back() = castColumnAccurate(column_to_cast, dict_struct.range_min->type); + + size_t keys_size = key_columns.front()->size(); + bool is_attribute_nullable = attribute.is_value_nullable.has_value(); + + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container * vec_null_map_to = nullptr; + if (is_attribute_nullable) + { + col_null_map_to = ColumnUInt8::create(keys_size, false); + vec_null_map_to = &col_null_map_to->getData(); + } + + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + using ColumnProvider = DictionaryAttributeColumnProvider; + + auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); + + if constexpr (std::is_same_v) + { + auto * out = column.get(); + + getItemsShortCircuitImpl( + attribute, + modified_key_columns, + [&](size_t, const Array & value, bool) + { + out->insert(value); + }, + default_mask); + } + else if constexpr (std::is_same_v) + { + auto * out = column.get(); + + if (is_attribute_nullable) + getItemsShortCircuitImpl( + attribute, + modified_key_columns, + [&](size_t row, StringRef value, bool is_null) + { + (*vec_null_map_to)[row] = is_null; + out->insertData(value.data, value.size); + }, + default_mask); + else + getItemsShortCircuitImpl( + attribute, + modified_key_columns, + [&](size_t, StringRef value, bool) + { + out->insertData(value.data, value.size); + }, + default_mask); + } + else + { + auto & out = column->getData(); + + if (is_attribute_nullable) + getItemsShortCircuitImpl( + attribute, + modified_key_columns, + [&](size_t row, const auto value, bool is_null) + { + (*vec_null_map_to)[row] = is_null; + out[row] = value; + }, + default_mask); + else + getItemsShortCircuitImpl( + attribute, + modified_key_columns, + [&](size_t row, const auto value, bool) + { + out[row] = value; + }, + default_mask); + } + + result = std::move(column); + }; + + callOnDictionaryAttributeType(attribute.type, type_call); + + result = result->filter(default_mask, found_count); + if (is_attribute_nullable) + result = ColumnNullable::create(result, col_null_map_to->filter(default_mask, found_count)); + + return result; +} + template ColumnPtr RangeHashedDictionary::getColumnInternal( const std::string & attribute_name, @@ -838,11 +973,7 @@ void RangeHashedDictionary::getItemsImpl( if constexpr (is_nullable) { bool is_null = (*attribute.is_value_nullable)[value_index]; - - if (!is_null) - set_value(key_index, value, false); - else - set_value(key_index, default_value_extractor[key_index], true); + set_value(key_index, value, is_null); } else { @@ -867,6 +998,114 @@ void RangeHashedDictionary::getItemsImpl( found_count.fetch_add(keys_found, std::memory_order_relaxed); } +template +template +void RangeHashedDictionary::getItemsShortCircuitImpl( + const Attribute & attribute, + const Columns & key_columns, + ValueSetter && set_value, + IColumn::Filter & default_mask) const +{ + const auto & attribute_container = std::get>(attribute.container); + + size_t keys_found = 0; + + const ColumnPtr & range_column = key_columns.back(); + auto key_columns_copy = key_columns; + key_columns_copy.pop_back(); + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor keys_extractor(key_columns_copy, arena_holder.getComplexKeyArena()); + const size_t keys_size = keys_extractor.getKeysSize(); + default_mask.resize(keys_size); + + callOnRangeType(dict_struct.range_min->type, [&](const auto & types) + { + using Types = std::decay_t; + using RangeColumnType = typename Types::LeftType; + using RangeStorageType = typename RangeColumnType::ValueType; + using RangeInterval = Interval; + + const auto * range_column_typed = typeid_cast(range_column.get()); + if (!range_column_typed) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Dictionary {} range column type should be equal to {}", + getFullName(), + dict_struct.range_min->type->getName()); + + const auto & range_column_data = range_column_typed->getData(); + + const auto & key_attribute_container = std::get>(key_attribute.container); + + for (size_t key_index = 0; key_index < keys_size; ++key_index) + { + auto key = keys_extractor.extractCurrentKey(); + const auto it = key_attribute_container.find(key); + + if (it) + { + const auto date = range_column_data[key_index]; + const auto & interval_tree = it->getMapped(); + + size_t value_index = 0; + std::optional range; + + interval_tree.find(date, [&](auto & interval, auto & interval_value_index) + { + if (range) + { + if (likely(configuration.lookup_strategy == RangeHashedDictionaryLookupStrategy::min) && interval < *range) + { + range = interval; + value_index = interval_value_index; + } + else if (configuration.lookup_strategy == RangeHashedDictionaryLookupStrategy::max && interval > * range) + { + range = interval; + value_index = interval_value_index; + } + } + else + { + range = interval; + value_index = interval_value_index; + } + + return true; + }); + + if (range.has_value()) + { + default_mask[key_index] = 1; + ++keys_found; + + AttributeType value = attribute_container[value_index]; + + if constexpr (is_nullable) + { + bool is_null = (*attribute.is_value_nullable)[value_index]; + set_value(key_index, value, is_null); + } + else + { + set_value(key_index, value, false); + } + + keys_extractor.rollbackCurrentKey(); + continue; + } + } + + default_mask[key_index] = 0; + + keys_extractor.rollbackCurrentKey(); + } + }); + + query_count.fetch_add(keys_size, std::memory_order_relaxed); + found_count.fetch_add(keys_found, std::memory_order_relaxed); +} + template template void RangeHashedDictionary::getItemsInternalImpl( diff --git a/tests/queries/0_stateless/02950_dictionary_short_circuit.reference b/tests/queries/0_stateless/02950_dictionary_short_circuit.reference index 6720ddf2d09c..3dc2f1a62902 100644 --- a/tests/queries/0_stateless/02950_dictionary_short_circuit.reference +++ b/tests/queries/0_stateless/02950_dictionary_short_circuit.reference @@ -13,3 +13,6 @@ Hashed array dictionary ('zero','zero') \N \N +Range hashed dictionary +\N +\N diff --git a/tests/queries/0_stateless/02950_dictionary_short_circuit.sql b/tests/queries/0_stateless/02950_dictionary_short_circuit.sql index cee5005b32e4..64a30b4c4052 100644 --- a/tests/queries/0_stateless/02950_dictionary_short_circuit.sql +++ b/tests/queries/0_stateless/02950_dictionary_short_circuit.sql @@ -25,7 +25,7 @@ LAYOUT(FLAT()); SELECT 'Flat dictionary'; SELECT dictGetOrDefault('flat_dictionary', ('v1', 'v2'), 0, (intDiv(1, id), intDiv(1, id))) FROM dictionary_source_table; -SELECT dictGetOrDefault('flat_dictionary', 'v2', id+1, intDiv(NULL, id)) +SELECT dictGetOrDefault('flat_dictionary', 'v2', id+1, intDiv(NULL, id)) FROM dictionary_source_table; DROP DICTIONARY flat_dictionary; @@ -45,7 +45,7 @@ LAYOUT(HASHED()); SELECT 'Hashed dictionary'; SELECT dictGetOrDefault('hashed_dictionary', ('v1', 'v2'), 0, (intDiv(1, id), intDiv(1, id))) FROM dictionary_source_table; -SELECT dictGetOrDefault('hashed_dictionary', 'v2', id+1, intDiv(NULL, id)) +SELECT dictGetOrDefault('hashed_dictionary', 'v2', id+1, intDiv(NULL, id)) FROM dictionary_source_table; DROP DICTIONARY hashed_dictionary; @@ -70,37 +70,37 @@ FROM dictionary_source_table; DROP DICTIONARY hashed_array_dictionary; --- DROP TABLE IF EXISTS range_dictionary_source_table; --- CREATE TABLE range_dictionary_source_table --- ( --- id UInt64, --- start Date, --- end Nullable(Date), --- val UInt64 --- ) ENGINE=TinyLog; - --- INSERT INTO range_dictionary_source_table VALUES (0, '2023-01-01', Null, 0), (1, '2023-12-26', '2023-12-27', 1); - --- DROP DICTIONARY IF EXISTS range_hashed_dictionary; --- CREATE DICTIONARY range_hashed_dictionary --- ( --- id UInt64, --- start Date, --- end Nullable(Date), --- val UInt64 --- ) --- PRIMARY KEY id --- SOURCE(CLICKHOUSE(TABLE 'range_dictionary_source_table')) --- LIFETIME(MIN 0 MAX 0) --- LAYOUT(RANGE_HASHED()) --- RANGE(MIN start MAX end); - --- SELECT 'Range hashed dictionary'; --- SELECT dictGetOrDefault('range_hashed_dictionary', 'val', id, toDate('2023-01-02'), toDate('1933-01-22')) --- FROM range_dictionary_source_table; --- DROP DICTIONARY range_hashed_dictionary; - --- DROP TABLE range_dictionary_source_table; +DROP TABLE IF EXISTS range_dictionary_source_table; +CREATE TABLE range_dictionary_source_table +( + id UInt64, + start Date, + end Nullable(Date), + val Nullable(UInt64) +) ENGINE=TinyLog; + +INSERT INTO range_dictionary_source_table VALUES (0, '2023-01-01', Null, Null), (1, '2022-11-09', '2022-12-08', 1); + +DROP DICTIONARY IF EXISTS range_hashed_dictionary; +CREATE DICTIONARY range_hashed_dictionary +( + id UInt64, + start Date, + end Nullable(Date), + val Nullable(UInt64) +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE 'range_dictionary_source_table')) +LIFETIME(MIN 0 MAX 0) +LAYOUT(RANGE_HASHED()) +RANGE(MIN start MAX end); + +SELECT 'Range hashed dictionary'; +SELECT dictGetOrDefault('range_hashed_dictionary', 'val', id, toDate('2023-01-02'), intDiv(NULL, id)) +FROM range_dictionary_source_table; +DROP DICTIONARY range_hashed_dictionary; + +DROP TABLE range_dictionary_source_table; DROP TABLE dictionary_source_table; \ No newline at end of file