Skip to content

Commit

Permalink
support range hashed short circuit
Browse files Browse the repository at this point in the history
  • Loading branch information
jsc0218 committed Dec 28, 2023
1 parent 17f391a commit 0fc569a
Show file tree
Hide file tree
Showing 4 changed files with 282 additions and 40 deletions.
4 changes: 2 additions & 2 deletions src/Dictionaries/FlatDictionary.cpp
Expand Up @@ -154,7 +154,7 @@ ColumnPtr FlatDictionary::getColumn(

ColumnPtr FlatDictionary::getColumnOrDefaultShortCircuit(
const std::string & attribute_name,
const DataTypePtr & atribute_type,
const DataTypePtr & attribute_type,
const Columns & key_columns,
const DataTypes & key_types [[maybe_unused]],
IColumn::Filter & default_mask) const
Expand All @@ -166,7 +166,7 @@ ColumnPtr FlatDictionary::getColumnOrDefaultShortCircuit(

auto size = ids.size();

const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, atribute_type);
const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, attribute_type);

size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second;
const auto & attribute = attributes[attribute_index];
Expand Down
249 changes: 244 additions & 5 deletions src/Dictionaries/RangeHashedDictionary.h
Expand Up @@ -136,6 +136,13 @@ class RangeHashedDictionary final : public IDictionary
const DataTypes & key_types,
const ColumnPtr & default_values_column) const override;

ColumnPtr getColumnOrDefaultShortCircuit(
const std::string & attribute_name,
const DataTypePtr & attribute_type,
const Columns & key_columns,
const DataTypes & key_types,
IColumn::Filter & default_mask) const override;

ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;

Pipe read(const Names & column_names, size_t max_block_size, size_t num_streams) const override;
Expand Down Expand Up @@ -245,6 +252,13 @@ class RangeHashedDictionary final : public IDictionary
ValueSetter && set_value,
DefaultValueExtractor & default_value_extractor) const;

template <typename AttributeType, bool is_nullable, typename ValueSetter>
void getItemsShortCircuitImpl(
const Attribute & attribute,
const Columns & key_columns,
ValueSetter && set_value,
IColumn::Filter & default_mask) const;

ColumnPtr getColumnInternal(
const std::string & attribute_name,
const DataTypePtr & result_type,
Expand Down Expand Up @@ -456,6 +470,127 @@ ColumnPtr RangeHashedDictionary<dictionary_key_type>::getColumn(
return result;
}

template <DictionaryKeyType dictionary_key_type>
ColumnPtr RangeHashedDictionary<dictionary_key_type>::getColumnOrDefaultShortCircuit(
const std::string & attribute_name,
const DataTypePtr & attribute_type,
const Columns & key_columns,
const DataTypes & key_types,
IColumn::Filter & default_mask) const
{
if (dictionary_key_type == DictionaryKeyType::Complex)
{
auto key_types_copy = key_types;
key_types_copy.pop_back();
dict_struct.validateKeyTypes(key_types_copy);
}

ColumnPtr result;

const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, attribute_type);
const size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second;
const auto & attribute = attributes[attribute_index];

/// Cast range column to storage type
Columns modified_key_columns = key_columns;
const ColumnPtr & range_storage_column = key_columns.back();
ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types.back(), ""};
modified_key_columns.back() = castColumnAccurate(column_to_cast, dict_struct.range_min->type);

size_t keys_size = key_columns.front()->size();
bool is_attribute_nullable = attribute.is_value_nullable.has_value();

ColumnUInt8::MutablePtr col_null_map_to;
ColumnUInt8::Container * vec_null_map_to = nullptr;
if (is_attribute_nullable)
{
col_null_map_to = ColumnUInt8::create(keys_size, false);
vec_null_map_to = &col_null_map_to->getData();
}

auto type_call = [&](const auto & dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
using ValueType = DictionaryValueType<AttributeType>;
using ColumnProvider = DictionaryAttributeColumnProvider<AttributeType>;

auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size);

if constexpr (std::is_same_v<ValueType, Array>)
{
auto * out = column.get();

getItemsShortCircuitImpl<ValueType, false>(
attribute,
modified_key_columns,
[&](size_t, const Array & value, bool)
{
out->insert(value);
},
default_mask);
}
else if constexpr (std::is_same_v<ValueType, StringRef>)
{
auto * out = column.get();

if (is_attribute_nullable)
getItemsShortCircuitImpl<ValueType, true>(
attribute,
modified_key_columns,
[&](size_t row, StringRef value, bool is_null)
{
(*vec_null_map_to)[row] = is_null;
out->insertData(value.data, value.size);
},
default_mask);
else
getItemsShortCircuitImpl<ValueType, false>(
attribute,
modified_key_columns,
[&](size_t, StringRef value, bool)
{
out->insertData(value.data, value.size);
},
default_mask);
}
else
{
auto & out = column->getData();

if (is_attribute_nullable)
getItemsShortCircuitImpl<ValueType, true>(
attribute,
modified_key_columns,
[&](size_t row, const auto value, bool is_null)
{
(*vec_null_map_to)[row] = is_null;
out[row] = value;
},
default_mask);
else
getItemsShortCircuitImpl<ValueType, false>(
attribute,
modified_key_columns,
[&](size_t row, const auto value, bool)
{
out[row] = value;
},
default_mask);
}

result = std::move(column);
};

callOnDictionaryAttributeType(attribute.type, type_call);

result = result->filter(default_mask, found_count);
if (is_attribute_nullable)
result = ColumnNullable::create(result, col_null_map_to->filter(default_mask, found_count));

return result;
}

template <DictionaryKeyType dictionary_key_type>
ColumnPtr RangeHashedDictionary<dictionary_key_type>::getColumnInternal(
const std::string & attribute_name,
Expand Down Expand Up @@ -838,11 +973,7 @@ void RangeHashedDictionary<dictionary_key_type>::getItemsImpl(
if constexpr (is_nullable)
{
bool is_null = (*attribute.is_value_nullable)[value_index];

if (!is_null)
set_value(key_index, value, false);
else
set_value(key_index, default_value_extractor[key_index], true);
set_value(key_index, value, is_null);
}
else
{
Expand All @@ -867,6 +998,114 @@ void RangeHashedDictionary<dictionary_key_type>::getItemsImpl(
found_count.fetch_add(keys_found, std::memory_order_relaxed);
}

template <DictionaryKeyType dictionary_key_type>
template <typename AttributeType, bool is_nullable, typename ValueSetter>
void RangeHashedDictionary<dictionary_key_type>::getItemsShortCircuitImpl(
const Attribute & attribute,
const Columns & key_columns,
ValueSetter && set_value,
IColumn::Filter & default_mask) const
{
const auto & attribute_container = std::get<AttributeContainerType<AttributeType>>(attribute.container);

size_t keys_found = 0;

const ColumnPtr & range_column = key_columns.back();
auto key_columns_copy = key_columns;
key_columns_copy.pop_back();

DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
DictionaryKeysExtractor<dictionary_key_type> keys_extractor(key_columns_copy, arena_holder.getComplexKeyArena());
const size_t keys_size = keys_extractor.getKeysSize();
default_mask.resize(keys_size);

callOnRangeType(dict_struct.range_min->type, [&](const auto & types)
{
using Types = std::decay_t<decltype(types)>;
using RangeColumnType = typename Types::LeftType;
using RangeStorageType = typename RangeColumnType::ValueType;
using RangeInterval = Interval<RangeStorageType>;

const auto * range_column_typed = typeid_cast<const RangeColumnType *>(range_column.get());
if (!range_column_typed)
throw Exception(ErrorCodes::TYPE_MISMATCH,
"Dictionary {} range column type should be equal to {}",
getFullName(),
dict_struct.range_min->type->getName());

const auto & range_column_data = range_column_typed->getData();

const auto & key_attribute_container = std::get<KeyAttributeContainerType<RangeStorageType>>(key_attribute.container);

for (size_t key_index = 0; key_index < keys_size; ++key_index)
{
auto key = keys_extractor.extractCurrentKey();
const auto it = key_attribute_container.find(key);

if (it)
{
const auto date = range_column_data[key_index];
const auto & interval_tree = it->getMapped();

size_t value_index = 0;
std::optional<RangeInterval> range;

interval_tree.find(date, [&](auto & interval, auto & interval_value_index)
{
if (range)
{
if (likely(configuration.lookup_strategy == RangeHashedDictionaryLookupStrategy::min) && interval < *range)
{
range = interval;
value_index = interval_value_index;
}
else if (configuration.lookup_strategy == RangeHashedDictionaryLookupStrategy::max && interval > * range)
{
range = interval;
value_index = interval_value_index;
}
}
else
{
range = interval;
value_index = interval_value_index;
}

return true;
});

if (range.has_value())
{
default_mask[key_index] = 1;
++keys_found;

AttributeType value = attribute_container[value_index];

if constexpr (is_nullable)
{
bool is_null = (*attribute.is_value_nullable)[value_index];
set_value(key_index, value, is_null);
}
else
{
set_value(key_index, value, false);
}

keys_extractor.rollbackCurrentKey();
continue;
}
}

default_mask[key_index] = 0;

keys_extractor.rollbackCurrentKey();
}
});

query_count.fetch_add(keys_size, std::memory_order_relaxed);
found_count.fetch_add(keys_found, std::memory_order_relaxed);
}

template <DictionaryKeyType dictionary_key_type>
template <typename AttributeType, bool is_nullable, typename ValueSetter>
void RangeHashedDictionary<dictionary_key_type>::getItemsInternalImpl(
Expand Down
Expand Up @@ -13,3 +13,6 @@ Hashed array dictionary
('zero','zero')
\N
\N
Range hashed dictionary
\N
\N
66 changes: 33 additions & 33 deletions tests/queries/0_stateless/02950_dictionary_short_circuit.sql
Expand Up @@ -25,7 +25,7 @@ LAYOUT(FLAT());
SELECT 'Flat dictionary';
SELECT dictGetOrDefault('flat_dictionary', ('v1', 'v2'), 0, (intDiv(1, id), intDiv(1, id)))
FROM dictionary_source_table;
SELECT dictGetOrDefault('flat_dictionary', 'v2', id+1, intDiv(NULL, id))
SELECT dictGetOrDefault('flat_dictionary', 'v2', id+1, intDiv(NULL, id))
FROM dictionary_source_table;
DROP DICTIONARY flat_dictionary;

Expand All @@ -45,7 +45,7 @@ LAYOUT(HASHED());
SELECT 'Hashed dictionary';
SELECT dictGetOrDefault('hashed_dictionary', ('v1', 'v2'), 0, (intDiv(1, id), intDiv(1, id)))
FROM dictionary_source_table;
SELECT dictGetOrDefault('hashed_dictionary', 'v2', id+1, intDiv(NULL, id))
SELECT dictGetOrDefault('hashed_dictionary', 'v2', id+1, intDiv(NULL, id))
FROM dictionary_source_table;
DROP DICTIONARY hashed_dictionary;

Expand All @@ -70,37 +70,37 @@ FROM dictionary_source_table;
DROP DICTIONARY hashed_array_dictionary;


-- DROP TABLE IF EXISTS range_dictionary_source_table;
-- CREATE TABLE range_dictionary_source_table
-- (
-- id UInt64,
-- start Date,
-- end Nullable(Date),
-- val UInt64
-- ) ENGINE=TinyLog;

-- INSERT INTO range_dictionary_source_table VALUES (0, '2023-01-01', Null, 0), (1, '2023-12-26', '2023-12-27', 1);

-- DROP DICTIONARY IF EXISTS range_hashed_dictionary;
-- CREATE DICTIONARY range_hashed_dictionary
-- (
-- id UInt64,
-- start Date,
-- end Nullable(Date),
-- val UInt64
-- )
-- PRIMARY KEY id
-- SOURCE(CLICKHOUSE(TABLE 'range_dictionary_source_table'))
-- LIFETIME(MIN 0 MAX 0)
-- LAYOUT(RANGE_HASHED())
-- RANGE(MIN start MAX end);

-- SELECT 'Range hashed dictionary';
-- SELECT dictGetOrDefault('range_hashed_dictionary', 'val', id, toDate('2023-01-02'), toDate('1933-01-22'))
-- FROM range_dictionary_source_table;
-- DROP DICTIONARY range_hashed_dictionary;

-- DROP TABLE range_dictionary_source_table;
DROP TABLE IF EXISTS range_dictionary_source_table;
CREATE TABLE range_dictionary_source_table
(
id UInt64,
start Date,
end Nullable(Date),
val Nullable(UInt64)
) ENGINE=TinyLog;

INSERT INTO range_dictionary_source_table VALUES (0, '2023-01-01', Null, Null), (1, '2022-11-09', '2022-12-08', 1);

DROP DICTIONARY IF EXISTS range_hashed_dictionary;
CREATE DICTIONARY range_hashed_dictionary
(
id UInt64,
start Date,
end Nullable(Date),
val Nullable(UInt64)
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(TABLE 'range_dictionary_source_table'))
LIFETIME(MIN 0 MAX 0)
LAYOUT(RANGE_HASHED())
RANGE(MIN start MAX end);

SELECT 'Range hashed dictionary';
SELECT dictGetOrDefault('range_hashed_dictionary', 'val', id, toDate('2023-01-02'), intDiv(NULL, id))
FROM range_dictionary_source_table;
DROP DICTIONARY range_hashed_dictionary;

DROP TABLE range_dictionary_source_table;


DROP TABLE dictionary_source_table;

0 comments on commit 0fc569a

Please sign in to comment.