Skip to content

Commit

Permalink
Merge pull request #51958 from ClickHouse/prewhere_and_pk_columns
Browse files Browse the repository at this point in the history
Move conditions with columns from PK to the end of PREWHERE chain
  • Loading branch information
robot-clickhouse committed Jul 21, 2023
2 parents 69b897d + 0c2ea94 commit 9280f4a
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 4 deletions.
1 change: 1 addition & 0 deletions src/Core/Settings.h
Expand Up @@ -129,6 +129,7 @@ class IColumn;
M(Bool, optimize_move_to_prewhere_if_final, false, "If query has `FINAL`, the optimization `move_to_prewhere` is not always correct and it is enabled only if both settings `optimize_move_to_prewhere` and `optimize_move_to_prewhere_if_final` are turned on", 0) \
M(Bool, move_all_conditions_to_prewhere, true, "Move all viable conditions from WHERE to PREWHERE", 0) \
M(Bool, enable_multiple_prewhere_read_steps, true, "Move more conditions from WHERE to PREWHERE and do reads from disk and filtering in multiple steps if there are multiple conditions combined with AND", 0) \
M(Bool, move_primary_key_columns_to_end_of_prewhere, true, "Move PREWHERE conditions containing primary key columns to the end of AND chain. It is likely that these conditions are taken into account during primary key analysis and thus will not contribute a lot to PREWHERE filtering.", 0) \
\
M(UInt64, alter_sync, 1, "Wait for actions to manipulate the partitions. 0 - do not wait, 1 - wait for execution only of itself, 2 - wait for everyone.", 0) ALIAS(replication_alter_partitions_sync) \
M(Int64, replication_wait_for_inactive_replica_timeout, 120, "Wait for inactive replica to execute ALTER/OPTIMIZE. Time in seconds, 0 - do not wait, negative - wait for unlimited time.", 0) \
Expand Down
38 changes: 38 additions & 0 deletions src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
Expand Up @@ -22,6 +22,33 @@ namespace DB
/// This is used to assume that condition is likely to have good selectivity.
static constexpr auto threshold = 2;

static NameToIndexMap fillNamesPositions(const Names & names)
{
NameToIndexMap names_positions;

for (size_t position = 0; position < names.size(); ++position)
{
const auto & name = names[position];
names_positions[name] = position;
}

return names_positions;
}

/// Find minimal position of any of the column in primary key.
static Int64 findMinPosition(const NameSet & condition_table_columns, const NameToIndexMap & primary_key_positions)
{
Int64 min_position = std::numeric_limits<Int64>::max() - 1;

for (const auto & column : condition_table_columns)
{
auto it = primary_key_positions.find(column);
if (it != primary_key_positions.end())
min_position = std::min(min_position, static_cast<Int64>(it->second));
}

return min_position;
}

MergeTreeWhereOptimizer::MergeTreeWhereOptimizer(
std::unordered_map<std::string, UInt64> column_sizes_,
Expand All @@ -35,6 +62,7 @@ MergeTreeWhereOptimizer::MergeTreeWhereOptimizer(
, supported_columns{supported_columns_}
, sorting_key_names{NameSet(
metadata_snapshot->getSortingKey().column_names.begin(), metadata_snapshot->getSortingKey().column_names.end())}
, primary_key_names_positions(fillNamesPositions(metadata_snapshot->getPrimaryKey().column_names))
, log{log_}
, column_sizes{std::move(column_sizes_)}
{
Expand All @@ -60,6 +88,7 @@ void MergeTreeWhereOptimizer::optimize(SelectQueryInfo & select_query_info, cons
where_optimizer_context.context = context;
where_optimizer_context.array_joined_names = determineArrayJoinedNames(select);
where_optimizer_context.move_all_conditions_to_prewhere = context->getSettingsRef().move_all_conditions_to_prewhere;
where_optimizer_context.move_primary_key_columns_to_end_of_prewhere = context->getSettingsRef().move_primary_key_columns_to_end_of_prewhere;
where_optimizer_context.is_final = select.final();

RPNBuilderTreeContext tree_context(context, std::move(block_with_constants), {} /*prepared_sets*/);
Expand Down Expand Up @@ -89,6 +118,7 @@ std::optional<MergeTreeWhereOptimizer::FilterActionsOptimizeResult> MergeTreeWhe
where_optimizer_context.context = context;
where_optimizer_context.array_joined_names = {};
where_optimizer_context.move_all_conditions_to_prewhere = context->getSettingsRef().move_all_conditions_to_prewhere;
where_optimizer_context.move_primary_key_columns_to_end_of_prewhere = context->getSettingsRef().move_primary_key_columns_to_end_of_prewhere;
where_optimizer_context.is_final = is_final;

RPNBuilderTreeContext tree_context(context);
Expand Down Expand Up @@ -234,6 +264,14 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const RPNBuilderTree
if (cond.viable)
cond.good = isConditionGood(node, table_columns);

if (where_optimizer_context.move_primary_key_columns_to_end_of_prewhere)
{
/// Consider all conditions good with this setting enabled.
cond.good = cond.viable;
/// Find min position in PK of any column that is used in this condition.
cond.min_position_in_primary_key = findMinPosition(cond.table_columns, primary_key_names_positions);
}

res.emplace_back(std::move(cond));
}
}
Expand Down
9 changes: 8 additions & 1 deletion src/Storages/MergeTree/MergeTreeWhereOptimizer.h
Expand Up @@ -72,9 +72,14 @@ class MergeTreeWhereOptimizer : private boost::noncopyable
/// Does the condition presumably have good selectivity?
bool good = false;

/// Does the condition contain primary key column?
/// If so, it is better to move it further to the end of PREWHERE chain depending on minimal position in PK of any
/// column in this condition because this condition have bigger chances to be already satisfied by PK analysis.
Int64 min_position_in_primary_key = std::numeric_limits<Int64>::max() - 1;

auto tuple() const
{
return std::make_tuple(!viable, !good, columns_size, table_columns.size());
return std::make_tuple(!viable, !good, -min_position_in_primary_key, columns_size, table_columns.size());
}

/// Is condition a better candidate for moving to PREWHERE?
Expand All @@ -91,6 +96,7 @@ class MergeTreeWhereOptimizer : private boost::noncopyable
ContextPtr context;
NameSet array_joined_names;
bool move_all_conditions_to_prewhere = false;
bool move_primary_key_columns_to_end_of_prewhere = false;
bool is_final = false;
};

Expand Down Expand Up @@ -141,6 +147,7 @@ class MergeTreeWhereOptimizer : private boost::noncopyable
const Names queried_columns;
const std::optional<NameSet> supported_columns;
const NameSet sorting_key_names;
const NameToIndexMap primary_key_names_positions;
Poco::Logger * log;
std::unordered_map<std::string, UInt64> column_sizes;
UInt64 total_size_of_queried_columns = 0;
Expand Down
@@ -1,6 +1,6 @@
SELECT count()
FROM t_02156_merge1
PREWHERE (k = 3) AND notEmpty(v)
PREWHERE notEmpty(v) AND (k = 3)
2
SELECT count()
FROM t_02156_merge2
Expand Down
5 changes: 3 additions & 2 deletions tests/queries/0_stateless/02156_storage_merge_prewhere.sql
@@ -1,4 +1,5 @@
SET optimize_move_to_prewhere = 1;
SET enable_multiple_prewhere_read_steps = 1;

DROP TABLE IF EXISTS t_02156_mt1;
DROP TABLE IF EXISTS t_02156_mt2;
Expand All @@ -8,8 +9,8 @@ DROP TABLE IF EXISTS t_02156_merge1;
DROP TABLE IF EXISTS t_02156_merge2;
DROP TABLE IF EXISTS t_02156_merge3;

CREATE TABLE t_02156_mt1 (k UInt32, v String) ENGINE = MergeTree ORDER BY k;
CREATE TABLE t_02156_mt2 (k UInt32, v String) ENGINE = MergeTree ORDER BY k;
CREATE TABLE t_02156_mt1 (k UInt32, v String) ENGINE = MergeTree ORDER BY k SETTINGS min_bytes_for_wide_part=0;
CREATE TABLE t_02156_mt2 (k UInt32, v String) ENGINE = MergeTree ORDER BY k SETTINGS min_bytes_for_wide_part=0;
CREATE TABLE t_02156_log (k UInt32, v String) ENGINE = Log;

CREATE TABLE t_02156_dist (k UInt32, v String) ENGINE = Distributed(test_shard_localhost, currentDatabase(), t_02156_mt1);
Expand Down

0 comments on commit 9280f4a

Please sign in to comment.