Skip to content

Commit

Permalink
Merge pull request #60586 from Avogar/fix-filter-virtual-columns
Browse files Browse the repository at this point in the history
Fix reading from MergeTree with non-deterministic functions in filter
  • Loading branch information
Avogar committed Mar 13, 2024
2 parents 5b27b82 + ecc3044 commit 843e2dc
Show file tree
Hide file tree
Showing 8 changed files with 130 additions and 0 deletions.
Expand Up @@ -18,6 +18,7 @@

#include <Common/logger_useful.h>
#include <Storages/StorageDummy.h>
#include <Storages/VirtualColumnUtils.h>
#include <Planner/PlannerExpressionAnalysis.h>
#include <Interpreters/InterpreterSelectQuery.h>
#include <Interpreters/InterpreterSelectQueryAnalyzer.h>
Expand Down Expand Up @@ -464,6 +465,9 @@ AggregateProjectionCandidates getAggregateProjectionCandidates(
// LOG_TRACE(getLogger("optimizeUseProjections"), "Query DAG: {}", dag.dag->dumpDAG());

candidates.has_filter = dag.filter_node;
/// We can't use minmax projection if filter has non-deterministic functions.
if (dag.filter_node && !VirtualColumnUtils::isDeterministicInScopeOfQuery(dag.filter_node))
can_use_minmax_projection = false;

if (can_use_minmax_projection)
{
Expand Down
2 changes: 2 additions & 0 deletions src/Storages/MergeTree/MergeTreeData.cpp
Expand Up @@ -1071,6 +1071,8 @@ std::optional<UInt64> MergeTreeData::totalRowsByPartitionPredicateImpl(
auto virtual_columns_block = getBlockWithVirtualsForFilter({parts[0]});

auto filter_dag = VirtualColumnUtils::splitFilterDagForAllowedInputs(filter_actions_dag->getOutputs().at(0), nullptr);
if (!filter_dag)
return {};

/// Generate valid expressions for filtering
bool valid = true;
Expand Down
21 changes: 21 additions & 0 deletions src/Storages/VirtualColumnUtils.cpp
Expand Up @@ -238,6 +238,23 @@ static bool canEvaluateSubtree(const ActionsDAG::Node * node, const Block & allo
return true;
}

bool isDeterministicInScopeOfQuery(const ActionsDAG::Node * node)
{
for (const auto * child : node->children)
{
if (!isDeterministicInScopeOfQuery(child))
return false;
}

if (node->type != ActionsDAG::ActionType::FUNCTION)
return true;

if (!node->function_base->isDeterministicInScopeOfQuery())
return false;

return true;
}

static const ActionsDAG::Node * splitFilterNodeForAllowedInputs(
const ActionsDAG::Node * node,
const Block * allowed_inputs,
Expand Down Expand Up @@ -313,6 +330,10 @@ static const ActionsDAG::Node * splitFilterNodeForAllowedInputs(
}
}
}
else if (!isDeterministicInScopeOfQuery(node))
{
return nullptr;
}
}

if (allowed_inputs && !canEvaluateSubtree(node, *allowed_inputs))
Expand Down
3 changes: 3 additions & 0 deletions src/Storages/VirtualColumnUtils.h
Expand Up @@ -25,6 +25,9 @@ void filterBlockWithPredicate(const ActionsDAG::Node * predicate, Block & block,
/// Just filters block. Block should contain all the required columns.
void filterBlockWithDAG(ActionsDAGPtr dag, Block & block, ContextPtr context);

/// Recursively checks if all functions used in DAG are deterministic in scope of query.
bool isDeterministicInScopeOfQuery(const ActionsDAG::Node * node);

/// Extract a part of predicate that can be evaluated using only columns from input_names.
ActionsDAGPtr splitFilterDagForAllowedInputs(const ActionsDAG::Node * predicate, const Block * allowed_inputs);

Expand Down
@@ -0,0 +1,11 @@
0
0
0
0
0
0
0
0
0
0
1
@@ -0,0 +1,6 @@
create table test (number UInt64) engine=MergeTree order by number;
insert into test select * from numbers(100000000);
select ignore(number) from test where RAND() > 4292390314 limit 10;
select count() > 0 from test where RAND() > 4292390314;
drop table test;

@@ -0,0 +1,55 @@
-- count
100000 all_10_10_0
100000 all_1_1_0
100000 all_2_2_0
100000 all_3_3_0
100000 all_4_4_0
100000 all_5_5_0
100000 all_6_6_0
100000 all_7_7_0
100000 all_8_8_0
100000 all_9_9_0
-- rand()%2=0:
1 all_10_10_0
1 all_1_1_0
1 all_2_2_0
1 all_3_3_0
1 all_4_4_0
1 all_5_5_0
1 all_6_6_0
1 all_7_7_0
1 all_8_8_0
1 all_9_9_0
-- optimize_use_implicit_projections=0
1 all_10_10_0
1 all_1_1_0
1 all_2_2_0
1 all_3_3_0
1 all_4_4_0
1 all_5_5_0
1 all_6_6_0
1 all_7_7_0
1 all_8_8_0
1 all_9_9_0
-- optimize_trivial_count_query=0
1 all_10_10_0
1 all_1_1_0
1 all_2_2_0
1 all_3_3_0
1 all_4_4_0
1 all_5_5_0
1 all_6_6_0
1 all_7_7_0
1 all_8_8_0
1 all_9_9_0
-- optimize_trivial_count_query=0, optimize_use_implicit_projections=0
1 all_10_10_0
1 all_1_1_0
1 all_2_2_0
1 all_3_3_0
1 all_4_4_0
1 all_5_5_0
1 all_6_6_0
1 all_7_7_0
1 all_8_8_0
1 all_9_9_0
@@ -0,0 +1,28 @@
create table test (number UInt64) engine=MergeTree order by number;
system stop merges test;
INSERT INTO test select number from numbers(100000);
INSERT INTO test select number from numbers(100000);
INSERT INTO test select number from numbers(100000);
INSERT INTO test select number from numbers(100000);
INSERT INTO test select number from numbers(100000);
INSERT INTO test select number from numbers(100000);
INSERT INTO test select number from numbers(100000);
INSERT INTO test select number from numbers(100000);
INSERT INTO test select number from numbers(100000);
INSERT INTO test select number from numbers(100000);

select '-- count';
SELECT count(), _part FROM test GROUP BY _part ORDER BY _part;

select '-- rand()%2=0:';
SELECT count() > 0 AND count() < 100000, _part FROM test WHERE rand(1)%2=1 GROUP BY _part ORDER BY _part;

select '-- optimize_use_implicit_projections=0';
SELECT count() > 0 AND count() < 100000, _part FROM test WHERE rand(2)%2=1 GROUP BY _part ORDER BY _part settings optimize_use_implicit_projections=0;

select '-- optimize_trivial_count_query=0';
SELECT count() > 0 AND count() < 100000, _part FROM test WHERE rand(3)%2=1 GROUP BY _part ORDER BY _part settings optimize_trivial_count_query=0;

select '-- optimize_trivial_count_query=0, optimize_use_implicit_projections=0';
SELECT count() > 0 AND count() < 100000, _part FROM test WHERE rand(4)%2=1 GROUP BY _part ORDER BY _part settings optimize_trivial_count_query=0,optimize_use_implicit_projections=0;

0 comments on commit 843e2dc

Please sign in to comment.