diff --git a/src/Interpreters/misc.h b/src/Interpreters/misc.h index b77fc5aee1ec..5526b85be82c 100644 --- a/src/Interpreters/misc.h +++ b/src/Interpreters/misc.h @@ -16,9 +16,14 @@ inline bool functionIsInOperator(const std::string & name) return name == "in" || name == "notIn" || name == "nullIn" || name == "notNullIn"; } +inline bool functionIsGlobalInOperator(const std::string & name) +{ + return name == "globalIn" || name == "globalNotIn" || name == "globalNullIn" || name == "globalNotNullIn"; +} + inline bool functionIsInOrGlobalInOperator(const std::string & name) { - return functionIsInOperator(name) || name == "globalIn" || name == "globalNotIn" || name == "globalNullIn" || name == "globalNotNullIn"; + return functionIsInOperator(name) || functionIsGlobalInOperator(name); } inline bool functionIsLikeOperator(const std::string & name) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index b8408213d57c..84c495b4bd3d 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -2910,6 +2910,23 @@ void ReadFromMergeTree::updatePrewhereInfo(const PrewhereInfoPtr & prewhere_info { query_info.prewhere_info = prewhere_info_value; + /// Build sets for the new PREWHERE synchronously. PREWHERE is evaluated at the + /// storage level during data reading, before the pipeline-level CreatingSetsStep + /// has a chance to execute. If a condition with IN (subquery) was moved to PREWHERE + /// by optimizePrewhere after applyFilters already ran, the set would remain unbuilt + /// and cause a "Not-ready Set" error. + /// We must skip sets used in GLOBAL IN functions because ReadFromRemote needs to + /// attach external tables to those sets before they are built. Building them here + /// would cause "Trying to attach external table to a ready set" errors. + /// Only build sets when applyFilters has already been called for this step (indicated by + /// `indexes` being populated). The plan built by `considerEnablingParallelReplicas` for + /// statistics collection runs `optimizePrewhere` without `optimizePrimaryKeyConditionAndLimit`, + /// so `applyFilters` is skipped there and sets must not be built — the original plan's + /// `CreatingSetsStep` (added later via `addStepsToBuildSets`) handles them. Building here + /// would re-execute the IN-subquery and double-count its rows against `max_rows_to_read`. + if (query_info.prewhere_info && indexes.has_value()) + VirtualColumnUtils::buildSetsForDAGExcludingGlobalIn(query_info.prewhere_info->prewhere_actions, context); + output_header = std::make_shared(MergeTreeSelectProcessor::transformHeader( storage_snapshot->getSampleBlockForColumns(all_column_names), query_info.row_level_filter, diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index a9630d4a5fad..73f498c81e7a 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -602,6 +602,13 @@ bool MergeTreeWhereOptimizer::cannotBeMoved(const RPNBuilderTreeNode & node, con if (function_name == "arrayJoin") return true; + /// Disallow GLOBAL IN conditions from being moved to PREWHERE. + /// GLOBAL IN sets are populated via external tables attached by `ReadFromRemote`; + /// they cannot be built synchronously during PREWHERE evaluation, which runs + /// before the pipeline-level `CreatingSetsStep` has a chance to execute. + if (functionIsGlobalInOperator(function_name)) + return true; + size_t arguments_size = function_node.getArgumentsSize(); for (size_t i = 0; i < arguments_size; ++i) { diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 4c94916cbab4..2e4783ef654a 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -1,5 +1,6 @@ #include #include +#include #include @@ -7,6 +8,7 @@ #include #include +#include #include #include #include @@ -97,6 +99,57 @@ void buildSetsForDAG(const ActionsDAG & dag, const ContextPtr & context) buildSetsForDagImpl(dag, context, /* ordered = */ false); } +void buildSetsForDAGExcludingGlobalIn(const ActionsDAG & dag, const ContextPtr & context) +{ + /// Collect ColumnSet nodes that are arguments to globalIn/globalNotIn functions. + /// These sets must NOT be built synchronously here because ReadFromRemote needs to + /// attach external tables to them first (via setExternalTable). Building them early + /// would make the set "created" without explicit elements, causing a LOGICAL_ERROR. + std::unordered_set global_in_set_nodes; + for (const auto & node : dag.getNodes()) + { + if (node.type == ActionsDAG::ActionType::FUNCTION && node.function_base) + { + auto name = node.function_base->getName(); + if (functionIsGlobalInOperator(name)) + { + /// The set is the second argument (index 1) + if (node.children.size() >= 2) + global_in_set_nodes.insert(node.children[1]); + } + } + } + + for (const auto & node : dag.getNodes()) + { + if (node.type == ActionsDAG::ActionType::COLUMN && !global_in_set_nodes.contains(&node)) + { + const ColumnSet * column_set = checkAndGetColumnConstData(node.column.get()); + if (!column_set) + column_set = checkAndGetColumn(node.column.get()); + + if (column_set) + { + auto future_set = column_set->getData(); + if (!future_set->get()) + { + if (auto * set_from_subquery = typeid_cast(future_set.get())) + { + /// Prefer ordered build so that the set retains explicit elements, + /// which `KeyCondition` and skip-index analysis require to use the set + /// for primary-key / skip-index filtering (via `buildOrderedSetInplace`). + /// If `use_index_for_in_with_subqueries` is disabled, the ordered build + /// returns `nullptr` without building; fall back to unordered so the set + /// is still ready when PREWHERE is evaluated at read time. + if (!set_from_subquery->buildOrderedSetInplace(context)) + set_from_subquery->buildSetInplace(context); + } + } + } + } + } +} + void buildOrderedSetsForDAG(const ActionsDAG & dag, const ContextPtr & context) { buildSetsForDagImpl(dag, context, /* ordered = */ true); diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index fa02acf5025a..8f16d2782285 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -51,6 +51,11 @@ void filterBlockWithExpression(const ExpressionActionsPtr & actions, Block & blo /// Builds sets used by ActionsDAG inplace. void buildSetsForDAG(const ActionsDAG & dag, const ContextPtr & context); +/// Builds sets used by ActionsDAG inplace, but skips sets that are arguments to +/// GLOBAL IN functions (globalIn, globalNotIn, globalNullIn, globalNotNullIn). +/// Those sets need external tables set up by ReadFromRemote before they can be built. +void buildSetsForDAGExcludingGlobalIn(const ActionsDAG & dag, const ContextPtr & context); + /// Builds ordered sets used by ActionsDAG inplace. void buildOrderedSetsForDAG(const ActionsDAG & dag, const ContextPtr & context); diff --git a/tests/queries/0_stateless/02967_parallel_replicas_joins_and_analyzer.reference b/tests/queries/0_stateless/02967_parallel_replicas_joins_and_analyzer.reference index 6be4160043ac..72f729ad0a4d 100644 --- a/tests/queries/0_stateless/02967_parallel_replicas_joins_and_analyzer.reference +++ b/tests/queries/0_stateless/02967_parallel_replicas_joins_and_analyzer.reference @@ -407,10 +407,6 @@ Expression Expression Expression ReadFromMergeTree - CreatingSet - Expression - Filter - ReadFromSystemNumbers Expression Expression ReadFromMemoryStorage @@ -466,10 +462,6 @@ Expression Expression Expression ReadFromMergeTree - CreatingSet - Expression - Filter - ReadFromSystemNumbers Expression Union Expression @@ -894,10 +886,6 @@ Expression Expression Expression ReadFromMergeTree - CreatingSet - Expression - Filter - ReadFromSystemNumbers Expression Expression Expression @@ -955,10 +943,6 @@ Expression Expression Expression ReadFromMergeTree - CreatingSet - Expression - Filter - ReadFromSystemNumbers Expression Union Expression diff --git a/tests/queries/0_stateless/03302_analyzer_distributed_filter_push_down.reference b/tests/queries/0_stateless/03302_analyzer_distributed_filter_push_down.reference index 0c36f390a029..eb7ae7a95e86 100644 --- a/tests/queries/0_stateless/03302_analyzer_distributed_filter_push_down.reference +++ b/tests/queries/0_stateless/03302_analyzer_distributed_filter_push_down.reference @@ -312,7 +312,7 @@ CreatingSets (Create sets before main query execution) ReadFromRemote (Read from remote replica) CreatingSets (Create sets before main query execution) Expression ((Project names + Projection)) - Expression ((WHERE + Change column names to column identifiers)) + Filter ((WHERE + Change column names to column identifiers)) ReadFromMergeTree (default.tab0) Indexes: PrimaryKey @@ -331,7 +331,7 @@ CreatingSets (Create sets before main query execution) Expression ((Project names + Projection)) Aggregating Expression (Before GROUP BY) - Expression ((WHERE + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))) + Filter ((WHERE + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))) ReadFromMergeTree (default.tab0) Indexes: PrimaryKey @@ -351,7 +351,7 @@ CreatingSets (Create sets before main query execution) ReadFromRemote (Read from remote replica) CreatingSets (Create sets before main query execution) Expression ((Project names + Projection)) - Expression ((WHERE + Change column names to column identifiers)) + Filter ((WHERE + Change column names to column identifiers)) ReadFromMergeTree (default.tab0) Indexes: PrimaryKey @@ -364,7 +364,7 @@ CreatingSets (Create sets before main query execution) Ranges: 0 CreatingSets (Create sets before main query execution) Expression ((Project names + Projection)) - Expression ((WHERE + Change column names to column identifiers)) + Filter ((WHERE + Change column names to column identifiers)) ReadFromMergeTree (default.tab0) Indexes: PrimaryKey @@ -384,7 +384,7 @@ CreatingSets (Create sets before main query execution) Aggregating Union Expression (Before GROUP BY) - Expression ((WHERE + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))) + Filter ((WHERE + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))) ReadFromMergeTree (default.tab0) Indexes: PrimaryKey @@ -400,7 +400,7 @@ CreatingSets (Create sets before main query execution) ReadFromRemote (Read from remote replica) CreatingSets (Create sets before main query execution) Expression ((Project names + Projection)) - Expression ((WHERE + Change column names to column identifiers)) + Filter ((WHERE + Change column names to column identifiers)) ReadFromMergeTree (default.tab0) Indexes: PrimaryKey @@ -417,7 +417,7 @@ CreatingSets (Create sets before main query execution) Aggregating Union Expression (Before GROUP BY) - Expression ((WHERE + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))) + Filter ((WHERE + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))) ReadFromMergeTree (default.tab0) Indexes: PrimaryKey @@ -433,7 +433,7 @@ CreatingSets (Create sets before main query execution) ReadFromRemote (Read from remote replica) CreatingSets (Create sets before main query execution) Expression ((Project names + Projection)) - Expression ((WHERE + Change column names to column identifiers)) + Filter ((WHERE + Change column names to column identifiers)) ReadFromMergeTree (default.tab0) Indexes: PrimaryKey @@ -446,7 +446,7 @@ CreatingSets (Create sets before main query execution) Ranges: 1 CreatingSets (Create sets before main query execution) Expression ((Project names + Projection)) - Expression ((WHERE + Change column names to column identifiers)) + Filter ((WHERE + Change column names to column identifiers)) ReadFromMergeTree (default.tab0) Indexes: PrimaryKey @@ -463,7 +463,7 @@ CreatingSets (Create sets before main query execution) Aggregating Union Expression (Before GROUP BY) - Expression ((WHERE + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))) + Filter ((WHERE + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))) ReadFromMergeTree (default.tab0) Indexes: PrimaryKey @@ -476,7 +476,7 @@ CreatingSets (Create sets before main query execution) ReadFromRemote (Read from remote replica) CreatingSets (Create sets before main query execution) Expression ((Project names + Projection)) - Expression ((WHERE + Change column names to column identifiers)) + Filter ((WHERE + Change column names to column identifiers)) ReadFromMergeTree (default.tab0) Indexes: PrimaryKey @@ -489,7 +489,7 @@ CreatingSets (Create sets before main query execution) ReadFromMemoryStorage CreatingSets (Create sets before main query execution) Expression ((Project names + Projection)) - Expression ((WHERE + Change column names to column identifiers)) + Filter ((WHERE + Change column names to column identifiers)) ReadFromMergeTree (default.tab0) Indexes: PrimaryKey diff --git a/tests/queries/0_stateless/03457_move_global_in_to_prewhere.reference b/tests/queries/0_stateless/03457_move_global_in_to_prewhere.reference index 6b3f681fc90b..f7486d4b5aca 100644 --- a/tests/queries/0_stateless/03457_move_global_in_to_prewhere.reference +++ b/tests/queries/0_stateless/03457_move_global_in_to_prewhere.reference @@ -1,17 +1,13 @@ 3 2048 23 2048 -Prewhere filter column: globalIn(key, ) (removed) 3 2048 -Prewhere filter column: globalIn(key, ) (removed) 0 2048 1 2048 2 2048 4 2048 5 2048 -Prewhere filter column: globalNotIn(key, ) (removed) 0 2048 1 2048 2 2048 4 2048 5 2048 -Prewhere filter column: globalNotIn(key, ) (removed) diff --git a/tests/queries/0_stateless/03620_analyzer_distributed_global_in.reference b/tests/queries/0_stateless/03620_analyzer_distributed_global_in.reference index 070fc644d1c5..689e784cc037 100644 --- a/tests/queries/0_stateless/03620_analyzer_distributed_global_in.reference +++ b/tests/queries/0_stateless/03620_analyzer_distributed_global_in.reference @@ -22,7 +22,7 @@ CreatingSets (Create sets before main query execution) Expression ((Project names + Projection)) Aggregating Expression (Before GROUP BY) - Expression ((WHERE + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))) + Filter ((WHERE + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))) ReadFromMergeTree (default.tab0) Indexes: PrimaryKey @@ -44,7 +44,7 @@ CreatingSets (Create sets before main query execution) ReadFromRemote (Read from remote replica) CreatingSets (Create sets before main query execution) Expression ((Project names + Projection)) - Expression ((WHERE + Change column names to column identifiers)) + Filter ((WHERE + Change column names to column identifiers)) ReadFromMergeTree (default.tab0) Indexes: PrimaryKey diff --git a/tests/queries/0_stateless/04053_in_subquery_prewhere_not_ready_set.reference b/tests/queries/0_stateless/04053_in_subquery_prewhere_not_ready_set.reference new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/queries/0_stateless/04053_in_subquery_prewhere_not_ready_set.sql b/tests/queries/0_stateless/04053_in_subquery_prewhere_not_ready_set.sql new file mode 100644 index 000000000000..5c8ce07bdd0b --- /dev/null +++ b/tests/queries/0_stateless/04053_in_subquery_prewhere_not_ready_set.sql @@ -0,0 +1,27 @@ +-- Regression test for "Not-ready Set" error when IN (subquery) condition +-- gets moved to PREWHERE by optimizePrewhere after applyFilters already ran. +-- https://github.com/ClickHouse/ClickHouse/issues/100318 + +CREATE TABLE t_100318_log (v0 UInt32) ENGINE = Log; +CREATE TABLE t_100318_mt (v0 UInt32, v1 UInt32, v2 DateTime, PRIMARY KEY(v1)) ENGINE = SummingMergeTree; +CREATE TABLE t_100318_rmt (v0 UInt32, v1 UInt32, PRIMARY KEY(v0)) ENGINE = ReplacingMergeTree; + +INSERT INTO t_100318_mt VALUES (13, 23000, '2100-01-05'); +INSERT INTO t_100318_mt VALUES (16, 26000, '2066-10-07'); +INSERT INTO t_100318_rmt VALUES (91, 101000); + +SELECT 1 FROM (SELECT 1 FROM t_100318_log) +WHERE EXISTS ( + SELECT 1 + UNION ALL + SELECT ref_4.v0 FROM ( + SELECT row_number() OVER (PARTITION BY t_100318_mt.v0) AS c_1 + FROM t_100318_mt + WHERE t_100318_mt.v2 IN (SELECT 1 FROM t_100318_log) + ) AS ref_3 + INNER JOIN t_100318_rmt AS ref_4 ON (ref_3.c_1 = ref_4.v0) +); + +DROP TABLE t_100318_log; +DROP TABLE t_100318_mt; +DROP TABLE t_100318_rmt; diff --git a/tests/queries/0_stateless/04060_explain_pretty_joins_sets.reference b/tests/queries/0_stateless/04060_explain_pretty_joins_sets.reference index e11392145b2a..c93f28b61e4f 100644 --- a/tests/queries/0_stateless/04060_explain_pretty_joins_sets.reference +++ b/tests/queries/0_stateless/04060_explain_pretty_joins_sets.reference @@ -84,18 +84,12 @@ CreatingSets (Create sets before main query execution) Output: a, b CreatingSets (Create sets before main query execution) -├──ReadFromMergeTree (default.t1) -│ Read type: Default -│ Parts: 1 | Granules: 1 -│ Output: a, b -│ Prewhere filter -│ Prewhere filter column: b IN subquery1 AND a IN subquery2 -└──CreatingSet (Create set for subquery) - │ Set: subquery1 - └──ReadFromMergeTree (default.t2) - Read type: Default - Parts: 1 | Granules: 1 - Output: y +└──ReadFromMergeTree (default.t1) + Read type: Default + Parts: 1 | Granules: 1 + Output: a, b + Prewhere filter + Prewhere filter column: b IN subquery1 AND a IN subquery2 --- IN with Set engine --- Output: a, b diff --git a/tests/queries/0_stateless/04070_global_in_subquery_prewhere.reference b/tests/queries/0_stateless/04070_global_in_subquery_prewhere.reference new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/queries/0_stateless/04070_global_in_subquery_prewhere.sql b/tests/queries/0_stateless/04070_global_in_subquery_prewhere.sql new file mode 100644 index 000000000000..3010240f1315 --- /dev/null +++ b/tests/queries/0_stateless/04070_global_in_subquery_prewhere.sql @@ -0,0 +1,28 @@ +-- Regression test: GLOBAL IN (subquery) must not be moved to PREWHERE, +-- because GLOBAL IN sets are populated via external tables attached by ReadFromRemote +-- and cannot be built synchronously during PREWHERE evaluation. +-- Also covers null-aware variants (globalNullIn/globalNotNullIn) via transform_null_in. +-- https://github.com/ClickHouse/ClickHouse/pull/100375 + +SET transform_null_in = 1; + +CREATE TABLE t_100375_mt (v0 UInt32, v1 UInt32, v2 Nullable(DateTime), PRIMARY KEY(v1)) ENGINE = SummingMergeTree; +CREATE TABLE t_100375_log (v0 UInt32) ENGINE = Log; + +INSERT INTO t_100375_mt VALUES (13, 23000, '2100-01-05'); +INSERT INTO t_100375_mt VALUES (16, 26000, '2066-10-07'); + +SELECT 1 FROM (SELECT 1 FROM t_100375_log) +WHERE EXISTS ( + SELECT 1 + UNION ALL + SELECT ref_4.v0 FROM ( + SELECT row_number() OVER (PARTITION BY t_100375_mt.v0) AS c_1 + FROM t_100375_mt + WHERE t_100375_mt.v2 GLOBAL IN (SELECT 1 FROM t_100375_log) + ) AS ref_3 + INNER JOIN t_100375_mt AS ref_4 ON (ref_3.c_1 = ref_4.v0) +); + +DROP TABLE t_100375_mt; +DROP TABLE t_100375_log;