diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index d9a250e3f7ac..5f51cfc93247 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -231,6 +231,19 @@ std::vector MergeTreeReadPool::fillPerPartInfo( auto [required_columns, required_pre_columns, should_reorder] = getReadTaskColumns(data, metadata_snapshot, part.data_part, column_names, prewhere_info, check_columns); + if (predict_block_size_bytes) + { + const auto & required_column_names = required_columns.getNames(); + const auto & required_pre_column_names = required_pre_columns.getNames(); + NameSet complete_column_names(required_column_names.begin(), required_column_names.end()); + complete_column_names.insert(required_pre_column_names.begin(), required_pre_column_names.end()); + + per_part_size_predictor.emplace_back(std::make_unique( + part.data_part, Names(complete_column_names.begin(), complete_column_names.end()), sample_block)); + } + else + per_part_size_predictor.emplace_back(nullptr); + /// will be used to distinguish between PREWHERE and WHERE columns when applying filter const auto & required_column_names = required_columns.getNames(); per_part_column_name_set.emplace_back(required_column_names.begin(), required_column_names.end()); @@ -240,14 +253,6 @@ std::vector MergeTreeReadPool::fillPerPartInfo( per_part_should_reorder.push_back(should_reorder); parts_with_idx.push_back({ part.data_part, part.part_index_in_query }); - - if (predict_block_size_bytes) - { - per_part_size_predictor.emplace_back(std::make_unique( - part.data_part, column_names, sample_block)); - } - else - per_part_size_predictor.emplace_back(nullptr); } return per_part_sum_marks; diff --git a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp index 027529085765..fc728e4feaba 100644 --- a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp @@ -93,9 +93,17 @@ try MarkRanges mark_ranges_for_task = { all_mark_ranges.back() }; all_mark_ranges.pop_back(); - auto size_predictor = (preferred_block_size_bytes == 0) - ? nullptr - : std::make_unique(data_part, ordered_names, metadata_snapshot->getSampleBlock()); + std::unique_ptr size_predictor; + if (preferred_block_size_bytes) + { + const auto & required_column_names = task_columns.columns.getNames(); + const auto & required_pre_column_names = task_columns.pre_columns.getNames(); + NameSet complete_column_names(required_column_names.begin(), required_column_names.end()); + complete_column_names.insert(required_pre_column_names.begin(), required_pre_column_names.end()); + + size_predictor = std::make_unique( + data_part, Names(complete_column_names.begin(), complete_column_names.end()), metadata_snapshot->getSampleBlock()); + } task = std::make_unique( data_part, mark_ranges_for_task, part_index_in_query, ordered_names, column_name_set, diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index 65f9b1eba3b9..9b84a5cac044 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -71,9 +71,17 @@ try storage, metadata_snapshot, data_part, required_columns, prewhere_info, check_columns); - auto size_predictor = (preferred_block_size_bytes == 0) - ? nullptr - : std::make_unique(data_part, ordered_names, metadata_snapshot->getSampleBlock()); + std::unique_ptr size_predictor; + if (preferred_block_size_bytes) + { + const auto & required_column_names = task_columns.columns.getNames(); + const auto & required_pre_column_names = task_columns.pre_columns.getNames(); + NameSet complete_column_names(required_column_names.begin(), required_column_names.end()); + complete_column_names.insert(required_pre_column_names.begin(), required_pre_column_names.end()); + + size_predictor = std::make_unique( + data_part, Names(complete_column_names.begin(), complete_column_names.end()), metadata_snapshot->getSampleBlock()); + } /// will be used to distinguish between PREWHERE and WHERE columns when applying filter const auto & column_names = task_columns.columns.getNames(); diff --git a/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.reference b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.reference new file mode 100644 index 000000000000..b70a1cb7c752 --- /dev/null +++ b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.reference @@ -0,0 +1,3 @@ +8 +4 +4 diff --git a/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sql b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sql new file mode 100644 index 000000000000..7aa1b0112a65 --- /dev/null +++ b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sql @@ -0,0 +1,11 @@ +CREATE TABLE test_extract(str String, arr Array(Array(String)) ALIAS extractAllGroupsHorizontal(str, '\\W(\\w+)=("[^"]*?"|[^",}]*)')) ENGINE=MergeTree() PARTITION BY tuple() ORDER BY tuple(); + +INSERT INTO test_extract (str) WITH range(8) as range_arr, arrayMap(x-> concat(toString(x),'Id'), range_arr) as key, arrayMap(x -> rand() % 8, range_arr) as val, arrayStringConcat(arrayMap((x,y) -> concat(x,'=',toString(y)), key, val),',') as str SELECT str FROM numbers(500000); + +ALTER TABLE test_extract ADD COLUMN `15Id` Nullable(UInt16) DEFAULT toUInt16OrNull(arrayFirst((v, k) -> (k = '4Id'), arr[2], arr[1])); + +SELECT uniq(15Id) FROM test_extract SETTINGS max_threads=1, max_memory_usage=100000000; + +SELECT uniq(15Id) FROM test_extract PREWHERE 15Id < 4 SETTINGS max_threads=1, max_memory_usage=100000000; + +SELECT uniq(15Id) FROM test_extract WHERE 15Id < 4 SETTINGS max_threads=1, max_memory_usage=100000000;