Skip to content

Commit

Permalink
Merge pull request #59701 from ClickHouse/revert-59669-backport/24.1/…
Browse files Browse the repository at this point in the history
…59650

Revert "Backport #59650 to 24.1: MergeTree FINAL optimization diagnostics and settings"
  • Loading branch information
Algunenano committed Feb 7, 2024
2 parents 1c34aa0 + 5e1ab25 commit 7a7b61b
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 42 deletions.
2 changes: 0 additions & 2 deletions src/Core/Settings.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,6 @@ class IColumn;
M(UInt64, merge_tree_max_rows_to_use_cache, (128 * 8192), "The maximum number of rows per request, to use the cache of uncompressed data. If the request is large, the cache is not used. (For large queries not to flush out the cache.)", 0) \
M(UInt64, merge_tree_max_bytes_to_use_cache, (192 * 10 * 1024 * 1024), "The maximum number of bytes per request, to use the cache of uncompressed data. If the request is large, the cache is not used. (For large queries not to flush out the cache.)", 0) \
M(Bool, do_not_merge_across_partitions_select_final, false, "Merge parts only in one partition in select final", 0) \
M(Bool, split_parts_ranges_into_intersecting_and_non_intersecting_final, true, "Split parts ranges into intersecting and non intersecting during FINAL optimization", 0) \
M(Bool, split_intersecting_parts_ranges_into_layers_final, true, "Split intersecting parts ranges into layers during FINAL optimization", 0) \
M(Bool, allow_experimental_inverted_index, false, "If it is set to true, allow to use experimental inverted index.", 0) \
\
M(UInt64, mysql_max_rows_to_insert, 65536, "The maximum number of rows in MySQL batch insertion of the MySQL storage engine", 0) \
Expand Down
4 changes: 1 addition & 3 deletions src/Core/SettingsChangesHistory.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
{"23.12", {{"allow_suspicious_ttl_expressions", true, false, "It is a new setting, and in previous versions the behavior was equivalent to allowing."},
{"input_format_parquet_allow_missing_columns", false, true, "Allow missing columns in Parquet files by default"},
{"input_format_orc_allow_missing_columns", false, true, "Allow missing columns in ORC files by default"},
{"input_format_arrow_allow_missing_columns", false, true, "Allow missing columns in Arrow files by default"},
{"split_parts_ranges_into_intersecting_and_non_intersecting_final", false, true, "Allow to split parts ranges into intersecting and non intersecting during FINAL optimization"},
{"split_intersecting_parts_ranges_into_layers_final", true, true, "Allow to split intersecting parts ranges into layers during FINAL optimization"}}},
{"input_format_arrow_allow_missing_columns", false, true, "Allow missing columns in Arrow files by default"}}},
{"23.9", {{"optimize_group_by_constant_keys", false, true, "Optimize group by constant keys by default"},
{"input_format_json_try_infer_named_tuples_from_objects", false, true, "Try to infer named Tuples from JSON objects by default"},
{"input_format_json_read_numbers_as_strings", false, true, "Allow to read numbers as strings in JSON formats by default"},
Expand Down
38 changes: 7 additions & 31 deletions src/Processors/QueryPlan/PartsSplitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ struct SplitPartsRangesResult
RangesInDataParts intersecting_parts_ranges;
};

SplitPartsRangesResult splitPartsRanges(RangesInDataParts ranges_in_data_parts, const LoggerPtr & logger)
SplitPartsRangesResult splitPartsRanges(RangesInDataParts ranges_in_data_parts)
{
/** Split ranges in data parts into intersecting ranges in data parts and non intersecting ranges in data parts.
*
Expand Down Expand Up @@ -483,15 +483,10 @@ SplitPartsRangesResult splitPartsRanges(RangesInDataParts ranges_in_data_parts,
intersecting_ranges_in_data_parts.end(),
[](const auto & lhs, const auto & rhs) { return lhs.part_index_in_query < rhs.part_index_in_query; });

LOG_TEST(logger, "Non intersecting ranges in data parts {}", non_intersecting_ranges_in_data_parts.getDescriptions().describe());
LOG_TEST(logger, "Intersecting ranges in data parts {}", intersecting_ranges_in_data_parts.getDescriptions().describe());

return {std::move(non_intersecting_ranges_in_data_parts), std::move(intersecting_ranges_in_data_parts)};
}

std::pair<std::vector<RangesInDataParts>, std::vector<Values>> splitIntersectingPartsRangesIntoLayers(RangesInDataParts intersecting_ranges_in_data_parts,
size_t max_layers,
const LoggerPtr & logger)
std::pair<std::vector<RangesInDataParts>, std::vector<Values>> splitIntersectingPartsRangesIntoLayers(RangesInDataParts intersecting_ranges_in_data_parts, size_t max_layers)
{
// We will advance the iterator pointing to the mark with the smallest PK value until
// there will be not less than rows_per_layer rows in the current layer (roughly speaking).
Expand Down Expand Up @@ -596,18 +591,8 @@ std::pair<std::vector<RangesInDataParts>, std::vector<Values>> splitIntersecting
result_layers.back() = std::move(current_layer_builder.getCurrentRangesInDataParts());
}

size_t result_layers_size = result_layers.size();
LOG_TEST(logger, "Split intersecting ranges into {} layers", result_layers_size);

for (size_t i = 0; i < result_layers_size; ++i)
for (auto & layer : result_layers)
{
auto & layer = result_layers[i];

LOG_TEST(logger, "Layer {} {} filter values in ({}, {}])",
i,
layer.getDescriptions().describe(),
i ? ::toString(borders[i - 1]) : "-inf", i < borders.size() ? ::toString(borders[i]) : "+inf");

std::stable_sort(
layer.begin(),
layer.end(),
Expand Down Expand Up @@ -727,32 +712,23 @@ SplitPartsWithRangesByPrimaryKeyResult splitPartsWithRangesByPrimaryKey(
size_t max_layers,
ContextPtr context,
ReadingInOrderStepGetter && in_order_reading_step_getter,
bool split_parts_ranges_into_intersecting_and_non_intersecting_final,
bool split_intersecting_parts_ranges_into_layers)
bool force_process_all_ranges)
{
if (max_layers <= 1)
throw Exception(ErrorCodes::LOGICAL_ERROR, "max_layer should be greater than 1");

auto logger = getLogger("PartsSplitter");

SplitPartsWithRangesByPrimaryKeyResult result;

RangesInDataParts intersecting_parts_ranges = std::move(parts);

if (split_parts_ranges_into_intersecting_and_non_intersecting_final)
if (!force_process_all_ranges)
{
SplitPartsRangesResult split_result = splitPartsRanges(intersecting_parts_ranges, logger);
SplitPartsRangesResult split_result = splitPartsRanges(intersecting_parts_ranges);
result.non_intersecting_parts_ranges = std::move(split_result.non_intersecting_parts_ranges);
intersecting_parts_ranges = std::move(split_result.intersecting_parts_ranges);
}

if (!split_intersecting_parts_ranges_into_layers)
{
result.merging_pipes.emplace_back(in_order_reading_step_getter(intersecting_parts_ranges));
return result;
}

auto && [layers, borders] = splitIntersectingPartsRangesIntoLayers(intersecting_parts_ranges, max_layers, logger);
auto && [layers, borders] = splitIntersectingPartsRangesIntoLayers(intersecting_parts_ranges, max_layers);
auto filters = buildFilters(primary_key, borders);
result.merging_pipes.resize(layers.size());

Expand Down
3 changes: 1 addition & 2 deletions src/Processors/QueryPlan/PartsSplitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,5 @@ SplitPartsWithRangesByPrimaryKeyResult splitPartsWithRangesByPrimaryKey(
size_t max_layers,
ContextPtr context,
ReadingInOrderStepGetter && in_order_reading_step_getter,
bool split_parts_ranges_into_intersecting_and_non_intersecting,
bool split_intersecting_parts_ranges_into_layers);
bool force_process_all_ranges);
}
6 changes: 2 additions & 4 deletions src/Processors/QueryPlan/ReadFromMergeTree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1175,8 +1175,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal(

/// Parts of non-zero level still may contain duplicate PK values to merge on FINAL if there's is_deleted column,
/// so we have to process all ranges. It would be more optimal to remove this flag and add an extra filtering step.
bool split_parts_ranges_into_intersecting_and_non_intersecting_final = settings.split_parts_ranges_into_intersecting_and_non_intersecting_final &&
data.merging_params.is_deleted_column.empty();
bool force_process_all_ranges = !data.merging_params.is_deleted_column.empty();

SplitPartsWithRangesByPrimaryKeyResult split_ranges_result = splitPartsWithRangesByPrimaryKey(
metadata_for_reading->getPrimaryKey(),
Expand All @@ -1185,8 +1184,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal(
num_streams,
context,
std::move(in_order_reading_step_getter),
split_parts_ranges_into_intersecting_and_non_intersecting_final,
settings.split_intersecting_parts_ranges_into_layers_final);
force_process_all_ranges);

for (auto && non_intersecting_parts_range : split_ranges_result.non_intersecting_parts_ranges)
non_intersecting_parts_by_primary_key.push_back(std::move(non_intersecting_parts_range));
Expand Down

0 comments on commit 7a7b61b

Please sign in to comment.