From 59c6219f2dffd12d2a464b170a9b87766eb29005 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Mon, 24 Mar 2025 20:39:44 +0100 Subject: [PATCH 1/8] autogenerate MergeTree settings --- scripts/settings/autogenerate-settings.sh | 40 +++++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/scripts/settings/autogenerate-settings.sh b/scripts/settings/autogenerate-settings.sh index bc4e97a9c17..673b258344e 100755 --- a/scripts/settings/autogenerate-settings.sh +++ b/scripts/settings/autogenerate-settings.sh @@ -94,8 +94,19 @@ settings_from_cpp AS ), main_content AS ( - SELECT format('## {} {}\\n{}\\n{}\\n\\nType: {}\\n\\nDefault value: {}\\n\\n{}\\n\\n', - name, '{#'||name||'}', multiIf(tier == 'Experimental', '', tier == 'Beta', '', ''), if(description LIKE '%Only has an effect in ClickHouse Cloud%', '\\n', ''), type, default, replaceOne(trim(BOTH '\\n' FROM description), ' and [MaterializedMySQL](../../engines/database-engines/materialized-mysql.md)','')) + SELECT format( + '## {} {}\\n{}\\n{}\\n\\nType: {}\\n\\nDefault value: {}\\n\\n{}\\n\\n', + name, + '{#'||name||'}', + multiIf(tier == 'Experimental', '', tier == 'Beta', '', ''), + if(description LIKE '%Only has an effect in ClickHouse Cloud%', '\\n', ''), + type, + default, + replaceOne( + trim(BOTH '\\n' FROM description), + ' and [MaterializedMySQL](../../engines/database-engines/materialized-mysql.md)','' + ) + ) FROM system.settings WHERE name IN settings_from_cpp ORDER BY name ), @@ -119,12 +130,35 @@ SELECT prefix || (SELECT groupConcat(*) FROM main_content) INTO OUTFILE 'settings.md' TRUNCATE FORMAT LineAsString " > /dev/null || { echo "Failed to Autogenerate Core settings"; exit 1; } +# Autogenerate settings +./clickhouse -q " + WITH + merge_tree_settings AS + ( + SELECT format( + '## {} {} {} \n{} \nType: {} \nDefault value: {} \n\n{} \n', + name, + '{#'||name||'}', + multiIf(tier == 'Experimental', '\n', tier == 'Beta', '\n', ''), + if(description LIKE '%Only has an effect in ClickHouse Cloud%', '\\n', ''), + type, + default, + description + ) + FROM system.merge_tree_settings ORDER BY name + ) + SELECT * FROM merge_tree_settings + INTO OUTFILE 'generated_merge_tree_settings.md' TRUNCATE FORMAT LineAsString +" > /dev/null || { echo "Failed to Autogenerate Core settings"; exit 1; } + mv settings-formats.md "$root/docs/operations/settings" || { echo "Failed to move generated settings-format.md"; exit 1; } mv settings.md "$root/docs/operations/settings" || { echo "Failed to move generated settings.md"; exit 1; } +cat generated_merge_tree_settings.md +cat generated_merge_tree_settings.md >> "$root/docs/operations/settings/merge-tree-settings.md" || { echo "Failed to append MergeTree settings.md"; exit 1; } echo "[$SCRIPT_NAME] Auto-generation of settings markdown pages completed successfully" # perform cleanup -rm -rf "$tmp_dir"/{settings-formats.md,settings.md,FormatFactorySettings.h,Settings.cpp,clickhouse} +rm -rf "$tmp_dir"/{settings-formats.md,settings.md,FormatFactorySettings.h,Settings.cpp,generated_merge_tree_settings.md,clickhouse} echo "[$SCRIPT_NAME] Autogenerating settings completed" From 9308658554270d3315721b17c36b43f038415d27 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Sat, 29 Mar 2025 18:34:23 +0100 Subject: [PATCH 2/8] add mergetree-settings --- scripts/settings/mergetree-settings.sql | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/scripts/settings/mergetree-settings.sql b/scripts/settings/mergetree-settings.sql index e69de29bb2d..ab092a1efaf 100644 --- a/scripts/settings/mergetree-settings.sql +++ b/scripts/settings/mergetree-settings.sql @@ -0,0 +1,16 @@ +WITH + merge_tree_settings AS + ( + SELECT format( + '## {} {} \n{}\nType: \`{}\`\n\nDefault: \`{}\`\n{}', + name, + '{#'||name||'}', + multiIf(tier == 'Experimental', '\n\n', tier == 'Beta', '\n\n', ''), + type, + default, + replaceRegexpAll(description, '(?m)(^[ \t]+|[ \t]+$)', '') + ) + FROM system.merge_tree_settings ORDER BY name + ) +SELECT * FROM merge_tree_settings +INTO OUTFILE 'generated_merge_tree_settings.md' TRUNCATE FORMAT LineAsString \ No newline at end of file From a4a7e8e761d377e7bc957691117b6f4bcf6b1380 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Sat, 29 Mar 2025 18:50:12 +0100 Subject: [PATCH 3/8] fix links --- docs/managing-data/core-concepts/parts.md | 2 +- docs/migrations/bigquery/migrating-to-clickhouse-cloud.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/managing-data/core-concepts/parts.md b/docs/managing-data/core-concepts/parts.md index 9cd9e172e6d..dc243a29654 100644 --- a/docs/managing-data/core-concepts/parts.md +++ b/docs/managing-data/core-concepts/parts.md @@ -55,7 +55,7 @@ Data parts are self-contained, including all metadata needed to interpret their ## Part merges {#part-merges} -To manage the number of parts per table, a [background merge](/merges) job periodically combines smaller parts into larger ones until they reach a [configurable](/operations/settings/merge-tree-settings#max-bytes-to-merge-at-max-space-in-pool) compressed size (typically ~150 GB). Merged parts are marked as inactive and deleted after a [configurable](/operations/settings/merge-tree-settings#old-parts-lifetime) time interval. Over time, this process creates a hierarchical structure of merged parts, which is why it’s called a MergeTree table: +To manage the number of parts per table, a [background merge](/merges) job periodically combines smaller parts into larger ones until they reach a [configurable](/operations/settings/merge-tree-settings#max_bytes_to_merge_at_max_space_in_pool) compressed size (typically ~150 GB). Merged parts are marked as inactive and deleted after a [configurable](/operations/settings/merge-tree-settings#old_parts_lifetime) time interval. Over time, this process creates a hierarchical structure of merged parts, which is why it’s called a MergeTree table: diff --git a/docs/migrations/bigquery/migrating-to-clickhouse-cloud.md b/docs/migrations/bigquery/migrating-to-clickhouse-cloud.md index 1db6f5d9d7f..94a95068bce 100644 --- a/docs/migrations/bigquery/migrating-to-clickhouse-cloud.md +++ b/docs/migrations/bigquery/migrating-to-clickhouse-cloud.md @@ -242,7 +242,7 @@ Users should consider partitioning a data management technique. It is ideal when Important: Ensure your partitioning key expression does not result in a high cardinality set i.e. creating more than 100 partitions should be avoided. For example, do not partition your data by high cardinality columns such as client identifiers or names. Instead, make a client identifier or name the first column in the `ORDER BY` expression. -> Internally, ClickHouse [creates parts](/guides/best-practices/sparse-primary-indexes#clickhouse-index-design) for inserted data. As more data is inserted, the number of parts increases. In order to prevent an excessively high number of parts, which will degrade query performance (because there are more files to read), parts are merged together in a background asynchronous process. If the number of parts exceeds a [pre-configured limit](/operations/settings/merge-tree-settings#parts-to-throw-insert), then ClickHouse will throw an exception on insert as a ["too many parts" error](/knowledgebase/exception-too-many-parts). This should not happen under normal operation and only occurs if ClickHouse is misconfigured or used incorrectly e.g. many small inserts. Since parts are created per partition in isolation, increasing the number of partitions causes the number of parts to increase i.e. it is a multiple of the number of partitions. High cardinality partitioning keys can, therefore, cause this error and should be avoided. +> Internally, ClickHouse [creates parts](/guides/best-practices/sparse-primary-indexes#clickhouse-index-design) for inserted data. As more data is inserted, the number of parts increases. In order to prevent an excessively high number of parts, which will degrade query performance (because there are more files to read), parts are merged together in a background asynchronous process. If the number of parts exceeds a [pre-configured limit](/operations/settings/merge-tree-settings#parts_to_throw_insert), then ClickHouse will throw an exception on insert as a ["too many parts" error](/knowledgebase/exception-too-many-parts). This should not happen under normal operation and only occurs if ClickHouse is misconfigured or used incorrectly e.g. many small inserts. Since parts are created per partition in isolation, increasing the number of partitions causes the number of parts to increase i.e. it is a multiple of the number of partitions. High cardinality partitioning keys can, therefore, cause this error and should be avoided. ## Materialized views vs projections {#materialized-views-vs-projections} From 8ac3093f4d19050987e6ebd001f9ec63adcc97ec Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Sat, 29 Mar 2025 19:05:01 +0100 Subject: [PATCH 4/8] tabulate type and default --- scripts/settings/mergetree-settings.sql | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/settings/mergetree-settings.sql b/scripts/settings/mergetree-settings.sql index ab092a1efaf..c50691679f3 100644 --- a/scripts/settings/mergetree-settings.sql +++ b/scripts/settings/mergetree-settings.sql @@ -2,12 +2,11 @@ WITH merge_tree_settings AS ( SELECT format( - '## {} {} \n{}\nType: \`{}\`\n\nDefault: \`{}\`\n{}', + '## {} {} \n{}\n{}{}', name, '{#'||name||'}', multiIf(tier == 'Experimental', '\n\n', tier == 'Beta', '\n\n', ''), - type, - default, + if(type != '' AND default != '', format('|Type|Default|\n|---|---|\n|`{}`|`{}`|\n\n',type, default), ''), replaceRegexpAll(description, '(?m)(^[ \t]+|[ \t]+$)', '') ) FROM system.merge_tree_settings ORDER BY name From 6bb29c23526f202f4d988317ca7f2b7ecfa04529 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Sat, 29 Mar 2025 19:22:06 +0100 Subject: [PATCH 5/8] fix links --- docs/integrations/data-ingestion/s3/performance.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/integrations/data-ingestion/s3/performance.md b/docs/integrations/data-ingestion/s3/performance.md index e64b0b03122..6199bbc002d 100644 --- a/docs/integrations/data-ingestion/s3/performance.md +++ b/docs/integrations/data-ingestion/s3/performance.md @@ -60,13 +60,13 @@ Note that the `min_insert_block_size_bytes` value denotes the uncompressed in-me #### Be aware of merges {#be-aware-of-merges} -The smaller the configured insert block size is, the more initial parts get created for a large data load, and the more background part merges are executed concurrently with the data ingestion. This can cause resource contention (CPU and memory) and require additional time (for reaching a [healthy](/operations/settings/merge-tree-settings#parts-to-throw-insert) (3000) number of parts) after the ingestion is finished. +The smaller the configured insert block size is, the more initial parts get created for a large data load, and the more background part merges are executed concurrently with the data ingestion. This can cause resource contention (CPU and memory) and require additional time (for reaching a [healthy](/operations/settings/merge-tree-settings#parts_to_throw_insert) (3000) number of parts) after the ingestion is finished. :::important ClickHouse query performance will be negatively impacted if the part count exceeds the [recommended limits](/operations/settings/merge-tree-settings#parts-to-throw-insert). ::: -ClickHouse will continuously [merge parts](https://clickhouse.com/blog/asynchronous-data-inserts-in-clickhouse#data-needs-to-be-batched-for-optimal-performance) into larger parts until they [reach](/operations/settings/merge-tree-settings#max-bytes-to-merge-at-max-space-in-pool) a compressed size of ~150 GiB. This diagram shows how a ClickHouse server merges parts: +ClickHouse will continuously [merge parts](https://clickhouse.com/blog/asynchronous-data-inserts-in-clickhouse#data-needs-to-be-batched-for-optimal-performance) into larger parts until they [reach](/operations/settings/merge-tree-settings#max_bytes_to_merge_at_max_space_in_pool) a compressed size of ~150 GiB. This diagram shows how a ClickHouse server merges parts: Background merges in ClickHouse @@ -84,7 +84,7 @@ Go to ① Note that [increasing](https://clickhouse.com/blog/supercharge-your-clickhouse-data-loads-part1#hardware-size) the number of CPU cores and the size of RAM increases the background merge throughput. -Parts that were merged into larger parts are marked as [inactive](/operations/system-tables/parts) and finally deleted after a [configurable](/operations/settings/merge-tree-settings#old-parts-lifetime) number of minutes. Over time, this creates a tree of merged parts (hence the name [`MergeTree`](/engines/table-engines/mergetree-family) table). +Parts that were merged into larger parts are marked as [inactive](/operations/system-tables/parts) and finally deleted after a [configurable](/operations/settings/merge-tree-settings#old_parts_lifetime) number of minutes. Over time, this creates a tree of merged parts (hence the name [`MergeTree`](/engines/table-engines/mergetree-family) table). ### Insert Parallelism {#insert-parallelism} From 73ce44b37ef22322d5344058a16599fcec38ac30 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Sun, 30 Mar 2025 23:25:04 +0200 Subject: [PATCH 6/8] fix link --- docs/managing-data/core-concepts/merges.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/managing-data/core-concepts/merges.md b/docs/managing-data/core-concepts/merges.md index b6ee079ede5..2b975ffd693 100644 --- a/docs/managing-data/core-concepts/merges.md +++ b/docs/managing-data/core-concepts/merges.md @@ -28,7 +28,7 @@ ClickHouse [is fast](/concepts/why-clickhouse-is-so-fast) not just for queries b This makes data writes lightweight and [highly efficient](/concepts/why-clickhouse-is-so-fast#storage-layer-concurrent-inserts-are-isolated-from-each-other). -To control the number of parts per table and implement ② above, ClickHouse continuously merges ([per partition](/partitions#per-partition-merges)) smaller parts into larger ones in the background until they reach a compressed size of approximately [~150 GB](/operations/settings/merge-tree-settings#max-bytes-to-merge-at-max-space-in-pool). +To control the number of parts per table and implement ② above, ClickHouse continuously merges ([per partition](/partitions#per-partition-merges)) smaller parts into larger ones in the background until they reach a compressed size of approximately [~150 GB](/operations/settings/merge-tree-settings#max_bytes_to_merge_at_max_space_in_pool). The following diagram sketches this background merge process: @@ -36,7 +36,7 @@ The following diagram sketches this background merge process:
-The `merge level` of a part is incremented by one with each additional merge. A level of `0` means the part is new and has not been merged yet. Parts that were merged into larger parts are marked as [inactive](/operations/system-tables/parts) and finally deleted after a [configurable](/operations/settings/merge-tree-settings#old-parts-lifetime) time (8 minutes by default). Over time, this creates a **tree** of merged parts. Hence the name [merge tree](/engines/table-engines/mergetree-family) table. +The `merge level` of a part is incremented by one with each additional merge. A level of `0` means the part is new and has not been merged yet. Parts that were merged into larger parts are marked as [inactive](/operations/system-tables/parts) and finally deleted after a [configurable](/operations/settings/merge-tree-settings#old_parts_lifetime) time (8 minutes by default). Over time, this creates a **tree** of merged parts. Hence the name [merge tree](/engines/table-engines/mergetree-family) table. ## Monitoring merges {#monitoring-merges} From 3f20bad5bd61c199be504fce15b1010425520eeb Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Sun, 30 Mar 2025 23:25:39 +0200 Subject: [PATCH 7/8] fix links --- docs/integrations/data-ingestion/s3/performance.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/integrations/data-ingestion/s3/performance.md b/docs/integrations/data-ingestion/s3/performance.md index 6199bbc002d..584bd9d35a8 100644 --- a/docs/integrations/data-ingestion/s3/performance.md +++ b/docs/integrations/data-ingestion/s3/performance.md @@ -63,7 +63,7 @@ Note that the `min_insert_block_size_bytes` value denotes the uncompressed in-me The smaller the configured insert block size is, the more initial parts get created for a large data load, and the more background part merges are executed concurrently with the data ingestion. This can cause resource contention (CPU and memory) and require additional time (for reaching a [healthy](/operations/settings/merge-tree-settings#parts_to_throw_insert) (3000) number of parts) after the ingestion is finished. :::important -ClickHouse query performance will be negatively impacted if the part count exceeds the [recommended limits](/operations/settings/merge-tree-settings#parts-to-throw-insert). +ClickHouse query performance will be negatively impacted if the part count exceeds the [recommended limits](/operations/settings/merge-tree-settings#parts_to_throw_insert). ::: ClickHouse will continuously [merge parts](https://clickhouse.com/blog/asynchronous-data-inserts-in-clickhouse#data-needs-to-be-batched-for-optimal-performance) into larger parts until they [reach](/operations/settings/merge-tree-settings#max_bytes_to_merge_at_max_space_in_pool) a compressed size of ~150 GiB. This diagram shows how a ClickHouse server merges parts: From 9e01403cdca17456252d0f3726aa4f2f88bfccb4 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Sun, 30 Mar 2025 23:38:26 +0200 Subject: [PATCH 8/8] Update docusaurus.config.en.js --- docusaurus.config.en.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docusaurus.config.en.js b/docusaurus.config.en.js index 02f742800bd..5a9a9d503c4 100644 --- a/docusaurus.config.en.js +++ b/docusaurus.config.en.js @@ -59,7 +59,7 @@ const config = { onBrokenLinks: "throw", onBrokenMarkdownLinks: "warn", onDuplicateRoutes: "throw", - onBrokenAnchors: "throw", + onBrokenAnchors: "warn", favicon: "img/docs_favicon.ico", organizationName: "ClickHouse", trailingSlash: false,