From 1d9509046336d58a53b603b62f7ee30c9a2cb2c8 Mon Sep 17 00:00:00 2001 From: Dale Mcdiarmid Date: Mon, 3 Mar 2025 18:32:48 +0000 Subject: [PATCH 1/4] move more images to static --- docs/data-modeling/backfilling.md | 35 +++++++++--------- docs/data-modeling/denormalization.md | 13 +++---- docs/data-modeling/schema-design.md | 33 +++++++---------- docs/guides/developer/deduplication.md | 5 ++- ...nding-query-execution-with-the-analyzer.md | 26 ++++++++----- .../postgres/data-modeling-techniques.md | 21 +++++------ docs/migrations/postgres/dataset.md | 7 ++-- docs/migrations/postgres/designing-schemas.md | 15 +++----- .../postgres/replacing-merge-tree.md | 10 ++--- docs/migrations/snowflake.md | 6 ++- .../denormalization-diagram.png | Bin .../data-modeling}/denormalization-schema.png | Bin .../images/data-modeling}/null_table_mv.png | Bin .../data-modeling}/schema-design-indices.png | Bin .../data-modeling}/schema-design-types.png | Bin .../data-modeling}/stackoverflow-schema.png | Bin .../guides/developer}/Deduplication.png | Bin .../images/guides/developer}/analyzer1.png | Bin .../images/guides/developer}/analyzer2.png | Bin .../images/guides/developer}/analyzer3.png | Bin .../images/guides/developer}/analyzer4.png | Bin .../images/guides/developer}/analyzer5.png | Bin .../migrate_snowflake_clickhouse.png | Bin .../images/migrations}/postgres-b-tree.png | Bin .../migrations}/postgres-partitions.png | Bin .../migrations}/postgres-projections.png | Bin .../postgres-replacingmergetree.png | Bin .../migrations}/postgres-sparse-index.png | Bin .../postgres-stackoverflow-schema.png | Bin .../images/quickstart/SQLConsole.png | Bin .../images/quickstart/ServiceDetails.png | Bin .../images/quickstart/Services.png | Bin .../images/quickstart/ShowDatabases.png | Bin 33 files changed, 81 insertions(+), 90 deletions(-) rename {docs/data-modeling/images => static/images/data-modeling}/denormalization-diagram.png (100%) rename {docs/data-modeling/images => static/images/data-modeling}/denormalization-schema.png (100%) rename {docs/data-modeling/images => static/images/data-modeling}/null_table_mv.png (100%) rename {docs/data-modeling/images => static/images/data-modeling}/schema-design-indices.png (100%) rename {docs/data-modeling/images => static/images/data-modeling}/schema-design-types.png (100%) rename {docs/data-modeling/images => static/images/data-modeling}/stackoverflow-schema.png (100%) rename {docs/guides/developer/images => static/images/guides/developer}/Deduplication.png (100%) rename {docs/guides/developer/images => static/images/guides/developer}/analyzer1.png (100%) rename {docs/guides/developer/images => static/images/guides/developer}/analyzer2.png (100%) rename {docs/guides/developer/images => static/images/guides/developer}/analyzer3.png (100%) rename {docs/guides/developer/images => static/images/guides/developer}/analyzer4.png (100%) rename {docs/guides/developer/images => static/images/guides/developer}/analyzer5.png (100%) rename {docs/migrations/images => static/images/migrations}/migrate_snowflake_clickhouse.png (100%) rename {docs/migrations/images => static/images/migrations}/postgres-b-tree.png (100%) rename {docs/migrations/images => static/images/migrations}/postgres-partitions.png (100%) rename {docs/migrations/images => static/images/migrations}/postgres-projections.png (100%) rename {docs/migrations/images => static/images/migrations}/postgres-replacingmergetree.png (100%) rename {docs/migrations/images => static/images/migrations}/postgres-sparse-index.png (100%) rename {docs/migrations/images => static/images/migrations}/postgres-stackoverflow-schema.png (100%) rename {docs => static}/images/quickstart/SQLConsole.png (100%) rename {docs => static}/images/quickstart/ServiceDetails.png (100%) rename {docs => static}/images/quickstart/Services.png (100%) rename {docs => static}/images/quickstart/ShowDatabases.png (100%) diff --git a/docs/data-modeling/backfilling.md b/docs/data-modeling/backfilling.md index c851e612c19..c86db96c7d1 100644 --- a/docs/data-modeling/backfilling.md +++ b/docs/data-modeling/backfilling.md @@ -5,6 +5,8 @@ description: How to use backfill large datasets in ClickHouse keywords: [materialized views, backfilling, inserting data, resilient data load] --- +import nullTableMV from '@site/static/images/data-modeling/null_table_mv.png'; + # Backfilling Data Whether new to ClickHouse or responsible for an existing deployment, users will invariably need to backfill tables with historical data. In some cases, this is relatively simple but can become more complex when materialized views need to be populated. This guide documents some processes for this task that users can apply to their use case. @@ -15,7 +17,7 @@ This guide assumes users are already familiar with the concept of [Incremental M ## Example dataset {#example-dataset} -Throughout this guide, we use a PyPI dataset. Each row in this dataset represents a Python package download using a tool such as `pip`. +Throughout this guide, we use a PyPI dataset. Each row in this dataset represents a Python package download using a tool such as `pip`. For example, the subset covers a single day - `2024-12-17` and is available publicly at `https://datasets-documentation.s3.eu-west-3.amazonaws.com/pypi/2024-12-17/`. Users can query with: @@ -66,12 +68,12 @@ The full PyPI dataset, consisting of over 1 trillion rows, is available in our p ## Backfilling scenarios {#backfilling-scenarios} -Backfilling is typically needed when a stream of data is being consumed from a point in time. This data is being inserted into ClickHouse tables with [incremental materialized views](/materialized-view/incremental-materialized-view), triggering on blocks as they are inserted. These views may be transforming the data prior to insert or computing aggregates and sending results to target tables for later use in downstream applications. +Backfilling is typically needed when a stream of data is being consumed from a point in time. This data is being inserted into ClickHouse tables with [incremental materialized views](/materialized-view/incremental-materialized-view), triggering on blocks as they are inserted. These views may be transforming the data prior to insert or computing aggregates and sending results to target tables for later use in downstream applications. We will attempt to cover the following scenarios: 1. **Backfilling data with existing data ingestion** - New data is being loaded, and historical data needs to be backfilled. This historical data has been identified. -2. **Adding materialized views to existing tables** - New materialized views need to be added to a setup for which historical data has been populated and data is already streaming. +2. **Adding materialized views to existing tables** - New materialized views need to be added to a setup for which historical data has been populated and data is already streaming. We assume data will be backfilled from object storage. In all cases, we aim to avoid pauses in data insertion. @@ -141,7 +143,7 @@ FROM pypi_downloads Peak memory usage: 682.38 KiB. ``` -Suppose we wish to load another subset `{101..200}`. While we could insert directly into `pypi`, we can do this backfill in isolation by creating duplicate tables. +Suppose we wish to load another subset `{101..200}`. While we could insert directly into `pypi`, we can do this backfill in isolation by creating duplicate tables. Should the backfill fail, we have not impacted our main tables and can simply [truncate](/managing-data/truncate) our duplicate tables and repeat. @@ -236,9 +238,9 @@ FROM pypi_v2 Importantly, the `MOVE PARTITION` operation is both lightweight (exploiting hard links) and atomic, i.e. it either fails or succeeds with no intermediate state. -We exploit this process heavily in our backfilling scenarios below. +We exploit this process heavily in our backfilling scenarios below. -Notice how this process requires users to choose the size of each insert operation. +Notice how this process requires users to choose the size of each insert operation. Larger inserts i.e. more rows, will mean fewer `MOVE PARTITION` operations are required. However, this must be balanced against the cost in the event of an insert failure e.g. due to network interruption, to recover. Users can complement this process with batching files to reduce the risk. This can be performed with either range queries e.g. `WHERE timestamp BETWEEN 2024-12-17 09:00:00 AND 2024-12-17 10:00:00` or glob patterns. For example, @@ -258,7 +260,7 @@ ClickPipes uses this approach when loading data from object storage, automatical ## Scenario 1: Backfilling data with existing data ingestion {#scenario-1-backfilling-data-with-existing-data-ingestion} -In this scenario, we assume that the data to backfill is not in an isolated bucket and thus filtering is required. Data is already inserting and a timestamp or monotonically increasing column can be identified from which historical data needs to be backfilled. +In this scenario, we assume that the data to backfill is not in an isolated bucket and thus filtering is required. Data is already inserting and a timestamp or monotonically increasing column can be identified from which historical data needs to be backfilled. This process follows the following steps: @@ -317,7 +319,7 @@ ALTER TABLE pypi_downloads If the historical data is an isolated bucket, the above time filter is not required. If a time or monotonic column is unavailable, isolate your historical data. :::note Just use ClickPipes in ClickHouse Cloud -ClickHouse Cloud users should use ClickPipes for restoring historical backups if the data can be isolated in its own bucket (and a filter is not required). As well as parallelizing the load with multiple workers, thus reducing the load time, ClickPipes automates the above process - creating duplicate tables for both the main table and materialized views. +ClickHouse Cloud users should use ClickPipes for restoring historical backups if the data can be isolated in its own bucket (and a filter is not required). As well as parallelizing the load with multiple workers, thus reducing the load time, ClickPipes automates the above process - creating duplicate tables for both the main table and materialized views. ::: ## Scenario 2: Adding materialized views to existing tables {#scenario-2-adding-materialized-views-to-existing-tables} @@ -339,7 +341,7 @@ Our simplest approach involves the following steps: This can be further enhanced to target subsets of data in step (2) and/or use a duplicate target table for the materialized view (attach partitions to the original once the insert is complete) for easier recovery after failure. -Consider the following materialized view, which computes the most popular projects per hour. +Consider the following materialized view, which computes the most popular projects per hour. ```sql CREATE TABLE pypi_downloads_per_day @@ -372,7 +374,7 @@ AS SELECT project, count() AS count FROM pypi WHERE timestamp >= '2024-12-17 09:00:00' GROUP BY hour, project -``` +``` Once this view is added, we can backfill all data for the materialized view prior to this data. @@ -403,7 +405,7 @@ In our case, this is a relatively lightweight aggregation that completes in unde Often materialized view's query can be more complex (not uncommon as otherwise users wouldn't use a view!) and consume resources. In rarer cases, the resources for the query are beyond that of the server. This highlights one of the advantages of ClickHouse materialized views - they are incremental and don't process the entire dataset in one go! -In this case, users have several options: +In this case, users have several options: 1. Modify your query to backfill ranges e.g. `WHERE timestamp BETWEEN 2024-12-17 08:00:00 AND 2024-12-17 09:00:00`, `WHERE timestamp BETWEEN 2024-12-17 07:00:00 AND 2024-12-17 08:00:00` etc. 2. Use a [Null table engine](/engines/table-engines/special/null) to fill the materialized view. This replicates the typical incremental population of a materialized view, executing it's query over blocks of data (of configurable size). @@ -418,10 +420,7 @@ The [Null table engine](/engines/table-engines/special/null) provides a storage Importantly, any materialized views attached to the table engine still execute over blocks of data as its inserted - sending their results to a target table. These blocks are of a configurable size. While larger blocks can potentially be more efficient (and faster to process), they consume more resources (principally memory). Use of this table engine means we can build our materialized view incrementally i.e. a block at a time, avoiding the need to hold the entire aggregation in memory. -Denormalization in ClickHouse +Denormalization in ClickHouse
@@ -449,7 +448,7 @@ GROUP BY Here, we create a Null table, `pypi_v2,` to receive the rows that will be used to build our materialized view. Note how we limit the schema to only the columns we need. Our materialized view performs an aggregation over rows inserted into this table (one block at a time), sending the results to our target table, `pypi_downloads_per_day`. :::note -We have used `pypi_downloads_per_day` as our target table here. For additional resiliency, users could create a duplicate table, `pypi_downloads_per_day_v2`, and use this as the target table of the view, as shown in previous examples. On completion of the insert, partitions in `pypi_downloads_per_day_v2` could, in turn, be moved to `pypi_downloads_per_day.` This would allow recovery in the case our insert fails due to memory issues or server interruptions i.e. we just truncate `pypi_downloads_per_day_v2`, tune settings, and retry. +We have used `pypi_downloads_per_day` as our target table here. For additional resiliency, users could create a duplicate table, `pypi_downloads_per_day_v2`, and use this as the target table of the view, as shown in previous examples. On completion of the insert, partitions in `pypi_downloads_per_day_v2` could, in turn, be moved to `pypi_downloads_per_day.` This would allow recovery in the case our insert fails due to memory issues or server interruptions i.e. we just truncate `pypi_downloads_per_day_v2`, tune settings, and retry. ::: To populate this materialized view, we simply insert the relevant data to backfill into `pypi_v2` from `pypi.` @@ -467,8 +466,8 @@ Notice our memory usage here is `639.47 MiB`. Several factors will determine the performance and resources used in the above scenario. We recommend readers understand insert mechanics documented in detail [here](/integrations/s3/performance#using-threads-for-reads) prior to attempting to tune. In summary: -- **Read Parallelism** - The number of threads used to read. Controlled through [`max_threads`](/operations/settings/settings#max_threads). In ClickHouse Cloud this is determined by the instance size with it defaulting to the number of vCPUs. Increasing this value may improve read performance at the expense of greater memory usage. -- **Insert Parallelism** - The number of insert threads used to insert. Controlled through [`max_insert_threads`](/operations/settings/settings#max_insert_threads). In ClickHouse Cloud this is determined by the instance size (between 2 and 4) and is set to 1 in OSS. Increasing this value may improve performance at the expense of greater memory usage. +- **Read Parallelism** - The number of threads used to read. Controlled through [`max_threads`](/operations/settings/settings#max_threads). In ClickHouse Cloud this is determined by the instance size with it defaulting to the number of vCPUs. Increasing this value may improve read performance at the expense of greater memory usage. +- **Insert Parallelism** - The number of insert threads used to insert. Controlled through [`max_insert_threads`](/operations/settings/settings#max_insert_threads). In ClickHouse Cloud this is determined by the instance size (between 2 and 4) and is set to 1 in OSS. Increasing this value may improve performance at the expense of greater memory usage. - **Insert Block Size** - data is processed in a loop where it is pulled, parsed, and formed into in-memory insert blocks based on the [partitioning key](/engines/table-engines/mergetree-family/custom-partitioning-key). These blocks are sorted, optimized, compressed, and written to storage as new [data parts](/parts). The size of the insert block, controlled by settings [`min_insert_block_size_rows`](/operations/settings/settings#min_insert_block_size_rows) and [`min_insert_block_size_bytes`](/operations/settings/settings#min_insert_block_size_bytes) (uncompressed), impacts memory usage and disk I/O. Larger blocks use more memory but create fewer parts, reducing I/O and background merges. These settings represent minimum thresholds (whichever is reached first triggers a flush). - **Materialized view block size** - As well as the above mechanics for the main insert, prior to insertion into materialized views, blocks are also squashed for more efficient processing. The size of these blocks is determined by the settings [`min_insert_block_size_bytes_for_materialized_views`](/operations/settings/settings#min_insert_block_size_bytes_for_materialized_views) and [`min_insert_block_size_rows_for_materialized_views`](/operations/settings/settings#min_insert_block_size_rows_for_materialized_views). Larger blocks allow more efficient processing at the expense of greater memory usage. By default, these settings revert to the values of the source table settings [`min_insert_block_size_rows`](/operations/settings/settings#min_insert_block_size_rows) and [`min_insert_block_size_bytes`](/operations/settings/settings#min_insert_block_size_bytes), respectively. diff --git a/docs/data-modeling/denormalization.md b/docs/data-modeling/denormalization.md index 83443e7ff53..b725c8dadd7 100644 --- a/docs/data-modeling/denormalization.md +++ b/docs/data-modeling/denormalization.md @@ -5,6 +5,9 @@ description: How to use denormalization to improve query performance keywords: [data denormalization, denormalize, query optimization] --- +import denormalizationDiagram from '@site/static/images/data-modeling/denormalization-diagram.png'; +import denormalizationSchema from '@site/static/images/data-modeling/denormalization-schema.png'; + # Denormalizing Data Data denormalization is a technique in ClickHouse to use flattened tables to help minimize query latency by avoiding joins. @@ -15,10 +18,7 @@ Denormalizing data involves intentionally reversing the normalization process to This process reduces the need for complex joins at query time and can significantly speed up read operations, making it ideal for applications with heavy read requirements and complex queries. However, it can increase the complexity of write operations and maintenance, as any changes to the duplicated data must be propagated across all instances to maintain consistency. -Denormalization in ClickHouse +Denormalization in ClickHouse
@@ -131,10 +131,7 @@ The main observation here is that aggregated vote statistics for each post would Now let's consider our `Users` and `Badges`: -Users and Badges schema +Users and Badges schema

We first insert the data with the following command: diff --git a/docs/data-modeling/schema-design.md b/docs/data-modeling/schema-design.md index d1f9bcde038..4d3dd52c54a 100644 --- a/docs/data-modeling/schema-design.md +++ b/docs/data-modeling/schema-design.md @@ -5,6 +5,10 @@ description: Optimizing ClickHouse schema for query performance keywords: [schema, schema design, query optimization] --- +import stackOverflowSchema from '@site/static/images/data-modeling/stackoverflow-schema.png'; +import schemaDesignTypes from '@site/static/images/data-modeling/schema-design-types.png'; +import schemaDesignIndices from '@site/static/images/data-modeling/schema-design-indices.png'; + Understanding effective schema design is key to optimizing ClickHouse performance and includes choices that often involve trade-offs, with the optimal approach depending on the queries being served as well as factors such as data update frequency, latency requirements, and data volume. This guide provides an overview of schema design best practices and data modeling techniques for optimizing ClickHouse performance. ## Stack Overflow dataset {#stack-overflow-dataset} @@ -13,21 +17,18 @@ For the examples in this guide, we use a subset of the Stack Overflow dataset. T > The primary keys and relationships indicated are not enforced through constraints (Parquet is file not table format) and purely indicate how the data is related and the unique keys it possesses. -Stack Overflow Schema +Stack Overflow Schema
-The Stack Overflow dataset contains a number of related tables. In any data modeling task, we recommend users focus on loading their primary table first. This may not necessarily be the largest table but rather the one on which you expect to receive most analytical queries. This will allow you to familiarize yourself with the main ClickHouse concepts and types, especially important if coming from a predominantly OLTP background. This table may require remodeling as additional tables are added to fully exploit ClickHouse features and obtain optimal performance. +The Stack Overflow dataset contains a number of related tables. In any data modeling task, we recommend users focus on loading their primary table first. This may not necessarily be the largest table but rather the one on which you expect to receive most analytical queries. This will allow you to familiarize yourself with the main ClickHouse concepts and types, especially important if coming from a predominantly OLTP background. This table may require remodeling as additional tables are added to fully exploit ClickHouse features and obtain optimal performance. The above schema is intentionally not optimal for the purposes of this guide. ## Establish initial schema {#establish-initial-schema} -Since the `posts` table will be the target for most analytics queries, we focus on establishing a schema for this table. This data is available in the public S3 bucket `s3://datasets-documentation/stackoverflow/parquet/posts/*.parquet` with a file per year. +Since the `posts` table will be the target for most analytics queries, we focus on establishing a schema for this table. This data is available in the public S3 bucket `s3://datasets-documentation/stackoverflow/parquet/posts/*.parquet` with a file per year. > Loading data from S3 in Parquet format represents the most common and preferred way to load data into ClickHouse. ClickHouse is optimized for processing Parquet and can potentially read and insert 10s of millions of rows from S3 per second. @@ -127,7 +128,7 @@ INSERT INTO posts SELECT * FROM s3('https://datasets-documentation.s3.eu-west-3. ## Optimizing Types {#optimizing-types} -One of the secrets to ClickHouse query performance is compression. +One of the secrets to ClickHouse query performance is compression. Less data on disk means less I/O and thus faster queries and inserts. The overhead of any compression algorithm with respect to CPU will in most cases be out weighted by the reduction in IO. Improving the compression of the data should therefore be the first focus when working on ensuring ClickHouse queries are fast. @@ -141,18 +142,15 @@ The largest initial improvement in compression and query performance can be obta - **Avoid Nullable Columns** - By default the above columns have been assumed to be Null. The Nullable type allows queries to determine the difference between an empty and Null value. This creates a separate column of UInt8 type. This additional column has to be processed every time a user works with a nullable column. This leads to additional storage space used and almost always negatively affects query performance. Only use Nullable if there is a difference between the default empty value for a type and Null. For example, a value of 0 for empty values in the `ViewCount` column will likely be sufficient for most queries and not impact results. If empty values should be treated differently, they can often also be excluded from queries with a filter. Use the minimal precision for numeric types - ClickHouse has a number of numeric types designed for different numeric ranges and precision. Always aim to minimize the number of bits used to represent a column. As well as integers of different size e.g. Int16, ClickHouse offers unsigned variants whose minimum value is 0. These can allow fewer bits to be used for a column e.g. UInt16 has a maximum value of 65535, twice that of an Int16. Prefer these types over larger signed variants if possible. - **Minimal precision for date types** - ClickHouse supports a number of date and datetime types. Date and Date32 can be used for storing pure dates, with the latter supporting a larger date range at the expense of more bits. DateTime and DateTime64 provide support for date times. DateTime is limited to second granularity and uses 32 bits. DateTime64, as the name suggests, uses 64 bits but provides support up to nanosecond granularity. As ever, choose the more coarse version acceptable for queries, minimizing the number of bits needed. -- **Use LowCardinality** - Numbers, strings, Date or DateTime columns with a low number of unique values can potentially be encoded using the LowCardinality type. This dictionary encodes values, reducing the size on disk. Consider this for columns with less than 10k unique values. +- **Use LowCardinality** - Numbers, strings, Date or DateTime columns with a low number of unique values can potentially be encoded using the LowCardinality type. This dictionary encodes values, reducing the size on disk. Consider this for columns with less than 10k unique values. FixedString for special cases - Strings which have a fixed length can be encoded with the FixedString type e.g. language and currency codes. This is efficient when data has the length of precisely N bytes. In all other cases, it is likely to reduce efficiency and LowCardinality is preferred. -- **Enums for data validation** - The Enum type can be used to efficiently encode enumerated types. Enums can either be 8 or 16 bits, depending on the number of unique values they are required to store. Consider using this if you need either the associated validation at insert time (undeclared values will be rejected) or wish to perform queries which exploit a natural ordering in the Enum values e.g. imagine a feedback column containing user responses `Enum(':(' = 1, ':|' = 2, ':)' = 3)`. +- **Enums for data validation** - The Enum type can be used to efficiently encode enumerated types. Enums can either be 8 or 16 bits, depending on the number of unique values they are required to store. Consider using this if you need either the associated validation at insert time (undeclared values will be rejected) or wish to perform queries which exploit a natural ordering in the Enum values e.g. imagine a feedback column containing user responses `Enum(':(' = 1, ':|' = 2, ':)' = 3)`. > Tip: To find the range of all columns, and the number of distinct values, users can use the simple query `SELECT * APPLY min, * APPLY max, * APPLY uniq FROM table FORMAT Vertical`. We recommend performing this over a smaller subset of the data as this can be expensive. This query requires numerics to be at least defined as such for an accurate result i.e. not a String. By applying these simple rules to our posts table, we can identify an optimal type for each column: -Stack Overflow Schema +Schema Design - Optimized Types
@@ -205,14 +203,11 @@ Users coming from OLTP databases often look for the equivalent concept in ClickH At the scale at which ClickHouse is often used, memory and disk efficiency are paramount. Data is written to ClickHouse tables in chunks known as parts, with rules applied for merging the parts in the background. In ClickHouse, each part has its own primary index. When parts are merged, then the merged part's primary indexes are also merged. The primary index for a part has one index entry per group of rows - this technique is called sparse indexing. -Sparse Indexing in ClickHouse +Sparse Indexing in ClickHouse
-The selected key in ClickHouse will determine not only the index, but also order in which data is written on disk. Because of this, it can dramatically impact compression levels which can in turn affect query performance. An ordering key which causes the values of most columns to be written in contiguous order will allow the selected compression algorithm (and codecs) to compress the data more effectively. +The selected key in ClickHouse will determine not only the index, but also order in which data is written on disk. Because of this, it can dramatically impact compression levels which can in turn affect query performance. An ordering key which causes the values of most columns to be written in contiguous order will allow the selected compression algorithm (and codecs) to compress the data more effectively. > All columns in a table will be sorted based on the value of the specified ordering key, regardless of whether they are included in the key itself. For instance, if `CreationDate` is used as the key, the order of values in all other columns will correspond to the order of values in the `CreationDate` column. Multiple ordering keys can be specified - this will order with the same semantics as an `ORDER BY` clause in a `SELECT` query. @@ -315,7 +310,7 @@ For users interested in the compression improvements achieved by using specific ## Next: Data Modelling Techniques {#next-data-modelling-techniques} -Until now, we've migrated only a single table. While this has allowed us to introduce some core ClickHouse concepts, most schemas are unfortunately not this simple. +Until now, we've migrated only a single table. While this has allowed us to introduce some core ClickHouse concepts, most schemas are unfortunately not this simple. In the other guides listed below, we will explore a number of techniques to restructure our wider schema for optimal ClickHouse querying. Throughout this process we aim for `Posts` to remain our central table through which most analytical queries are performed. While other tables can still be queried in isolation, we assume most analytics want to be performed in the context of `posts`. diff --git a/docs/guides/developer/deduplication.md b/docs/guides/developer/deduplication.md index dae1bf09849..0e063565405 100644 --- a/docs/guides/developer/deduplication.md +++ b/docs/guides/developer/deduplication.md @@ -5,6 +5,8 @@ sidebar_position: 3 description: Use deduplication when you need to perform frequent upserts, updates and deletes. --- +import deduplication from '@site/static/images/guides/developer/deduplication.png'; + # Deduplication Strategies **Deduplication** refers to the process of ***removing duplicate rows of a dataset***. In an OLTP database, this is done easily because each row has a unique primary key-but at the cost of slower inserts. Every inserted row needs to first be searched for and, if found, needs to be replaced. @@ -19,7 +21,7 @@ ClickHouse is built for speed when it comes to data insertion. The storage files ||| |------|----| -|Cassandra logo|ClickHouse provides free training on deduplication and many other topics. The [Deleting and Updating Data training module](https://learn.clickhouse.com/visitor_catalog_class/show/1328954/?utm_source=clickhouse&utm_medium=docs) is a good place to start.| +|Cassandra logo|ClickHouse provides free training on deduplication and many other topics. The [Deleting and Updating Data training module](https://learn.clickhouse.com/visitor_catalog_class/show/1328954/?utm_source=clickhouse&utm_medium=docs) is a good place to start.| @@ -338,4 +340,3 @@ A `VersionedCollapsingMergeTree` table is quite handy when you want to implement One reason inserted rows may not be deduplicated is if you are using a non-idempotent function or expression in your `INSERT` statement. For example, if you are inserting rows with the column `createdAt DateTime64(3) DEFAULT now()`, your rows are guaranteed to be unique because each row will have a unique default value for the `createdAt` column. The MergeTree / ReplicatedMergeTree table engine will not know to deduplicate the rows as each inserted row will generate a unique checksum. In this case, you can specify your own `insert_deduplication_token` for each batch of rows to ensure that multiple inserts of the same batch will not result in the same rows being re-inserted. Please see the [documentation on `insert_deduplication_token`](/operations/settings/settings#insert_deduplication_token) for more details about how to use this setting. - diff --git a/docs/guides/developer/understanding-query-execution-with-the-analyzer.md b/docs/guides/developer/understanding-query-execution-with-the-analyzer.md index c095882822c..4bd910690fa 100644 --- a/docs/guides/developer/understanding-query-execution-with-the-analyzer.md +++ b/docs/guides/developer/understanding-query-execution-with-the-analyzer.md @@ -4,6 +4,12 @@ sidebar_label: Understanding Query Execution with the Analyzer title: Understanding Query Execution with the Analyzer --- +import analyzer1 from '@site/static/images/guides/developer/analyzer1.png'; +import analyzer2 from '@site/static/images/guides/developer/analyzer2.png'; +import analyzer3 from '@site/static/images/guides/developer/analyzer3.png'; +import analyzer4 from '@site/static/images/guides/developer/analyzer4.png'; +import analyzer5 from '@site/static/images/guides/developer/analyzer5.png'; + # Understanding Query Execution with the Analyzer ClickHouse processes queries extremely quickly, but the execution of a query is not a simple story. Let’s try to understand how a `SELECT` query gets executed. To illustrate it, let’s add some data in a table in ClickHouse: @@ -26,7 +32,7 @@ INSERT INTO session_events SELECT * FROM generateRandom('clientId UUID, Now that we have some data in ClickHouse, we want to run some queries and understand their execution. The execution of a query is decomposed into many steps. Each step of the query execution can be analyzed and troubleshooted using the corresponding `EXPLAIN` query. These steps are summarized in the chart below: -![Explain query steps](./images/analyzer1.png) +Explain query steps Let’s look at each entity in action during query execution. We are going to take a few queries and then examine them using the `EXPLAIN` statement. @@ -57,18 +63,18 @@ EXPLAIN AST SELECT min(timestamp), max(timestamp) FROM session_events; The output is an Abstract Syntax Tree that can be visualized as shown below: -![AST output](./images/analyzer2.png) +AST output -Each node has corresponding children and the overall tree represents the overall structure of your query. This is a logical structure to help processing a query. From an end-user standpoint (unless interested in query execution), it is not super useful; this tool is mainly used by developers. +Each node has corresponding children and the overall tree represents the overall structure of your query. This is a logical structure to help processing a query. From an end-user standpoint (unless interested in query execution), it is not super useful; this tool is mainly used by developers. ## Analyzer {#analyzer} ClickHouse currently has two architectures for the Analyzer. You can use the old architecture by setting: `enable_analyzer=0`. The new architecture is enabled by default. We are going to describe only the new architecture here, given the old one is going to be deprecated once the new analyzer is generally available. :::note -The new architecture should provide us with a better framework to improve ClickHouse's performance. However, given it is a fundamental component of the query processing steps, it also might have a negative impact on some queries and there are [known incompatibilities](/operations/analyzer#known-incompatibilities). You can revert back to the old analyzer by changing the `enable_analyzer` setting at the query or user level. +The new architecture should provide us with a better framework to improve ClickHouse's performance. However, given it is a fundamental component of the query processing steps, it also might have a negative impact on some queries and there are [known incompatibilities](/operations/analyzer#known-incompatibilities). You can revert back to the old analyzer by changing the `enable_analyzer` setting at the query or user level. ::: - + The analyzer is an important step of the query execution. It takes an AST and transforms it into a query tree. The main benefit of a query tree over an AST is that a lot of the components will be resolved, like the storage for instance. We also know from which table to read, aliases are also resolved, and the tree knows the different data types used. With all these benefits, the analyzer can apply optimizations. The way these optimizations work is via “passes”. Every pass is going to look for different optimizations. You can see all the passes [here](https://github.com/ClickHouse/ClickHouse/blob/76578ebf92af3be917cd2e0e17fea2965716d958/src/Analyzer/QueryTreePassManager.cpp#L249), let’s see it in practice with our previous query: @@ -327,7 +333,7 @@ digraph You can then copy this output and paste it [here](https://dreampuf.github.io/GraphvizOnline) and that will generate the following graph: -![Graph output](./images/analyzer3.png) +Graph output A white rectangle corresponds to a pipeline node, the gray rectangle corresponds to the query plan steps, and the `x` followed by a number corresponds to the number of inputs/outputs that are being used. If you do not want to see them in a compact form, you can always add `compact=0`: @@ -367,9 +373,9 @@ digraph } ``` -![Compact graph output](./images/analyzer4.png) +Compact graph output -Why does ClickHouse not read from the table using multiple threads? Let's try to add more data to our table: +Why does ClickHouse not read from the table using multiple threads? Let's try to add more data to our table: ```sql INSERT INTO session_events SELECT * FROM generateRandom('clientId UUID, @@ -379,7 +385,7 @@ INSERT INTO session_events SELECT * FROM generateRandom('clientId UUID, type Enum(\'type1\', \'type2\')', 1, 10, 2) LIMIT 1000000; ``` -Now let's run our `EXPLAIN` query again: +Now let's run our `EXPLAIN` query again: ```sql EXPLAIN PIPELINE graph = 1, compact = 0 @@ -426,7 +432,7 @@ digraph } ``` -![Parallel graph](./images/analyzer5.png) +Parallel graph output So the executor decided not to parallelize operations because the volume of data was not high enough. By adding more rows, the executor then decided to use multiple threads as shown in the graph. diff --git a/docs/migrations/postgres/data-modeling-techniques.md b/docs/migrations/postgres/data-modeling-techniques.md index 8c8e07543ff..3f5162fafc3 100644 --- a/docs/migrations/postgres/data-modeling-techniques.md +++ b/docs/migrations/postgres/data-modeling-techniques.md @@ -5,6 +5,9 @@ description: Data modeling for migrating from PostgreSQL to ClickHouse keywords: [postgres, postgresql, migrate, migration, data modeling] --- +import postgres_partitions from '@site/static/images/migrations/postgres-partitions.png'; +import postgres_projections from '@site/static/images/migrations/postgres-projections.png'; + > This is **Part 3** of a guide on migrating from PostgreSQL to ClickHouse. This content can be considered introductory, with the aim of helping users deploy an initial functional system that adheres to ClickHouse best practices. It avoids complex topics and will not result in a fully optimized schema; rather, it provides a solid foundation for users to build a production system and base their learning. We recommend users migrating from Postgres read [the guide for modeling data in ClickHouse](/data-modeling/schema-design). This guide uses the same Stack Overflow dataset and explores multiple approaches using ClickHouse features. @@ -17,10 +20,7 @@ In ClickHouse, partitioning is specified on a table when it is initially defined
-NEEDS ALT +PostgreSQL partitions to ClickHouse partitions
@@ -73,7 +73,7 @@ WHERE `table` = 'posts' └───────────┘ 17 rows in set. Elapsed: 0.002 sec. - + ALTER TABLE posts (DROP PARTITION '2008') @@ -88,7 +88,7 @@ Ok. Users should consider partitioning a data management technique. It is ideal when data needs to be expired from the cluster when operating with time series data e.g. the oldest partition can [simply be dropped](/sql-reference/statements/alter/partition#alter_drop-partition). -**Important:** Ensure your partitioning key expression does not result in a high cardinality set i.e. creating more than 100 partitions should be avoided. For example, do not partition your data by high cardinality columns such as client identifiers or names. Instead, make a client identifier or name the first column in the ORDER BY expression. +**Important:** Ensure your partitioning key expression does not result in a high cardinality set i.e. creating more than 100 partitions should be avoided. For example, do not partition your data by high cardinality columns such as client identifiers or names. Instead, make a client identifier or name the first column in the ORDER BY expression. > Internally, ClickHouse [creates parts](/guides/best-practices/sparse-primary-indexes#clickhouse-index-design) for inserted data. As more data is inserted, the number of parts increases. In order to prevent an excessively high number of parts, which will degrade query performance (more files to read), parts are merged together in a background asynchronous process. If the number of parts exceeds a pre-configured limit, then ClickHouse will throw an exception on insert - as a "too many parts" error. This should not happen under normal operation and only occurs if ClickHouse is misconfigured or used incorrectly e.g. many small inserts. @@ -98,7 +98,7 @@ Users should consider partitioning a data management technique. It is ideal when Postgres allows for the creation of multiple indices on a single table, enabling optimization for a variety of access patterns. This flexibility allows administrators and developers to tailor database performance to specific queries and operational needs. ClickHouse’s concept of projections, while not fully analogous to this, allows users to specify multiple `ORDER BY` clauses for a table. -In ClickHouse [data modeling docs](/data-modeling/schema-design), we explore how materialized views can be used in ClickHouse to pre-compute aggregations, transform rows, and optimize queries for different access patterns. +In ClickHouse [data modeling docs](/data-modeling/schema-design), we explore how materialized views can be used in ClickHouse to pre-compute aggregations, transform rows, and optimize queries for different access patterns. For the latter of these, we provided [an example](/materialized-view/incremental-materialized-view#lookup-table) where the materialized view sends rows to a target table with a different ordering key than the original table receiving inserts. @@ -208,14 +208,11 @@ WHERE UserId = 8592047 ## When to use projections {#when-to-use-projections} -Projections are an appealing feature for new users as they are automatically maintained as data is inserted. Furthermore, queries can just be sent to a single table where the projections are exploited where possible to speed up the response time. +Projections are an appealing feature for new users as they are automatically maintained as data is inserted. Furthermore, queries can just be sent to a single table where the projections are exploited where possible to speed up the response time.
-NEEDS ALT +PostgreSQL projections in ClickHouse
diff --git a/docs/migrations/postgres/dataset.md b/docs/migrations/postgres/dataset.md index 767b4102e11..1f19b960747 100644 --- a/docs/migrations/postgres/dataset.md +++ b/docs/migrations/postgres/dataset.md @@ -5,6 +5,8 @@ description: Dataset example to migrate from PostgreSQL to ClickHouse keywords: [postgres, postgresql, migrate, migration] --- +import postgres_stackoverflow_schema from '@site/static/images/migrations/postgres-stackoverflow-schema.png'; + > This is **Part 1** of a guide on migrating from PostgreSQL to ClickHouse. This content can be considered introductory, with the aim of helping users deploy an initial functional system that adheres to ClickHouse best practices. It avoids complex topics and will not result in a fully optimized schema; rather, it provides a solid foundation for users to build a production system and base their learning. ## Dataset {#dataset} @@ -13,10 +15,7 @@ As an example dataset to show a typical migration from Postgres to ClickHouse, w
-NEEDS ALT +PostgreSQL Stack Overflow schema
diff --git a/docs/migrations/postgres/designing-schemas.md b/docs/migrations/postgres/designing-schemas.md index cb968392477..e69c01f0926 100644 --- a/docs/migrations/postgres/designing-schemas.md +++ b/docs/migrations/postgres/designing-schemas.md @@ -5,6 +5,9 @@ description: Designing schemas when migrating from PostgreSQL to ClickHouse keywords: [postgres, postgresql, migrate, migration, schema] --- +import postgres_b_tree from '@site/static/images/migrations/postgres-b-tree.png'; +import postgres_sparse_index from '@site/static/images/migrations/postgres-sparse-index.png'; + > This is **Part 2** of a guide on migrating from PostgreSQL to ClickHouse. This content can be considered introductory, with the aim of helping users deploy an initial functional system that adheres to ClickHouse best practices. It avoids complex topics and will not result in a fully optimized schema; rather, it provides a solid foundation for users to build a production system and base their learning. The Stack Overflow dataset contains a number of related tables. We recommend migrations focus on migrating their primary table first. This may not necessarily be the largest table but rather the one on which you expect to receive the most analytical queries. This will allow you to familiarize yourself with the main ClickHouse concepts, which are especially important if you come from a predominantly OLTP background. This table may require remodeling as additional tables are added to fully exploit ClickHouse features and obtain optimal performance. We explore this modeling process in our [Data Modeling docs](/data-modeling/schema-design#next-data-modelling-techniques). @@ -174,22 +177,16 @@ Users coming from OLTP databases often look for the equivalent concept in ClickH To understand why using your OLTP primary key in ClickHouse is not appropriate, users should understand the basics of ClickHouse indexing. We use Postgres as an example comparison, but these general concepts apply to other OLTP databases. - Postgres primary keys are, by definition, unique per row. The use of [B-tree structures](/guides/best-practices/sparse-primary-indexes#an-index-design-for-massive-data-scales) allows the efficient lookup of single rows by this key. While ClickHouse can be optimized for the lookup of a single row value, analytics workloads will typically require the reading of a few columns but for many rows. Filters will more often need to identify **a subset of rows** on which an aggregation will be performed. -- Memory and disk efficiency are paramount to the scale at which ClickHouse is often used. Data is written to ClickHouse tables in chunks known as parts, with rules applied for merging the parts in the background. In ClickHouse, each part has its own primary index. When parts are merged, the merged part's primary indexes are also merged. Unlike Postgres, these indexes are not built for each row. Instead, the primary index for a part has one index entry per group of rows - this technique is called **sparse indexing**. +- Memory and disk efficiency are paramount to the scale at which ClickHouse is often used. Data is written to ClickHouse tables in chunks known as parts, with rules applied for merging the parts in the background. In ClickHouse, each part has its own primary index. When parts are merged, the merged part's primary indexes are also merged. Unlike Postgres, these indexes are not built for each row. Instead, the primary index for a part has one index entry per group of rows - this technique is called **sparse indexing**. - **Sparse indexing** is possible because ClickHouse stores the rows for a part on disk ordered by a specified key. Instead of directly locating single rows (like a B-Tree-based index), the sparse primary index allows it to quickly (via a binary search over index entries) identify groups of rows that could possibly match the query. The located groups of potentially matching rows are then, in parallel, streamed into the ClickHouse engine in order to find the matches. This index design allows for the primary index to be small (it completely fits into the main memory) whilst still significantly speeding up query execution times, especially for range queries that are typical in data analytics use cases. For more details, we recommend this [in-depth guide](/guides/best-practices/sparse-primary-indexes).
-NEEDS ALT +PostgreSQL B-Tree Index
-NEEDS ALT +PostgreSQL Sparse Index
diff --git a/docs/migrations/postgres/replacing-merge-tree.md b/docs/migrations/postgres/replacing-merge-tree.md index 80035fcc604..6cc2122bf50 100644 --- a/docs/migrations/postgres/replacing-merge-tree.md +++ b/docs/migrations/postgres/replacing-merge-tree.md @@ -5,6 +5,7 @@ description: Using the ReplacingMergeTree engine in ClickHouse keywords: [replacingmergetree, inserts, deduplication] --- +import postgres_replacingmergetree from '@site/static/images/migrations/postgres-replacingmergetree.png'; While transactional databases are optimized for transactional update and delete workloads, OLAP databases offer reduced guarantees for such operations. Instead, they optimize for immutable data inserted in batches for the benefit of significantly faster analytical queries. While ClickHouse offers update operations through mutations, as well as a lightweight means of deleting rows, its column-orientated structure means these operations should be scheduled with care, as described above. These operations are handled asynchronously, processed with a single thread, and require (in the case of updates) data to be rewritten on disk. They should thus not be used for high numbers of small changes. In order to process a stream of update and delete rows while avoiding the above usage patterns, we can use the ClickHouse table engine ReplacingMergeTree. @@ -27,10 +28,7 @@ As a result of this merge process, we have four rows representing the final stat
-NEEDS ALT +ReplacingMergeTree process
@@ -49,7 +47,7 @@ We recommend pausing inserts once (1) is guaranteed and until this command and t > Handling deletes with the ReplacingMergeTree is only recommended for tables with a low to moderate number of deletes (less than 10%) unless periods can be scheduled for cleanup with the above conditions. -> Tip: Users may also be able to issue `OPTIMIZE FINAL CLEANUP` against selective partitions no longer subject to changes. +> Tip: Users may also be able to issue `OPTIMIZE FINAL CLEANUP` against selective partitions no longer subject to changes. ## Choosing a primary/deduplication key {#choosing-a-primarydeduplication-key} @@ -229,7 +227,7 @@ If the `WHERE` condition does not use a key column, ClickHouse does not currentl Merging of data in ClickHouse occurs at a partition level. When using ReplacingMergeTree, we recommend users partition their table according to best practices, provided users can ensure this **partitioning key does not change for a row**. This will ensure updates pertaining to the same row will be sent to the same ClickHouse partition. You may reuse the same partition key as Postgres provided you adhere to the best practices outlined here. -Assuming this is the case, users can use the setting `do_not_merge_across_partitions_select_final=1` to improve `FINAL` query performance. This setting causes partitions to be merged and processed independently when using FINAL. +Assuming this is the case, users can use the setting `do_not_merge_across_partitions_select_final=1` to improve `FINAL` query performance. This setting causes partitions to be merged and processed independently when using FINAL. Consider the following posts table, where we use no partitioning: diff --git a/docs/migrations/snowflake.md b/docs/migrations/snowflake.md index fcd78c1b901..17af2f784a1 100644 --- a/docs/migrations/snowflake.md +++ b/docs/migrations/snowflake.md @@ -6,6 +6,8 @@ description: Migrating from Snowflake to ClickHouse keywords: [migrate, migration, migrating, data, etl, elt, snowflake] --- +import migrate_snowflake_clickhouse from '@site/static/images/migrations/migrate_snowflake_clickhouse.png'; + # Migrating from Snowflake to ClickHouse This guide shows how to migrate data from Snowflake to ClickHouse. @@ -14,7 +16,7 @@ Migrating data between Snowflake and ClickHouse requires the use of an object st ## 1. Exporting data from Snowflake {#1-exporting-data-from-snowflake} -Migrating from Snowflake to ClickHouse +Migrating from Snowflake to ClickHouse Exporting data from Snowflake requires the use of an external stage, as shown in the diagram above. @@ -37,7 +39,7 @@ In the example below, we create a named file format in Snowflake to represent Pa CREATE FILE FORMAT my_parquet_format TYPE = parquet; -- Create the external stage that specifies the S3 bucket to copy into -CREATE OR REPLACE STAGE external_stage +CREATE OR REPLACE STAGE external_stage URL='s3://mybucket/mydataset' CREDENTIALS=(AWS_KEY_ID='' AWS_SECRET_KEY='') FILE_FORMAT = my_parquet_format; diff --git a/docs/data-modeling/images/denormalization-diagram.png b/static/images/data-modeling/denormalization-diagram.png similarity index 100% rename from docs/data-modeling/images/denormalization-diagram.png rename to static/images/data-modeling/denormalization-diagram.png diff --git a/docs/data-modeling/images/denormalization-schema.png b/static/images/data-modeling/denormalization-schema.png similarity index 100% rename from docs/data-modeling/images/denormalization-schema.png rename to static/images/data-modeling/denormalization-schema.png diff --git a/docs/data-modeling/images/null_table_mv.png b/static/images/data-modeling/null_table_mv.png similarity index 100% rename from docs/data-modeling/images/null_table_mv.png rename to static/images/data-modeling/null_table_mv.png diff --git a/docs/data-modeling/images/schema-design-indices.png b/static/images/data-modeling/schema-design-indices.png similarity index 100% rename from docs/data-modeling/images/schema-design-indices.png rename to static/images/data-modeling/schema-design-indices.png diff --git a/docs/data-modeling/images/schema-design-types.png b/static/images/data-modeling/schema-design-types.png similarity index 100% rename from docs/data-modeling/images/schema-design-types.png rename to static/images/data-modeling/schema-design-types.png diff --git a/docs/data-modeling/images/stackoverflow-schema.png b/static/images/data-modeling/stackoverflow-schema.png similarity index 100% rename from docs/data-modeling/images/stackoverflow-schema.png rename to static/images/data-modeling/stackoverflow-schema.png diff --git a/docs/guides/developer/images/Deduplication.png b/static/images/guides/developer/Deduplication.png similarity index 100% rename from docs/guides/developer/images/Deduplication.png rename to static/images/guides/developer/Deduplication.png diff --git a/docs/guides/developer/images/analyzer1.png b/static/images/guides/developer/analyzer1.png similarity index 100% rename from docs/guides/developer/images/analyzer1.png rename to static/images/guides/developer/analyzer1.png diff --git a/docs/guides/developer/images/analyzer2.png b/static/images/guides/developer/analyzer2.png similarity index 100% rename from docs/guides/developer/images/analyzer2.png rename to static/images/guides/developer/analyzer2.png diff --git a/docs/guides/developer/images/analyzer3.png b/static/images/guides/developer/analyzer3.png similarity index 100% rename from docs/guides/developer/images/analyzer3.png rename to static/images/guides/developer/analyzer3.png diff --git a/docs/guides/developer/images/analyzer4.png b/static/images/guides/developer/analyzer4.png similarity index 100% rename from docs/guides/developer/images/analyzer4.png rename to static/images/guides/developer/analyzer4.png diff --git a/docs/guides/developer/images/analyzer5.png b/static/images/guides/developer/analyzer5.png similarity index 100% rename from docs/guides/developer/images/analyzer5.png rename to static/images/guides/developer/analyzer5.png diff --git a/docs/migrations/images/migrate_snowflake_clickhouse.png b/static/images/migrations/migrate_snowflake_clickhouse.png similarity index 100% rename from docs/migrations/images/migrate_snowflake_clickhouse.png rename to static/images/migrations/migrate_snowflake_clickhouse.png diff --git a/docs/migrations/images/postgres-b-tree.png b/static/images/migrations/postgres-b-tree.png similarity index 100% rename from docs/migrations/images/postgres-b-tree.png rename to static/images/migrations/postgres-b-tree.png diff --git a/docs/migrations/images/postgres-partitions.png b/static/images/migrations/postgres-partitions.png similarity index 100% rename from docs/migrations/images/postgres-partitions.png rename to static/images/migrations/postgres-partitions.png diff --git a/docs/migrations/images/postgres-projections.png b/static/images/migrations/postgres-projections.png similarity index 100% rename from docs/migrations/images/postgres-projections.png rename to static/images/migrations/postgres-projections.png diff --git a/docs/migrations/images/postgres-replacingmergetree.png b/static/images/migrations/postgres-replacingmergetree.png similarity index 100% rename from docs/migrations/images/postgres-replacingmergetree.png rename to static/images/migrations/postgres-replacingmergetree.png diff --git a/docs/migrations/images/postgres-sparse-index.png b/static/images/migrations/postgres-sparse-index.png similarity index 100% rename from docs/migrations/images/postgres-sparse-index.png rename to static/images/migrations/postgres-sparse-index.png diff --git a/docs/migrations/images/postgres-stackoverflow-schema.png b/static/images/migrations/postgres-stackoverflow-schema.png similarity index 100% rename from docs/migrations/images/postgres-stackoverflow-schema.png rename to static/images/migrations/postgres-stackoverflow-schema.png diff --git a/docs/images/quickstart/SQLConsole.png b/static/images/quickstart/SQLConsole.png similarity index 100% rename from docs/images/quickstart/SQLConsole.png rename to static/images/quickstart/SQLConsole.png diff --git a/docs/images/quickstart/ServiceDetails.png b/static/images/quickstart/ServiceDetails.png similarity index 100% rename from docs/images/quickstart/ServiceDetails.png rename to static/images/quickstart/ServiceDetails.png diff --git a/docs/images/quickstart/Services.png b/static/images/quickstart/Services.png similarity index 100% rename from docs/images/quickstart/Services.png rename to static/images/quickstart/Services.png diff --git a/docs/images/quickstart/ShowDatabases.png b/static/images/quickstart/ShowDatabases.png similarity index 100% rename from docs/images/quickstart/ShowDatabases.png rename to static/images/quickstart/ShowDatabases.png From 2daa8831b1efc82cddaac0d098c4021aaf709d27 Mon Sep 17 00:00:00 2001 From: Dale Mcdiarmid Date: Mon, 3 Mar 2025 18:36:49 +0000 Subject: [PATCH 2/4] ch mode images --- docs/deployment-modes.md | 17 +++++++++++------ .../images/deployment-modes/ch-cloud.png | Bin .../images/deployment-modes/ch-local.png | Bin .../images/deployment-modes/ch-server.png | Bin .../images/deployment-modes/chdb.png | Bin 5 files changed, 11 insertions(+), 6 deletions(-) rename {docs => static}/images/deployment-modes/ch-cloud.png (100%) rename {docs => static}/images/deployment-modes/ch-local.png (100%) rename {docs => static}/images/deployment-modes/ch-server.png (100%) rename {docs => static}/images/deployment-modes/chdb.png (100%) diff --git a/docs/deployment-modes.md b/docs/deployment-modes.md index abd35b57006..e46fbaf1a75 100644 --- a/docs/deployment-modes.md +++ b/docs/deployment-modes.md @@ -4,6 +4,11 @@ description: "ClickHouse offers four deployment options that all use the same po title: Deployment modes --- +import chServer from '@site/static/images/deployment-modes/ch-server.png'; +import chCloud from '@site/static/images/deployment-modes/ch-cloud.png'; +import chLocal from '@site/static/images/deployment-modes/ch-local.png'; +import chDB from '@site/static/images/deployment-modes/chdb.png'; + ClickHouse is a versatile database system that can be deployed in several different ways depending on your needs. At its core, all deployment options **use the same powerful ClickHouse database engine** – what differs is how you interact with it and where it runs. Whether you're running large-scale analytics in production, doing local data analysis, or building applications, there's a deployment option designed for your use case. The consistency of the underlying engine means you get the same high performance and SQL compatibility across all deployment modes. @@ -14,7 +19,7 @@ This guide explores the four main ways to deploy and use ClickHouse: * clickhouse-local for command-line data processing * chDB for embedding ClickHouse directly in applications -Each deployment mode has its own strengths and ideal use cases, which we'll explore in detail below. +Each deployment mode has its own strengths and ideal use cases, which we'll explore in detail below. @@ -22,18 +27,18 @@ Each deployment mode has its own strengths and ideal use cases, which we'll expl ClickHouse Server represents the traditional client/server architecture and is ideal for production deployments. This deployment mode provides the full OLAP database capabilities with high throughput and low latency queries that ClickHouse is known for. -ClickHouse Cloud +ClickHouse Server
When it comes to deployment flexibility, ClickHouse Server can be installed on your local machine for development or testing, deployed to major cloud providers like AWS, GCP, or Azure for cloud-based operations, or set up on your own on-premises hardware. For larger scale operations, it can be configured as a distributed cluster to handle increased load and provide high availability. -This deployment mode is the go-to choice for production environments where reliability, performance, and full feature access are crucial. +This deployment mode is the go-to choice for production environments where reliability, performance, and full feature access are crucial. ## ClickHouse Cloud {#clickhouse-cloud} [ClickHouse Cloud](/cloud/overview) is a fully managed version of ClickHouse that removes the operational overhead of running your own deployment. While it maintains all the core capabilities of ClickHouse Server, it enhances the experience with additional features designed to streamline development and operations. -ClickHouse Cloud +ClickHouse Cloud
@@ -49,7 +54,7 @@ The managed nature of the service means you don't need to worry about updates, b [clickhouse-local](/operations/utilities/clickhouse-local) is a powerful command-line tool that provides the complete functionality of ClickHouse in a standalone executable. It's essentially the same database as ClickHouse Server, but packaged in a way that lets you harness all of ClickHouse's capabilities directly from the command line without running a server instance. -ClickHouse Cloud +clickhouse-local
This tool excels at ad-hoc data analysis, particularly when working with local files or data stored in cloud storage services. You can directly query files in various formats (CSV, JSON, Parquet, etc.) using ClickHouse's SQL dialect, making it an excellent choice for quick data exploration or one-off analysis tasks. @@ -62,7 +67,7 @@ The combination of remote table functions and access to the local file system ma [chDB](/chdb) is ClickHouse embedded as an in-process database engine,, with Python being the primary implementation, though it's also available for Go, Rust, NodeJS, and Bun. This deployment option brings ClickHouse's powerful OLAP capabilities directly into your application's process, eliminating the need for a separate database installation. -ClickHouse Cloud +chDB - Embedded ClickHouse
diff --git a/docs/images/deployment-modes/ch-cloud.png b/static/images/deployment-modes/ch-cloud.png similarity index 100% rename from docs/images/deployment-modes/ch-cloud.png rename to static/images/deployment-modes/ch-cloud.png diff --git a/docs/images/deployment-modes/ch-local.png b/static/images/deployment-modes/ch-local.png similarity index 100% rename from docs/images/deployment-modes/ch-local.png rename to static/images/deployment-modes/ch-local.png diff --git a/docs/images/deployment-modes/ch-server.png b/static/images/deployment-modes/ch-server.png similarity index 100% rename from docs/images/deployment-modes/ch-server.png rename to static/images/deployment-modes/ch-server.png diff --git a/docs/images/deployment-modes/chdb.png b/static/images/deployment-modes/chdb.png similarity index 100% rename from docs/images/deployment-modes/chdb.png rename to static/images/deployment-modes/chdb.png From f3e7a1189dd3e40d31c76d221460e50dd80a2fb6 Mon Sep 17 00:00:00 2001 From: Dale Mcdiarmid Date: Mon, 3 Mar 2025 18:38:41 +0000 Subject: [PATCH 3/4] de-duplication --- docs/guides/developer/deduplication.md | 2 +- .../{Deduplication.png => de_duplication.png} | Bin 2 files changed, 1 insertion(+), 1 deletion(-) rename static/images/guides/developer/{Deduplication.png => de_duplication.png} (100%) diff --git a/docs/guides/developer/deduplication.md b/docs/guides/developer/deduplication.md index 0e063565405..d2b1fa2db12 100644 --- a/docs/guides/developer/deduplication.md +++ b/docs/guides/developer/deduplication.md @@ -5,7 +5,7 @@ sidebar_position: 3 description: Use deduplication when you need to perform frequent upserts, updates and deletes. --- -import deduplication from '@site/static/images/guides/developer/deduplication.png'; +import deduplication from '@site/static/images/guides/developer/de_duplication.png'; # Deduplication Strategies diff --git a/static/images/guides/developer/Deduplication.png b/static/images/guides/developer/de_duplication.png similarity index 100% rename from static/images/guides/developer/Deduplication.png rename to static/images/guides/developer/de_duplication.png From 33dd8ec4141a852a12b8ad1354aa597ecbd94d0f Mon Sep 17 00:00:00 2001 From: Dale Mcdiarmid Date: Mon, 3 Mar 2025 18:41:26 +0000 Subject: [PATCH 4/4] mvs images --- .../incremental-materialized-view.md | 13 +++++----- .../refreshable-materialized-view.md | 24 ++++++++++-------- .../materialized-view-diagram.png | Bin .../refreshable-materialized-view-diagram.png | Bin 4 files changed, 20 insertions(+), 17 deletions(-) rename {docs/materialized-view/images => static/images/materialized-view}/materialized-view-diagram.png (100%) rename {docs/materialized-view/images => static/images/materialized-view}/refreshable-materialized-view-diagram.png (100%) diff --git a/docs/materialized-view/incremental-materialized-view.md b/docs/materialized-view/incremental-materialized-view.md index 3ce30cb5e24..b8c36216bcf 100644 --- a/docs/materialized-view/incremental-materialized-view.md +++ b/docs/materialized-view/incremental-materialized-view.md @@ -6,6 +6,8 @@ keywords: [incremental materialized views, speed up queries, query optimization] score: 10000 --- +import materializedViewDiagram from '@site/static/images/materialized-view/materialized-view-diagram.png'; + # Incremental Materialized Views Incremental Materialized Views (Materialized Views) allow users to shift the cost of computation from query time to insert time, resulting in faster `SELECT` queries. @@ -16,11 +18,10 @@ The principal motivation for materialized views is that the results inserted int Materialized views in ClickHouse are updated in real time as data flows into the table they are based on, functioning more like continually updating indexes. This is in contrast to other databases where materialized views are typically static snapshots of a query that must be refreshed (similar to ClickHouse [refreshable materialized views](/sql-reference/statements/create/view#refreshable-materialized-view)). - - +Materialized view diagram ## Example {#example} @@ -410,7 +411,7 @@ In this example, our materialized view can be very simple, selecting only the `P CREATE TABLE comments_posts_users ( PostId UInt32, UserId Int32 -) ENGINE = MergeTree ORDER BY UserId +) ENGINE = MergeTree ORDER BY UserId CREATE TABLE comments_null AS comments diff --git a/docs/materialized-view/refreshable-materialized-view.md b/docs/materialized-view/refreshable-materialized-view.md index 41a7ed7ebbf..40e21abd715 100644 --- a/docs/materialized-view/refreshable-materialized-view.md +++ b/docs/materialized-view/refreshable-materialized-view.md @@ -5,14 +5,16 @@ description: How to use materialized views to speed up queries keywords: [refreshable materialized view, refresh, materialized views, speed up queries, query optimization] --- +import refreshableMaterializedViewDiagram from '@site/static/images/materialized-view/refreshable-materialized-view-diagram.png'; + [Refreshable materialized views](/sql-reference/statements/create/view#refreshable-materialized-view) are conceptually similar to materialized views in traditional OLTP databases, storing the result of a specified query for quick retrieval and reducing the need to repeatedly execute resource-intensive queries. Unlike ClickHouse’s [incremental materialized views](/materialized-view/incremental-materialized-view), this requires the periodic execution of the query over the full dataset - the results of which are stored in a target table for querying. This result set should, in theory, be smaller than the original dataset, allowing the subsequent query to execute faster. The diagram explains how Refreshable Materialized Views work: -Refreshable materialized view diagram +Refreshable materialized view diagram You can also see the following video: @@ -44,7 +46,7 @@ If you want to force refresh a materialized view, you can use the `SYSTEM REFRES SYSTEM REFRESH VIEW table_name_mv; ``` -You can also cancel, stop, or start a view. +You can also cancel, stop, or start a view. For more details, see the [managing refreshable materialized views](/sql-reference/statements/system#refreshable-materialized-views) documentation. ## When was a refreshable materialized view last refreshed? {#when-was-a-refreshable-materialized-view-last-refreshed} @@ -52,7 +54,7 @@ For more details, see the [managing refreshable materialized views](/sql-referen To find out when a refreshable materialized view was last refreshed, you can query the [`system.view_refreshes`](/operations/system-tables/view_refreshes) system table, as shown below: ```sql -SELECT database, view, status, +SELECT database, view, status, last_success_time, last_refresh_time, next_refresh_time, read_rows, written_rows FROM system.view_refreshes; @@ -140,8 +142,8 @@ CREATE TABLE events_snapshot ( ts DateTime32, uuid String, count UInt64 -) -ENGINE = MergeTree +) +ENGINE = MergeTree ORDER BY uuid; ``` @@ -240,7 +242,7 @@ In the [dbt and ClickHouse integration guide](/integrations/dbt#dbt) we populate We can then write the following query can be used to compute a summary of each actor, ordered by the most movie appearances. ```sql -SELECT +SELECT id, any(actor_name) AS name, uniqExact(movie_id) AS movies, round(avg(rank), 2) AS avg_rank, uniqExact(genre) AS genres, uniqExact(director_name) AS directors, max(created_at) AS updated_at @@ -248,8 +250,8 @@ FROM ( SELECT imdb.actors.id AS id, concat(imdb.actors.first_name, ' ', imdb.actors.last_name) AS actor_name, - imdb.movies.id AS movie_id, imdb.movies.rank AS rank, genre, - concat(imdb.directors.first_name, ' ', imdb.directors.last_name) AS director_name, + imdb.movies.id AS movie_id, imdb.movies.rank AS rank, genre, + concat(imdb.directors.first_name, ' ', imdb.directors.last_name) AS director_name, created_at FROM imdb.actors INNER JOIN imdb.roles ON imdb.roles.actor_id = imdb.actors.id diff --git a/docs/materialized-view/images/materialized-view-diagram.png b/static/images/materialized-view/materialized-view-diagram.png similarity index 100% rename from docs/materialized-view/images/materialized-view-diagram.png rename to static/images/materialized-view/materialized-view-diagram.png diff --git a/docs/materialized-view/images/refreshable-materialized-view-diagram.png b/static/images/materialized-view/refreshable-materialized-view-diagram.png similarity index 100% rename from docs/materialized-view/images/refreshable-materialized-view-diagram.png rename to static/images/materialized-view/refreshable-materialized-view-diagram.png