From 12c7c466bd34bb844f1457a2d27824e41fcae264 Mon Sep 17 00:00:00 2001 From: Paultagoras Date: Tue, 25 Mar 2025 17:26:59 -0400 Subject: [PATCH 1/2] Add ignorePartitionsWhenBatching property information --- .../kafka/kafka-clickhouse-connect-sink.md | 53 ++++++++++--------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/docs/integrations/data-ingestion/kafka/kafka-clickhouse-connect-sink.md b/docs/integrations/data-ingestion/kafka/kafka-clickhouse-connect-sink.md index bd7cb1eb6ff..af28b2dc79c 100644 --- a/docs/integrations/data-ingestion/kafka/kafka-clickhouse-connect-sink.md +++ b/docs/integrations/data-ingestion/kafka/kafka-clickhouse-connect-sink.md @@ -89,33 +89,34 @@ To connect the ClickHouse Sink to the ClickHouse server, you need to provide: The full table of configuration options: -| Property Name | Description | Default Value | -|-------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------| -| `hostname` (Required) | The hostname or IP address of the server | N/A | -| `port` | The ClickHouse port - default is 8443 (for HTTPS in the cloud), but for HTTP (the default for self-hosted) it should be 8123 | `8443` | -| `ssl` | Enable ssl connection to ClickHouse | `true` | -| `jdbcConnectionProperties` | Connection properties when connecting to Clickhouse. Must start with `?` and joined by `&` between `param=value` | `""` | -| `username` | ClickHouse database username | `default` | -| `password` (Required) | ClickHouse database password | N/A | -| `database` | ClickHouse database name | `default` | -| `connector.class` (Required) | Connector Class(explicit set and keep as the default value) | `"com.clickhouse.kafka.connect.ClickHouseSinkConnector"` | -| `tasks.max` | The number of Connector Tasks | `"1"` | -| `errors.retry.timeout` | ClickHouse JDBC Retry Timeout | `"60"` | -| `exactlyOnce` | Exactly Once Enabled | `"false"` | -| `topics` (Required) | The Kafka topics to poll - topic names must match table names | `""` | -| `key.converter` (Required* - See Description) | Set according to the types of your keys. Required here if you are passing keys (and not defined in worker config). | `"org.apache.kafka.connect.storage.StringConverter"` | -| `value.converter` (Required* - See Description) | Set based on the type of data on your topic. Supported: - JSON, String, Avro or Protobuf formats. Required here if not defined in worker config. | `"org.apache.kafka.connect.json.JsonConverter"` | -| `value.converter.schemas.enable` | Connector Value Converter Schema Support | `"false"` | -| `errors.tolerance` | Connector Error Tolerance. Supported: none, all | `"none"` | -| `errors.deadletterqueue.topic.name` | If set (with errors.tolerance=all), a DLQ will be used for failed batches (see [Troubleshooting](#troubleshooting)) | `""` | -| `errors.deadletterqueue.context.headers.enable` | Adds additional headers for the DLQ | `""` | -| `clickhouseSettings` | Comma-separated list of ClickHouse settings (e.g. "insert_quorum=2, etc...") | `""` | -| `topic2TableMap` | Comma-separated list that maps topic names to table names (e.g. "topic1=table1, topic2=table2, etc...") | `""` | -| `tableRefreshInterval` | Time (in seconds) to refresh the table definition cache | `0` | +| Property Name | Description | Default Value | +|-------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------| +| `hostname` (Required) | The hostname or IP address of the server | N/A | +| `port` | The ClickHouse port - default is 8443 (for HTTPS in the cloud), but for HTTP (the default for self-hosted) it should be 8123 | `8443` | +| `ssl` | Enable ssl connection to ClickHouse | `true` | +| `jdbcConnectionProperties` | Connection properties when connecting to Clickhouse. Must start with `?` and joined by `&` between `param=value` | `""` | +| `username` | ClickHouse database username | `default` | +| `password` (Required) | ClickHouse database password | N/A | +| `database` | ClickHouse database name | `default` | +| `connector.class` (Required) | Connector Class(explicit set and keep as the default value) | `"com.clickhouse.kafka.connect.ClickHouseSinkConnector"` | +| `tasks.max` | The number of Connector Tasks | `"1"` | +| `errors.retry.timeout` | ClickHouse JDBC Retry Timeout | `"60"` | +| `exactlyOnce` | Exactly Once Enabled | `"false"` | +| `topics` (Required) | The Kafka topics to poll - topic names must match table names | `""` | +| `key.converter` (Required* - See Description) | Set according to the types of your keys. Required here if you are passing keys (and not defined in worker config). | `"org.apache.kafka.connect.storage.StringConverter"` | +| `value.converter` (Required* - See Description) | Set based on the type of data on your topic. Supported: - JSON, String, Avro or Protobuf formats. Required here if not defined in worker config. | `"org.apache.kafka.connect.json.JsonConverter"` | +| `value.converter.schemas.enable` | Connector Value Converter Schema Support | `"false"` | +| `errors.tolerance` | Connector Error Tolerance. Supported: none, all | `"none"` | +| `errors.deadletterqueue.topic.name` | If set (with errors.tolerance=all), a DLQ will be used for failed batches (see [Troubleshooting](#troubleshooting)) | `""` | +| `errors.deadletterqueue.context.headers.enable` | Adds additional headers for the DLQ | `""` | +| `clickhouseSettings` | Comma-separated list of ClickHouse settings (e.g. "insert_quorum=2, etc...") | `""` | +| `topic2TableMap` | Comma-separated list that maps topic names to table names (e.g. "topic1=table1, topic2=table2, etc...") | `""` | +| `tableRefreshInterval` | Time (in seconds) to refresh the table definition cache | `0` | | `keeperOnCluster` | Allows configuration of ON CLUSTER parameter for self-hosted instances (e.g. `ON CLUSTER clusterNameInConfigFileDefinition`) for exactly-once connect_state table (see [Distributed DDL Queries](/sql-reference/distributed-ddl) | `""` | -| `bypassRowBinary` | Allows disabling use of RowBinary and RowBinaryWithDefaults for Schema-based data (Avro, Protobuf, etc.) - should only be used when data will have missing columns, and Nullable/Default are unacceptable | `"false"` | -| `dateTimeFormats` | Date time formats for parsing DateTime64 schema fields, separated by `;` (e.g. `someDateField=yyyy-MM-dd HH:mm:ss.SSSSSSSSS;someOtherDateField=yyyy-MM-dd HH:mm:ss`). | `""` | -| `tolerateStateMismatch` | Allows the connector to drop records "earlier" than the current offset stored AFTER_PROCESSING (e.g. if offset 5 is sent, and offset 250 was the last recorded offset) | `"false"` | +| `bypassRowBinary` | Allows disabling use of RowBinary and RowBinaryWithDefaults for Schema-based data (Avro, Protobuf, etc.) - should only be used when data will have missing columns, and Nullable/Default are unacceptable | `"false"` | +| `dateTimeFormats` | Date time formats for parsing DateTime64 schema fields, separated by `;` (e.g. `someDateField=yyyy-MM-dd HH:mm:ss.SSSSSSSSS;someOtherDateField=yyyy-MM-dd HH:mm:ss`). | `""` | +| `tolerateStateMismatch` | Allows the connector to drop records "earlier" than the current offset stored AFTER_PROCESSING (e.g. if offset 5 is sent, and offset 250 was the last recorded offset) | `"false"` | +| `ignorePartitionsWhenBatching` | Will ignore partition when collecting messages for insert (though only if exactlyOnce is `false`). Performance Note: The more connector tasks, the fewer kafka partitions assigned per task - this can mean diminishing returns. | `"false"` | ### Target Tables {#target-tables} From 4ad6fc2260fd47be9e575820d747e5d515c721d2 Mon Sep 17 00:00:00 2001 From: Paultagoras Date: Wed, 26 Mar 2025 10:53:41 -0400 Subject: [PATCH 2/2] Update kafka-clickhouse-connect-sink.md --- .../kafka/kafka-clickhouse-connect-sink.md | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/docs/integrations/data-ingestion/kafka/kafka-clickhouse-connect-sink.md b/docs/integrations/data-ingestion/kafka/kafka-clickhouse-connect-sink.md index af28b2dc79c..c60128158ac 100644 --- a/docs/integrations/data-ingestion/kafka/kafka-clickhouse-connect-sink.md +++ b/docs/integrations/data-ingestion/kafka/kafka-clickhouse-connect-sink.md @@ -89,34 +89,34 @@ To connect the ClickHouse Sink to the ClickHouse server, you need to provide: The full table of configuration options: -| Property Name | Description | Default Value | -|-------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------| -| `hostname` (Required) | The hostname or IP address of the server | N/A | -| `port` | The ClickHouse port - default is 8443 (for HTTPS in the cloud), but for HTTP (the default for self-hosted) it should be 8123 | `8443` | -| `ssl` | Enable ssl connection to ClickHouse | `true` | -| `jdbcConnectionProperties` | Connection properties when connecting to Clickhouse. Must start with `?` and joined by `&` between `param=value` | `""` | -| `username` | ClickHouse database username | `default` | -| `password` (Required) | ClickHouse database password | N/A | -| `database` | ClickHouse database name | `default` | -| `connector.class` (Required) | Connector Class(explicit set and keep as the default value) | `"com.clickhouse.kafka.connect.ClickHouseSinkConnector"` | -| `tasks.max` | The number of Connector Tasks | `"1"` | -| `errors.retry.timeout` | ClickHouse JDBC Retry Timeout | `"60"` | -| `exactlyOnce` | Exactly Once Enabled | `"false"` | -| `topics` (Required) | The Kafka topics to poll - topic names must match table names | `""` | -| `key.converter` (Required* - See Description) | Set according to the types of your keys. Required here if you are passing keys (and not defined in worker config). | `"org.apache.kafka.connect.storage.StringConverter"` | -| `value.converter` (Required* - See Description) | Set based on the type of data on your topic. Supported: - JSON, String, Avro or Protobuf formats. Required here if not defined in worker config. | `"org.apache.kafka.connect.json.JsonConverter"` | -| `value.converter.schemas.enable` | Connector Value Converter Schema Support | `"false"` | -| `errors.tolerance` | Connector Error Tolerance. Supported: none, all | `"none"` | -| `errors.deadletterqueue.topic.name` | If set (with errors.tolerance=all), a DLQ will be used for failed batches (see [Troubleshooting](#troubleshooting)) | `""` | -| `errors.deadletterqueue.context.headers.enable` | Adds additional headers for the DLQ | `""` | -| `clickhouseSettings` | Comma-separated list of ClickHouse settings (e.g. "insert_quorum=2, etc...") | `""` | -| `topic2TableMap` | Comma-separated list that maps topic names to table names (e.g. "topic1=table1, topic2=table2, etc...") | `""` | -| `tableRefreshInterval` | Time (in seconds) to refresh the table definition cache | `0` | -| `keeperOnCluster` | Allows configuration of ON CLUSTER parameter for self-hosted instances (e.g. `ON CLUSTER clusterNameInConfigFileDefinition`) for exactly-once connect_state table (see [Distributed DDL Queries](/sql-reference/distributed-ddl) | `""` | -| `bypassRowBinary` | Allows disabling use of RowBinary and RowBinaryWithDefaults for Schema-based data (Avro, Protobuf, etc.) - should only be used when data will have missing columns, and Nullable/Default are unacceptable | `"false"` | -| `dateTimeFormats` | Date time formats for parsing DateTime64 schema fields, separated by `;` (e.g. `someDateField=yyyy-MM-dd HH:mm:ss.SSSSSSSSS;someOtherDateField=yyyy-MM-dd HH:mm:ss`). | `""` | -| `tolerateStateMismatch` | Allows the connector to drop records "earlier" than the current offset stored AFTER_PROCESSING (e.g. if offset 5 is sent, and offset 250 was the last recorded offset) | `"false"` | -| `ignorePartitionsWhenBatching` | Will ignore partition when collecting messages for insert (though only if exactlyOnce is `false`). Performance Note: The more connector tasks, the fewer kafka partitions assigned per task - this can mean diminishing returns. | `"false"` | +| Property Name | Description | Default Value | +|-------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------| +| `hostname` (Required) | The hostname or IP address of the server | N/A | +| `port` | The ClickHouse port - default is 8443 (for HTTPS in the cloud), but for HTTP (the default for self-hosted) it should be 8123 | `8443` | +| `ssl` | Enable ssl connection to ClickHouse | `true` | +| `jdbcConnectionProperties` | Connection properties when connecting to Clickhouse. Must start with `?` and joined by `&` between `param=value` | `""` | +| `username` | ClickHouse database username | `default` | +| `password` (Required) | ClickHouse database password | N/A | +| `database` | ClickHouse database name | `default` | +| `connector.class` (Required) | Connector Class(explicit set and keep as the default value) | `"com.clickhouse.kafka.connect.ClickHouseSinkConnector"` | +| `tasks.max` | The number of Connector Tasks | `"1"` | +| `errors.retry.timeout` | ClickHouse JDBC Retry Timeout | `"60"` | +| `exactlyOnce` | Exactly Once Enabled | `"false"` | +| `topics` (Required) | The Kafka topics to poll - topic names must match table names | `""` | +| `key.converter` (Required* - See Description) | Set according to the types of your keys. Required here if you are passing keys (and not defined in worker config). | `"org.apache.kafka.connect.storage.StringConverter"` | +| `value.converter` (Required* - See Description) | Set based on the type of data on your topic. Supported: - JSON, String, Avro or Protobuf formats. Required here if not defined in worker config. | `"org.apache.kafka.connect.json.JsonConverter"` | +| `value.converter.schemas.enable` | Connector Value Converter Schema Support | `"false"` | +| `errors.tolerance` | Connector Error Tolerance. Supported: none, all | `"none"` | +| `errors.deadletterqueue.topic.name` | If set (with errors.tolerance=all), a DLQ will be used for failed batches (see [Troubleshooting](#troubleshooting)) | `""` | +| `errors.deadletterqueue.context.headers.enable` | Adds additional headers for the DLQ | `""` | +| `clickhouseSettings` | Comma-separated list of ClickHouse settings (e.g. "insert_quorum=2, etc...") | `""` | +| `topic2TableMap` | Comma-separated list that maps topic names to table names (e.g. "topic1=table1, topic2=table2, etc...") | `""` | +| `tableRefreshInterval` | Time (in seconds) to refresh the table definition cache | `0` | +| `keeperOnCluster` | Allows configuration of ON CLUSTER parameter for self-hosted instances (e.g. `ON CLUSTER clusterNameInConfigFileDefinition`) for exactly-once connect_state table (see [Distributed DDL Queries](/sql-reference/distributed-ddl) | `""` | +| `bypassRowBinary` | Allows disabling use of RowBinary and RowBinaryWithDefaults for Schema-based data (Avro, Protobuf, etc.) - should only be used when data will have missing columns, and Nullable/Default are unacceptable | `"false"` | +| `dateTimeFormats` | Date time formats for parsing DateTime64 schema fields, separated by `;` (e.g. `someDateField=yyyy-MM-dd HH:mm:ss.SSSSSSSSS;someOtherDateField=yyyy-MM-dd HH:mm:ss`). | `""` | +| `tolerateStateMismatch` | Allows the connector to drop records "earlier" than the current offset stored AFTER_PROCESSING (e.g. if offset 5 is sent, and offset 250 was the last recorded offset) | `"false"` | +| `ignorePartitionsWhenBatching` | Will ignore partition when collecting messages for insert (though only if `exactlyOnce` is `false`). Performance Note: The more connector tasks, the fewer kafka partitions assigned per task - this can mean diminishing returns. | `"false"` | ### Target Tables {#target-tables}