From a44869746f4d8c70861c20cfb163e63e5ede2934 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Wed, 17 Sep 2025 20:36:16 +0200 Subject: [PATCH 1/9] add data masking draft --- docs/cloud/guides/security/data_masking.md | 185 +++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 docs/cloud/guides/security/data_masking.md diff --git a/docs/cloud/guides/security/data_masking.md b/docs/cloud/guides/security/data_masking.md new file mode 100644 index 00000000000..27372012edc --- /dev/null +++ b/docs/cloud/guides/security/data_masking.md @@ -0,0 +1,185 @@ +--- +slug: /cloud/guides/data-masking +sidebar_label: 'Data masking' +title: 'Data masking in ClickHouse Cloud' +description: 'A guide to data masking in ClickHouse Cloud' +keywords: ['data masking'] +--- + +# Data masking in ClickHouse Cloud + +Data masking is a technique used for data protection, in which the original data +is replaced with a version of the data which maintains its format and structure +while removing any personally identifiable information (PII) or sensitive information. + +This guide shows you how you can mask data in ClickHouse. + +## Use String functions {#using-string-functions} + +For basic data masking use cases, the `replace` family of functions can be used: + +| Function | Description | +|------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------| +| [`replaceOne`](/sql-reference/functions/string-replace-functions#replaceone) | Replaces the first occurrence of a pattern in a haystack string by the provided replacement string. | +| [`replaceAll`](/sql-reference/functions/string-replace-functions#replaceall) | Replaces all occurrences of a pattern in a haystack string by the provided replacement string. | +| [`replaceRegexpOne`](/sql-reference/functions/string-replace-functions#replaceregexpone) | Replaces the first occurrence of a substring matching a regular expression pattern (in re2 syntax) in a haystack by the provided replacement string. | +| [`replaceRegexpAll`](/sql-reference/functions/string-replace-functions#replaceregexpall) | Replaces all occurrences of a substring matching a regular expression pattern (in re2 syntax) in a haystack by the provided replacement string. | + +For example, you can replace customer names with a placeholder `[CUSTOMER_NAME]` using the `replaceOne` function: + +```sql +SELECT replaceOne( + 'Customer John Smith called about his account', + 'John Smith', + '[CUSTOMER_NAME]' +) AS anonymized_text; +``` + +```response +┌─anonymized_text───────────────────────────────────┐ +│ Customer [CUSTOMER_NAME] called about his account │ +└───────────────────────────────────────────────────┘ +``` + +Or mask a social security number, leaving only the last 4 digits using the `replaceRegexpAll` function: + +```sql +SELECT replaceRegexpAll( + 'SSN: 123-45-6789', + '(\d{3})-(\d{2})-(\d{4})', + 'XXX-XX-\3' +) AS masked_ssn; +``` + +```response +┌─masked_ssn───────┐ +│ SSN: XXX-XX-6789 │ +└──────────────────┘ +``` + +## Create masked `VIEW`s {#masked-views} + +A [`VIEW`](/sql-reference/statements/create/view) can be used in conjunction with +the aforementioned functions to apply transformations to columns containing sensitive data, before they are presented to the user. +In this way, the original data remains unchanged, and users querying the view see the masked data. + +To demonstrate, let's imagine that we have a table which stores records of customer orders. +We want to make sure that certain employees can view the information without exposing +personal data of the customers. + +First, create the following table for the data, and insert some rows into it: + +```sql +CREATE TABLE orders ( + user_id UInt32, + name String, + email String, + phone String, + total_amount Decimal(10,2), + order_date Date, + shipping_address String +) +ENGINE = MergeTree() +ORDER BY user_id; + +INSERT INTO orders VALUES + (1001, 'John Smith', 'john.smith@gmail.com', '555-123-4567', 299.99, '2024-01-15', '123 Main St, New York, NY 10001'), + (1002, 'Sarah Johnson', 'sarah.johnson@outlook.com', '555-987-6543', 149.50, '2024-01-16', '456 Oak Ave, Los Angeles, CA 90210'), + (1003, 'Michael Brown', 'mbrown@company.com', '555-456-7890', 599.00, '2024-01-17', '789 Pine Rd, Chicago, IL 60601'), + (1004, 'Emily Rogers', 'emily.rogers@yahoo.com', '555-321-0987', 89.99, '2024-01-18', '321 Elm St, Houston, TX 77001'), + (1005, 'David Wilson', 'dwilson@email.net', '555-654-3210', 449.75, '2024-01-19', '654 Cedar Blvd, Phoenix, AZ 85001'); +``` + +Create a view called `masked_orders`: + +```sql +CREATE VIEW masked_orders AS +SELECT + user_id, + replaceRegexpOne(name, '^([A-Za-z]+)\\s+(.*)$', '\\1 ****') AS name, + replaceRegexpOne(email, '^(.{2})[^@]*(@.*)$', '\\1****\\2') AS email, + replaceRegexpOne(phone, '^(\\d{3})-(\\d{3})-(\\d{4})$', '\\1-***-\\3') AS phone, + total_amount, + order_date, + replaceRegexpOne(shipping_address, '^[^,]+,\\s*(.*)$', '*** \\1') AS shipping_address +FROM orders; +``` + +In the `SELECT` clause of the view, transformations on the `name`, `email`, `phone` and `shipping_address` +fields are defined in order to partially mask the data. + +Select the data from the view: + +```sql +SELECT * FROM masked_orders +``` + +```response +┌─user_id─┬─name─────────┬─email──────────────┬─phone────────┬─total_amount─┬─order_date─┬─shipping_address──────────┐ +│ 1001 │ John **** │ jo****@gmail.com │ 555-***-4567 │ 299.99 │ 2024-01-15 │ *** New York, NY 10001 │ +│ 1002 │ Sarah **** │ sa****@outlook.com │ 555-***-6543 │ 149.5 │ 2024-01-16 │ *** Los Angeles, CA 90210 │ +│ 1003 │ Michael **** │ mb****@company.com │ 555-***-7890 │ 599 │ 2024-01-17 │ *** Chicago, IL 60601 │ +│ 1004 │ Emily **** │ em****@yahoo.com │ 555-***-0987 │ 89.99 │ 2024-01-18 │ *** Houston, TX 77001 │ +│ 1005 │ David **** │ dw****@email.net │ 555-***-3210 │ 449.75 │ 2024-01-19 │ *** Phoenix, AZ 85001 │ +└─────────┴──────────────┴────────────────────┴──────────────┴──────────────┴────────────┴───────────────────────────┘ +``` + +The data which is returned is masked, hiding sensitive information. +You can also create multiple views, with differing levels of obfuscation depending +on the level of privilege of the viewer. + +To ensure that users are only able to access the view returning the masked data, +you can use ClickHouse's [Role Based Access Control](/cloud/security/cloud-access-management/overview) +to ensure that the view is tied to a specific role. + +First create the role: + +```sql +CREATE ROLE masked_orders_viewer; +``` + +Grant `SELECT` privileges on the view to the role: + +```sql +GRANT SELECT ON masked_orders TO masked_orders_viewer; +``` + +Because ClickHouse roles are additive, you must ensure that users who should only see the masked view do not have any SELECT privilege on the base table via any role. + +As such, you should explicitly revoke base-table access to be safe: + +```sql +REVOKE SELECT ON orders FROM masked_orders_viewer; +``` + +Finally, assign the role to the appropriate users: + +```sql +GRANT masked_orders_viewer TO your_user; +``` + +This ensures that users with the `masked_orders_viewer` role are only able to see +the masked data from the view and not the original unmasked data from the table. + +## Use query masking rules for log data {#use-query-masking-rules} + +For users of ClickHouse OSS wishing to mask log data specifically, you can make use of query masking rules (log masking) +to mask data. + +To do so you can define regular expression-based masking rules in the server configuration. +These rules are applied to queries and all log messages before they are stored in server logs or system tables +(such as `system.query_log`, `system.text_log`, and `system.processes`). + +This helps prevent sensitive data from leaking into logs, but does not mask data in query results. + +For example, to mask a social security number, you could add the following rule to your server configuration: + +```yaml + + + hide SSN + (^|\D)\d{3}-\d{2}-\d{4}($|\D) + 000-00-0000 + + +``` From 1cab27e57e23a481adf3a1ab9f5a665a6d6c299e Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Wed, 17 Sep 2025 20:44:10 +0200 Subject: [PATCH 2/9] make generic to ClickHouse --- docs/cloud/guides/security/data_masking.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/cloud/guides/security/data_masking.md b/docs/cloud/guides/security/data_masking.md index 27372012edc..0da6bcfa0a4 100644 --- a/docs/cloud/guides/security/data_masking.md +++ b/docs/cloud/guides/security/data_masking.md @@ -1,12 +1,12 @@ --- slug: /cloud/guides/data-masking sidebar_label: 'Data masking' -title: 'Data masking in ClickHouse Cloud' +title: 'Data masking in ClickHouse' description: 'A guide to data masking in ClickHouse Cloud' keywords: ['data masking'] --- -# Data masking in ClickHouse Cloud +# Data masking in ClickHouse Data masking is a technique used for data protection, in which the original data is replaced with a version of the data which maintains its format and structure From 529ce59f9175d03d8435f1ca96cca0694060b3da Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 18 Sep 2025 10:02:11 +0200 Subject: [PATCH 3/9] improvements --- docs/cloud/guides/security/data_masking.md | 56 ++++++++++------------ 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/docs/cloud/guides/security/data_masking.md b/docs/cloud/guides/security/data_masking.md index 0da6bcfa0a4..c9693ec013d 100644 --- a/docs/cloud/guides/security/data_masking.md +++ b/docs/cloud/guides/security/data_masking.md @@ -2,28 +2,25 @@ slug: /cloud/guides/data-masking sidebar_label: 'Data masking' title: 'Data masking in ClickHouse' -description: 'A guide to data masking in ClickHouse Cloud' +description: 'A guide to data masking in ClickHouse' keywords: ['data masking'] --- # Data masking in ClickHouse -Data masking is a technique used for data protection, in which the original data -is replaced with a version of the data which maintains its format and structure -while removing any personally identifiable information (PII) or sensitive information. - +Data masking is a technique used for data protection, in which the original data is replaced with a version of the data which maintains its format and structure while removing any personally identifiable information (PII) or sensitive information. This guide shows you how you can mask data in ClickHouse. -## Use String functions {#using-string-functions} +## Use string replacement functions {#using-string-functions} -For basic data masking use cases, the `replace` family of functions can be used: +For basic data masking use cases, the `replace` family of functions offers a convenient way to mask data: -| Function | Description | -|------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------| -| [`replaceOne`](/sql-reference/functions/string-replace-functions#replaceone) | Replaces the first occurrence of a pattern in a haystack string by the provided replacement string. | -| [`replaceAll`](/sql-reference/functions/string-replace-functions#replaceall) | Replaces all occurrences of a pattern in a haystack string by the provided replacement string. | -| [`replaceRegexpOne`](/sql-reference/functions/string-replace-functions#replaceregexpone) | Replaces the first occurrence of a substring matching a regular expression pattern (in re2 syntax) in a haystack by the provided replacement string. | -| [`replaceRegexpAll`](/sql-reference/functions/string-replace-functions#replaceregexpall) | Replaces all occurrences of a substring matching a regular expression pattern (in re2 syntax) in a haystack by the provided replacement string. | +| Function | Description | +|------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------| +| [`replaceOne`](/sql-reference/functions/string-replace-functions#replaceone) | Replaces the first occurrence of a pattern in a haystack string with the provided replacement string. | +| [`replaceAll`](/sql-reference/functions/string-replace-functions#replaceall) | Replaces all occurrences of a pattern in a haystack string with the provided replacement string. | +| [`replaceRegexpOne`](/sql-reference/functions/string-replace-functions#replaceregexpone) | Replaces the first occurrence of a substring matching a regular expression pattern (in re2 syntax) in a haystack with the provided replacement string. | +| [`replaceRegexpAll`](/sql-reference/functions/string-replace-functions#replaceregexpall) | Replaces all occurrences of a substring matching a regular expression pattern (in re2 syntax) in a haystack with the provided replacement string. | For example, you can replace customer names with a placeholder `[CUSTOMER_NAME]` using the `replaceOne` function: @@ -41,7 +38,7 @@ SELECT replaceOne( └───────────────────────────────────────────────────┘ ``` -Or mask a social security number, leaving only the last 4 digits using the `replaceRegexpAll` function: +Or mask a social security number, leaving only the last 4 digits using the `replaceRegexpAll` function. ```sql SELECT replaceRegexpAll( @@ -51,6 +48,8 @@ SELECT replaceRegexpAll( ) AS masked_ssn; ``` +In the query below `\3` is used to substitute the third capture group into the resulting string, which produces: + ```response ┌─masked_ssn───────┐ │ SSN: XXX-XX-6789 │ @@ -60,14 +59,14 @@ SELECT replaceRegexpAll( ## Create masked `VIEW`s {#masked-views} A [`VIEW`](/sql-reference/statements/create/view) can be used in conjunction with -the aforementioned functions to apply transformations to columns containing sensitive data, before they are presented to the user. -In this way, the original data remains unchanged, and users querying the view see the masked data. +the aforementioned string functions to apply transformations to columns containing sensitive data, before they are presented to the user. +In this way, the original data remains unchanged, and users querying the view see only the masked data. To demonstrate, let's imagine that we have a table which stores records of customer orders. -We want to make sure that certain employees can view the information without exposing -personal data of the customers. +We want to make sure that a group of employees can view the information, but we don't want +them to see the full information of the customers. -First, create the following table for the data, and insert some rows into it: +Run the query below to create an example table `orders` and insert some fictional customer order recordords into it: ```sql CREATE TABLE orders ( @@ -97,7 +96,7 @@ CREATE VIEW masked_orders AS SELECT user_id, replaceRegexpOne(name, '^([A-Za-z]+)\\s+(.*)$', '\\1 ****') AS name, - replaceRegexpOne(email, '^(.{2})[^@]*(@.*)$', '\\1****\\2') AS email, + replaceRegexpOne(email, '^(.{0})[^@]*(@.*)$', '\\1****\\2') AS email, replaceRegexpOne(phone, '^(\\d{3})-(\\d{3})-(\\d{4})$', '\\1-***-\\3') AS phone, total_amount, order_date, @@ -105,8 +104,8 @@ SELECT FROM orders; ``` -In the `SELECT` clause of the view, transformations on the `name`, `email`, `phone` and `shipping_address` -fields are defined in order to partially mask the data. +In the `SELECT` clause of the view creation query above, we define transformations using the `replaceRegexpOne` on the `name`, `email`, `phone` and `shipping_address` +fields, which are the fields containing sensitive information that we wish to partially mask. Select the data from the view: @@ -124,13 +123,10 @@ SELECT * FROM masked_orders └─────────┴──────────────┴────────────────────┴──────────────┴──────────────┴────────────┴───────────────────────────┘ ``` -The data which is returned is masked, hiding sensitive information. -You can also create multiple views, with differing levels of obfuscation depending -on the level of privilege of the viewer. +Notice that the data returned from the view is partially masked, obfuscating the sensitive information. +You can also create multiple views, with differing levels of obfuscation depending on the level of privileged access to information the viewer has. -To ensure that users are only able to access the view returning the masked data, -you can use ClickHouse's [Role Based Access Control](/cloud/security/cloud-access-management/overview) -to ensure that the view is tied to a specific role. +To ensure that users are only able to access the view returning the masked data, and not the table with the original unmasked data, you should use [Role Based Access Control](/cloud/security/cloud-access-management/overview) to ensure that specific roles only have grants to select from the view. First create the role: @@ -138,13 +134,13 @@ First create the role: CREATE ROLE masked_orders_viewer; ``` -Grant `SELECT` privileges on the view to the role: +Next grant `SELECT` privileges on the view to the role: ```sql GRANT SELECT ON masked_orders TO masked_orders_viewer; ``` -Because ClickHouse roles are additive, you must ensure that users who should only see the masked view do not have any SELECT privilege on the base table via any role. +Because ClickHouse roles are additive, you must ensure that users who should only see the masked view do not have any `SELECT` privilege on the base table via any role. As such, you should explicitly revoke base-table access to be safe: From 7e6f83ffa9ecd9f952d3918eac2b90bdbe68fe3b Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 18 Sep 2025 10:05:44 +0200 Subject: [PATCH 4/9] add a link to query masking rules section --- docs/cloud/guides/security/data_masking.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/cloud/guides/security/data_masking.md b/docs/cloud/guides/security/data_masking.md index c9693ec013d..164a3c4f142 100644 --- a/docs/cloud/guides/security/data_masking.md +++ b/docs/cloud/guides/security/data_masking.md @@ -159,14 +159,14 @@ the masked data from the view and not the original unmasked data from the table. ## Use query masking rules for log data {#use-query-masking-rules} -For users of ClickHouse OSS wishing to mask log data specifically, you can make use of query masking rules (log masking) +For users of ClickHouse OSS wishing to mask log data specifically, you can make use of [query masking rules](/operations/server-configuration-parameters/settings#query_masking_rules) (log masking) to mask data. -To do so you can define regular expression-based masking rules in the server configuration. +To do so, you can define regular expression-based masking rules in the server configuration. These rules are applied to queries and all log messages before they are stored in server logs or system tables (such as `system.query_log`, `system.text_log`, and `system.processes`). -This helps prevent sensitive data from leaking into logs, but does not mask data in query results. +This helps prevent sensitive data from leaking into **logs** only. Note that it does not mask data in query results. For example, to mask a social security number, you could add the following rule to your server configuration: From 245f9baf6623b52af385cf137ea312073bf266a7 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 18 Sep 2025 10:07:18 +0200 Subject: [PATCH 5/9] fixes --- docs/cloud/guides/security/data_masking.md | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/docs/cloud/guides/security/data_masking.md b/docs/cloud/guides/security/data_masking.md index 164a3c4f142..c63f5160d5c 100644 --- a/docs/cloud/guides/security/data_masking.md +++ b/docs/cloud/guides/security/data_masking.md @@ -58,13 +58,11 @@ In the query below `\3` is used to substitute the third capture group into the r ## Create masked `VIEW`s {#masked-views} -A [`VIEW`](/sql-reference/statements/create/view) can be used in conjunction with -the aforementioned string functions to apply transformations to columns containing sensitive data, before they are presented to the user. +A [`VIEW`](/sql-reference/statements/create/view) can be used in conjunction with the aforementioned string functions to apply transformations to columns containing sensitive data, before they are presented to the user. In this way, the original data remains unchanged, and users querying the view see only the masked data. To demonstrate, let's imagine that we have a table which stores records of customer orders. -We want to make sure that a group of employees can view the information, but we don't want -them to see the full information of the customers. +We want to make sure that a group of employees can view the information, but we don't want them to see the full information of the customers. Run the query below to create an example table `orders` and insert some fictional customer order recordords into it: @@ -104,8 +102,7 @@ SELECT FROM orders; ``` -In the `SELECT` clause of the view creation query above, we define transformations using the `replaceRegexpOne` on the `name`, `email`, `phone` and `shipping_address` -fields, which are the fields containing sensitive information that we wish to partially mask. +In the `SELECT` clause of the view creation query above, we define transformations using the `replaceRegexpOne` on the `name`, `email`, `phone` and `shipping_address` fields, which are the fields containing sensitive information that we wish to partially mask. Select the data from the view: @@ -159,14 +156,13 @@ the masked data from the view and not the original unmasked data from the table. ## Use query masking rules for log data {#use-query-masking-rules} -For users of ClickHouse OSS wishing to mask log data specifically, you can make use of [query masking rules](/operations/server-configuration-parameters/settings#query_masking_rules) (log masking) -to mask data. +For users of ClickHouse OSS wishing to mask log data specifically, you can make use of [query masking rules](/operations/server-configuration-parameters/settings#query_masking_rules) (log masking) to mask data. To do so, you can define regular expression-based masking rules in the server configuration. -These rules are applied to queries and all log messages before they are stored in server logs or system tables -(such as `system.query_log`, `system.text_log`, and `system.processes`). +These rules are applied to queries and all log messages before they are stored in server logs or system tables (such as `system.query_log`, `system.text_log`, and `system.processes`). -This helps prevent sensitive data from leaking into **logs** only. Note that it does not mask data in query results. +This helps prevent sensitive data from leaking into **logs** only. +Note that it does not mask data in query results. For example, to mask a social security number, you could add the following rule to your server configuration: From 5035c72b47396c169eb05e93e59ae5d91a76eca7 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 18 Sep 2025 10:10:01 +0200 Subject: [PATCH 6/9] fix spelling --- docs/cloud/guides/security/data_masking.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cloud/guides/security/data_masking.md b/docs/cloud/guides/security/data_masking.md index c63f5160d5c..369b617ccfe 100644 --- a/docs/cloud/guides/security/data_masking.md +++ b/docs/cloud/guides/security/data_masking.md @@ -64,7 +64,7 @@ In this way, the original data remains unchanged, and users querying the view se To demonstrate, let's imagine that we have a table which stores records of customer orders. We want to make sure that a group of employees can view the information, but we don't want them to see the full information of the customers. -Run the query below to create an example table `orders` and insert some fictional customer order recordords into it: +Run the query below to create an example table `orders` and insert some fictional customer order records into it: ```sql CREATE TABLE orders ( From 3185c90e1cc14c6d40ee2e621cd3551ba496b32d Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 18 Sep 2025 10:28:47 +0200 Subject: [PATCH 7/9] Incorporate review feedback --- docs/cloud/guides/security/data_masking.md | 27 ++++++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/docs/cloud/guides/security/data_masking.md b/docs/cloud/guides/security/data_masking.md index 369b617ccfe..fa59226ca2a 100644 --- a/docs/cloud/guides/security/data_masking.md +++ b/docs/cloud/guides/security/data_masking.md @@ -22,7 +22,7 @@ For basic data masking use cases, the `replace` family of functions offers a con | [`replaceRegexpOne`](/sql-reference/functions/string-replace-functions#replaceregexpone) | Replaces the first occurrence of a substring matching a regular expression pattern (in re2 syntax) in a haystack with the provided replacement string. | | [`replaceRegexpAll`](/sql-reference/functions/string-replace-functions#replaceregexpall) | Replaces all occurrences of a substring matching a regular expression pattern (in re2 syntax) in a haystack with the provided replacement string. | -For example, you can replace customer names with a placeholder `[CUSTOMER_NAME]` using the `replaceOne` function: +For example, you can replace the name "John Smith" with a placeholder `[CUSTOMER_NAME]` using the `replaceOne` function: ```sql SELECT replaceOne( @@ -38,7 +38,24 @@ SELECT replaceOne( └───────────────────────────────────────────────────┘ ``` -Or mask a social security number, leaving only the last 4 digits using the `replaceRegexpAll` function. +More generically, you can use the `replaceRegexpOne` to replace any customer name: + +```sql +SELECT + replaceRegexpAll( + 'Customer John Smith called. Later, Mary Johnson and Bob Wilson also called.', + '\\b[A-Z][a-z]+ [A-Z][a-z]+\\b', + '[CUSTOMER_NAME]' + ) AS anonymized_text; +``` + +```response +┌─anonymized_text───────────────────────────────────────────────────────────────────────┐ +│ [CUSTOMER_NAME] Smith called. Later, [CUSTOMER_NAME] and [CUSTOMER_NAME] also called. │ +└───────────────────────────────────────────────────────────────────────────────────────┘ +``` + +Or you could mask a social security number, leaving only the last 4 digits using the `replaceRegexpAll` function. ```sql SELECT replaceRegexpAll( @@ -48,7 +65,7 @@ SELECT replaceRegexpAll( ) AS masked_ssn; ``` -In the query below `\3` is used to substitute the third capture group into the resulting string, which produces: +In the query above `\3` is used to substitute the third capture group into the resulting string, which produces: ```response ┌─masked_ssn───────┐ @@ -164,9 +181,9 @@ These rules are applied to queries and all log messages before they are stored i This helps prevent sensitive data from leaking into **logs** only. Note that it does not mask data in query results. -For example, to mask a social security number, you could add the following rule to your server configuration: +For example, to mask a social security number, you could add the following rule to your [server configuration](/operations/configuration-files): -```yaml +```yaml title="" hide SSN From ca3f9feda03995d40d14fcde970ce22fac7a945d Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 18 Sep 2025 11:43:25 +0200 Subject: [PATCH 8/9] add a section on using materialized columns for data masking --- docs/cloud/guides/security/data_masking.md | 88 ++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/docs/cloud/guides/security/data_masking.md b/docs/cloud/guides/security/data_masking.md index fa59226ca2a..517a01b4d1a 100644 --- a/docs/cloud/guides/security/data_masking.md +++ b/docs/cloud/guides/security/data_masking.md @@ -171,6 +171,94 @@ GRANT masked_orders_viewer TO your_user; This ensures that users with the `masked_orders_viewer` role are only able to see the masked data from the view and not the original unmasked data from the table. +## Use `MATERIALIZED` columns and column-level access restrictions {#materialized-ephemeral-column-restrictions} + +In cases where you don't want to create a separate view, you can store masked versions of your data alongside the original data. +To do so, you can use [materialized columns](/sql-reference/statements/create/table#materialized). +Values of such columns are automatically calculated according to the specified materialized expression when rows are inserted, +and we can use them to create new columns with masked versions of the data. + +Taking the example before, instead of creating a separate `VIEW` for the masked data, we'll now create masked columns using `MATERIALIZED`: + +```sql +CREATE TABLE orders ( + user_id UInt32, + name String, + name_masked String MATERIALIZED replaceRegexpOne(name, '^([A-Za-z]+)\\s+(.*)$', '\\1 ****'), + email String, + email_masked String MATERIALIZED replaceRegexpOne(email, '^(.{0})[^@]*(@.*)$', '\\1****\\2'), + phone String, + phone_masked String MATERIALIZED replaceRegexpOne(phone, '^(\\d{3})-(\\d{3})-(\\d{4})$', '\\1-***-\\3'), + total_amount Decimal(10,2), + order_date Date, + shipping_address String, + shipping_address_masked String MATERIALIZED replaceRegexpOne(shipping_address, '^[^,]+,\\s*(.*)$', '*** \\1') +) +ENGINE = MergeTree() +ORDER BY user_id; + +INSERT INTO orders VALUES + (1001, 'John Smith', 'john.smith@gmail.com', '555-123-4567', 299.99, '2024-01-15', '123 Main St, New York, NY 10001'), + (1002, 'Sarah Johnson', 'sarah.johnson@outlook.com', '555-987-6543', 149.50, '2024-01-16', '456 Oak Ave, Los Angeles, CA 90210'), + (1003, 'Michael Brown', 'mbrown@company.com', '555-456-7890', 599.00, '2024-01-17', '789 Pine Rd, Chicago, IL 60601'), + (1004, 'Emily Rogers', 'emily.rogers@yahoo.com', '555-321-0987', 89.99, '2024-01-18', '321 Elm St, Houston, TX 77001'), + (1005, 'David Wilson', 'dwilson@email.net', '555-654-3210', 449.75, '2024-01-19', '654 Cedar Blvd, Phoenix, AZ 85001'); +``` + +If you now run the following select query, you will see that the masked data is 'materialized' at insert time and stored alongside the original, unmasked data. +It is necessary to explicitly select the masked columns as ClickHouse doesn't automatically include materialized columns in `SELECT *` queries by default. + +```sql +SELECT + *, + name_masked, + email_masked, + phone_masked, + shipping_address_masked +FROM orders +ORDER BY user_id ASC +``` + +```response + ┌─user_id─┬─name──────────┬─email─────────────────────┬─phone────────┬─total_amount─┬─order_date─┬─shipping_address───────────────────┬─name_masked──┬─email_masked───────┬─phone_masked─┬─shipping_address_masked────┐ +1. │ 1001 │ John Smith │ john.smith@gmail.com │ 555-123-4567 │ 299.99 │ 2024-01-15 │ 123 Main St, New York, NY 10001 │ John **** │ jo****@gmail.com │ 555-***-4567 │ **** New York, NY 10001 │ +2. │ 1002 │ Sarah Johnson │ sarah.johnson@outlook.com │ 555-987-6543 │ 149.5 │ 2024-01-16 │ 456 Oak Ave, Los Angeles, CA 90210 │ Sarah **** │ sa****@outlook.com │ 555-***-6543 │ **** Los Angeles, CA 90210 │ +3. │ 1003 │ Michael Brown │ mbrown@company.com │ 555-456-7890 │ 599 │ 2024-01-17 │ 789 Pine Rd, Chicago, IL 60601 │ Michael **** │ mb****@company.com │ 555-***-7890 │ **** Chicago, IL 60601 │ +4. │ 1004 │ Emily Rogers │ emily.rogers@yahoo.com │ 555-321-0987 │ 89.99 │ 2024-01-18 │ 321 Elm St, Houston, TX 77001 │ Emily **** │ em****@yahoo.com │ 555-***-0987 │ **** Houston, TX 77001 │ +5. │ 1005 │ David Wilson │ dwilson@email.net │ 555-654-3210 │ 449.75 │ 2024-01-19 │ 654 Cedar Blvd, Phoenix, AZ 85001 │ David **** │ dw****@email.net │ 555-***-3210 │ **** Phoenix, AZ 85001 │ + └─────────┴───────────────┴───────────────────────────┴──────────────┴──────────────┴────────────┴────────────────────────────────────┴──────────────┴────────────────────┴──────────────┴────────────────────────────┘ +``` + +To ensure that users are only able to access columns containing the masked data, you can again use [Role Based Access Control](/cloud/security/cloud-access-management/overview) to ensure that specific roles only have grants to select on masked columns from `orders`. + +Recreate the role that we made previously: + +```sql +DROP ROLE IF EXISTS masked_order_viewer; +CREATE ROLE masked_order_viewer; +``` + +Next, grant `SELECT` permission to the `orders` table: + +```sql +GRANT SELECT ON orders TO masked_data_reader; +``` + +Revoke access to any sensitive columns: + +```sql +REVOKE SELECT(name) ON orders FROM masked_data_reader; +REVOKE SELECT(email) ON orders FROM masked_data_reader; +REVOKE SELECT(phone) ON orders FROM masked_data_reader; +REVOKE SELECT(shipping_address) ON orders FROM masked_data_reader; +``` + +Finally, assign the role to the appropriate users: + +```sql +GRANT masked_orders_viewer TO your_user; +``` + ## Use query masking rules for log data {#use-query-masking-rules} For users of ClickHouse OSS wishing to mask log data specifically, you can make use of [query masking rules](/operations/server-configuration-parameters/settings#query_masking_rules) (log masking) to mask data. From 287cb658f0ffbdc430e087c6ac7dbafbb1de5f2c Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 18 Sep 2025 11:53:10 +0200 Subject: [PATCH 9/9] add a section on using EPHEMERAL --- docs/cloud/guides/security/data_masking.md | 76 ++++++++++++++++++---- 1 file changed, 65 insertions(+), 11 deletions(-) diff --git a/docs/cloud/guides/security/data_masking.md b/docs/cloud/guides/security/data_masking.md index 517a01b4d1a..e22e4d72edb 100644 --- a/docs/cloud/guides/security/data_masking.md +++ b/docs/cloud/guides/security/data_masking.md @@ -24,7 +24,7 @@ For basic data masking use cases, the `replace` family of functions offers a con For example, you can replace the name "John Smith" with a placeholder `[CUSTOMER_NAME]` using the `replaceOne` function: -```sql +```sql title="Query" SELECT replaceOne( 'Customer John Smith called about his account', 'John Smith', @@ -32,7 +32,7 @@ SELECT replaceOne( ) AS anonymized_text; ``` -```response +```response title="Response" ┌─anonymized_text───────────────────────────────────┐ │ Customer [CUSTOMER_NAME] called about his account │ └───────────────────────────────────────────────────┘ @@ -40,7 +40,7 @@ SELECT replaceOne( More generically, you can use the `replaceRegexpOne` to replace any customer name: -```sql +```sql title="Query" SELECT replaceRegexpAll( 'Customer John Smith called. Later, Mary Johnson and Bob Wilson also called.', @@ -49,7 +49,7 @@ SELECT ) AS anonymized_text; ``` -```response +```response title="Response" ┌─anonymized_text───────────────────────────────────────────────────────────────────────┐ │ [CUSTOMER_NAME] Smith called. Later, [CUSTOMER_NAME] and [CUSTOMER_NAME] also called. │ └───────────────────────────────────────────────────────────────────────────────────────┘ @@ -57,7 +57,7 @@ SELECT Or you could mask a social security number, leaving only the last 4 digits using the `replaceRegexpAll` function. -```sql +```sql title="Query" SELECT replaceRegexpAll( 'SSN: 123-45-6789', '(\d{3})-(\d{2})-(\d{4})', @@ -67,7 +67,7 @@ SELECT replaceRegexpAll( In the query above `\3` is used to substitute the third capture group into the resulting string, which produces: -```response +```response title="Response" ┌─masked_ssn───────┐ │ SSN: XXX-XX-6789 │ └──────────────────┘ @@ -123,11 +123,11 @@ In the `SELECT` clause of the view creation query above, we define transformatio Select the data from the view: -```sql +```sql title="Query" SELECT * FROM masked_orders ``` -```response +```response title="Response" ┌─user_id─┬─name─────────┬─email──────────────┬─phone────────┬─total_amount─┬─order_date─┬─shipping_address──────────┐ │ 1001 │ John **** │ jo****@gmail.com │ 555-***-4567 │ 299.99 │ 2024-01-15 │ *** New York, NY 10001 │ │ 1002 │ Sarah **** │ sa****@outlook.com │ 555-***-6543 │ 149.5 │ 2024-01-16 │ *** Los Angeles, CA 90210 │ @@ -181,6 +181,7 @@ and we can use them to create new columns with masked versions of the data. Taking the example before, instead of creating a separate `VIEW` for the masked data, we'll now create masked columns using `MATERIALIZED`: ```sql +DROP TABLE IF EXISTS orders; CREATE TABLE orders ( user_id UInt32, name String, @@ -208,7 +209,7 @@ INSERT INTO orders VALUES If you now run the following select query, you will see that the masked data is 'materialized' at insert time and stored alongside the original, unmasked data. It is necessary to explicitly select the masked columns as ClickHouse doesn't automatically include materialized columns in `SELECT *` queries by default. -```sql +```sql title="Query" SELECT *, name_masked, @@ -219,7 +220,7 @@ FROM orders ORDER BY user_id ASC ``` -```response +```response title="Response" ┌─user_id─┬─name──────────┬─email─────────────────────┬─phone────────┬─total_amount─┬─order_date─┬─shipping_address───────────────────┬─name_masked──┬─email_masked───────┬─phone_masked─┬─shipping_address_masked────┐ 1. │ 1001 │ John Smith │ john.smith@gmail.com │ 555-123-4567 │ 299.99 │ 2024-01-15 │ 123 Main St, New York, NY 10001 │ John **** │ jo****@gmail.com │ 555-***-4567 │ **** New York, NY 10001 │ 2. │ 1002 │ Sarah Johnson │ sarah.johnson@outlook.com │ 555-987-6543 │ 149.5 │ 2024-01-16 │ 456 Oak Ave, Los Angeles, CA 90210 │ Sarah **** │ sa****@outlook.com │ 555-***-6543 │ **** Los Angeles, CA 90210 │ @@ -259,6 +260,59 @@ Finally, assign the role to the appropriate users: GRANT masked_orders_viewer TO your_user; ``` +In the case where you want to store only the masked data in the `orders` table, +you can mark the sensitive unmasked columns as [`EPHEMERAL`](/sql-reference/statements/create/table#ephemeral), +which will ensure that columns of this type are not stored in the table. + +```sql +DROP TABLE IF EXISTS orders; +CREATE TABLE orders ( + user_id UInt32, + name String EPHEMERAL, + name_masked String MATERIALIZED replaceRegexpOne(name, '^([A-Za-z]+)\\s+(.*)$', '\\1 ****'), + email String EPHEMERAL, + email_masked String MATERIALIZED replaceRegexpOne(email, '^(.{2})[^@]*(@.*)$', '\\1****\\2'), + phone String EPHEMERAL, + phone_masked String MATERIALIZED replaceRegexpOne(phone, '^(\\d{3})-(\\d{3})-(\\d{4})$', '\\1-***-\\3'), + total_amount Decimal(10,2), + order_date Date, + shipping_address String EPHEMERAL, + shipping_address_masked String MATERIALIZED replaceRegexpOne(shipping_address, '^([^,]+),\\s*(.*)$', '*** \\2') +) +ENGINE = MergeTree() +ORDER BY user_id; + +INSERT INTO orders (user_id, name, email, phone, total_amount, order_date, shipping_address) VALUES + (1001, 'John Smith', 'john.smith@gmail.com', '555-123-4567', 299.99, '2024-01-15', '123 Main St, New York, NY 10001'), + (1002, 'Sarah Johnson', 'sarah.johnson@outlook.com', '555-987-6543', 149.50, '2024-01-16', '456 Oak Ave, Los Angeles, CA 90210'), + (1003, 'Michael Brown', 'mbrown@company.com', '555-456-7890', 599.00, '2024-01-17', '789 Pine Rd, Chicago, IL 60601'), + (1004, 'Emily Rogers', 'emily.rogers@yahoo.com', '555-321-0987', 89.99, '2024-01-18', '321 Elm St, Houston, TX 77001'), + (1005, 'David Wilson', 'dwilson@email.net', '555-654-3210', 449.75, '2024-01-19', '654 Cedar Blvd, Phoenix, AZ 85001'); +``` + +If we run the same query as before, you'll now see that only the materialized masked data was inserted into the table: + +```sql title="Query" +SELECT + *, + name_masked, + email_masked, + phone_masked, + shipping_address_masked +FROM orders +ORDER BY user_id ASC +``` + +```response title="Response" + ┌─user_id─┬─total_amount─┬─order_date─┬─name_masked──┬─email_masked───────┬─phone_masked─┬─shipping_address_masked───┐ +1. │ 1001 │ 299.99 │ 2024-01-15 │ John **** │ jo****@gmail.com │ 555-***-4567 │ *** New York, NY 10001 │ +2. │ 1002 │ 149.5 │ 2024-01-16 │ Sarah **** │ sa****@outlook.com │ 555-***-6543 │ *** Los Angeles, CA 90210 │ +3. │ 1003 │ 599 │ 2024-01-17 │ Michael **** │ mb****@company.com │ 555-***-7890 │ *** Chicago, IL 60601 │ +4. │ 1004 │ 89.99 │ 2024-01-18 │ Emily **** │ em****@yahoo.com │ 555-***-0987 │ *** Houston, TX 77001 │ +5. │ 1005 │ 449.75 │ 2024-01-19 │ David **** │ dw****@email.net │ 555-***-3210 │ *** Phoenix, AZ 85001 │ + └─────────┴──────────────┴────────────┴──────────────┴────────────────────┴──────────────┴───────────────────────────┘ +``` + ## Use query masking rules for log data {#use-query-masking-rules} For users of ClickHouse OSS wishing to mask log data specifically, you can make use of [query masking rules](/operations/server-configuration-parameters/settings#query_masking_rules) (log masking) to mask data. @@ -271,7 +325,7 @@ Note that it does not mask data in query results. For example, to mask a social security number, you could add the following rule to your [server configuration](/operations/configuration-files): -```yaml title="" +```yaml hide SSN