From 60789efbc89250f498a283e1fbdd720d03004a3f Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Fri, 19 Sep 2025 12:57:37 +0200 Subject: [PATCH] start generating JSON functions --- docs/best-practices/json_type.md | 2 +- .../01_migration_guides/04_snowflake/02_migration_guide.md | 2 +- docs/getting-started/index.md | 1 + docs/integrations/data-ingestion/data-formats/json/other.md | 2 +- scripts/settings/autogenerate-settings.sh | 3 +++ 5 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/best-practices/json_type.md b/docs/best-practices/json_type.md index 22fad8198fb..aa339dace77 100644 --- a/docs/best-practices/json_type.md +++ b/docs/best-practices/json_type.md @@ -224,7 +224,7 @@ ORDER BY doc.update_date We provide a type hint for the `update_date` column in the JSON definition, as we use it in the ordering/primary key. This helps ClickHouse to know that this column won't be null and ensures it knows which `update_date` sub-column to use (there may be multiple for each type, so this is ambiguous otherwise). ::: -We can insert into this table and view the subsequently inferred schema using the [`JSONAllPathsWithTypes`](/sql-reference/functions/json-functions#jsonallpathswithtypes) function and [`PrettyJSONEachRow`](/interfaces/formats/PrettyJSONEachRow) output format: +We can insert into this table and view the subsequently inferred schema using the [`JSONAllPathsWithTypes`](/sql-reference/functions/json-functions#JSONAllPathsWithTypes) function and [`PrettyJSONEachRow`](/interfaces/formats/PrettyJSONEachRow) output format: ```sql INSERT INTO arxiv FORMAT JSONAsObject diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/02_migration_guide.md b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/02_migration_guide.md index 468a8b6193b..eb0415fcb18 100644 --- a/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/02_migration_guide.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/02_migration_guide.md @@ -101,7 +101,7 @@ input_format_parquet_case_insensitive_column_matching = 1 -- Column matching bet :::note Note on nested column structures The `VARIANT` and `OBJECT` columns in the original Snowflake table schema will be output as JSON strings by default, forcing us to cast these when inserting them into ClickHouse. -Nested structures such as `some_file` are converted to JSON strings on copy by Snowflake. Importing this data requires us to transform these structures to Tuples at insert time in ClickHouse, using the [JSONExtract function](/sql-reference/functions/json-functions#jsonextract) as shown above. +Nested structures such as `some_file` are converted to JSON strings on copy by Snowflake. Importing this data requires us to transform these structures to Tuples at insert time in ClickHouse, using the [JSONExtract function](/sql-reference/functions/json-functions#JSONExtract) as shown above. ::: ## Test successful data export {#3-testing-successful-data-export} diff --git a/docs/getting-started/index.md b/docs/getting-started/index.md index 02ff69186f1..560b4275435 100644 --- a/docs/getting-started/index.md +++ b/docs/getting-started/index.md @@ -42,6 +42,7 @@ by https://github.com/ClickHouse/clickhouse-docs/blob/main/scripts/autogenerate- | [Foursquare places](/getting-started/example-datasets/foursquare-places) | Dataset with over 100 million records containing information about places on a map, such as shops, restaurants, parks, playgrounds, and monuments. | | [GitHub Events Dataset](/getting-started/example-datasets/github-events) | Dataset containing all events on GitHub from 2011 to Dec 6 2020, with a size of 3.1 billion records. | | [Hacker News dataset](/getting-started/example-datasets/hacker-news) | Dataset containing 28 million rows of hacker news data. | +| [Hacker News Vector Search dataset](/getting-started/example-datasets/hackernews-vector-search-dataset) | Dataset containing 28+ million Hacker News postings & their vector embeddings | | [LAION 5B dataset](/getting-started/example-datasets/laion-5b-dataset) | Dataset containing 100 million vectors from the LAION 5B dataset | | [Laion-400M dataset](/getting-started/example-datasets/laion-400m-dataset) | Dataset containing 400 million images with English image captions | | [New York Public Library "What's on the Menu?" Dataset](/getting-started/example-datasets/menus) | Dataset containing 1.3 million records of historical data on the menus of hotels, restaurants and cafes with the dishes along with their prices. | diff --git a/docs/integrations/data-ingestion/data-formats/json/other.md b/docs/integrations/data-ingestion/data-formats/json/other.md index 95411202514..c481db5f53d 100644 --- a/docs/integrations/data-ingestion/data-formats/json/other.md +++ b/docs/integrations/data-ingestion/data-formats/json/other.md @@ -70,7 +70,7 @@ SELECT JSONExtractString(tags, 'holidays') AS holidays FROM people 1 row in set. Elapsed: 0.002 sec. ``` -Notice how the functions require both a reference to the `String` column `tags` and a path in the JSON to extract. Nested paths require functions to be nested e.g. `JSONExtractUInt(JSONExtractString(tags, 'car'), 'year')` which extracts the column `tags.car.year`. The extraction of nested paths can be simplified through the functions [`JSON_QUERY`](/sql-reference/functions/json-functions#json_query) and [`JSON_VALUE`](/sql-reference/functions/json-functions#json_value). +Notice how the functions require both a reference to the `String` column `tags` and a path in the JSON to extract. Nested paths require functions to be nested e.g. `JSONExtractUInt(JSONExtractString(tags, 'car'), 'year')` which extracts the column `tags.car.year`. The extraction of nested paths can be simplified through the functions [`JSON_QUERY`](/sql-reference/functions/json-functions#JSON_QUERY) and [`JSON_VALUE`](/sql-reference/functions/json-functions#json_value). Consider the extreme case with the `arxiv` dataset where we consider the entire body to be a `String`. diff --git a/scripts/settings/autogenerate-settings.sh b/scripts/settings/autogenerate-settings.sh index f31890a2319..2d1e3cedf44 100755 --- a/scripts/settings/autogenerate-settings.sh +++ b/scripts/settings/autogenerate-settings.sh @@ -266,6 +266,7 @@ if [ -f "$FUNCTION_SQL_FILE" ]; then "Encryption" "Hash" "Introspection" + "JSON" ) for CATEGORY in "${FUNCTION_CATEGORIES[@]}"; do @@ -376,6 +377,7 @@ insert_src_files=( "encryption-functions.md" "hash-functions.md" "introspection-functions.md" + "json-functions.md" ) insert_dest_files=( @@ -394,6 +396,7 @@ insert_dest_files=( "docs/sql-reference/functions/encryption-functions.md" "docs/sql-reference/functions/hash-functions.md" "docs/sql-reference/functions/introspection.md" + "docs/sql-reference/functions/json-functions.md" ) echo "[$SCRIPT_NAME] Inserting generated markdown content between AUTOGENERATED_START and AUTOGENERATED_END tags"