From 2385431ae7489bec732f11f344c20cdc7803c8d5 Mon Sep 17 00:00:00 2001 From: Michael Stetsyuk Date: Tue, 12 May 2026 19:28:32 +0000 Subject: [PATCH 1/4] Fix dangling references in cached AvroConfluent deserializer `AvroConfluentRowInputFormat::getOrCreateDeserializer` built an `AvroDeserializer` on the stack and copied it into `deserializer_cache`. The deserializer holds `symbolic_skip_fn_map`, and skip lambdas in its action tree capture references into that map. Copying deep-copies the map to a new address while the lambdas still reference the original; once the stack local is destroyed those references dangle, and the next row deserialization invokes a `std::function` over freed memory. Move into the cache instead of copying. `std::map` move-construction transfers existing nodes without relocating them, so the captured references remain valid. --- src/Processors/Formats/Impl/AvroRowInputFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index c58ccde70b7f..f884a730c2da 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -1327,7 +1327,7 @@ const AvroDeserializer & AvroConfluentRowInputFormat::getOrCreateDeserializer(Sc auto schema = schema_registry->getSchema(schema_id); AvroDeserializer deserializer( output.getHeader(), schema, format_settings.avro.allow_missing_fields, format_settings.null_as_default, format_settings); - it = deserializer_cache.emplace(schema_id, deserializer).first; + it = deserializer_cache.emplace(schema_id, std::move(deserializer)).first; } return it->second; } From b409e4943393493da11ad85ceb853900e1988e1d Mon Sep 17 00:00:00 2001 From: Michael Stetsyuk Date: Tue, 12 May 2026 19:41:16 +0000 Subject: [PATCH 2/4] Delete copy operations on AvroDeserializer Prevent reintroduction of the use-after-free fixed in the previous commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/Processors/Formats/Impl/AvroRowInputFormat.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.h b/src/Processors/Formats/Impl/AvroRowInputFormat.h index dc26ee35ffd5..e9b2a4b625d7 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.h +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.h @@ -52,6 +52,12 @@ class AvroDeserializer public: AvroDeserializer(const Block & header, avro::ValidSchema schema, bool allow_missing_fields, bool null_as_default_, const FormatSettings & settings_); AvroDeserializer(DataTypePtr data_type, const std::string & column_name, avro::ValidSchema schema, bool allow_missing_fields, bool null_as_default_, const FormatSettings & settings_); + + AvroDeserializer(const AvroDeserializer &) = delete; + AvroDeserializer & operator=(const AvroDeserializer &) = delete; + AvroDeserializer(AvroDeserializer &&) = default; + AvroDeserializer & operator=(AvroDeserializer &&) = delete; + void deserializeRow(MutableColumns & columns, avro::Decoder & decoder, RowReadExtension & ext) const; using DeserializeFn = std::function; From 6a40d3a559571882d0aa2c178e3649874eb371d1 Mon Sep 17 00:00:00 2001 From: Michael Stetsyuk Date: Wed, 13 May 2026 05:50:15 +0000 Subject: [PATCH 3/4] add regression test --- .../test_format_avro_confluent/test.py | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tests/integration/test_format_avro_confluent/test.py b/tests/integration/test_format_avro_confluent/test.py index 18f763ed974d..f19a18af9458 100644 --- a/tests/integration/test_format_avro_confluent/test.py +++ b/tests/integration/test_format_avro_confluent/test.py @@ -81,6 +81,56 @@ def test_select(started_cluster): ] +def test_select_skip_symbolic(started_cluster): + # type: (ClickHouseCluster) -> None + + reg_url = "http://localhost:{}".format(started_cluster.schema_registry_port) + schema_registry_client = CachedSchemaRegistryClient({"url": reg_url}) + serializer = MessageSerializer(schema_registry_client) + + schema = avro.schema.make_avsc_object( + { + "name": "Outer", + "type": "record", + "fields": [ + {"name": "value", "type": "long"}, + { + "name": "a", + "type": { + "type": "record", + "name": "Inner", + "fields": [{"name": "x", "type": "int"}], + }, + }, + {"name": "b", "type": "Inner"}, + ], + } + ) + + data = serializer.encode_record_with_schema( + "test_subject_skip_symbolic", + schema, + {"value": 0, "a": {"x": 0}, "b": {"x": 0}}, + ) + + instance = started_cluster.instances["dummy"] # type: ClickHouseInstance + schema_registry_url = "http://{}:{}".format( + started_cluster.schema_registry_host, started_cluster.schema_registry_port + ) + settings = {"format_avro_schema_registry_url": schema_registry_url} + run_query( + instance, + "create table avro_data_skip_symbolic(value Int64) engine = Memory()", + ) + run_query( + instance, + "insert into avro_data_skip_symbolic format AvroConfluent", + data, + settings, + ) + assert run_query(instance, "select value from avro_data_skip_symbolic").strip() == "0" + + def test_select_auth(started_cluster): # type: (ClickHouseCluster) -> None From 5448d84d55a45f108462b4167308c9dd01a89901 Mon Sep 17 00:00:00 2001 From: Michael Stetsyuk Date: Wed, 13 May 2026 06:25:24 +0000 Subject: [PATCH 4/4] Fix regression test to actually trigger the use-after-free The previous schema's symbolic reference sat at the top level of a skipped field, so createAction resolved it before createSkipFn was called, never building the reference-capturing lambda. A recursive Node schema with `next: ["null", "Node"]` puts the symbolic node inside a union branch, which is one of the spots createSkipFn recurses into on its own. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../test_format_avro_confluent/test.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/tests/integration/test_format_avro_confluent/test.py b/tests/integration/test_format_avro_confluent/test.py index f19a18af9458..33eb1c103b73 100644 --- a/tests/integration/test_format_avro_confluent/test.py +++ b/tests/integration/test_format_avro_confluent/test.py @@ -90,27 +90,18 @@ def test_select_skip_symbolic(started_cluster): schema = avro.schema.make_avsc_object( { - "name": "Outer", + "name": "Node", "type": "record", "fields": [ {"name": "value", "type": "long"}, - { - "name": "a", - "type": { - "type": "record", - "name": "Inner", - "fields": [{"name": "x", "type": "int"}], - }, - }, - {"name": "b", "type": "Inner"}, + {"name": "next", "type": ["null", "Node"]}, ], } ) + record = {"value": 0, "next": {"value": 1, "next": {"value": 2, "next": None}}} data = serializer.encode_record_with_schema( - "test_subject_skip_symbolic", - schema, - {"value": 0, "a": {"x": 0}, "b": {"x": 0}}, + "test_subject_skip_symbolic", schema, record ) instance = started_cluster.instances["dummy"] # type: ClickHouseInstance