From 7443dc925cb52eb19a6b42116e3acc3d8336a942 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 17 May 2023 13:12:00 +0000 Subject: [PATCH 1/2] Fix possible Logical error on bad Nullable parsing for text formats --- .../Serializations/SerializationNullable.cpp | 8 ++++++++ ...51_text_formats_bad_nullable_parsing.reference | 3 +++ .../02751_text_formats_bad_nullable_parsing.sh | 15 +++++++++++++++ 3 files changed, 26 insertions(+) create mode 100644 tests/queries/0_stateless/02751_text_formats_bad_nullable_parsing.reference create mode 100755 tests/queries/0_stateless/02751_text_formats_bad_nullable_parsing.sh diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 20188f7cec50..6e4402740d95 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -355,6 +355,9 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col /// It can happen only if there is a string instead of a number /// or if someone uses tab or LF in TSV null_representation. /// In the first case we cannot continue reading anyway. The second case seems to be unlikely. + /// We also should delete incorrectly deserialized value from nested column. + nested_column.popBack(1); + if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos) throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation " "containing '\\t' or '\\n' may not work correctly for large input."); @@ -447,6 +450,8 @@ ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, Re /// We have some unread data in PeekableReadBuffer own memory. /// It can happen only if there is an unquoted string instead of a number. + /// We also should delete incorrectly deserialized value from nested column. + nested_column.popBack(1); throw DB::ParsingException( ErrorCodes::CANNOT_READ_ALL_DATA, "Error while parsing Nullable: got an unquoted string {} instead of a number", @@ -579,6 +584,9 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB /// It can happen only if there is an unquoted string instead of a number /// or if someone uses csv delimiter, LF or CR in CSV null representation. /// In the first case we cannot continue reading anyway. The second case seems to be unlikely. + /// We also should delete incorrectly deserialized value from nested column. + nested_column.popBack(1); + if (null_representation.find(settings.csv.delimiter) != std::string::npos || null_representation.find('\r') != std::string::npos || null_representation.find('\n') != std::string::npos) throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "CSV custom null representation containing " diff --git a/tests/queries/0_stateless/02751_text_formats_bad_nullable_parsing.reference b/tests/queries/0_stateless/02751_text_formats_bad_nullable_parsing.reference new file mode 100644 index 000000000000..65e15e19c8b6 --- /dev/null +++ b/tests/queries/0_stateless/02751_text_formats_bad_nullable_parsing.reference @@ -0,0 +1,3 @@ +1 \N +1 \N +1 \N diff --git a/tests/queries/0_stateless/02751_text_formats_bad_nullable_parsing.sh b/tests/queries/0_stateless/02751_text_formats_bad_nullable_parsing.sh new file mode 100755 index 000000000000..e51079071ec2 --- /dev/null +++ b/tests/queries/0_stateless/02751_text_formats_bad_nullable_parsing.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +# NOTE: this sh wrapper is required because of shell_config + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "drop table if exists test" +$CLICKHOUSE_CLIENT -q "create table test (x UInt32, y Nullable(UInt32)) engine=MergeTree order by x" +$CLICKHOUSE_CLIENT -q "select '1\t\\\N\n2\t\\\' format RawBLOB" | $CLICKHOUSE_CLIENT -q "insert into test settings input_format_allow_errors_num=1 format TSV" +$CLICKHOUSE_CLIENT -q "select '1,\\\N\n2,\\\' format RawBLOB" | $CLICKHOUSE_CLIENT -q "insert into test settings input_format_allow_errors_num=1 format CSV" +$CLICKHOUSE_CLIENT -q "select '1\tNULL\n2\tN' format RawBLOB" | $CLICKHOUSE_CLIENT -q "insert into test settings input_format_allow_errors_num=2, format_custom_escaping_rule='Quoted' format CustomSeparated" +$CLICKHOUSE_CLIENT -q "select * from test" +$CLICKHOUSE_CLIENT -q "drop table test"; From bf19765c9bd69d78bf8d6113563e0a1f48339e54 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 22 May 2023 19:34:19 +0000 Subject: [PATCH 2/2] Fix possible use-of-uninitialized-value --- src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index 6f73ede5d4d2..1c2efe3a41d7 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -44,7 +44,7 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat( format_settings_, std::make_unique(*buf_, ignore_spaces_, format_settings_), format_settings_.custom.try_detect_header) - , buf(std::move(buf_)) + , buf(std::move(buf_)), ignore_spaces(ignore_spaces_) { /// In case of CustomSeparatedWithNames(AndTypes) formats and enabled setting input_format_with_names_use_header we don't know /// the exact number of columns in data (because it can contain unknown columns). So, if field_delimiter and row_after_delimiter are