Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport #43038 to 22.11: Fix regression on SingleValueDataString #43488

Merged
merged 2 commits into from
Nov 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/AggregateFunctions/AggregateFunctionArgMinMax.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ struct Settings;
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int CORRUPTED_DATA;
}


Expand Down Expand Up @@ -89,6 +90,13 @@ class AggregateFunctionArgMinMax final : public IAggregateFunctionDataHelper<Dat
{
this->data(place).result.read(buf, *serialization_res, arena);
this->data(place).value.read(buf, *serialization_val, arena);
if (unlikely(this->data(place).value.has() != this->data(place).result.has()))
throw Exception(
ErrorCodes::CORRUPTED_DATA,
"Invalid state of the aggregate function {}: has_value ({}) != has_result ({})",
getName(),
this->data(place).value.has(),
this->data(place).result.has());
}

bool allocatesMemoryInArena() const override
Expand Down
139 changes: 102 additions & 37 deletions src/AggregateFunctions/AggregateFunctionMinMaxAny.h
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,34 @@ struct SingleValueDataFixed

};

struct Compatibility
{
/// Old versions used to store terminating null-character in SingleValueDataString.
/// Then -WithTerminatingZero methods were removed from IColumn interface,
/// because these methods are quite dangerous and easy to misuse. It introduced incompatibility.
/// See https://github.com/ClickHouse/ClickHouse/pull/41431 and https://github.com/ClickHouse/ClickHouse/issues/42916
/// Here we keep these functions for compatibility.
/// It's safe because there's no way unsanitized user input (without \0 at the end) can reach these functions.

static StringRef getDataAtWithTerminatingZero(const ColumnString & column, size_t n)
{
auto res = column.getDataAt(n);
/// ColumnString always reserves extra byte for null-character after string.
/// But getDataAt returns StringRef without the null-character. Let's add it.
chassert(res.data[res.size] == '\0');
++res.size;
return res;
}

static void insertDataWithTerminatingZero(ColumnString & column, const char * pos, size_t length)
{
/// String already has terminating null-character.
/// But insertData will add another one unconditionally. Trim existing null-character to avoid duplication.
chassert(0 < length);
chassert(pos[length - 1] == '\0');
column.insertData(pos, length - 1);
}
};

/** For strings. Short strings are stored in the object itself, and long strings are allocated separately.
* NOTE It could also be suitable for arrays of numbers.
Expand Down Expand Up @@ -477,20 +505,31 @@ struct SingleValueDataString //-V730
return size >= 0;
}

const char * getData() const
private:
char * getDataMutable()
{
return size <= MAX_SMALL_STRING_SIZE ? small_data : large_data;
}

const char * getData() const
{
const char * data_ptr = size <= MAX_SMALL_STRING_SIZE ? small_data : large_data;
/// It must always be terminated with null-character
chassert(0 < size);
chassert(data_ptr[size - 1] == '\0');
return data_ptr;
}

StringRef getStringRef() const
{
return StringRef(getData(), size);
}

public:
void insertResultInto(IColumn & to) const
{
if (has())
assert_cast<ColumnString &>(to).insertData(getData(), size);
Compatibility::insertDataWithTerminatingZero(assert_cast<ColumnString &>(to), getData(), size);
else
assert_cast<ColumnString &>(to).insertDefault();
}
Expand All @@ -502,44 +541,76 @@ struct SingleValueDataString //-V730
buf.write(getData(), size);
}

void allocateLargeDataIfNeeded(Int64 size_to_reserve, Arena * arena)
{
if (capacity < size_to_reserve)
{
capacity = static_cast<Int32>(roundUpToPowerOfTwoOrZero(size_to_reserve));
/// It might happen if the size was too big and the rounded value does not fit a size_t
if (unlikely(capacity < size_to_reserve))
throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "String size is too big ({})", size_to_reserve);

/// Don't free large_data here.
large_data = arena->alloc(capacity);
}
}

void read(ReadBuffer & buf, const ISerialization & /*serialization*/, Arena * arena)
{
Int32 rhs_size;
readBinary(rhs_size, buf);

if (rhs_size >= 0)
if (rhs_size < 0)
{
if (rhs_size <= MAX_SMALL_STRING_SIZE)
{
/// Don't free large_data here.

size = rhs_size;
/// Don't free large_data here.
size = rhs_size;
return;
}

if (size > 0)
buf.readStrict(small_data, size);
}
else
{
if (capacity < rhs_size)
{
capacity = static_cast<Int32>(roundUpToPowerOfTwoOrZero(rhs_size));
/// It might happen if the size was too big and the rounded value does not fit a size_t
if (unlikely(capacity < rhs_size))
throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "String size is too big ({})", rhs_size);
if (rhs_size <= MAX_SMALL_STRING_SIZE)
{
/// Don't free large_data here.

/// Don't free large_data here.
large_data = arena->alloc(capacity);
}
size = rhs_size;

size = rhs_size;
buf.readStrict(large_data, size);
}
if (size > 0)
buf.readStrict(small_data, size);
}
else
{
/// Don't free large_data here.
/// Reserve one byte more for null-character
Int64 rhs_size_to_reserve = rhs_size;
rhs_size_to_reserve += 1; /// Avoid overflow
allocateLargeDataIfNeeded(rhs_size_to_reserve, arena);
size = rhs_size;
buf.readStrict(large_data, size);
}

/// Check if the string we read is null-terminated (getDataMutable does not have the assertion)
if (0 < size && getDataMutable()[size - 1] == '\0')
return;

/// It's not null-terminated, but it must be (for historical reasons). There are two variants:
/// - The value was serialized by one of the incompatible versions of ClickHouse. We had some range of versions
/// that used to serialize SingleValueDataString without terminating '\0'. Let's just append it.
/// - An attacker sent crafted data. Sanitize it and append '\0'.
/// In all other cases the string must be already null-terminated.

/// NOTE We cannot add '\0' unconditionally, because it will be duplicated.
/// NOTE It's possible that a string that actually ends with '\0' was written by one of the incompatible versions.
/// Unfortunately, we cannot distinguish it from normal string written by normal version.
/// So such strings will be trimmed.

if (size == MAX_SMALL_STRING_SIZE)
{
/// Special case: We have to move value to large_data
allocateLargeDataIfNeeded(size + 1, arena);
memcpy(large_data, small_data, size);
}

/// We have enough space to append
++size;
getDataMutable()[size - 1] = '\0';
}

/// Assuming to.has()
Expand All @@ -557,21 +628,15 @@ struct SingleValueDataString //-V730
}
else
{
if (capacity < value_size)
{
/// Don't free large_data here.
capacity = static_cast<Int32>(roundUpToPowerOfTwoOrZero(value_size));
large_data = arena->alloc(capacity);
}

allocateLargeDataIfNeeded(value_size, arena);
size = value_size;
memcpy(large_data, value.data, size);
}
}

void change(const IColumn & column, size_t row_num, Arena * arena)
{
changeImpl(assert_cast<const ColumnString &>(column).getDataAt(row_num), arena);
changeImpl(Compatibility::getDataAtWithTerminatingZero(assert_cast<const ColumnString &>(column), row_num), arena);
}

void change(const Self & to, Arena * arena)
Expand Down Expand Up @@ -620,7 +685,7 @@ struct SingleValueDataString //-V730

bool changeIfLess(const IColumn & column, size_t row_num, Arena * arena)
{
if (!has() || assert_cast<const ColumnString &>(column).getDataAt(row_num) < getStringRef())
if (!has() || Compatibility::getDataAtWithTerminatingZero(assert_cast<const ColumnString &>(column), row_num) < getStringRef())
{
change(column, row_num, arena);
return true;
Expand All @@ -642,7 +707,7 @@ struct SingleValueDataString //-V730

bool changeIfGreater(const IColumn & column, size_t row_num, Arena * arena)
{
if (!has() || assert_cast<const ColumnString &>(column).getDataAt(row_num) > getStringRef())
if (!has() || Compatibility::getDataAtWithTerminatingZero(assert_cast<const ColumnString &>(column), row_num) > getStringRef())
{
change(column, row_num, arena);
return true;
Expand All @@ -669,7 +734,7 @@ struct SingleValueDataString //-V730

bool isEqualTo(const IColumn & column, size_t row_num) const
{
return has() && assert_cast<const ColumnString &>(column).getDataAt(row_num) == getStringRef();
return has() && Compatibility::getDataAtWithTerminatingZero(assert_cast<const ColumnString &>(column), row_num) == getStringRef();
}

static bool allocatesMemoryInArena()
Expand Down
23 changes: 16 additions & 7 deletions tests/integration/test_backward_compatibility/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@
backward = cluster.add_instance(
"backward",
image="clickhouse/clickhouse-server",
tag="22.9",
# Note that a bug changed the string representation of several aggregations in 22.9 and 22.10 and some minor
# releases of 22.8, 22.7 and 22.3
# See https://github.com/ClickHouse/ClickHouse/issues/42916
# Affected at least: singleValueOrNull, last_value, min, max, any, anyLast, anyHeavy, first_value, argMin, argMax
tag="22.6",
with_installed_binary=True,
)

Expand Down Expand Up @@ -139,6 +143,9 @@ def test_string_functions(start_cluster):
"substring",
"CAST",
# NOTE: no need to ignore now()/now64() since they will fail because they don't accept any argument
# 22.8 Backward Incompatible Change: Extended range of Date32
"toDate32OrZero",
"toDate32OrDefault",
]
functions = filter(lambda x: x not in excludes, functions)

Expand All @@ -149,14 +156,15 @@ def test_string_functions(start_cluster):
failed = 0
passed = 0

def get_function_value(node, function_name, value="foo"):
def get_function_value(node, function_name, value):
return node.query(f"select {function_name}('{value}')").strip()

v = "foo"
for function in functions:
logging.info("Checking %s", function)
logging.info("Checking %s('%s')", function, v)

try:
backward_value = get_function_value(backward, function)
backward_value = get_function_value(backward, function, v)
except QueryRuntimeException as e:
error_message = str(e)
allowed_errors = [
Expand Down Expand Up @@ -199,11 +207,12 @@ def get_function_value(node, function_name, value="foo"):
failed += 1
continue

upstream_value = get_function_value(upstream, function)
upstream_value = get_function_value(upstream, function, v)
if upstream_value != backward_value:
logging.info(
"Failed %s, %s (backward) != %s (upstream)",
logging.warning(
"Failed %s('%s') %s (backward) != %s (upstream)",
function,
v,
backward_value,
upstream_value,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
1
22.8.5.29 10
22.8.6.71 10
1
22.8.5.29 52
22.8.6.71 52
1
22.8.5.29 0
22.8.6.71 0
46_OK 0123456789012345678901234567890123456789012345
46_KO 0123456789012345678901234567890123456789012345
47_OK 01234567890123456789012345678901234567890123456
47_KO 01234567890123456789012345678901234567890123456
48_OK 012345678901234567890123456789012345678901234567
48_KO 012345678901234567890123456789012345678901234567
63_OK 012345678901234567890123456789012345678901234567890123456789012
63_KO 012345678901234567890123456789012345678901234567890123456789012
64_OK 0123456789012345678901234567890123456789012345678901234567890123
64_KO 0123456789012345678901234567890123456789012345678901234567890123
-1 0
-2 0
-2^31 0
1M without 0 1048576
1M with 0 1048575
fuzz2 0123 4