Skip to content

Commit

Permalink
Merge pull request #43488 from ClickHouse/backport/22.11/43038
Browse files Browse the repository at this point in the history
Backport #43038 to 22.11: Fix regression on SingleValueDataString
  • Loading branch information
tavplubix committed Nov 23, 2022
2 parents 81e231a + f6dd9c1 commit eb069ec
Show file tree
Hide file tree
Showing 5 changed files with 260 additions and 44 deletions.
8 changes: 8 additions & 0 deletions src/AggregateFunctions/AggregateFunctionArgMinMax.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ struct Settings;
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int CORRUPTED_DATA;
}


Expand Down Expand Up @@ -89,6 +90,13 @@ class AggregateFunctionArgMinMax final : public IAggregateFunctionDataHelper<Dat
{
this->data(place).result.read(buf, *serialization_res, arena);
this->data(place).value.read(buf, *serialization_val, arena);
if (unlikely(this->data(place).value.has() != this->data(place).result.has()))
throw Exception(
ErrorCodes::CORRUPTED_DATA,
"Invalid state of the aggregate function {}: has_value ({}) != has_result ({})",
getName(),
this->data(place).value.has(),
this->data(place).result.has());
}

bool allocatesMemoryInArena() const override
Expand Down
139 changes: 102 additions & 37 deletions src/AggregateFunctions/AggregateFunctionMinMaxAny.h
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,34 @@ struct SingleValueDataFixed

};

struct Compatibility
{
/// Old versions used to store terminating null-character in SingleValueDataString.
/// Then -WithTerminatingZero methods were removed from IColumn interface,
/// because these methods are quite dangerous and easy to misuse. It introduced incompatibility.
/// See https://github.com/ClickHouse/ClickHouse/pull/41431 and https://github.com/ClickHouse/ClickHouse/issues/42916
/// Here we keep these functions for compatibility.
/// It's safe because there's no way unsanitized user input (without \0 at the end) can reach these functions.

static StringRef getDataAtWithTerminatingZero(const ColumnString & column, size_t n)
{
auto res = column.getDataAt(n);
/// ColumnString always reserves extra byte for null-character after string.
/// But getDataAt returns StringRef without the null-character. Let's add it.
chassert(res.data[res.size] == '\0');
++res.size;
return res;
}

static void insertDataWithTerminatingZero(ColumnString & column, const char * pos, size_t length)
{
/// String already has terminating null-character.
/// But insertData will add another one unconditionally. Trim existing null-character to avoid duplication.
chassert(0 < length);
chassert(pos[length - 1] == '\0');
column.insertData(pos, length - 1);
}
};

/** For strings. Short strings are stored in the object itself, and long strings are allocated separately.
* NOTE It could also be suitable for arrays of numbers.
Expand Down Expand Up @@ -477,20 +505,31 @@ struct SingleValueDataString //-V730
return size >= 0;
}

const char * getData() const
private:
char * getDataMutable()
{
return size <= MAX_SMALL_STRING_SIZE ? small_data : large_data;
}

const char * getData() const
{
const char * data_ptr = size <= MAX_SMALL_STRING_SIZE ? small_data : large_data;
/// It must always be terminated with null-character
chassert(0 < size);
chassert(data_ptr[size - 1] == '\0');
return data_ptr;
}

StringRef getStringRef() const
{
return StringRef(getData(), size);
}

public:
void insertResultInto(IColumn & to) const
{
if (has())
assert_cast<ColumnString &>(to).insertData(getData(), size);
Compatibility::insertDataWithTerminatingZero(assert_cast<ColumnString &>(to), getData(), size);
else
assert_cast<ColumnString &>(to).insertDefault();
}
Expand All @@ -502,44 +541,76 @@ struct SingleValueDataString //-V730
buf.write(getData(), size);
}

void allocateLargeDataIfNeeded(Int64 size_to_reserve, Arena * arena)
{
if (capacity < size_to_reserve)
{
capacity = static_cast<Int32>(roundUpToPowerOfTwoOrZero(size_to_reserve));
/// It might happen if the size was too big and the rounded value does not fit a size_t
if (unlikely(capacity < size_to_reserve))
throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "String size is too big ({})", size_to_reserve);

/// Don't free large_data here.
large_data = arena->alloc(capacity);
}
}

void read(ReadBuffer & buf, const ISerialization & /*serialization*/, Arena * arena)
{
Int32 rhs_size;
readBinary(rhs_size, buf);

if (rhs_size >= 0)
if (rhs_size < 0)
{
if (rhs_size <= MAX_SMALL_STRING_SIZE)
{
/// Don't free large_data here.

size = rhs_size;
/// Don't free large_data here.
size = rhs_size;
return;
}

if (size > 0)
buf.readStrict(small_data, size);
}
else
{
if (capacity < rhs_size)
{
capacity = static_cast<Int32>(roundUpToPowerOfTwoOrZero(rhs_size));
/// It might happen if the size was too big and the rounded value does not fit a size_t
if (unlikely(capacity < rhs_size))
throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "String size is too big ({})", rhs_size);
if (rhs_size <= MAX_SMALL_STRING_SIZE)
{
/// Don't free large_data here.

/// Don't free large_data here.
large_data = arena->alloc(capacity);
}
size = rhs_size;

size = rhs_size;
buf.readStrict(large_data, size);
}
if (size > 0)
buf.readStrict(small_data, size);
}
else
{
/// Don't free large_data here.
/// Reserve one byte more for null-character
Int64 rhs_size_to_reserve = rhs_size;
rhs_size_to_reserve += 1; /// Avoid overflow
allocateLargeDataIfNeeded(rhs_size_to_reserve, arena);
size = rhs_size;
buf.readStrict(large_data, size);
}

/// Check if the string we read is null-terminated (getDataMutable does not have the assertion)
if (0 < size && getDataMutable()[size - 1] == '\0')
return;

/// It's not null-terminated, but it must be (for historical reasons). There are two variants:
/// - The value was serialized by one of the incompatible versions of ClickHouse. We had some range of versions
/// that used to serialize SingleValueDataString without terminating '\0'. Let's just append it.
/// - An attacker sent crafted data. Sanitize it and append '\0'.
/// In all other cases the string must be already null-terminated.

/// NOTE We cannot add '\0' unconditionally, because it will be duplicated.
/// NOTE It's possible that a string that actually ends with '\0' was written by one of the incompatible versions.
/// Unfortunately, we cannot distinguish it from normal string written by normal version.
/// So such strings will be trimmed.

if (size == MAX_SMALL_STRING_SIZE)
{
/// Special case: We have to move value to large_data
allocateLargeDataIfNeeded(size + 1, arena);
memcpy(large_data, small_data, size);
}

/// We have enough space to append
++size;
getDataMutable()[size - 1] = '\0';
}

/// Assuming to.has()
Expand All @@ -557,21 +628,15 @@ struct SingleValueDataString //-V730
}
else
{
if (capacity < value_size)
{
/// Don't free large_data here.
capacity = static_cast<Int32>(roundUpToPowerOfTwoOrZero(value_size));
large_data = arena->alloc(capacity);
}

allocateLargeDataIfNeeded(value_size, arena);
size = value_size;
memcpy(large_data, value.data, size);
}
}

void change(const IColumn & column, size_t row_num, Arena * arena)
{
changeImpl(assert_cast<const ColumnString &>(column).getDataAt(row_num), arena);
changeImpl(Compatibility::getDataAtWithTerminatingZero(assert_cast<const ColumnString &>(column), row_num), arena);
}

void change(const Self & to, Arena * arena)
Expand Down Expand Up @@ -620,7 +685,7 @@ struct SingleValueDataString //-V730

bool changeIfLess(const IColumn & column, size_t row_num, Arena * arena)
{
if (!has() || assert_cast<const ColumnString &>(column).getDataAt(row_num) < getStringRef())
if (!has() || Compatibility::getDataAtWithTerminatingZero(assert_cast<const ColumnString &>(column), row_num) < getStringRef())
{
change(column, row_num, arena);
return true;
Expand All @@ -642,7 +707,7 @@ struct SingleValueDataString //-V730

bool changeIfGreater(const IColumn & column, size_t row_num, Arena * arena)
{
if (!has() || assert_cast<const ColumnString &>(column).getDataAt(row_num) > getStringRef())
if (!has() || Compatibility::getDataAtWithTerminatingZero(assert_cast<const ColumnString &>(column), row_num) > getStringRef())
{
change(column, row_num, arena);
return true;
Expand All @@ -669,7 +734,7 @@ struct SingleValueDataString //-V730

bool isEqualTo(const IColumn & column, size_t row_num) const
{
return has() && assert_cast<const ColumnString &>(column).getDataAt(row_num) == getStringRef();
return has() && Compatibility::getDataAtWithTerminatingZero(assert_cast<const ColumnString &>(column), row_num) == getStringRef();
}

static bool allocatesMemoryInArena()
Expand Down
23 changes: 16 additions & 7 deletions tests/integration/test_backward_compatibility/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@
backward = cluster.add_instance(
"backward",
image="clickhouse/clickhouse-server",
tag="22.9",
# Note that a bug changed the string representation of several aggregations in 22.9 and 22.10 and some minor
# releases of 22.8, 22.7 and 22.3
# See https://github.com/ClickHouse/ClickHouse/issues/42916
# Affected at least: singleValueOrNull, last_value, min, max, any, anyLast, anyHeavy, first_value, argMin, argMax
tag="22.6",
with_installed_binary=True,
)

Expand Down Expand Up @@ -139,6 +143,9 @@ def test_string_functions(start_cluster):
"substring",
"CAST",
# NOTE: no need to ignore now()/now64() since they will fail because they don't accept any argument
# 22.8 Backward Incompatible Change: Extended range of Date32
"toDate32OrZero",
"toDate32OrDefault",
]
functions = filter(lambda x: x not in excludes, functions)

Expand All @@ -149,14 +156,15 @@ def test_string_functions(start_cluster):
failed = 0
passed = 0

def get_function_value(node, function_name, value="foo"):
def get_function_value(node, function_name, value):
return node.query(f"select {function_name}('{value}')").strip()

v = "foo"
for function in functions:
logging.info("Checking %s", function)
logging.info("Checking %s('%s')", function, v)

try:
backward_value = get_function_value(backward, function)
backward_value = get_function_value(backward, function, v)
except QueryRuntimeException as e:
error_message = str(e)
allowed_errors = [
Expand Down Expand Up @@ -199,11 +207,12 @@ def get_function_value(node, function_name, value="foo"):
failed += 1
continue

upstream_value = get_function_value(upstream, function)
upstream_value = get_function_value(upstream, function, v)
if upstream_value != backward_value:
logging.info(
"Failed %s, %s (backward) != %s (upstream)",
logging.warning(
"Failed %s('%s') %s (backward) != %s (upstream)",
function,
v,
backward_value,
upstream_value,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
1
22.8.5.29 10
22.8.6.71 10
1
22.8.5.29 52
22.8.6.71 52
1
22.8.5.29 0
22.8.6.71 0
46_OK 0123456789012345678901234567890123456789012345
46_KO 0123456789012345678901234567890123456789012345
47_OK 01234567890123456789012345678901234567890123456
47_KO 01234567890123456789012345678901234567890123456
48_OK 012345678901234567890123456789012345678901234567
48_KO 012345678901234567890123456789012345678901234567
63_OK 012345678901234567890123456789012345678901234567890123456789012
63_KO 012345678901234567890123456789012345678901234567890123456789012
64_OK 0123456789012345678901234567890123456789012345678901234567890123
64_KO 0123456789012345678901234567890123456789012345678901234567890123
-1 0
-2 0
-2^31 0
1M without 0 1048576
1M with 0 1048575
fuzz2 0123 4

0 comments on commit eb069ec

Please sign in to comment.