Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix crash in FPC codec #56795

Merged
merged 2 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
170 changes: 89 additions & 81 deletions src/Compression/CompressionCodecFPC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,23 +153,23 @@ void registerCodecFPC(CompressionCodecFactory & factory)
namespace
{

template <std::unsigned_integral TUint>
requires (sizeof(TUint) >= 4)
template <std::unsigned_integral TUInt>
requires (sizeof(TUInt) >= 4)
class DfcmPredictor
{
public:
explicit DfcmPredictor(std::size_t table_size)
explicit DfcmPredictor(size_t table_size)
: table(table_size, 0), prev_value{0}, hash{0}
{
}

[[nodiscard]]
TUint predict() const noexcept
TUInt predict() const noexcept
{
return table[hash] + prev_value;
}

void add(TUint value) noexcept
void add(TUInt value) noexcept
{
table[hash] = value - prev_value;
recalculateHash();
Expand All @@ -180,38 +180,38 @@ class DfcmPredictor
void recalculateHash() noexcept
{
auto value = table[hash];
if constexpr (sizeof(TUint) >= 8)
if constexpr (sizeof(TUInt) >= 8)
{
hash = ((hash << 2) ^ static_cast<std::size_t>(value >> 40)) & (table.size() - 1);
hash = ((hash << 2) ^ static_cast<size_t>(value >> 40)) & (table.size() - 1);
}
else
{
hash = ((hash << 4) ^ static_cast<std::size_t>(value >> 23)) & (table.size() - 1);
hash = ((hash << 4) ^ static_cast<size_t>(value >> 23)) & (table.size() - 1);
}
}

std::vector<TUint> table;
TUint prev_value;
std::size_t hash;
std::vector<TUInt> table;
TUInt prev_value;
size_t hash;
};

template <std::unsigned_integral TUint>
requires (sizeof(TUint) >= 4)
template <std::unsigned_integral TUInt>
requires (sizeof(TUInt) >= 4)
class FcmPredictor
{
public:
explicit FcmPredictor(std::size_t table_size)
explicit FcmPredictor(size_t table_size)
: table(table_size, 0), hash{0}
{
}

[[nodiscard]]
TUint predict() const noexcept
TUInt predict() const noexcept
{
return table[hash];
}

void add(TUint value) noexcept
void add(TUInt value) noexcept
{
table[hash] = value;
recalculateHash();
Expand All @@ -221,44 +221,44 @@ class FcmPredictor
void recalculateHash() noexcept
{
auto value = table[hash];
if constexpr (sizeof(TUint) >= 8)
if constexpr (sizeof(TUInt) >= 8)
{
hash = ((hash << 6) ^ static_cast<std::size_t>(value >> 48)) & (table.size() - 1);
hash = ((hash << 6) ^ static_cast<size_t>(value >> 48)) & (table.size() - 1);
}
else
{
hash = ((hash << 1) ^ static_cast<std::size_t>(value >> 22)) & (table.size() - 1);
hash = ((hash << 1) ^ static_cast<size_t>(value >> 22)) & (table.size() - 1);
}
}

std::vector<TUint> table;
std::size_t hash;
std::vector<TUInt> table;
size_t hash;
};

template <std::unsigned_integral TUint>
template <std::unsigned_integral TUInt>
class FPCOperation
{
static constexpr auto VALUE_SIZE = sizeof(TUint);
static constexpr size_t VALUE_SIZE = sizeof(TUInt);
static constexpr std::byte FCM_BIT{0};
static constexpr std::byte DFCM_BIT{1u << 3};
static constexpr auto DFCM_BIT_1 = DFCM_BIT << 4;
static constexpr auto DFCM_BIT_2 = DFCM_BIT;
static constexpr unsigned MAX_ZERO_BYTE_COUNT = 0b111u;
static constexpr std::byte DFCM_BIT_1 = DFCM_BIT << 4;
static constexpr std::byte DFCM_BIT_2 = DFCM_BIT;
static constexpr UInt32 MAX_ZERO_BYTE_COUNT = 0b111u;
static constexpr std::endian ENDIAN = std::endian::little;
static constexpr std::size_t CHUNK_SIZE = 64;
static constexpr size_t CHUNK_SIZE = 64;

public:
FPCOperation(std::span<std::byte> destination, UInt8 compression_level)
: dfcm_predictor(1u << compression_level), fcm_predictor(1u << compression_level), chunk{}, result{destination}
{
}

std::size_t encode(std::span<const std::byte> data) &&
size_t encode(std::span<const std::byte> data) &&
{
auto initial_size = result.size();

std::span chunk_view(chunk);
for (std::size_t i = 0; i < data.size(); i += chunk_view.size_bytes())
for (size_t i = 0; i < data.size(); i += chunk_view.size_bytes())
{
auto written_values_count = importChunk(data.subspan(i), chunk_view);
encodeChunk(chunk_view.subspan(0, written_values_count));
Expand All @@ -267,12 +267,12 @@ class FPCOperation
return initial_size - result.size();
}

void decode(std::span<const std::byte> values, std::size_t decoded_size) &&
void decode(std::span<const std::byte> values, size_t decoded_size) &&
{
std::size_t read_bytes = 0;
size_t read_bytes = 0;

std::span<TUint> chunk_view(chunk);
for (std::size_t i = 0; i < decoded_size; i += chunk_view.size_bytes())
std::span<TUInt> chunk_view(chunk);
for (size_t i = 0; i < decoded_size; i += chunk_view.size_bytes())
{
if (i + chunk_view.size_bytes() > decoded_size)
chunk_view = chunk_view.first(ceilBytesToEvenValues(decoded_size - i));
Expand All @@ -282,50 +282,50 @@ class FPCOperation
}

private:
static std::size_t ceilBytesToEvenValues(std::size_t bytes_count)
static size_t ceilBytesToEvenValues(size_t bytes_count)
{
auto values_count = (bytes_count + VALUE_SIZE - 1) / VALUE_SIZE;
size_t values_count = (bytes_count + VALUE_SIZE - 1) / VALUE_SIZE;
return values_count % 2 == 0 ? values_count : values_count + 1;
}

std::size_t importChunk(std::span<const std::byte> values, std::span<TUint> chnk)
size_t importChunk(std::span<const std::byte> values, std::span<TUInt> current_chunk)
{
if (auto chunk_view = std::as_writable_bytes(chnk); chunk_view.size() <= values.size())
if (auto chunk_view = std::as_writable_bytes(current_chunk); chunk_view.size() <= values.size())
{
std::memcpy(chunk_view.data(), values.data(), chunk_view.size());
memcpy(chunk_view.data(), values.data(), chunk_view.size());
return chunk_view.size() / VALUE_SIZE;
}
else
{
std::memset(chunk_view.data(), 0, chunk_view.size());
std::memcpy(chunk_view.data(), values.data(), values.size());
memset(chunk_view.data(), 0, chunk_view.size());
memcpy(chunk_view.data(), values.data(), values.size());
return ceilBytesToEvenValues(values.size());
}
}

void exportChunk(std::span<const TUint> chnk)
void exportChunk(std::span<const TUInt> current_chunk)
{
auto chunk_view = std::as_bytes(chnk).first(std::min(result.size(), chnk.size_bytes()));
std::memcpy(result.data(), chunk_view.data(), chunk_view.size());
auto chunk_view = std::as_bytes(current_chunk).first(std::min(result.size(), current_chunk.size_bytes()));
memcpy(result.data(), chunk_view.data(), chunk_view.size());
result = result.subspan(chunk_view.size());
}

void encodeChunk(std::span<const TUint> seq)
void encodeChunk(std::span<const TUInt> sequence)
{
for (std::size_t i = 0; i < seq.size(); i += 2)
for (size_t i = 0; i < sequence.size(); i += 2)
{
encodePair(seq[i], seq[i + 1]);
encodePair(sequence[i], sequence[i + 1]);
}
}

struct CompressedValue
{
TUint value;
unsigned compressed_size;
TUInt value;
UInt32 compressed_size;
std::byte predictor;
};

unsigned encodeCompressedZeroByteCount(unsigned compressed)
UInt32 encodeCompressedZeroByteCount(UInt32 compressed)
{
if constexpr (VALUE_SIZE == MAX_ZERO_BYTE_COUNT + 1)
{
Expand All @@ -335,7 +335,7 @@ class FPCOperation
return std::min(compressed, MAX_ZERO_BYTE_COUNT);
}

unsigned decodeCompressedZeroByteCount(unsigned encoded_size)
UInt32 decodeCompressedZeroByteCount(UInt32 encoded_size)
{
if constexpr (VALUE_SIZE == MAX_ZERO_BYTE_COUNT + 1)
{
Expand All @@ -345,22 +345,22 @@ class FPCOperation
return encoded_size;
}

CompressedValue compressValue(TUint value) noexcept
CompressedValue compressValue(TUInt value) noexcept
{
static constexpr auto BITS_PER_BYTE = std::numeric_limits<unsigned char>::digits;

TUint compressed_dfcm = dfcm_predictor.predict() ^ value;
TUint compressed_fcm = fcm_predictor.predict() ^ value;
TUInt compressed_dfcm = dfcm_predictor.predict() ^ value;
TUInt compressed_fcm = fcm_predictor.predict() ^ value;
dfcm_predictor.add(value);
fcm_predictor.add(value);
auto zeroes_dfcm = std::countl_zero(compressed_dfcm);
auto zeroes_fcm = std::countl_zero(compressed_fcm);
if (zeroes_dfcm > zeroes_fcm)
return {compressed_dfcm, encodeCompressedZeroByteCount(static_cast<unsigned>(zeroes_dfcm) / BITS_PER_BYTE), DFCM_BIT};
return {compressed_fcm, encodeCompressedZeroByteCount(static_cast<unsigned>(zeroes_fcm) / BITS_PER_BYTE), FCM_BIT};
return {compressed_dfcm, encodeCompressedZeroByteCount(static_cast<UInt32>(zeroes_dfcm) / BITS_PER_BYTE), DFCM_BIT};
return {compressed_fcm, encodeCompressedZeroByteCount(static_cast<UInt32>(zeroes_fcm) / BITS_PER_BYTE), FCM_BIT};
}

void encodePair(TUint first, TUint second)
void encodePair(TUInt first, TUInt second)
{
auto [compressed_value1, zero_byte_count1, predictor1] = compressValue(first);
auto [compressed_value2, zero_byte_count2, predictor2] = compressValue(second);
Expand All @@ -374,24 +374,24 @@ class FPCOperation
auto tail_size1 = VALUE_SIZE - zero_byte_count1;
auto tail_size2 = VALUE_SIZE - zero_byte_count2;

std::memcpy(result.data() + 1, valueTail(compressed_value1, zero_byte_count1), tail_size1);
std::memcpy(result.data() + 1 + tail_size1, valueTail(compressed_value2, zero_byte_count2), tail_size2);
memcpy(result.data() + 1, valueTail(compressed_value1, zero_byte_count1), tail_size1);
memcpy(result.data() + 1 + tail_size1, valueTail(compressed_value2, zero_byte_count2), tail_size2);
result = result.subspan(1 + tail_size1 + tail_size2);
}

std::size_t decodeChunk(std::span<const std::byte> values, std::span<TUint> seq)
size_t decodeChunk(std::span<const std::byte> values, std::span<TUInt> sequence)
{
std::size_t read_bytes = 0;
for (std::size_t i = 0; i < seq.size(); i += 2)
size_t read_bytes = 0;
for (size_t i = 0; i < sequence.size(); i += 2)
{
read_bytes += decodePair(values.subspan(read_bytes), seq[i], seq[i + 1]);
read_bytes += decodePair(values.subspan(read_bytes), sequence[i], sequence[i + 1]);
}
return read_bytes;
}

TUint decompressValue(TUint value, bool isDfcmPredictor)
TUInt decompressValue(TUInt value, bool isDfcmPredictor)
{
TUint decompressed;
TUInt decompressed;
if (isDfcmPredictor)
{
decompressed = dfcm_predictor.predict() ^ value;
Expand All @@ -405,37 +405,45 @@ class FPCOperation
return decompressed;
}

std::size_t decodePair(std::span<const std::byte> bytes, TUint& first, TUint& second)
size_t decodePair(std::span<const std::byte> bytes, TUInt & first, TUInt & second)
{
if (bytes.empty()) [[unlikely]]
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Unexpected end of encoded sequence");

auto zero_byte_count1 = decodeCompressedZeroByteCount(
std::to_integer<unsigned>(bytes.front() >> 4) & MAX_ZERO_BYTE_COUNT);
auto zero_byte_count2 = decodeCompressedZeroByteCount(
std::to_integer<unsigned>(bytes.front()) & MAX_ZERO_BYTE_COUNT);
UInt32 zero_byte_count1 = decodeCompressedZeroByteCount(
std::to_integer<UInt32>(bytes.front() >> 4) & MAX_ZERO_BYTE_COUNT);
UInt32 zero_byte_count2 = decodeCompressedZeroByteCount(
std::to_integer<UInt32>(bytes.front()) & MAX_ZERO_BYTE_COUNT);

auto tail_size1 = VALUE_SIZE - zero_byte_count1;
auto tail_size2 = VALUE_SIZE - zero_byte_count2;
if (zero_byte_count1 > VALUE_SIZE || zero_byte_count2 > VALUE_SIZE) [[unlikely]]
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Invalid compressed data");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This causes 00002_log_and_exception_messages_formatting to fail because of exceptions shorter than 30 check


size_t tail_size1 = VALUE_SIZE - zero_byte_count1;
size_t tail_size2 = VALUE_SIZE - zero_byte_count2;

size_t expected_size = 0;
if (__builtin_add_overflow(tail_size1, tail_size2, &expected_size)
|| __builtin_add_overflow(expected_size, 1, &expected_size)) [[unlikely]]
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Invalid compressed data");

if (bytes.size() < 1 + tail_size1 + tail_size2) [[unlikely]]
if (bytes.size() < expected_size) [[unlikely]]
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Unexpected end of encoded sequence");

TUint value1 = 0;
TUint value2 = 0;
TUInt value1 = 0;
TUInt value2 = 0;

std::memcpy(valueTail(value1, zero_byte_count1), bytes.data() + 1, tail_size1);
std::memcpy(valueTail(value2, zero_byte_count2), bytes.data() + 1 + tail_size1, tail_size2);
memcpy(valueTail(value1, zero_byte_count1), bytes.data() + 1, tail_size1);
memcpy(valueTail(value2, zero_byte_count2), bytes.data() + 1 + tail_size1, tail_size2);

auto is_dfcm_predictor1 = std::to_integer<unsigned char>(bytes.front() & DFCM_BIT_1) != 0;
auto is_dfcm_predictor2 = std::to_integer<unsigned char>(bytes.front() & DFCM_BIT_2) != 0;
first = decompressValue(value1, is_dfcm_predictor1);
second = decompressValue(value2, is_dfcm_predictor2);

return 1 + tail_size1 + tail_size2;
return expected_size;
}

static void* valueTail(TUint& value, unsigned compressed_size)
static void* valueTail(TUInt& value, UInt32 compressed_size)
{
if constexpr (ENDIAN == std::endian::little)
{
Expand All @@ -447,11 +455,11 @@ class FPCOperation
}
}

DfcmPredictor<TUint> dfcm_predictor;
FcmPredictor<TUint> fcm_predictor;
DfcmPredictor<TUInt> dfcm_predictor;
FcmPredictor<TUInt> fcm_predictor;

// memcpy the input into this buffer to align reads, this improves performance compared to unaligned reads (bit_cast) by ~10%
std::array<TUint, CHUNK_SIZE> chunk{};
std::array<TUInt, CHUNK_SIZE> chunk{};

std::span<std::byte> result{};
};
Expand Down
2 changes: 2 additions & 0 deletions tests/queries/0_stateless/02915_fpc_overflow.reference
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Exc
Exc
11 changes: 11 additions & 0 deletions tests/queries/0_stateless/02915_fpc_overflow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash

CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh

echo -ne 'checksumchecksum\x98\x90\x00\x00\x00\x11\x11\x11\x11\x04\x0f\x51 ' |
${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&decompress=1&http_native_compression_disable_checksumming_on_decompress=1" --data-binary @- 2>&1 | grep -oF 'Exc'

echo -ne 'checksumchecksum\x98\x90\x00\x00\x00\x11\x11\x11\x11\x04\x0f\x16 ' |
${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&decompress=1&http_native_compression_disable_checksumming_on_decompress=1" --data-binary @- 2>&1 | grep -oF 'Exc'