Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GCD codec #53149

Merged
merged 95 commits into from Sep 5, 2023
Merged
Show file tree
Hide file tree
Changes from 84 commits
Commits
Show all changes
95 commits
Select commit Hold shift + click to select a range
181e214
Added Codec-file and shit-coded CompressionMethodByte
seshWCS Aug 7, 2023
90d0c48
Edited codec-files
seshWCS Aug 7, 2023
8339575
Merge branch 'ClickHouse:master' into gcddelta-codec
seshWCS Aug 7, 2023
877a55e
Merge branch 'master' into gcddelta-codec
seshWCS Aug 8, 2023
0a4c3f1
Fix
seshWCS Aug 8, 2023
bdc3cdb
Style
seshWCS Aug 8, 2023
78b06eb
Merge branch 'clang17_fix' of https://github.com/Algunenano/ClickHous…
seshWCS Aug 8, 2023
f533eea
Moved registerCodecGCD to ifndef
seshWCS Aug 8, 2023
f52d4b5
Style
seshWCS Aug 8, 2023
5af6a89
Style
seshWCS Aug 9, 2023
d500490
Fixed bugs
seshWCS Aug 9, 2023
086d074
Style
seshWCS Aug 9, 2023
c6214cc
Merge branch 'master' into gcddelta-codec
seshWCS Aug 9, 2023
ef857f3
Fix
seshWCS Aug 9, 2023
b52655d
Added exception for types
seshWCS Aug 9, 2023
43a8271
Deleted args
seshWCS Aug 9, 2023
c8f0ee9
Edited ErrorCodes
seshWCS Aug 10, 2023
772de17
Added test
seshWCS Aug 11, 2023
699a4fb
Merge branch 'master' into gcddelta-codec
seshWCS Aug 11, 2023
a712dd2
Added gcd_fuzzer
seshWCS Aug 11, 2023
dc6c100
Fixed tests
seshWCS Aug 11, 2023
3a3a4ac
Deleted extral if
seshWCS Aug 11, 2023
88549ff
Delta -> GCD
seshWCS Aug 11, 2023
79c8bb0
Update 02842_gcd_codec.sql
seshWCS Aug 11, 2023
5fc20dc
Edited test-name
seshWCS Aug 11, 2023
e63288e
Update 02843_gcd_codec.sql
seshWCS Aug 11, 2023
a366500
Fix
seshWCS Aug 12, 2023
9ff2005
Test-fix
seshWCS Aug 12, 2023
ba8cb25
Deleted unused error code
seshWCS Aug 12, 2023
8037dd8
Edited sign
seshWCS Aug 12, 2023
a14d731
Added support for dt and decimal
seshWCS Aug 12, 2023
ac63a9b
Fixed bug in fuzzer & 1 + 1 -> 2 in Codec
seshWCS Aug 15, 2023
cbb9aa7
Merge branch 'ClickHouse:master' into gcddelta-codec
seshWCS Aug 15, 2023
eadcfdc
Rename commit
seshWCS Aug 15, 2023
7f40157
GCD optimization
seshWCS Aug 15, 2023
2b8592c
Style
seshWCS Aug 15, 2023
539555b
Added SIMD for UInt32/64
seshWCS Aug 20, 2023
bd6b10f
Fixed bug in libdivide usage
seshWCS Aug 20, 2023
c7b5cef
Merge branch 'ClickHouse:master' into gcddelta-codec
seshWCS Aug 21, 2023
6e0d9f1
Edited CompressionMethodByte for GCDCodec
seshWCS Aug 22, 2023
c7be413
Added boost::math::gcd and improved test
seshWCS Aug 22, 2023
176fdf3
Erased args from codec_builder
seshWCS Aug 22, 2023
50e10a6
Added Docs
seshWCS Aug 22, 2023
c7f7fc5
Merge branch 'ClickHouse:master' into gcddelta-codec
seshWCS Aug 22, 2023
2e3004d
Test-rename
seshWCS Aug 22, 2023
a291444
Update table.md
seshWCS Aug 22, 2023
e8a3914
Update table.md
seshWCS Aug 22, 2023
480d2df
Added params
seshWCS Aug 22, 2023
4c16f34
Perf-test edit
seshWCS Aug 23, 2023
32a5730
Deleted extra files
seshWCS Aug 23, 2023
73a9d44
Deleted extra
seshWCS Aug 24, 2023
893de89
Edited perf-test
seshWCS Aug 24, 2023
c82e6fd
Fix
seshWCS Aug 24, 2023
2b80dbb
Fix
seshWCS Aug 24, 2023
6761ccd
UInt64 -> DateTime64
seshWCS Aug 24, 2023
f9de489
Edited while
seshWCS Aug 24, 2023
680cbb6
Fixed bugs with memory
seshWCS Aug 25, 2023
e4ce24e
Edited ENGINE
seshWCS Aug 25, 2023
49b3f20
Added result to compressDataForType
seshWCS Aug 26, 2023
ee0b2e4
Merge branch 'ClickHouse:master' into gcddelta-codec
seshWCS Aug 26, 2023
62fe4e2
UInt32 => size_t + static_cast<UInt32>(size_t)
seshWCS Aug 26, 2023
887a4da
Bug-fix
seshWCS Aug 27, 2023
6cb4e10
Bug-fix in perf-test
seshWCS Aug 29, 2023
7c2aefe
Merge branch 'ClickHouse:master' into gcddelta-codec
seshWCS Aug 29, 2023
85240ac
Docs-fix
seshWCS Aug 29, 2023
059e75b
Deleted args
seshWCS Aug 29, 2023
e0157c4
Docs-fix
seshWCS Aug 29, 2023
5396357
Docs-fix
seshWCS Aug 29, 2023
9b7d337
Deleted unused ErrorCode
seshWCS Aug 29, 2023
a8d9b5d
Merge branch 'ClickHouse:master' into gcddelta-codec
seshWCS Aug 30, 2023
5923ba0
Added Negative tests
seshWCS Aug 31, 2023
c7dacd0
Edited perf-comp
seshWCS Aug 31, 2023
bbf6792
Test-fix
seshWCS Aug 31, 2023
8ec6f6a
perf-conditions
seshWCS Aug 31, 2023
1d515ac
DateTime64 => UInt64
seshWCS Aug 31, 2023
43a3650
Merge branch 'ClickHouse:master' into gcddelta-codec
seshWCS Aug 31, 2023
eae9e66
Deleted test
seshWCS Aug 31, 2023
48783f8
Merge branch 'ClickHouse:master' into gcddelta-codec
seshWCS Sep 1, 2023
43f9efb
Added libdivide for Integers with sizeof <= 8
seshWCS Sep 1, 2023
9a71bce
Merge branch 'ClickHouse:master' into gcddelta-codec
seshWCS Sep 1, 2023
cee0bc0
deleted extra variable
seshWCS Sep 3, 2023
0ca3807
Merge branch 'master' into gcddelta-codec
seshWCS Sep 5, 2023
e687b94
Added casts
seshWCS Sep 5, 2023
39d279e
Merge branch 'ClickHouse:master' into gcddelta-codec
seshWCS Sep 5, 2023
8ca51bf
Issue-fix
seshWCS Sep 5, 2023
cba3457
calculate result directly from source_size and sizeof(T)
seshWCS Sep 5, 2023
b6c80dd
Syntax fix
seshWCS Sep 5, 2023
5a0a9ac
Moved ctor upper
seshWCS Sep 5, 2023
6cc9830
Cosmetic
seshWCS Sep 5, 2023
221919f
TUInt32Or64 => LibdivideT
seshWCS Sep 5, 2023
3d078e2
if constexpr
seshWCS Sep 5, 2023
4a1d67d
Qpl
seshWCS Sep 5, 2023
b0a2295
Cosmetic
seshWCS Sep 5, 2023
dbab2f2
Docs-fix
seshWCS Sep 5, 2023
d416aaf
New tests
seshWCS Sep 5, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/en/sql-reference/statements/create/table.md
Expand Up @@ -401,6 +401,10 @@ These codecs are designed to make compression more effective by using specific f

`DoubleDelta(bytes_size)` — Calculates delta of deltas and writes it in compact binary form. Possible `bytes_size` values: 1, 2, 4, 8, the default value is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-bit deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf).

#### GCD

`GCD()` - Calculates the greatest common denominator (GCD) of all values in the column, then divides each value by the GCD. This codec is for data preparation and is not suitable for use without an additional codec. GCD-codec can be used with Integers, Decimals and DateTime. A good use case would be to store timestamps or monetary values with high precision.
seshWCS marked this conversation as resolved.
Show resolved Hide resolved
seshWCS marked this conversation as resolved.
Show resolved Hide resolved

#### Gorilla

`Gorilla(bytes_size)` — Calculates XOR between current and previous floating point value and writes it in compact binary form. The smaller the difference between consecutive values is, i.e. the slower the values of the series changes, the better the compression rate. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Possible `bytes_size` values: 1, 2, 4, 8, the default value is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. For additional information, see section 4.1 in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](https://doi.org/10.14778/2824032.2824078).
Expand Down
1 change: 1 addition & 0 deletions docs/ru/sql-reference/statements/create/table.md
Expand Up @@ -240,6 +240,7 @@ ClickHouse поддерживает кодеки общего назначени

- `Delta(delta_bytes)` — Метод, в котором исходные значения заменяются разностью двух соседних значений, за исключением первого значения, которое остаётся неизменным. Для хранения разниц используется до `delta_bytes`, т.е. `delta_bytes` — это максимальный размер исходных данных. Возможные значения `delta_bytes`: 1, 2, 4, 8. Значение по умолчанию для `delta_bytes` равно `sizeof(type)`, если результат 1, 2, 4, or 8. Во всех других случаях — 1.
- `DoubleDelta` — Вычисляется разницу от разниц и сохраняет её в компакном бинарном виде. Оптимальная степень сжатия достигается для монотонных последовательностей с постоянным шагом, наподобие временных рядов. Можно использовать с любым типом данных фиксированного размера. Реализует алгоритм, используемый в TSDB Gorilla, поддерживает 64-битные типы данных. Использует 1 дополнительный бит для 32-байтовых значений: 5-битные префиксы вместо 4-битных префиксов. Подробнее читайте в разделе «Compressing Time Stamps» документа [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf).
- `GCD` - Вычисляет НОД всех чисел, а затем делит их на него. Этот кодек предназначен для подготовки данных и не подходит для использования без дополнительного кодека. GCD-кодек может использоваться с Integer, Decimal и DateTime. Хорошим вариантом использования было бы хранение временных меток или денежных значений с высокой точностью.
- `Gorilla` — Вычисляет XOR между текущим и предыдущим значением и записывает результат в компактной бинарной форме. Еффективно сохраняет ряды медленно изменяющихся чисел с плавающей запятой, поскольку наилучший коэффициент сжатия достигается, если соседние значения одинаковые. Реализует алгоритм, используемый в TSDB Gorilla, адаптируя его для работы с 64-битными значениями. Подробнее читайте в разделе «Compressing Values» документа [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf).
- `T64` — Метод сжатия который обрезает неиспользуемые старшие биты целочисленных значений (включая `Enum`, `Date` и `DateTime`). На каждом шаге алгоритма, кодек помещает блок из 64 значений в матрицу 64✕64, транспонирует её, обрезает неиспользуемые биты, а то, что осталось возвращает в виде последовательности. Неиспользуемые биты, это биты, которые не изменяются от минимального к максимальному на всём диапазоне значений куска данных.

Expand Down
284 changes: 284 additions & 0 deletions src/Compression/CompressionCodecGCD.cpp
@@ -0,0 +1,284 @@
#include <Compression/ICompressionCodec.h>
#include <Compression/CompressionInfo.h>
#include <Compression/CompressionFactory.h>
#include <base/unaligned.h>
#include <Parsers/IAST.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTFunction.h>
#include <IO/WriteHelpers.h>
#include "Common/Exception.h"
#include "base/Decimal_fwd.h"
#include "base/types.h"
#include "config.h"

#include <boost/math/common_factor_rt.hpp>
seshWCS marked this conversation as resolved.
Show resolved Hide resolved
#include <libdivide-config.h>
#include <libdivide.h>


namespace DB
{

class CompressionCodecGCD : public ICompressionCodec
{
public:
explicit CompressionCodecGCD(UInt8 gcd_bytes_size_);

uint8_t getMethodByte() const override;

void updateHash(SipHash & hash) const override;

protected:
UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override;
void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override;
UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override;

bool isCompression() const override { return false; }
bool isGenericCompression() const override { return false; }

private:
const UInt8 gcd_bytes_size;
};


namespace ErrorCodes
{
extern const int CANNOT_COMPRESS;
extern const int CANNOT_DECOMPRESS;
extern const int ILLEGAL_SYNTAX_FOR_CODEC_TYPE;
extern const int BAD_ARGUMENTS;
}

UInt32 CompressionCodecGCD::getMaxCompressedDataSize(UInt32 uncompressed_size) const
{
return uncompressed_size
+ gcd_bytes_size // To store gcd
+ gcd_bytes_size // Max bytes_to_skip
+ 2; // Local header
}

CompressionCodecGCD::CompressionCodecGCD(UInt8 gcd_bytes_size_)
seshWCS marked this conversation as resolved.
Show resolved Hide resolved
: gcd_bytes_size(gcd_bytes_size_)
{
setCodecDescription("GCD", {});
}

uint8_t CompressionCodecGCD::getMethodByte() const
{
return static_cast<uint8_t>(CompressionMethodByte::GCD);
}

void CompressionCodecGCD::updateHash(SipHash & hash) const
{
getCodecDesc()->updateTreeHash(hash);
}

namespace
{

template <typename T>
UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
{
size_t result = 0;
seshWCS marked this conversation as resolved.
Show resolved Hide resolved
if (source_size % sizeof(T) != 0)
throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot GCD compress, data size {} is not aligned to {}", source_size, sizeof(T));
seshWCS marked this conversation as resolved.
Show resolved Hide resolved

const char * const source_end = source + source_size;

T gcd_divider{};
const auto * cur_source = source;
while (gcd_divider != T(1) && cur_source < source_end)
{
if (cur_source == source)
{
seshWCS marked this conversation as resolved.
Show resolved Hide resolved
gcd_divider = unalignedLoad<T>(cur_source);
}
else
{
gcd_divider = boost::math::gcd(gcd_divider, unalignedLoad<T>(cur_source));
}
cur_source += sizeof(T);
}

unalignedStore<T>(dest, gcd_divider);
dest += sizeof(T);
result += sizeof(T);

if (sizeof(T) <= 8)
seshWCS marked this conversation as resolved.
Show resolved Hide resolved
{
/// libdivide support only UInt32 and UInt64.
using TUInt32Or64 = std::conditional_t<sizeof(T) <= 4, UInt32, UInt64>;
seshWCS marked this conversation as resolved.
Show resolved Hide resolved
libdivide::divider<TUInt32Or64> divider(static_cast<TUInt32Or64>(gcd_divider));
seshWCS marked this conversation as resolved.
Show resolved Hide resolved
cur_source = source;
while (cur_source < source_end)
{
unalignedStore<T>(dest, static_cast<T>(static_cast<TUInt32Or64>(unalignedLoad<T>(cur_source)) / divider));
cur_source += sizeof(T);
dest += sizeof(T);
result += sizeof(T);
}
}
else
{
cur_source = source;
while (cur_source < source_end)
{
unalignedStore<T>(dest, unalignedLoad<T>(cur_source) / gcd_divider);
cur_source += sizeof(T);
dest += sizeof(T);
result += sizeof(T);
}
}
return static_cast<UInt32>(result);
}

template <typename T>
void decompressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 output_size)
{
const char * const output_end = dest + output_size;

if (source_size % sizeof(T) != 0)
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot GCD decompress, data size {} is not aligned to {}", source_size, sizeof(T));

if (source_size < sizeof(T))
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot GCD decompress, data size {} is less than {}", source_size, sizeof(T));

const char * const source_end = source + source_size;
const T gcd_multiplier = unalignedLoad<T>(source);
source += sizeof(T);
while (source < source_end)
{
if (dest + sizeof(T) > output_end) [[unlikely]]
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress the data");
unalignedStore<T>(dest, unalignedLoad<T>(source) * gcd_multiplier);

source += sizeof(T);
dest += sizeof(T);
}
}

}

UInt32 CompressionCodecGCD::doCompressData(const char * source, UInt32 source_size, char * dest) const
{
UInt8 bytes_to_skip = source_size % gcd_bytes_size;
dest[0] = gcd_bytes_size;
dest[1] = bytes_to_skip; /// unused (backward compatibility)
memcpy(&dest[2], source, bytes_to_skip);
size_t start_pos = 2 + bytes_to_skip;
UInt32 result_size = 0;
switch (gcd_bytes_size)
{
case 1:
result_size = compressDataForType<UInt8>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
break;
case 2:
result_size = compressDataForType<UInt16>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
break;
case 4:
result_size = compressDataForType<UInt32>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
break;
case 8:
result_size = compressDataForType<UInt64>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
break;
case 16:
result_size = compressDataForType<UInt128>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
break;
case 32:
result_size = compressDataForType<UInt256>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
break;
}
return 2 + bytes_to_skip + result_size;
}

void CompressionCodecGCD::doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const
{
if (source_size < 2)
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress. File has wrong header");

if (uncompressed_size == 0)
return;

UInt8 bytes_size = source[0];

if (!(bytes_size == 1 || bytes_size == 2 || bytes_size == 4 || bytes_size == 8 || bytes_size == 16 || bytes_size == 32))
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress. File has wrong header");

UInt8 bytes_to_skip = uncompressed_size % bytes_size;
UInt32 output_size = uncompressed_size - bytes_to_skip;

if (static_cast<UInt32>(2 + bytes_to_skip) > source_size)
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress. File has wrong header");

memcpy(dest, &source[2], bytes_to_skip);
UInt32 source_size_no_header = source_size - bytes_to_skip - 2;
switch (bytes_size)
{
case 1:
decompressDataForType<UInt8>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
break;
case 2:
decompressDataForType<UInt16>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
break;
case 4:
decompressDataForType<UInt32>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
break;
case 8:
decompressDataForType<UInt64>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
break;
case 16:
decompressDataForType<UInt128>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
break;
case 32:
decompressDataForType<UInt256>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
break;
}
}

namespace
{

UInt8 getGCDBytesSize(const IDataType * column_type)
{
if (!column_type->isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Codec GCD is not applicable for {} because the data type is not of fixed size",
column_type->getName());

size_t max_size = column_type->getSizeOfValueInMemory();
if (max_size == 1 || max_size == 2 || max_size == 4 || max_size == 8 || max_size == 16 || max_size == 32)
return static_cast<UInt8>(max_size);
else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Codec GCD is only applicable for data types of size 1, 2, 4, 8, 16, 32 bytes. Given type {}",
column_type->getName());
}

}

void registerCodecGCD(CompressionCodecFactory & factory)
seshWCS marked this conversation as resolved.
Show resolved Hide resolved
{
UInt8 method_code = static_cast<UInt8>(CompressionMethodByte::GCD);
auto codec_builder = [&](const ASTPtr & arguments, const IDataType * column_type) -> CompressionCodecPtr
seshWCS marked this conversation as resolved.
Show resolved Hide resolved
{
/// Default bytes size is 1.
UInt8 gcd_bytes_size = 1;

if (arguments && !arguments->children.empty())
{
throw Exception(ErrorCodes::ILLEGAL_SYNTAX_FOR_CODEC_TYPE, "GCD codec must have 0 parameters, given {}", arguments->children.size());
}
else if (column_type)
{
gcd_bytes_size = getGCDBytesSize(column_type);
}

return std::make_shared<CompressionCodecGCD>(gcd_bytes_size);
};
factory.registerCompressionCodecWithType("GCD", method_code, codec_builder);
}

CompressionCodecPtr getCompressionCodecGCD(UInt8 gcd_bytes_size)
{
return std::make_shared<CompressionCodecGCD>(gcd_bytes_size);
}

}
2 changes: 2 additions & 0 deletions src/Compression/CompressionFactory.cpp
Expand Up @@ -179,6 +179,7 @@ void registerCodecDoubleDelta(CompressionCodecFactory & factory);
void registerCodecGorilla(CompressionCodecFactory & factory);
void registerCodecEncrypted(CompressionCodecFactory & factory);
void registerCodecFPC(CompressionCodecFactory & factory);
void registerCodecGCD(CompressionCodecFactory & factory);
seshWCS marked this conversation as resolved.
Show resolved Hide resolved
#endif

CompressionCodecFactory::CompressionCodecFactory()
Expand All @@ -195,6 +196,7 @@ CompressionCodecFactory::CompressionCodecFactory()
registerCodecGorilla(*this);
registerCodecEncrypted(*this);
registerCodecFPC(*this);
registerCodecGCD(*this);
seshWCS marked this conversation as resolved.
Show resolved Hide resolved
#ifdef ENABLE_QPL_COMPRESSION
registerCodecDeflateQpl(*this);
#endif
Expand Down
1 change: 1 addition & 0 deletions src/Compression/CompressionInfo.h
Expand Up @@ -47,6 +47,7 @@ enum class CompressionMethodByte : uint8_t
AES_256_GCM_SIV = 0x97,
FPC = 0x98,
DeflateQpl = 0x99,
GCD = 0x9a,
};

}
3 changes: 3 additions & 0 deletions src/Compression/fuzzers/CMakeLists.txt
Expand Up @@ -18,3 +18,6 @@ target_link_libraries (double_delta_decompress_fuzzer PRIVATE dbms)

clickhouse_add_executable (encrypted_decompress_fuzzer encrypted_decompress_fuzzer.cpp)
target_link_libraries (encrypted_decompress_fuzzer PRIVATE dbms)

clickhouse_add_executable (gcd_decompress_fuzzer gcd_decompress_fuzzer.cpp)
target_link_libraries (gcd_decompress_fuzzer PRIVATE dbms)
45 changes: 45 additions & 0 deletions src/Compression/fuzzers/gcd_decompress_fuzzer.cpp
@@ -0,0 +1,45 @@
#include <iostream>
rschu1ze marked this conversation as resolved.
Show resolved Hide resolved
#include <string>

#include <Compression/ICompressionCodec.h>
#include <IO/BufferWithOwnMemory.h>
#include "base/types.h"

namespace DB
{
CompressionCodecPtr getCompressionCodecGCD(UInt8 gcd_bytes_size);
}

struct AuxiliaryRandomData
{
UInt8 gcd_size_bytes;
size_t decompressed_size;
};

extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size)
try
{
if (size < sizeof(AuxiliaryRandomData))
return 0;

const auto * p = reinterpret_cast<const AuxiliaryRandomData *>(data);
auto codec = DB::getCompressionCodecGCD(p->gcd_size_bytes);

size_t output_buffer_size = p->decompressed_size % 65536;
size -= sizeof(AuxiliaryRandomData);
data += sizeof(AuxiliaryRandomData) / sizeof(uint8_t);

// std::string input = std::string(reinterpret_cast<const char*>(data), size);
// fmt::print(stderr, "Using input {} of size {}, output size is {}. \n", input, size, output_buffer_size);

DB::Memory<> memory;
memory.resize(output_buffer_size + codec->getAdditionalSizeAtTheEndOfBuffer());

codec->doDecompressData(reinterpret_cast<const char *>(data), static_cast<UInt32>(size), memory.data(), static_cast<UInt32>(output_buffer_size));

return 0;
}
catch (...)
{
return 1;
}
@@ -0,0 +1 @@
0
15 changes: 15 additions & 0 deletions tests/queries/0_stateless/02868_gcd_codec_test_data.sql
@@ -0,0 +1,15 @@
DROP TABLE IF EXISTS table_none;
seshWCS marked this conversation as resolved.
Show resolved Hide resolved
CREATE TABLE table_none (id UInt64, ui UInt256 CODEC(LZ4)) ENGINE = Memory;
INSERT INTO table_none SELECT * FROM generateRandom() LIMIT 50;

DROP TABLE IF EXISTS table_gcd_codec;
CREATE TABLE table_gcd_codec (id UInt64, ui UInt256 CODEC(GCD, LZ4)) ENGINE = Memory;
INSERT INTO table_gcd_codec SELECT * FROM table_none;

SELECT COUNT(*)
FROM (
SELECT table_none.id, table_none.ui AS ui1, table_gcd_codec.id, table_gcd_codec.ui AS ui2
FROM table_none
JOIN table_gcd_codec ON table_none.id = table_gcd_codec.id
)
WHERE ui1 != ui2;
Empty file.