ClickHouse · rschu1ze · Sep 5, 2023 · Aug 7, 2023 · Aug 7, 2023 · Aug 7, 2023
diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md
@@ -401,6 +401,10 @@ These codecs are designed to make compression more effective by using specific f
 
 `DoubleDelta(bytes_size)` — Calculates delta of deltas and writes it in compact binary form. Possible `bytes_size` values: 1, 2, 4, 8, the default value is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-bit deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf).
 
+#### GCD
+
+`GCD()` - Calculates the greatest common denominator (GCD) of all values in the column, then divides each value by the GCD. This codec is for data preparation and is not suitable for use without an additional codec. GCD-codec can be used with Integers, Decimals and DateTime. A good use case would be to store timestamps or monetary values with high precision.
+
 #### Gorilla
 
 `Gorilla(bytes_size)` — Calculates XOR between current and previous floating point value and writes it in compact binary form. The smaller the difference between consecutive values is, i.e. the slower the values of the series changes, the better the compression rate. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Possible `bytes_size` values: 1, 2, 4, 8, the default value is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. For additional information, see section 4.1 in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](https://doi.org/10.14778/2824032.2824078).

diff --git a/docs/ru/sql-reference/statements/create/table.md b/docs/ru/sql-reference/statements/create/table.md
@@ -240,6 +240,7 @@ ClickHouse поддерживает кодеки общего назначени
 
 -   `Delta(delta_bytes)` — Метод, в котором исходные значения заменяются разностью двух соседних значений, за исключением первого значения, которое остаётся неизменным. Для хранения разниц используется до `delta_bytes`, т.е. `delta_bytes` — это максимальный размер исходных данных. Возможные значения `delta_bytes`: 1, 2, 4, 8. Значение по умолчанию для `delta_bytes` равно `sizeof(type)`, если результат 1, 2, 4, or 8. Во всех других случаях — 1.
 -   `DoubleDelta` — Вычисляется разницу от разниц и сохраняет её в компакном бинарном виде. Оптимальная степень сжатия достигается для монотонных последовательностей с постоянным шагом, наподобие временных рядов. Можно использовать с любым типом данных фиксированного размера. Реализует алгоритм, используемый в TSDB Gorilla, поддерживает 64-битные типы данных. Использует 1 дополнительный бит для 32-байтовых значений: 5-битные префиксы вместо 4-битных префиксов. Подробнее читайте в разделе «Compressing Time Stamps» документа [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf).
+-   `GCD` - Вычисляет НОД всех чисел, а затем делит их на него. Этот кодек предназначен для подготовки данных и не подходит для использования без дополнительного кодека. GCD-кодек может использоваться с Integer, Decimal и DateTime. Хорошим вариантом использования было бы хранение временных меток или денежных значений с высокой точностью.
 -   `Gorilla` — Вычисляет XOR между текущим и предыдущим значением и записывает результат в компактной бинарной форме. Еффективно сохраняет ряды медленно изменяющихся чисел с плавающей запятой, поскольку наилучший коэффициент сжатия достигается, если соседние значения одинаковые. Реализует алгоритм, используемый в TSDB Gorilla, адаптируя его для работы с 64-битными значениями. Подробнее читайте в разделе «Compressing Values» документа [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf).
 -   `T64` — Метод сжатия который обрезает неиспользуемые старшие биты целочисленных значений (включая `Enum`, `Date` и `DateTime`). На каждом шаге алгоритма, кодек помещает блок из 64 значений в матрицу 64✕64, транспонирует её, обрезает неиспользуемые биты, а то, что осталось возвращает в виде последовательности. Неиспользуемые биты, это биты, которые не изменяются от минимального к максимальному на всём диапазоне значений куска данных.
 

diff --git a/src/Compression/CompressionCodecGCD.cpp b/src/Compression/CompressionCodecGCD.cpp
@@ -0,0 +1,284 @@
+#include <Compression/ICompressionCodec.h>
+#include <Compression/CompressionInfo.h>
+#include <Compression/CompressionFactory.h>
+#include <base/unaligned.h>
+#include <Parsers/IAST.h>
+#include <Parsers/ASTLiteral.h>
+#include <Parsers/ASTFunction.h>
+#include <IO/WriteHelpers.h>
+#include "Common/Exception.h"
+#include "base/Decimal_fwd.h"
+#include "base/types.h"
+#include "config.h"
+
+#include <boost/math/common_factor_rt.hpp>
+#include <libdivide-config.h>
+#include <libdivide.h>
+
+
+namespace DB
+{
+
+class CompressionCodecGCD : public ICompressionCodec
+{
+public:
+    explicit CompressionCodecGCD(UInt8 gcd_bytes_size_);
+
+    uint8_t getMethodByte() const override;
+
+    void updateHash(SipHash & hash) const override;
+
+protected:
+    UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override;
+    void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override;
+    UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override;
+
+    bool isCompression() const override { return false; }
+    bool isGenericCompression() const override { return false; }
+
+private:
+    const UInt8 gcd_bytes_size;
+};
+
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_COMPRESS;
+    extern const int CANNOT_DECOMPRESS;
+    extern const int ILLEGAL_SYNTAX_FOR_CODEC_TYPE;
+    extern const int BAD_ARGUMENTS;
+}
+
+UInt32 CompressionCodecGCD::getMaxCompressedDataSize(UInt32 uncompressed_size) const
+{
+    return uncompressed_size
+           + gcd_bytes_size // To store gcd
+           + gcd_bytes_size // Max bytes_to_skip
+           + 2; // Local header
+}
+
+CompressionCodecGCD::CompressionCodecGCD(UInt8 gcd_bytes_size_)
+    : gcd_bytes_size(gcd_bytes_size_)
+{
+    setCodecDescription("GCD", {});
+}
+
+uint8_t CompressionCodecGCD::getMethodByte() const
+{
+    return static_cast<uint8_t>(CompressionMethodByte::GCD);
+}
+
+void CompressionCodecGCD::updateHash(SipHash & hash) const
+{
+    getCodecDesc()->updateTreeHash(hash);
+}
+
+namespace
+{
+
+template <typename T>
+UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
+{
+    size_t result = 0;
+    if (source_size % sizeof(T) != 0)
+        throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot GCD compress, data size {}  is not aligned to {}", source_size, sizeof(T));
+
+    const char * const source_end = source + source_size;
+
+    T gcd_divider{};
+    const auto * cur_source = source;
+    while (gcd_divider != T(1) && cur_source < source_end)
+    {
+        if (cur_source == source)
+        {
+            gcd_divider = unalignedLoad<T>(cur_source);
+        }
+        else
+        {
+            gcd_divider = boost::math::gcd(gcd_divider, unalignedLoad<T>(cur_source));
+        }
+        cur_source += sizeof(T);
+    }
+
+    unalignedStore<T>(dest, gcd_divider);
+    dest += sizeof(T);
+    result += sizeof(T);
+
+    if (sizeof(T) <= 8)
+    {
+        /// libdivide support only UInt32 and UInt64.
+        using TUInt32Or64 = std::conditional_t<sizeof(T) <= 4, UInt32, UInt64>;
+        libdivide::divider<TUInt32Or64> divider(static_cast<TUInt32Or64>(gcd_divider));
+        cur_source = source;
+        while (cur_source < source_end)
+        {
+            unalignedStore<T>(dest, static_cast<T>(static_cast<TUInt32Or64>(unalignedLoad<T>(cur_source)) / divider));
+            cur_source += sizeof(T);
+            dest += sizeof(T);
+            result += sizeof(T);
+        }
+    }
+    else
+    {
+        cur_source = source;
+        while (cur_source < source_end)
+        {
+            unalignedStore<T>(dest, unalignedLoad<T>(cur_source) / gcd_divider);
+            cur_source += sizeof(T);
+            dest += sizeof(T);
+            result += sizeof(T);
+        }
+    }
+    return static_cast<UInt32>(result);
+}
+
+template <typename T>
+void decompressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 output_size)
+{
+    const char * const output_end = dest + output_size;
+
+    if (source_size % sizeof(T) != 0)
+        throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot GCD decompress, data size {}  is not aligned to {}", source_size, sizeof(T));
+
+    if (source_size < sizeof(T))
+        throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot GCD decompress, data size {} is less than {}", source_size, sizeof(T));
+
+    const char * const source_end = source + source_size;
+    const T gcd_multiplier = unalignedLoad<T>(source);
+    source += sizeof(T);
+    while (source < source_end)
+    {
+        if (dest + sizeof(T) > output_end) [[unlikely]]
+            throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress the data");
+        unalignedStore<T>(dest, unalignedLoad<T>(source) * gcd_multiplier);
+
+        source += sizeof(T);
+        dest += sizeof(T);
+    }
+}
+
+}
+
+UInt32 CompressionCodecGCD::doCompressData(const char * source, UInt32 source_size, char * dest) const
+{
+    UInt8 bytes_to_skip = source_size % gcd_bytes_size;
+    dest[0] = gcd_bytes_size;
+    dest[1] = bytes_to_skip; /// unused (backward compatibility)
+    memcpy(&dest[2], source, bytes_to_skip);
+    size_t start_pos = 2 + bytes_to_skip;
+    UInt32 result_size = 0;
+    switch (gcd_bytes_size)
+    {
+    case 1:
+        result_size = compressDataForType<UInt8>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
+        break;
+    case 2:
+        result_size = compressDataForType<UInt16>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
+        break;
+    case 4:
+        result_size = compressDataForType<UInt32>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
+        break;
+    case 8:
+        result_size = compressDataForType<UInt64>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
+        break;
+    case 16:
+        result_size = compressDataForType<UInt128>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
+        break;
+    case 32:
+        result_size = compressDataForType<UInt256>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
+        break;
+    }
+    return 2 + bytes_to_skip + result_size;
+}
+
+void CompressionCodecGCD::doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const
+{
+    if (source_size < 2)
+        throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress. File has wrong header");
+
+    if (uncompressed_size == 0)
+        return;
+
+    UInt8 bytes_size = source[0];
+
+    if (!(bytes_size == 1 || bytes_size == 2 || bytes_size == 4 || bytes_size == 8 || bytes_size == 16 || bytes_size == 32))
+        throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress. File has wrong header");
+
+    UInt8 bytes_to_skip = uncompressed_size % bytes_size;
+    UInt32 output_size = uncompressed_size - bytes_to_skip;
+
+    if (static_cast<UInt32>(2 + bytes_to_skip) > source_size)
+        throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress. File has wrong header");
+
+    memcpy(dest, &source[2], bytes_to_skip);
+    UInt32 source_size_no_header = source_size - bytes_to_skip - 2;
+    switch (bytes_size)
+    {
+    case 1:
+        decompressDataForType<UInt8>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
+        break;
+    case 2:
+        decompressDataForType<UInt16>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
+        break;
+    case 4:
+        decompressDataForType<UInt32>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
+        break;
+    case 8:
+        decompressDataForType<UInt64>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
+        break;
+    case 16:
+        decompressDataForType<UInt128>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
+        break;
+    case 32:
+        decompressDataForType<UInt256>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
+        break;
+    }
+}
+
+namespace
+{
+
+UInt8 getGCDBytesSize(const IDataType * column_type)
+{
+    if (!column_type->isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion())
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Codec GCD is not applicable for {} because the data type is not of fixed size",
+            column_type->getName());
+
+    size_t max_size = column_type->getSizeOfValueInMemory();
+    if (max_size == 1 || max_size == 2 || max_size == 4 || max_size == 8 || max_size == 16 || max_size == 32)
+        return static_cast<UInt8>(max_size);
+    else
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Codec GCD is only applicable for data types of size 1, 2, 4, 8, 16, 32 bytes. Given type {}",
+            column_type->getName());
+}
+
+}
+
+void registerCodecGCD(CompressionCodecFactory & factory)
+{
+    UInt8 method_code = static_cast<UInt8>(CompressionMethodByte::GCD);
+    auto codec_builder = [&](const ASTPtr & arguments, const IDataType * column_type) -> CompressionCodecPtr
+    {
+        /// Default bytes size is 1.
+        UInt8 gcd_bytes_size = 1;
+
+        if (arguments && !arguments->children.empty())
+        {
+            throw Exception(ErrorCodes::ILLEGAL_SYNTAX_FOR_CODEC_TYPE, "GCD codec must have 0 parameters, given {}", arguments->children.size());
+        }
+        else if (column_type)
+        {
+            gcd_bytes_size = getGCDBytesSize(column_type);
+        }
+
+        return std::make_shared<CompressionCodecGCD>(gcd_bytes_size);
+    };
+    factory.registerCompressionCodecWithType("GCD", method_code, codec_builder);
+}
+
+CompressionCodecPtr getCompressionCodecGCD(UInt8 gcd_bytes_size)
+{
+    return std::make_shared<CompressionCodecGCD>(gcd_bytes_size);
+}
+
+}
diff --git a/src/Compression/CompressionFactory.cpp b/src/Compression/CompressionFactory.cpp
@@ -179,6 +179,7 @@ void registerCodecDoubleDelta(CompressionCodecFactory & factory);
 void registerCodecGorilla(CompressionCodecFactory & factory);
 void registerCodecEncrypted(CompressionCodecFactory & factory);
 void registerCodecFPC(CompressionCodecFactory & factory);
+void registerCodecGCD(CompressionCodecFactory & factory);
 #endif
 
 CompressionCodecFactory::CompressionCodecFactory()
@@ -195,6 +196,7 @@ CompressionCodecFactory::CompressionCodecFactory()
     registerCodecGorilla(*this);
     registerCodecEncrypted(*this);
     registerCodecFPC(*this);
+    registerCodecGCD(*this);
 #ifdef ENABLE_QPL_COMPRESSION
     registerCodecDeflateQpl(*this);
 #endif

diff --git a/src/Compression/CompressionInfo.h b/src/Compression/CompressionInfo.h
@@ -47,6 +47,7 @@ enum class CompressionMethodByte : uint8_t
     AES_256_GCM_SIV = 0x97,
     FPC             = 0x98,
     DeflateQpl      = 0x99,
+    GCD             = 0x9a,
 };
 
 }
diff --git a/src/Compression/fuzzers/CMakeLists.txt b/src/Compression/fuzzers/CMakeLists.txt
@@ -18,3 +18,6 @@ target_link_libraries (double_delta_decompress_fuzzer PRIVATE dbms)
 
 clickhouse_add_executable (encrypted_decompress_fuzzer encrypted_decompress_fuzzer.cpp)
 target_link_libraries (encrypted_decompress_fuzzer PRIVATE dbms)
+
+clickhouse_add_executable (gcd_decompress_fuzzer gcd_decompress_fuzzer.cpp)
+target_link_libraries (gcd_decompress_fuzzer PRIVATE dbms)
diff --git a/src/Compression/fuzzers/gcd_decompress_fuzzer.cpp b/src/Compression/fuzzers/gcd_decompress_fuzzer.cpp
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <string>
+
+#include <Compression/ICompressionCodec.h>
+#include <IO/BufferWithOwnMemory.h>
+#include "base/types.h"
+
+namespace DB
+{
+    CompressionCodecPtr getCompressionCodecGCD(UInt8 gcd_bytes_size);
+}
+
+struct AuxiliaryRandomData
+{
+    UInt8 gcd_size_bytes;
+    size_t decompressed_size;
+};
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size)
+try
+{
+    if (size < sizeof(AuxiliaryRandomData))
+        return 0;
+
+    const auto * p = reinterpret_cast<const AuxiliaryRandomData *>(data);
+    auto codec = DB::getCompressionCodecGCD(p->gcd_size_bytes);
+
+    size_t output_buffer_size = p->decompressed_size % 65536;
+    size -= sizeof(AuxiliaryRandomData);
+    data += sizeof(AuxiliaryRandomData) / sizeof(uint8_t);
+
+    // std::string input = std::string(reinterpret_cast<const char*>(data), size);
+    // fmt::print(stderr, "Using input {} of size {}, output size is {}. \n", input, size, output_buffer_size);
+
+    DB::Memory<> memory;
+    memory.resize(output_buffer_size + codec->getAdditionalSizeAtTheEndOfBuffer());
+
+    codec->doDecompressData(reinterpret_cast<const char *>(data), static_cast<UInt32>(size), memory.data(), static_cast<UInt32>(output_buffer_size));
+
+    return 0;
+}
+catch (...)
+{
+    return 1;
+}
diff --git a/tests/queries/0_stateless/02868_gcd_codec_test_data.reference b/tests/queries/0_stateless/02868_gcd_codec_test_data.reference
@@ -0,0 +1 @@
+0
diff --git a/tests/queries/0_stateless/02868_gcd_codec_test_data.sql b/tests/queries/0_stateless/02868_gcd_codec_test_data.sql
@@ -0,0 +1,15 @@
+DROP TABLE IF EXISTS table_none;
+CREATE TABLE table_none (id UInt64, ui UInt256 CODEC(LZ4)) ENGINE = Memory;
+INSERT INTO table_none SELECT * FROM generateRandom() LIMIT 50;
+
+DROP TABLE IF EXISTS table_gcd_codec;
+CREATE TABLE table_gcd_codec (id UInt64, ui UInt256 CODEC(GCD, LZ4)) ENGINE = Memory;
+INSERT INTO table_gcd_codec SELECT * FROM table_none;
+
+SELECT COUNT(*)
+FROM (
+    SELECT table_none.id, table_none.ui AS ui1, table_gcd_codec.id, table_gcd_codec.ui AS ui2
+    FROM table_none
+    JOIN table_gcd_codec ON table_none.id = table_gcd_codec.id
+)
+WHERE ui1 != ui2;
diff --git a/tests/queries/0_stateless/02869_gcd_codec_test_incorrect_type.reference b/tests/queries/0_stateless/02869_gcd_codec_test_incorrect_type.reference