From 42c5aa783da410a8fe0f879b58d07d1e4bde9726 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 24 Oct 2024 16:05:19 -0300 Subject: [PATCH 1/7] draft --- .../Impl/Parquet/ParquetDataValuesReader.cpp | 43 +++++++++++++++++++ .../Impl/Parquet/ParquetDataValuesReader.h | 21 +++++++++ .../Impl/Parquet/ParquetLeafColReader.cpp | 23 ++++++++++ .../Impl/Parquet/ParquetRecordReader.cpp | 2 +- 4 files changed, 88 insertions(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index b8e4db8700cc..977f2ad298b4 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -296,6 +296,40 @@ void ParquetPlainValuesReader::readBatch( ); } +template <> +void ParquetBitPlainReader::readBatch( + MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) +{ + auto & column = *assert_cast(col_ptr.get()); + auto cursor = column.size(); + + auto & container = column.getData(); + + container.resize(cursor + num_values); + + def_level_reader->visitNullableValues( + cursor, + num_values, + max_def_level, + null_map, + /* individual_visitor */ [&](size_t nest_cursor) + { + uint8_t byte; + bit_reader->GetValue(1, &byte); + container[nest_cursor] = byte; + }, + /* repeated_visitor */ [&](size_t nest_cursor, UInt32 count) + { + for (UInt32 i = 0; i < count; i++) + { + uint8_t byte; + bit_reader->GetValue(1, &byte); + container[nest_cursor++] = byte; + } + } + ); +} + template <> void ParquetPlainValuesReader, ParquetReaderTypes::TimestampInt96>::readBatch( @@ -515,6 +549,13 @@ void ParquetRleDictReader::readBatch( ); } +template <> +void ParquetRleDictReader::readBatch( + MutableColumnPtr & , LazyNullMap &, UInt32) +{ + assert(false); +} + template void ParquetRleDictReader::readBatch( MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) @@ -561,6 +602,7 @@ template class ParquetPlainValuesReader>; template class ParquetPlainValuesReader>; template class ParquetPlainValuesReader>; template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; template class ParquetFixedLenPlainReader>; template class ParquetFixedLenPlainReader>; @@ -569,6 +611,7 @@ template class ParquetRleLCReader; template class ParquetRleLCReader; template class ParquetRleLCReader; +template class ParquetRleDictReader; template class ParquetRleDictReader; template class ParquetRleDictReader; template class ParquetRleDictReader; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h index fbccb612b3c0..db55f7e2d6ab 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h @@ -172,6 +172,27 @@ class ParquetPlainValuesReader : public ParquetDataValuesReader ParquetDataBuffer plain_data_buffer; }; +template +class ParquetBitPlainReader : public ParquetDataValuesReader +{ +public: + ParquetBitPlainReader( + Int32 max_def_level_, + std::unique_ptr def_level_reader_, + std::unique_ptr bit_reader_) + : max_def_level(max_def_level_) + , def_level_reader(std::move(def_level_reader_)) + , bit_reader(std::move(bit_reader_)) + {} + + void readBatch(MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) override; + +private: + Int32 max_def_level; + std::unique_ptr def_level_reader; + std::unique_ptr bit_reader; +}; + /** * The data and definition level encoding are same as ParquetPlainValuesReader. * But the element size is const and bigger than primitive data type. diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp index 9e1cae9bb657..ef35502f934b 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp @@ -496,6 +496,28 @@ void ParquetLeafColReader::readPageV1(const parquet::DataPageV1 & page) } } +template <> +void ParquetLeafColReader::initDataReader( + parquet::Encoding::type enconding_type, + const uint8_t * buffer, + std::size_t max_size, + std::unique_ptr && def_level_reader) +{ + switch (enconding_type) + { + case parquet::Encoding::PLAIN: + { + auto bit_reader = std::make_unique(buffer, max_size); + data_values_reader = std::make_unique>(col_descriptor.max_definition_level(), + std::move(def_level_reader), + std::move(bit_reader)); + break; + } + default: + throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Unknown encoding type: {}", enconding_type); + } +} + template void ParquetLeafColReader::readPageV2(const parquet::DataPageV2 & /*page*/) { @@ -526,6 +548,7 @@ std::unique_ptr ParquetLeafColReader::createDi } +template class ParquetLeafColReader; template class ParquetLeafColReader; template class ParquetLeafColReader; template class ParquetLeafColReader; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp index 18a0db7484ec..74107df4e26b 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp @@ -265,7 +265,7 @@ std::unique_ptr ColReaderFactory::makeReader() switch (col_descriptor.physical_type()) { case parquet::Type::BOOLEAN: - break; + return makeLeafReader(); case parquet::Type::INT32: return fromInt32(); case parquet::Type::INT64: From 17d95ba5b6a95b9259fe1b9d004bfc32987881de Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 25 Oct 2024 09:37:22 -0300 Subject: [PATCH 2/7] tests --- ...03254_parquet_bool_native_reader.reference | 20 +++++++++++++++++ .../03254_parquet_bool_native_reader.sh | 21 ++++++++++++++++++ .../0_stateless/data_parquet/nullbool.parquet | Bin 0 -> 508 bytes 3 files changed, 41 insertions(+) create mode 100644 tests/queries/0_stateless/03254_parquet_bool_native_reader.reference create mode 100755 tests/queries/0_stateless/03254_parquet_bool_native_reader.sh create mode 100644 tests/queries/0_stateless/data_parquet/nullbool.parquet diff --git a/tests/queries/0_stateless/03254_parquet_bool_native_reader.reference b/tests/queries/0_stateless/03254_parquet_bool_native_reader.reference new file mode 100644 index 000000000000..0c7e55ad2344 --- /dev/null +++ b/tests/queries/0_stateless/03254_parquet_bool_native_reader.reference @@ -0,0 +1,20 @@ +0 false +1 \N +2 false +3 \N +4 false +5 \N +6 false +7 \N +8 true +9 \N +0 false +1 \N +2 false +3 \N +4 false +5 \N +6 false +7 \N +8 true +9 \N diff --git a/tests/queries/0_stateless/03254_parquet_bool_native_reader.sh b/tests/queries/0_stateless/03254_parquet_bool_native_reader.sh new file mode 100755 index 000000000000..c28523b3c549 --- /dev/null +++ b/tests/queries/0_stateless/03254_parquet_bool_native_reader.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Tags: no-ubsan, no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') + +WORKING_DIR="${USER_FILES_PATH}/${CLICKHOUSE_TEST_UNIQUE_NAME}" + +mkdir -p "${WORKING_DIR}" + +DATA_FILE="${CUR_DIR}/data_parquet/nullbool.parquet" + +DATA_FILE_USER_PATH="${WORKING_DIR}/nullbool.parquet" + +cp ${DATA_FILE} ${DATA_FILE_USER_PATH} + +${CLICKHOUSE_CLIENT} --query="select id, bool from file('${DATA_FILE_USER_PATH}', Parquet) order by id SETTINGS input_format_parquet_use_native_reader=false;" +${CLICKHOUSE_CLIENT} --query="select id, bool from file('${DATA_FILE_USER_PATH}', Parquet) order by id SETTINGS input_format_parquet_use_native_reader=true;" diff --git a/tests/queries/0_stateless/data_parquet/nullbool.parquet b/tests/queries/0_stateless/data_parquet/nullbool.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d9b365bbe75bcd69ccee52c5eddffd3aa17b62cb GIT binary patch literal 508 zcmZ8e%}T>S5T1>-#va5%cE~~wy)?8?OJizlBow@fry}?OsadelSWT@zm!8Cz@Z<}4 z_TWi;1>eEDM?w5HxtZN>Co|v7>^JQA@Fb8V$5=m456@ekbl}?3rs5MgEnp3(0P8>% z*Z@q*CV>2HtnaVa&&{&DYRXN?`l;AfbxXOXmSBo}OED>Su&E#g7$GFWH52x0HBuhi za>b^|<3}M!<`)_9k)7Qy&dzm~$O-~Ya;<3!2~EqbOy=_$p@FA5zU?8qNwzY)M3h&& z^6j{kQ0if76@p3+H(?U=q_|y*BYv@@!yiH(kpCz^t39fsWpPu{bi^YtG32{~xYjD{ zzfNWz;&)3j{|eb7eiQ8YHjHD&bL+SH^jhcLY@X^__ae!(yP@xr>~f~bJ-$rxtEIl) z$@0dH&KJ}9MI-12{cg}`O_tMH+K7fa)zM%-91i=vUK*ssNQXf*7=(jK5~ezgcHvEu Qw(599*mQ(f9pl~q08?vBH2?qr literal 0 HcmV?d00001 From afbe322dfcab82a4aee072d878f369ec459a5292 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 28 Oct 2024 18:06:16 -0300 Subject: [PATCH 3/7] progress --- .../Impl/Parquet/ParquetDataValuesReader.cpp | 7 --- .../Impl/Parquet/ParquetLeafColReader.cpp | 59 +++++++++---------- 2 files changed, 28 insertions(+), 38 deletions(-) diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index 977f2ad298b4..9a79bcffad3d 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -549,13 +549,6 @@ void ParquetRleDictReader::readBatch( ); } -template <> -void ParquetRleDictReader::readBatch( - MutableColumnPtr & , LazyNullMap &, UInt32) -{ - assert(false); -} - template void ParquetRleDictReader::readBatch( MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp index ef35502f934b..2dcd01737ed3 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp @@ -458,16 +458,29 @@ void ParquetLeafColReader::readPageV1(const parquet::DataPageV1 & page) degradeDictionary(); } - ParquetDataBuffer parquet_buffer = [&]() + if (col_descriptor.physical_type() == parquet::Type::BOOLEAN) { - if constexpr (!std::is_same_v, TColumn>) - return ParquetDataBuffer(buffer, max_size); - - auto scale = assert_cast(*base_data_type).getScale(); - return ParquetDataBuffer(buffer, max_size, scale); - }(); - data_values_reader = createPlainReader( - col_descriptor, std::move(def_level_reader), std::move(parquet_buffer)); + if constexpr (std::is_same_v) + { + auto bit_reader = std::make_unique(buffer, max_size); + data_values_reader = std::make_unique>(col_descriptor.max_definition_level(), + std::move(def_level_reader), + std::move(bit_reader)); + } + } + else + { + ParquetDataBuffer parquet_buffer = [&]() + { + if constexpr (!std::is_same_v, TColumn>) + return ParquetDataBuffer(buffer, max_size); + + auto scale = assert_cast(*base_data_type).getScale(); + return ParquetDataBuffer(buffer, max_size, scale); + }(); + data_values_reader = createPlainReader( + col_descriptor, std::move(def_level_reader), std::move(parquet_buffer)); + } break; } case parquet::Encoding::RLE_DICTIONARY: @@ -496,28 +509,6 @@ void ParquetLeafColReader::readPageV1(const parquet::DataPageV1 & page) } } -template <> -void ParquetLeafColReader::initDataReader( - parquet::Encoding::type enconding_type, - const uint8_t * buffer, - std::size_t max_size, - std::unique_ptr && def_level_reader) -{ - switch (enconding_type) - { - case parquet::Encoding::PLAIN: - { - auto bit_reader = std::make_unique(buffer, max_size); - data_values_reader = std::make_unique>(col_descriptor.max_definition_level(), - std::move(def_level_reader), - std::move(bit_reader)); - break; - } - default: - throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Unknown encoding type: {}", enconding_type); - } -} - template void ParquetLeafColReader::readPageV2(const parquet::DataPageV2 & /*page*/) { @@ -540,6 +531,12 @@ std::unique_ptr ParquetLeafColReader::createDi }); return res; } + + if (col_descriptor.physical_type() == parquet::Type::type::BOOLEAN) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Dictionary encoding for booleans is not supported"); + } + return std::make_unique>( col_descriptor.max_definition_level(), std::move(def_level_reader), From b2bccffb406f13b8f9653ba298c1438296201a6c Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 29 Oct 2024 09:30:44 -0300 Subject: [PATCH 4/7] getbatch --- .../Formats/Impl/Parquet/ParquetDataValuesReader.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index 9a79bcffad3d..fa38a24fd3cd 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -320,12 +320,7 @@ void ParquetBitPlainReader::readBatch( }, /* repeated_visitor */ [&](size_t nest_cursor, UInt32 count) { - for (UInt32 i = 0; i < count; i++) - { - uint8_t byte; - bit_reader->GetValue(1, &byte); - container[nest_cursor++] = byte; - } + bit_reader->GetBatch(1, &container[nest_cursor], count); } ); } From 0babc4fee7181e4f230668d049ded06ab69290e2 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 30 Oct 2024 16:18:43 -0300 Subject: [PATCH 5/7] fix msan issue --- .../Impl/Parquet/ParquetDataValuesReader.cpp | 18 +++---- .../Impl/Parquet/ParquetFilterCondition.cpp | 5 ++ .../Impl/Parquet/ParquetFilterCondition.h | 49 +++++++++++++++++++ 3 files changed, 62 insertions(+), 10 deletions(-) create mode 100644 src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp create mode 100644 src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index fa38a24fd3cd..b471989076b4 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -296,16 +296,12 @@ void ParquetPlainValuesReader::readBatch( ); } -template <> -void ParquetBitPlainReader::readBatch( +template +void ParquetBitPlainReader::readBatch( MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) { - auto & column = *assert_cast(col_ptr.get()); - auto cursor = column.size(); - - auto & container = column.getData(); - - container.resize(cursor + num_values); + auto cursor = col_ptr->size(); + auto * column_data = getResizedPrimitiveData(*assert_cast(col_ptr.get()), cursor + num_values); def_level_reader->visitNullableValues( cursor, @@ -316,11 +312,11 @@ void ParquetBitPlainReader::readBatch( { uint8_t byte; bit_reader->GetValue(1, &byte); - container[nest_cursor] = byte; + column_data[nest_cursor] = byte; }, /* repeated_visitor */ [&](size_t nest_cursor, UInt32 count) { - bit_reader->GetBatch(1, &container[nest_cursor], count); + bit_reader->GetBatch(1, &column_data[nest_cursor], count); } ); } @@ -592,6 +588,8 @@ template class ParquetPlainValuesReader>; template class ParquetPlainValuesReader; template class ParquetPlainValuesReader; +template class ParquetBitPlainReader; + template class ParquetFixedLenPlainReader>; template class ParquetFixedLenPlainReader>; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp b/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp new file mode 100644 index 000000000000..27be594d3c24 --- /dev/null +++ b/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp @@ -0,0 +1,5 @@ +// +// Created by laptop on 10/29/24. +// + +#include "ParquetFilterCondition.h" diff --git a/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h b/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h new file mode 100644 index 000000000000..a09eaa9ced0e --- /dev/null +++ b/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h @@ -0,0 +1,49 @@ +#pragma once + +#include + +#if USE_PARQUET + +#include + +namespace DB +{ + +class ParquetFilterCondition +{ + struct ConditionElement + { + enum Function + { + /// Atoms of a Boolean expression. + FUNCTION_EQUALS, + FUNCTION_NOT_EQUALS, + FUNCTION_IN, + FUNCTION_NOT_IN, + /// Can take any value. + FUNCTION_UNKNOWN, + /// Operators of the logical expression. + FUNCTION_NOT, + FUNCTION_AND, + FUNCTION_OR, + /// Constants + ALWAYS_FALSE, + ALWAYS_TRUE, + }; + + using ColumnPtr = IColumn::Ptr; + using HashesForColumns = std::vector>; + using KeyColumns = std::vector; + + Function function; + // each entry represents a list of hashes per column + // suppose there are three columns with 2 rows each + // hashes_per_column.size() == 3 and hashes_per_column[0].size() == 2 + HashesForColumns hashes_per_column; + KeyColumns key_columns; + }; +}; + +} + +#endif From 912373b81c5829724af891c7ff58b0a3f0c70d43 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 30 Oct 2024 19:26:13 -0300 Subject: [PATCH 6/7] remove unrelated file --- .../Impl/Parquet/ParquetFilterCondition.h | 49 ------------------- 1 file changed, 49 deletions(-) delete mode 100644 src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h diff --git a/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h b/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h deleted file mode 100644 index a09eaa9ced0e..000000000000 --- a/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h +++ /dev/null @@ -1,49 +0,0 @@ -#pragma once - -#include - -#if USE_PARQUET - -#include - -namespace DB -{ - -class ParquetFilterCondition -{ - struct ConditionElement - { - enum Function - { - /// Atoms of a Boolean expression. - FUNCTION_EQUALS, - FUNCTION_NOT_EQUALS, - FUNCTION_IN, - FUNCTION_NOT_IN, - /// Can take any value. - FUNCTION_UNKNOWN, - /// Operators of the logical expression. - FUNCTION_NOT, - FUNCTION_AND, - FUNCTION_OR, - /// Constants - ALWAYS_FALSE, - ALWAYS_TRUE, - }; - - using ColumnPtr = IColumn::Ptr; - using HashesForColumns = std::vector>; - using KeyColumns = std::vector; - - Function function; - // each entry represents a list of hashes per column - // suppose there are three columns with 2 rows each - // hashes_per_column.size() == 3 and hashes_per_column[0].size() == 2 - HashesForColumns hashes_per_column; - KeyColumns key_columns; - }; -}; - -} - -#endif From 79d7cebc6c2d17b15ff9fd22559f8c3946a2dc2f Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 30 Oct 2024 19:26:37 -0300 Subject: [PATCH 7/7] remove unrelated file --- .../Formats/Impl/Parquet/ParquetFilterCondition.cpp | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp diff --git a/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp b/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp deleted file mode 100644 index 27be594d3c24..000000000000 --- a/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp +++ /dev/null @@ -1,5 +0,0 @@ -// -// Created by laptop on 10/29/24. -// - -#include "ParquetFilterCondition.h"