diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index b8e4db8700cc..b471989076b4 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -296,6 +296,31 @@ void ParquetPlainValuesReader::readBatch( ); } +template +void ParquetBitPlainReader::readBatch( + MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) +{ + auto cursor = col_ptr->size(); + auto * column_data = getResizedPrimitiveData(*assert_cast(col_ptr.get()), cursor + num_values); + + def_level_reader->visitNullableValues( + cursor, + num_values, + max_def_level, + null_map, + /* individual_visitor */ [&](size_t nest_cursor) + { + uint8_t byte; + bit_reader->GetValue(1, &byte); + column_data[nest_cursor] = byte; + }, + /* repeated_visitor */ [&](size_t nest_cursor, UInt32 count) + { + bit_reader->GetBatch(1, &column_data[nest_cursor], count); + } + ); +} + template <> void ParquetPlainValuesReader, ParquetReaderTypes::TimestampInt96>::readBatch( @@ -561,6 +586,9 @@ template class ParquetPlainValuesReader>; template class ParquetPlainValuesReader>; template class ParquetPlainValuesReader>; template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; + +template class ParquetBitPlainReader; template class ParquetFixedLenPlainReader>; template class ParquetFixedLenPlainReader>; @@ -569,6 +597,7 @@ template class ParquetRleLCReader; template class ParquetRleLCReader; template class ParquetRleLCReader; +template class ParquetRleDictReader; template class ParquetRleDictReader; template class ParquetRleDictReader; template class ParquetRleDictReader; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h index fbccb612b3c0..db55f7e2d6ab 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h @@ -172,6 +172,27 @@ class ParquetPlainValuesReader : public ParquetDataValuesReader ParquetDataBuffer plain_data_buffer; }; +template +class ParquetBitPlainReader : public ParquetDataValuesReader +{ +public: + ParquetBitPlainReader( + Int32 max_def_level_, + std::unique_ptr def_level_reader_, + std::unique_ptr bit_reader_) + : max_def_level(max_def_level_) + , def_level_reader(std::move(def_level_reader_)) + , bit_reader(std::move(bit_reader_)) + {} + + void readBatch(MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) override; + +private: + Int32 max_def_level; + std::unique_ptr def_level_reader; + std::unique_ptr bit_reader; +}; + /** * The data and definition level encoding are same as ParquetPlainValuesReader. * But the element size is const and bigger than primitive data type. diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp index 9e1cae9bb657..2dcd01737ed3 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp @@ -458,16 +458,29 @@ void ParquetLeafColReader::readPageV1(const parquet::DataPageV1 & page) degradeDictionary(); } - ParquetDataBuffer parquet_buffer = [&]() + if (col_descriptor.physical_type() == parquet::Type::BOOLEAN) { - if constexpr (!std::is_same_v, TColumn>) - return ParquetDataBuffer(buffer, max_size); - - auto scale = assert_cast(*base_data_type).getScale(); - return ParquetDataBuffer(buffer, max_size, scale); - }(); - data_values_reader = createPlainReader( - col_descriptor, std::move(def_level_reader), std::move(parquet_buffer)); + if constexpr (std::is_same_v) + { + auto bit_reader = std::make_unique(buffer, max_size); + data_values_reader = std::make_unique>(col_descriptor.max_definition_level(), + std::move(def_level_reader), + std::move(bit_reader)); + } + } + else + { + ParquetDataBuffer parquet_buffer = [&]() + { + if constexpr (!std::is_same_v, TColumn>) + return ParquetDataBuffer(buffer, max_size); + + auto scale = assert_cast(*base_data_type).getScale(); + return ParquetDataBuffer(buffer, max_size, scale); + }(); + data_values_reader = createPlainReader( + col_descriptor, std::move(def_level_reader), std::move(parquet_buffer)); + } break; } case parquet::Encoding::RLE_DICTIONARY: @@ -518,6 +531,12 @@ std::unique_ptr ParquetLeafColReader::createDi }); return res; } + + if (col_descriptor.physical_type() == parquet::Type::type::BOOLEAN) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Dictionary encoding for booleans is not supported"); + } + return std::make_unique>( col_descriptor.max_definition_level(), std::move(def_level_reader), @@ -526,6 +545,7 @@ std::unique_ptr ParquetLeafColReader::createDi } +template class ParquetLeafColReader; template class ParquetLeafColReader; template class ParquetLeafColReader; template class ParquetLeafColReader; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp index 18a0db7484ec..74107df4e26b 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp @@ -265,7 +265,7 @@ std::unique_ptr ColReaderFactory::makeReader() switch (col_descriptor.physical_type()) { case parquet::Type::BOOLEAN: - break; + return makeLeafReader(); case parquet::Type::INT32: return fromInt32(); case parquet::Type::INT64: diff --git a/tests/queries/0_stateless/03254_parquet_bool_native_reader.reference b/tests/queries/0_stateless/03254_parquet_bool_native_reader.reference new file mode 100644 index 000000000000..0c7e55ad2344 --- /dev/null +++ b/tests/queries/0_stateless/03254_parquet_bool_native_reader.reference @@ -0,0 +1,20 @@ +0 false +1 \N +2 false +3 \N +4 false +5 \N +6 false +7 \N +8 true +9 \N +0 false +1 \N +2 false +3 \N +4 false +5 \N +6 false +7 \N +8 true +9 \N diff --git a/tests/queries/0_stateless/03254_parquet_bool_native_reader.sh b/tests/queries/0_stateless/03254_parquet_bool_native_reader.sh new file mode 100755 index 000000000000..c28523b3c549 --- /dev/null +++ b/tests/queries/0_stateless/03254_parquet_bool_native_reader.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Tags: no-ubsan, no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') + +WORKING_DIR="${USER_FILES_PATH}/${CLICKHOUSE_TEST_UNIQUE_NAME}" + +mkdir -p "${WORKING_DIR}" + +DATA_FILE="${CUR_DIR}/data_parquet/nullbool.parquet" + +DATA_FILE_USER_PATH="${WORKING_DIR}/nullbool.parquet" + +cp ${DATA_FILE} ${DATA_FILE_USER_PATH} + +${CLICKHOUSE_CLIENT} --query="select id, bool from file('${DATA_FILE_USER_PATH}', Parquet) order by id SETTINGS input_format_parquet_use_native_reader=false;" +${CLICKHOUSE_CLIENT} --query="select id, bool from file('${DATA_FILE_USER_PATH}', Parquet) order by id SETTINGS input_format_parquet_use_native_reader=true;" diff --git a/tests/queries/0_stateless/data_parquet/nullbool.parquet b/tests/queries/0_stateless/data_parquet/nullbool.parquet new file mode 100644 index 000000000000..d9b365bbe75b Binary files /dev/null and b/tests/queries/0_stateless/data_parquet/nullbool.parquet differ